diff --git a/.gitignore b/.gitignore index 99a7d61d6..b59adcbe5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,5 @@ TARGETS -build_config.mk +make_config.mk *.a *.arc @@ -20,6 +20,7 @@ build_config.mk *.d-e *.o-* *.swp +*~ ldb manifest_dump @@ -28,8 +29,22 @@ util/build_version.cc build_tools/VALGRIND_LOGS/ coverage/COVERAGE_REPORT .gdbhistory +package/ .phutil_module_cache +unity tags + +java/out +java/target +java/test-libs java/*.log java/include/org_rocksdb_*.h + +.idea/ +*.iml + unity.cc +java/crossbuild/.vagrant +.vagrant/ +java/**.asc +java/javadoc diff --git a/.travis.yml b/.travis.yml index 66f37a5d2..ad2129d27 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,6 @@ before_install: - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb # Lousy hack to disable use and testing of fallocate, which doesn't behave quite # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. - - sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform -script: make check -j8 +script: OPT=-DTRAVIS make unity && make clean && OPT=-DTRAVIS make check notifications: email: false diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..e644f5530 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,11 @@ +Facebook Inc. +Facebook Engineering Team + +Google Inc. +# Initial version authors: +Jeffrey Dean +Sanjay Ghemawat + +# Partial list of contributors: +Kevin Regan +Johan Bilien diff --git a/HISTORY.md b/HISTORY.md index 420377cbf..bef3e1ff1 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,78 @@ # Rocksdb Change Log -### Unreleased +### Unreleased Features +* Changed the LRU caching algorithm so that referenced blocks (by iterators) are never evicted +* By default we now optimize the compilation for the compilation platform (using -march=native). If you want to build portable binary, use 'PORTABLE=1' before the make command. +* We now allow level-compaction to place files in different paths by + specifying them in db_paths along with the target_size. + Lower numbered levels will be placed earlier in the db_paths and higher + numbered levels will be placed later in the db_paths vector. +* Potentially big performance improvements if you're using RocksDB with lots of column families (100-1000) +* Added BlockBasedTableOptions.format_version option, which allows user to specify which version of block based table he wants. As a general guidline, newer versions have more features, but might not be readable by older versions of RocksDB. +* Added new block based table format (version 2), which you can enable by setting BlockBasedTableOptions.format_version = 2. This format changes how we encode size information in compressed blocks and should help with memory allocations if you're using Zlib or BZip2 compressions. +* GetThreadStatus() is now able to report compaction activity. +* MemEnv (env that stores data in memory) is now available in default library build. You can create it by calling NewMemEnv(). +* Add SliceTransform.SameResultWhenAppended() to help users determine it is safe to apply prefix bloom/hash. +* Block based table now makes use of prefix bloom filter if it is a full fulter. + +### Public API changes +* Deprecated skip_log_error_on_recovery option +* Logger method logv with log level parameter is now virtual + +### 3.9.0 (12/8/2014) + +### New Features +* Add rocksdb::GetThreadList(), which in the future will return the current status of all + rocksdb-related threads. We will have more code instruments in the following RocksDB + releases. +* Change convert function in rocksdb/utilities/convenience.h to return Status instead of boolean. + Also add support for nested options in convert function + +### Public API changes +* New API to create a checkpoint added. Given a directory name, creates a new + database which is an image of the existing database. +* New API LinkFile added to Env. If you implement your own Env class, an + implementation of the API LinkFile will have to be provided. +* MemTableRep takes MemTableAllocator instead of Arena + +### Improvements +* RocksDBLite library now becomes smaller and will be compiled with -fno-exceptions flag. + +## 3.8.0 (11/14/2014) + +### Public API changes +* BackupEngine::NewBackupEngine() was deprecated; please use BackupEngine::Open() from now on. +* BackupableDB/RestoreBackupableDB have new GarbageCollect() methods, which will clean up files from corrupt and obsolete backups. +* BackupableDB/RestoreBackupableDB have new GetCorruptedBackups() methods which list corrupt backups. + +### Cleanup +* Bunch of code cleanup, some extra warnings turned on (-Wshadow, -Wshorten-64-to-32, -Wnon-virtual-dtor) + +### New features +* CompactFiles and EventListener, although they are still in experimental state +* Full ColumnFamily support in RocksJava. + +## 3.7.0 (11/6/2014) +### Public API changes +* Introduce SetOptions() API to allow adjusting a subset of options dynamically online +* Introduce 4 new convenient functions for converting Options from string: GetColumnFamilyOptionsFromMap(), GetColumnFamilyOptionsFromString(), GetDBOptionsFromMap(), GetDBOptionsFromString() +* Remove WriteBatchWithIndex.Delete() overloads using SliceParts +* When opening a DB, if options.max_background_compactions is larger than the existing low pri pool of options.env, it will enlarge it. Similarly, options.max_background_flushes is larger than the existing high pri pool of options.env, it will enlarge it. + +## 3.6.0 (10/7/2014) +### Disk format changes +* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy + +### Behavior changes +* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6. +* When disableDataSync=true, we no longer sync the MANIFEST file. +* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly. + +### Public API changes +* Change target_file_size_base type to uint64_t from int. +* Remove allow_thread_local. This feature was proved to be stable, so we are turning it always-on. + +## 3.5.0 (9/3/2014) ### New Features * Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it. * Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include: @@ -11,15 +83,12 @@ ### Public API changes * The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key. - ------ Past Releases ----- - - ## 3.4.0 (8/18/2014) ### New Features * Support Multiple DB paths in universal style compactions * Add feature of storing plain table index and bloom filter in SST file. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. +* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries. ### Public API changes * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size diff --git a/INSTALL.md b/INSTALL.md index 8cf66e6ab..330f8bcbd 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -2,7 +2,7 @@ RocksDB's library should be able to compile without any dependency installed, although we recommend installing some compression libraries (see below). -We do depend on newer gcc with C++11 support. +We do depend on newer gcc/clang with C++11 support. There are few options when compiling RocksDB: @@ -15,6 +15,10 @@ There are few options when compiling RocksDB: * `make all` will compile our static library, and all our tools and unit tests. Our tools depend on gflags. You will need to have gflags installed to run `make all`. +* By default the binary we produce is optimized for the platform you're compiling on +(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before +your make commands, like this: `PORTABLE=1 make static_lib` + ## Dependencies * You can link RocksDB with following compression libraries: @@ -72,13 +76,7 @@ depend on gflags. You will need to have gflags installed to run `make all`. * Install via [homebrew](http://brew.sh/). * If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line. * run `brew tap homebrew/dupes; brew install gcc47 --use-llvm` to install gcc 4.7 (or higher). - * Install zlib, bzip2 and snappy libraries for compression. - * Install gflags. We have included a script - `build_tools/mac-install-gflags.sh`, which should automatically install it (execute this file instead of runing using "source" command). - If you installed gflags by other means (for example, `brew install gflags`), - please set `LIBRARY_PATH` and `CPATH` accordingly. - * Please note that some of the optimizations/features are disabled in OSX. - We did not run any production workloads on it. + * run `brew install rocksdb` * **iOS**: - * Run: `TARGET_OS=IOS make static_lib` + * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`. diff --git a/Makefile b/Makefile index c75274cd0..82b0e0a35 100644 --- a/Makefile +++ b/Makefile @@ -3,12 +3,19 @@ # found in the LICENSE file. See the AUTHORS file for names of contributors. # Inherit some settings from environment variables, if available -INSTALL_PATH ?= $(CURDIR) #----------------------------------------------- +CFLAGS += ${EXTRA_CFLAGS} +CXXFLAGS += ${EXTRA_CXXFLAGS} +LDFLAGS += $(EXTRA_LDFLAGS) +MACHINE ?= $(shell uname -m) + ifneq ($(MAKECMDGOALS),dbg) -OPT += -O2 -fno-omit-frame-pointer -momit-leaf-frame-pointer +OPT += -O2 -fno-omit-frame-pointer +ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer +OPT += -momit-leaf-frame-pointer +endif else # intentionally left blank endif @@ -24,9 +31,9 @@ endif #----------------------------------------------- # detect what platform we're building on -$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/build_config.mk")) +$(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk")) # this file is generated by the previous line to set build flags and sources -include build_config.mk +include make_config.mk ifneq ($(PLATFORM), IOS) CFLAGS += -g @@ -36,35 +43,71 @@ else OPT += -DNDEBUG endif +ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) + # found + CFLAGS += -fno-exceptions + CXXFLAGS += -fno-exceptions +endif + # ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc. ifdef COMPILE_WITH_ASAN - # ASAN compile flags + DISABLE_JEMALLOC=1 EXEC_LDFLAGS += -fsanitize=address PLATFORM_CCFLAGS += -fsanitize=address PLATFORM_CXXFLAGS += -fsanitize=address -else - # if we're not compiling with ASAN, use jemalloc +endif + +# TSAN doesn't work well with jemalloc. If we're compiling with TSAN, we should use regular malloc. +ifdef COMPILE_WITH_TSAN + DISABLE_JEMALLOC=1 + EXEC_LDFLAGS += -fsanitize=thread -pie + PLATFORM_CCFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN + PLATFORM_CXXFLAGS += -fsanitize=thread -fPIC -DROCKSDB_TSAN_RUN +endif + +ifndef DISABLE_JEMALLOC EXEC_LDFLAGS := $(JEMALLOC_LIB) $(EXEC_LDFLAGS) PLATFORM_CXXFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC endif -WARNING_FLAGS = -Wall -Werror -Wsign-compare +#------------------------------------------------- +# make install related stuff +INSTALL_PATH ?= /usr/local + +uninstall: + @rm -rf $(INSTALL_PATH)/include/rocksdb + @rm -rf $(INSTALL_PATH)/lib/$(LIBRARY) + @rm -rf $(INSTALL_PATH)/lib/$(SHARED) + +install: + @install -d $(INSTALL_PATH)/lib + @for header_dir in `find "include/rocksdb" -type d`; do \ + install -d $(INSTALL_PATH)/$$header_dir; \ + done + @for header in `find "include/rocksdb" -type f -name *.h`; do \ + install -C -m 644 $$header $(INSTALL_PATH)/$$header; \ + done + @[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib + @[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib +#------------------------------------------------- + +WARNING_FLAGS = -Wall -Werror -Wsign-compare -Wshadow CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) -CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual +CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor LDFLAGS += $(PLATFORM_LDFLAGS) LIBOBJECTS = $(SOURCES:.cc=.o) -LIBOBJECTS += $(SOURCESCPP:.cpp=.o) -MEMENVOBJECTS = $(MEMENV_SOURCES:.cc=.o) +MOCKOBJECTS = $(MOCK_SOURCES:.cc=.o) TESTUTIL = ./util/testutil.o -TESTHARNESS = ./util/testharness.o $(TESTUTIL) +TESTHARNESS = ./util/testharness.o $(TESTUTIL) $(MOCKOBJECTS) BENCHHARNESS = ./util/benchharness.o VALGRIND_ERROR = 2 VALGRIND_DIR = build_tools/VALGRIND_LOGS VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) + VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full TESTS = \ @@ -85,22 +128,26 @@ TESTS = \ coding_test \ corruption_test \ crc32c_test \ + slice_transform_test \ dbformat_test \ env_test \ - blob_store_test \ + fault_injection_test \ filelock_test \ filename_test \ - filter_block_test \ + block_based_filter_block_test \ + full_filter_block_test \ histogram_test \ log_test \ manual_compaction_test \ memenv_test \ + mock_env_test \ merge_test \ + merger_test \ redis_test \ reduce_levels_test \ plain_table_db_test \ + comparator_db_test \ prefix_test \ - simple_table_db_test \ skiplist_test \ stringappend_test \ ttl_test \ @@ -110,8 +157,11 @@ TESTS = \ spatial_db_test \ version_edit_test \ version_set_test \ + compaction_picker_test \ + version_builder_test \ file_indexer_test \ - write_batch_test\ + write_batch_test \ + write_controller_test\ deletefile_test \ table_test \ thread_local_test \ @@ -121,7 +171,14 @@ TESTS = \ cuckoo_table_builder_test \ cuckoo_table_reader_test \ cuckoo_table_db_test \ - write_batch_with_index_test + flush_job_test \ + wal_manager_test \ + listener_test \ + compaction_job_test \ + thread_list_test \ + sst_dump_test + +SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) TOOLS = \ sst_dump \ @@ -129,10 +186,9 @@ TOOLS = \ db_stress \ ldb \ db_repl_stress \ - options_test \ - blob_store_bench + options_test \ -PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) +PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench perf_context_test memtablerep_bench $(TOOLS) # The library name is configurable since we are maintaining libraries of both # debug/release mode. @@ -140,7 +196,10 @@ ifeq ($(LIBNAME),) LIBNAME=librocksdb endif LIBRARY = ${LIBNAME}.a -MEMENVLIBRARY = libmemenv.a + +ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) +ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) +ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" include/rocksdb/version.h | cut -d ' ' -f 3) default: all @@ -153,29 +212,33 @@ ifneq ($(PLATFORM_SHARED_VERSIONED),true) SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1) SHARED3 = $(SHARED1) +SHARED4 = $(SHARED1) SHARED = $(SHARED1) else -# Update db.h if you change these. -SHARED_MAJOR = 3 -SHARED_MINOR = 4 +SHARED_MAJOR = $(ROCKSDB_MAJOR) +SHARED_MINOR = $(ROCKSDB_MINOR) +SHARED_PATCH = $(ROCKSDB_PATCH) SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) -SHARED = $(SHARED1) $(SHARED2) $(SHARED3) -$(SHARED1): $(SHARED3) - ln -fs $(SHARED3) $(SHARED1) -$(SHARED2): $(SHARED3) - ln -fs $(SHARED3) $(SHARED2) +SHARED4 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR).$(SHARED_PATCH) +SHARED = $(SHARED1) $(SHARED2) $(SHARED3) $(SHARED4) +$(SHARED1): $(SHARED4) + ln -fs $(SHARED4) $(SHARED1) +$(SHARED2): $(SHARED4) + ln -fs $(SHARED4) $(SHARED2) +$(SHARED3): $(SHARED4) + ln -fs $(SHARED4) $(SHARED3) endif -$(SHARED3): +$(SHARED4): $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) $(LDFLAGS) -o $@ endif # PLATFORM_SHARED_EXT -.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ +.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests package \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ - dbg + dbg rocksdbjavastatic rocksdbjava install uninstall analyze all: $(LIBRARY) $(PROGRAMS) $(TESTS) @@ -201,6 +264,10 @@ check: $(TESTS) ldb for t in $(TESTS); do echo "***** Running $$t"; ./$$t || exit 1; done python tools/ldb_test.py +check_some: $(SUBSET) ldb + for t in $(SUBSET); do echo "***** Running $$t"; ./$$t || exit 1; done + python tools/ldb_test.py + ldb_tests: ldb python tools/ldb_test.py @@ -236,6 +303,10 @@ valgrind_check: all $(PROGRAMS) $(TESTS) echo $$t $$((etime - stime)) >> $(VALGRIND_DIR)/valgrind_tests_times; \ done +analyze: + $(MAKE) clean + $(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) -o $(CURDIR)/scan_build_report $(MAKE) all -j32 + unity.cc: $(shell (export ROCKSDB_ROOT="$(CURDIR)"; "$(CURDIR)/build_tools/unity" "$(CURDIR)/unity.cc")) @@ -243,10 +314,11 @@ unity: unity.cc unity.o $(CXX) unity.o $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) clean: - -rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) $(MEMENVLIBRARY) build_config.mk unity.cc + -rm -f $(PROGRAMS) $(TESTS) $(LIBRARY) $(SHARED) make_config.mk unity.cc -rm -rf ios-x86/* ios-arm/* - -find . -name "*.[od]" -exec rm {} \; + -find . -name "*.[oda]" -exec rm {} \; -find . -type f -regex ".*\.\(\(gcda\)\|\(gcno\)\)" -exec rm {} \; + -rm -rf bzip2* snappy* zlib* tags: ctags * -R cscope -b `find . -name '*.cc'` `find . -name '*.h'` @@ -254,6 +326,9 @@ tags: format: build_tools/format-diff.sh +package: + bash build_tools/make_package.sh $(SHARED_MAJOR).$(SHARED_MINOR) + # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -264,6 +339,12 @@ $(LIBRARY): $(LIBOBJECTS) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +memtablerep_bench: db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) + $(CXX) db/memtablerep_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -276,9 +357,6 @@ db_sanity_test: tools/db_sanity_test.o $(LIBOBJECTS) $(TESTUTIL) db_repl_stress: tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(CXX) tools/db_repl_stress.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -blob_store_bench: tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) - $(CXX) tools/blob_store_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) - signal_test: util/signal_test.o $(LIBOBJECTS) $(CXX) util/signal_test.o $(LIBOBJECTS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -309,9 +387,6 @@ cache_test: util/cache_test.o $(LIBOBJECTS) $(TESTHARNESS) coding_test: util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/coding_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -blob_store_test: util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(TESTUTIL) - $(CXX) util/blob_store_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o$@ $(LDFLAGS) $(COVERAGEFLAGS) - stringappend_test: utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) utilities/merge_operators/string_append/stringappend_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -333,6 +408,10 @@ corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + + db_test: db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -345,8 +424,8 @@ log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) plain_table_db_test: db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/plain_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -simple_table_db_test: db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) db/simple_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +comparator_db_test: db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/comparator_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) table_reader_bench: table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/table_reader_bench.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -pg @@ -378,20 +457,35 @@ ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +flush_job_test: db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/flush_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +compaction_job_test: db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/compaction_job_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +wal_manager_test: db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/wal_manager_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) env_test: util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +fault_injection_test: db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/fault_injection_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -411,6 +505,12 @@ version_edit_test: db/version_edit_test.o $(LIBOBJECTS) $(TESTHARNESS) version_set_test: db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/version_set_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +compaction_picker_test: db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/compaction_picker_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +version_builder_test: db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/version_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + file_indexer_test : db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/file_indexer_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -420,9 +520,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) @@ -438,15 +544,26 @@ cuckoo_table_reader_test: table/cuckoo_table_reader_test.o $(LIBOBJECTS) $(TESTH cuckoo_table_db_test: db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) db/cuckoo_table_db_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) +listener_test: db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) db/listener_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +thread_list_test: util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/thread_list_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +compactor_test: utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) utilities/compaction/compactor_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +options_test: util/options_test.o util/options_helper.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -$(MEMENVLIBRARY) : $(MEMENVOBJECTS) - rm -f $@ - $(AR) -rs $@ $(MEMENVOBJECTS) +sst_dump_test: util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) -memenv_test : helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) - $(CXX) helpers/memenv/memenv_test.o $(MEMENVOBJECTS) $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +memenv_test : util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/memenv_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + +mock_env_test : util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/mock_env_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) manual_compaction_test: util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) util/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) @@ -475,24 +592,83 @@ ldb: tools/ldb.o $(LIBOBJECTS) JNI_NATIVE_SOURCES = ./java/rocksjni/*.cc JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux -ROCKSDBJNILIB = librocksdbjni.so -ROCKSDB_JAR = rocksdbjni.jar +ARCH := $(shell getconf LONG_BIT) +ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar +ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar +ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar +ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar ifeq ($(PLATFORM), OS_MACOSX) -ROCKSDBJNILIB = librocksdbjni.jnilib -JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +ROCKSDBJNILIB = librocksdbjni-osx.jnilib +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar +ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") + JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin +else + JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ +endif endif +libz.a: + -rm -rf zlib-1.2.8 + curl -O http://zlib.net/zlib-1.2.8.tar.gz + tar xvzf zlib-1.2.8.tar.gz + cd zlib-1.2.8 && CFLAGS='-fPIC' ./configure --static && make + cp zlib-1.2.8/libz.a . + +libbz2.a: + -rm -rf bzip2-1.0.6 + curl -O http://www.bzip.org/1.0.6/bzip2-1.0.6.tar.gz + tar xvzf bzip2-1.0.6.tar.gz + cd bzip2-1.0.6 && make CFLAGS='-fPIC -Wall -Winline -O2 -g -D_FILE_OFFSET_BITS=64' + cp bzip2-1.0.6/libbz2.a . + +libsnappy.a: + -rm -rf snappy-1.1.1 + curl -O https://snappy.googlecode.com/files/snappy-1.1.1.tar.gz + tar xvzf snappy-1.1.1.tar.gz + cd snappy-1.1.1 && ./configure --with-pic --enable-static + cd snappy-1.1.1 && make + cp snappy-1.1.1/.libs/libsnappy.a . + + +rocksdbjavastatic: libz.a libbz2.a libsnappy.a + OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j + cd java;$(MAKE) javalib; + rm -f ./java/target/$(ROCKSDBJNILIB) + $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(COVERAGEFLAGS) libz.a libbz2.a libsnappy.a + cd java/target;strip -S -x $(ROCKSDBJNILIB) + cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md + cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) + cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class + cd java/target/apidocs;jar -cf ../$(ROCKSDB_JAVADOCS_JAR) * + cd java/src/main/java;jar -cf ../../../target/$(ROCKSDB_SOURCES_JAR) org + +rocksdbjavastaticrelease: rocksdbjavastatic + cd java/crossbuild && vagrant destroy -f && vagrant up linux32 && vagrant halt linux32 && vagrant up linux64 && vagrant halt linux64 + cd java;jar -cf target/$(ROCKSDB_JAR_ALL) HISTORY*.md + cd java/target;jar -uf $(ROCKSDB_JAR_ALL) librocksdbjni-*.so librocksdbjni-*.jnilib + cd java/target/classes;jar -uf ../$(ROCKSDB_JAR_ALL) org/rocksdb/*.class org/rocksdb/util/*.class + +rocksdbjavastaticpublish: rocksdbjavastaticrelease + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-javadoc.jar -Dclassifier=javadoc + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar -Dclassifier=sources + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux64.jar -Dclassifier=linux64 + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux32.jar -Dclassifier=linux32 + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar -Dclassifier=osx + mvn gpg:sign-and-deploy-file -Durl=https://oss.sonatype.org/service/local/staging/deploy/maven2/ -DrepositoryId=sonatype-nexus-staging -DpomFile=java/rocksjni.pom -Dfile=java/target/rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar + rocksdbjava: OPT="-fPIC -DNDEBUG -O2" $(MAKE) $(LIBRARY) -j32 - cd java;$(MAKE) java; - rm -f ./java/$(ROCKSDBJNILIB) - $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) - cd java;jar -cf $(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class HISTORY*.md $(ROCKSDBJNILIB) + cd java;$(MAKE) javalib; + rm -f ./java/target/$(ROCKSDBJNILIB) + $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) $(LIBOBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS) + cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md + cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) + cd java/target/classes;jar -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class jclean: cd java;$(MAKE) clean; - rm -f $(ROCKSDBJNILIB) jtest: cd java;$(MAKE) sample;$(MAKE) test; @@ -565,8 +741,12 @@ ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),format) ifneq ($(MAKECMDGOALS),jclean) ifneq ($(MAKECMDGOALS),jtest) +ifneq ($(MAKECMDGOALS),package) +ifneq ($(MAKECMDGOALS),analyze) -include $(DEPFILES) endif endif endif endif +endif +endif diff --git a/README.md b/README.md index bda801fd7..916bdecde 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) RocksDB is developed and maintained by Facebook Database Engineering Team. -It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) +It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) and Jeff Dean (jeff@google.com) This code is a library that forms the core building block for a fast diff --git a/Vagrantfile b/Vagrantfile new file mode 100644 index 000000000..cdee5db53 --- /dev/null +++ b/Vagrantfile @@ -0,0 +1,16 @@ +Vagrant.configure("2") do |config| + + config.vm.provider "virtualbox" do |v| + v.memory = 4096 + v.cpus = 2 + end + + config.vm.define "ubuntu14" do |box| + box.vm.box = "ubuntu/trusty64" + end + + config.vm.define "centos65" do |box| + box.vm.box = "chef/centos-6.5" + end + +end diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 3389d2851..9c8bd7d2a 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -32,7 +32,7 @@ # 2. Once install, add the include path/lib path for gflags to CPATH and # LIBRARY_PATH respectively. If installed with default mode, the # lib and include path will be /usr/local/lib and /usr/local/include -# Mac user can do this by running build_tools/mac-install-gflags.sh +# Mac user can do this by having brew installed and running brew install gflags OUTPUT=$1 if test -z "$OUTPUT"; then @@ -46,18 +46,15 @@ PLATFORM_CXXFLAGS="-std=c++11" COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" # Default to fbcode gcc on internal fb machines -if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then +if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then FBCODE_BUILD="true" - if [ -z "$USE_CLANG" ]; then - CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ - $(rpm -q --whatprovides redhat-release)` - if [ "$CENTOS_VERSION" = "6" ]; then - source "$PWD/build_tools/fbcode.gcc481.sh" - else - source "$PWD/build_tools/fbcode.gcc471.sh" - fi + # If we're compiling with TSAN we need pic build + PIC_BUILD=$COMPILE_WITH_TSAN + if [ -z "$ROCKSDB_FBCODE_BUILD_WITH_481" ]; then + source "$PWD/build_tools/fbcode_config.sh" else - source "$PWD/build_tools/fbcode.clang31.sh" + # we need this to build with MySQL. Don't use for other purposes. + source "$PWD/build_tools/fbcode_config4.8.1.sh" fi fi @@ -78,6 +75,14 @@ if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` fi +if test -z "$CLANG_SCAN_BUILD"; then + CLANG_SCAN_BUILD=scan-build +fi + +if test -z "$CLANG_ANALYZER"; then + CLANG_ANALYZER=$(which clang++) +fi + COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}" CROSS_COMPILE= PLATFORM_CCFLAGS= @@ -164,7 +169,7 @@ if test -z "$DO_NOT_RUN_BUILD_DETECT_VERSION"; then "$PWD/build_tools/build_detect_version" fi -# We want to make a list of all cc files within util, db, table, and helpers +# We want to make a list of all cc files within util, db and table # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the # prune take effect. @@ -173,36 +178,27 @@ DIRS="util db table utilities" set -f # temporarily disable globbing so that our patterns arent expanded PRUNE_TEST="-name *test*.cc -prune" PRUNE_BENCH="-name *bench*.cc -prune" -PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cc' -print | sort | tr "\n" " "` -PORTABLE_CPP=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o -name '*.cpp' -print | sort | tr "\n" " "` +PRUNE_MOCK="-name *mock*.cc -prune" +PORTABLE_FILES=`cd "$ROCKSDB_ROOT"; find $DIRS $PRUNE_TEST -o $PRUNE_BENCH -o $PRUNE_MOCK -o -name '*.cc' -print | sort | tr "\n" " "` +MOCK_SOURCES=`cd "$ROCKSDB_ROOT"; find $DIRS -name '*mock*.cc' -print | grep -v "test" | sort | tr "\n" " "` set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port # file. echo "SOURCES=$PORTABLE_FILES $GENERIC_PORT_FILES $PORT_FILES" >> "$OUTPUT" -echo "SOURCESCPP=$PORTABLE_CPP" >> "$OUTPUT" -echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> "$OUTPUT" +echo "MOCK_SOURCES=$MOCK_SOURCES" >> "$OUTPUT" if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then # Cross-compiling; do not try any compilation tests. # Also don't need any compilation tests if compiling on fbcode true else - # If -std=c++0x works, use . Otherwise use port_posix.h. - $CXX $CFLAGS -std=c++0x -x c++ - -o /dev/null 2>/dev/null < - int main() {} -EOF - if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_ATOMIC_PRESENT" - fi - # Test whether fallocate is available $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < int main() { int fd = open("/dev/null", 0); - fallocate(fd, 0, 0, 1024); + fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024); } EOF if [ "$?" = 0 ]; then @@ -302,6 +298,14 @@ EOF fi fi +# Test whether -Wshorten-64-to-32 is available +$CXX $CFLAGS -x c++ - -o /dev/null -Wshorten-64-to-32 2>/dev/null <> "$OUTPUT" echo "CXX=$CXX" >> "$OUTPUT" echo "PLATFORM=$PLATFORM" >> "$OUTPUT" @@ -341,3 +353,8 @@ echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT" echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT" echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT" echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT" +echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT" +echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT" +echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT" +echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT" +echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT" diff --git a/build_tools/fbcode.clang31.sh b/build_tools/fbcode.clang31.sh deleted file mode 100644 index 25a2ca72f..000000000 --- a/build_tools/fbcode.clang31.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/sh -# -# Set environment variables so that we can compile leveldb using -# fbcode settings. It uses the latest g++ compiler and also -# uses jemalloc - -TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f -TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native" -TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1" -TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3 -GLIBC_RUNTIME_PATH=/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1 - -# location of libgcc -LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include" -LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs" - -# location of glibc -GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include" -GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib" - -# location of snappy headers and libraries -SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include" -SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a" - -# location of zlib headers and libraries -ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include" -ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a" - -# location of gflags headers and libraries -GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include" -GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a" - -# location of bzip headers and libraries -BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include" -BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a" - -# location of gflags headers and libraries -GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include" -GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a" - -# use Intel SSE support for checksum calculations -export USE_SSE=" -msse -msse4.2 " - -CC="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang $CLANG_INCLUDES" -CXX="$TOOLCHAIN_EXECUTABLES/clang/clang-3.2/0b7c69d/bin/clang++ $CLANG_INCLUDES $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE" -AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar -RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib - -CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin -nostdlib " -CFLAGS+=" -nostdinc -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1 " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/x86_64-facebook-linux " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include/c++/4.7.1/backward " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/clang/clang-3.2/0b7c69d/lib/clang/3.2/include " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include/linux " -CFLAGS+=" -isystem $TOOLCHAIN_LIB_BASE/kernel-headers/kernel-headers-3.2.18_70_fbk11_00129_gc8882d0/da39a3e/include " -CFLAGS+=" -Wall -Wno-sign-compare -Wno-unused-variable -Winvalid-pch -Wno-deprecated -Woverloaded-virtual" -CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" -CXXFLAGS="$CFLAGS -nostdinc++" - -CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC" - -EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a" -EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a" -EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS" -EXEC_LDFLAGS+=" -Wl,--dynamic-linker,$GLIBC_RUNTIME_PATH/lib/ld-linux-x86-64.so.2" -EXEC_LDFLAGS+=" -B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin" - -PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS " - -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $GFLAGS_LIBS" - -export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED diff --git a/build_tools/fbcode.gcc471.sh b/build_tools/fbcode.gcc471.sh deleted file mode 100644 index c971cda5b..000000000 --- a/build_tools/fbcode.gcc471.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/sh -# -# Set environment variables so that we can compile leveldb using -# fbcode settings. It uses the latest g++ compiler and also -# uses jemalloc - -TOOLCHAIN_REV=fbe3b095a4cc4a3713730050d182b7b4a80c342f -TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native" -TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.7.1-glibc-2.14.1" -TOOL_JEMALLOC=jemalloc-3.3.1/9202ce3 - -# location of libhdfs libraries -if test "$USE_HDFS"; then - JAVA_HOME="/usr/local/jdk-6u22-64" - JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux" - GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.7.1-glibc-2.14.1" - HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 " - HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib " - HDFSLIB+=" -ldl -lverify -ljava -ljvm " -fi - -# location of libgcc -LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/include" -LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.7.1/afc21dc/libs" - -# location of glibc -GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/include" -GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.14.1/99df8fc/lib" - -# location of snappy headers and libraries -SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/include" -SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/7518bbe/lib/libsnappy.a" - -# location of zlib headers and libraries -ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/include" -ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/91ddd43/lib/libz.a" - -# location of bzip headers and libraries -BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/include" -BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/91ddd43/lib/libbz2.a" - -# location of gflags headers and libraries -GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/include" -GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/91ddd43/lib/libgflags.a" - -# use Intel SSE support for checksum calculations -export USE_SSE=" -msse -msse4.2 " - -CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/gcc" -CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1-glibc-2.14.1/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $GFLAGS_INCLUDE" -AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar -RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib - -CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" -CFLAGS+=" -I $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/include -DHAVE_JEMALLOC" -CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" -CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2" - -EXEC_LDFLAGS=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/$TOOL_JEMALLOC/lib/libjemalloc.a" -EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/350336c/lib/libunwind.a" -EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS" - -PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS " - -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $GFLAGS_LIBS" - -VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/91ddd43/bin/" - -export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh deleted file mode 100644 index 5426e3f9a..000000000 --- a/build_tools/fbcode.gcc481.sh +++ /dev/null @@ -1,86 +0,0 @@ -#!/bin/sh -# -# Set environment variables so that we can compile rocksdb using -# fbcode settings. It uses the latest g++ compiler and also -# uses jemalloc - -TOOLCHAIN_REV=53dc1fe83f84e9145b9ffb81b81aa7f6a49c87cc -CENTOS_VERSION=`rpm -q --qf "%{VERSION}" $(rpm -q --whatprovides redhat-release)` -if [ "$CENTOS_VERSION" = "6" ]; then - TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos6-native" -else - TOOLCHAIN_EXECUTABLES="/mnt/gvfs/third-party/$TOOLCHAIN_REV/centos5.2-native" -fi -TOOLCHAIN_LIB_BASE="/mnt/gvfs/third-party/$TOOLCHAIN_REV/gcc-4.8.1-glibc-2.17" - -# location of libhdfs libraries -if test "$USE_HDFS"; then - JAVA_HOME="/usr/local/jdk-6u22-64" - JINCLUDE="-I$JAVA_HOME/include -I$JAVA_HOME/include/linux" - GLIBC_RUNTIME_PATH="/usr/local/fbcode/gcc-4.8.1-glibc-2.17" - HDFSLIB=" -Wl,--no-whole-archive hdfs/libhdfs.a -L$JAVA_HOME/jre/lib/amd64 " - HDFSLIB+=" -L$JAVA_HOME/jre/lib/amd64/server -L$GLIBC_RUNTIME_PATH/lib " - HDFSLIB+=" -ldl -lverify -ljava -ljvm " -fi - -# location of libgcc -LIBGCC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/include" -LIBGCC_LIBS=" -L $TOOLCHAIN_LIB_BASE/libgcc/libgcc-4.8.1/8aac7fc/libs" - -# location of glibc -GLIBC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/include" -GLIBC_LIBS=" -L $TOOLCHAIN_LIB_BASE/glibc/glibc-2.17/99df8fc/lib" - -# location of snappy headers and libraries -SNAPPY_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/include" -SNAPPY_LIBS=" $TOOLCHAIN_LIB_BASE/snappy/snappy-1.0.3/43d84e2/lib/libsnappy.a" - -# location of zlib headers and libraries -ZLIB_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/include" -ZLIB_LIBS=" $TOOLCHAIN_LIB_BASE/zlib/zlib-1.2.5/c3f970a/lib/libz.a" - -# location of bzip headers and libraries -BZIP_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/include" -BZIP_LIBS=" $TOOLCHAIN_LIB_BASE/bzip2/bzip2-1.0.6/c3f970a/lib/libbz2.a" - -LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b -LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include" -LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a" - -# location of gflags headers and libraries -GFLAGS_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/include" -GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a" - -# location of jemalloc -JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/" -JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a" - -# location of numa -NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65 -NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/" -NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a" - -# use Intel SSE support for checksum calculations -export USE_SSE=" -msse -msse4.2 " - -CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc" -CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" -AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar -RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib - -CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" -CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" -CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA" - -EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so" -EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a" -EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB" - -PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS " - -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS" - -VALGRIND_VER="$TOOLCHAIN_LIB_BASE/valgrind/valgrind-3.8.1/c3f970a/bin/" - -export CC CXX AR RANLIB CFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh new file mode 100644 index 000000000..2b40e3412 --- /dev/null +++ b/build_tools/fbcode_config.sh @@ -0,0 +1,125 @@ +#!/bin/sh +# +# Set environment variables so that we can compile rocksdb using +# fbcode settings. It uses the latest g++ and clang compilers and also +# uses jemalloc +# Environment variables that change the behavior of this script: +# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included + +CFLAGS="" + +# location of libgcc +LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e" +LIBGCC_INCLUDE="$LIBGCC_BASE/include" +LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" + +# location of glibc +GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720 +GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include" +GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib" + +# location of snappy headers and libraries +SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/" +if test -z $PIC_BUILD; then + SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a" +else + SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a" +fi +CFLAGS+=" -DSNAPPY" + +if test -z $PIC_BUILD; then + # location of zlib headers and libraries + ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/" + ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a" + CFLAGS+=" -DZLIB" + + # location of bzip headers and libraries + BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/" + BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a" + CFLAGS+=" -DBZIP2" + + LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/include/" + LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/79d2943e2dd7208a3e0b06cf95e9f85f05fe9e1b/r124/gcc-4.9-glibc-2.20/4230243/lib/liblz4.a" + CFLAGS+=" -DLZ4" +fi + +# location of gflags headers and libraries +GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/" +if test -z $PIC_BUILD; then + GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a" +else + GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a" +fi +CFLAGS+=" -DGFLAGS=google" + +# location of jemalloc +JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/" +JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a" + +if test -z $PIC_BUILD; then + # location of numa + NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/" + NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a" + CFLAGS+=" -DNUMA" + + # location of libunwind + LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a" +fi + +# use Intel SSE support for checksum calculations +export USE_SSE=1 + +BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin" +AR="$BINUTILS/ar" + +DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" + +GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4" +STDLIBS="-L $GCC_BASE/lib64" + +CLANG_BASE="/mnt/gvfs/third-party2/clang/290704c112bf894bf4a30d7bbd1be81e34998473/dev" +CLANG_ANALYZER="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++" +CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build" + +if [ -z "$USE_CLANG" ]; then + # gcc + CC="$GCC_BASE/bin/gcc" + CXX="$GCC_BASE/bin/g++" + + CFLAGS+=" -B$BINUTILS/gold" + CFLAGS+=" -isystem $GLIBC_INCLUDE" + CFLAGS+=" -isystem $LIBGCC_INCLUDE" +else + # clang + CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/" + CC="$CLANG_BASE/centos6-native/af4b1a0/bin/clang" + CXX="$CLANG_BASE/centos6-native/af4b1a0/bin/clang++" + + KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include" + + CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib" + CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x " + CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x/x86_64-facebook-linux " + CFLAGS+=" -isystem $GLIBC_INCLUDE" + CFLAGS+=" -isystem $LIBGCC_INCLUDE" + CFLAGS+=" -isystem $CLANG_INCLUDE" + CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " + CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " + CXXFLAGS="-nostdinc++" +fi + +CFLAGS+=" $DEPS_INCLUDE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT" +CXXFLAGS+=" $CFLAGS" + +EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB" +EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.9-glibc-2.20/lib/ld.so" +EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND" + +PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" + +EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS" + +VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/" + +export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh new file mode 100644 index 000000000..7c1ff5147 --- /dev/null +++ b/build_tools/fbcode_config4.8.1.sh @@ -0,0 +1,105 @@ +#!/bin/sh +# +# Set environment variables so that we can compile rocksdb using +# fbcode settings. It uses the latest g++ compiler and also +# uses jemalloc + +# location of libgcc +LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/7712e757d7355cb51292454ee0b7b46a467fdfed/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc" +LIBGCC_INCLUDE="$LIBGCC_BASE/include" +LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" + +# location of glibc +GLIBC_REV=6e40560b4e0b6d690fd1cf8c7a43ad7452b04cfa +GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include" +GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib" + +# location of snappy headers and libraries +SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/include" +SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/aef17f6c0b44b4fe408bd06f67c93701ab0a6ceb/1.0.3/gcc-4.8.1-glibc-2.17/43d84e2/lib/libsnappy.a" + +# location of zlib headers and libraries +ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/include" +ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/25c6216928b4d77b59ddeca0990ff6fe9ac16b81/1.2.5/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a" + +# location of bzip headers and libraries +BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/" +BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/c9ef7629c2aa0024f7a416e87602f06eb88f5eac/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a" + +LZ4_REV=065ec7e38fe83329031f6668c43bef83eff5808b +LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/include" +LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r108/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a" + +# location of gflags headers and libraries +GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/include/" +GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/1ad047a6e6f6673991918ecadc670868205a243a/1.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a" + +# location of jemalloc +JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/include" +JEMALLOC_LIB=" -Wl,--whole-archive /mnt/gvfs/third-party2/jemalloc/c60d854f7824f334195fe7fd34b2bc9057e3c1f9/3.6.0/gcc-4.8.1-glibc-2.17/4d53c6f/lib/libjemalloc.a" + +# location of numa +NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65 +NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/" +NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a" + +# location of libunwind +LIBUNWIND_REV=2c060e64064559905d46fd194000d61592087bdc +LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/1.1/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a" + +# use Intel SSE support for checksum calculations +export USE_SSE=1 + +BINUTILS="/mnt/gvfs/third-party2/binutils/2aff2e7b474cd3e6ab23495ad1224b7d214b9f8e/2.21.1/centos6-native/da39a3e/bin" +AR="$BINUTILS/ar" + +DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" + +GCC_BASE="/mnt/gvfs/third-party2/gcc/1ec615e23800f0815d474478ba476a0adc3fe788/4.8.1/centos6-native/cc6c9dc" +STDLIBS="-L $GCC_BASE/lib64" + +if [ -z "$USE_CLANG" ]; then + # gcc + CC="$GCC_BASE/bin/gcc" + CXX="$GCC_BASE/bin/g++" + + CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic" + CFLAGS+=" -isystem $GLIBC_INCLUDE" + CFLAGS+=" -isystem $LIBGCC_INCLUDE" +else + # clang + CLANG_BASE="/mnt/gvfs/third-party2/clang/9ab68376f938992c4eb5946ca68f90c3185cffc8/3.4" + CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/fb0f730/lib/clang/3.4/include" + CC="$CLANG_BASE/centos6-native/9cefd8a/bin/clang" + CXX="$CLANG_BASE/centos6-native/9cefd8a/bin/clang++" + + KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/a683ed7135276731065a9d76d3016c9731f4e2f9/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.8.1-glibc-2.17/da39a3e/include/" + + CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib" + CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 " + CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1/x86_64-facebook-linux " + CFLAGS+=" -isystem $GLIBC_INCLUDE" + CFLAGS+=" -isystem $LIBGCC_INCLUDE" + CFLAGS+=" -isystem $CLANG_INCLUDE" + CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux " + CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE " + CXXFLAGS="-nostdinc++" +fi + +CFLAGS+=" $DEPS_INCLUDE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT" +CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA" +CXXFLAGS+=" $CFLAGS" + +EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB" +EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so" +EXEC_LDFLAGS+=" -Wl,--no-whole-archive $LIBUNWIND" + +PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" + +EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS" + +VALGRIND_REV=b2a9f85e4b70cd03abc85a7f3027fbc4cef35bd0 +VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/" + +export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE diff --git a/build_tools/mac-install-gflags.sh b/build_tools/mac-install-gflags.sh deleted file mode 100755 index a245a26a8..000000000 --- a/build_tools/mac-install-gflags.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/sh -# Install gflags for mac developers. - -set -e - -DIR=`mktemp -d /tmp/rocksdb_gflags_XXXX` - -cd $DIR -wget https://gflags.googlecode.com/files/gflags-2.0.tar.gz -tar xvfz gflags-2.0.tar.gz -cd gflags-2.0 - -./configure -make -make install - -# Add include/lib path for g++ -echo 'export LIBRARY_PATH+=":/usr/local/lib"' >> ~/.bash_profile -echo 'export CPATH+=":/usr/local/include"' >> ~/.bash_profile - -echo "" -echo "-----------------------------------------------------------------------------" -echo "| Installation Completed |" -echo "-----------------------------------------------------------------------------" -echo "Please run \`. ~/.bash_profile\` to be able to compile with gflags" diff --git a/build_tools/make_package.sh b/build_tools/make_package.sh new file mode 100755 index 000000000..2ca28023d --- /dev/null +++ b/build_tools/make_package.sh @@ -0,0 +1,116 @@ +#/usr/bin/env bash + +set -e + +function log() { + echo "[+] $1" +} + +function fatal() { + echo "[!] $1" + exit 1 +} + +function platform() { + local __resultvar=$1 + if [[ -f "/etc/yum.conf" ]]; then + eval $__resultvar="centos" + elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then + eval $__resultvar="ubuntu" + else + fatal "Unknwon operating system" + fi +} +platform OS + +function package() { + if [[ $OS = "ubuntu" ]]; then + if dpkg --get-selections | grep --quiet $1; then + log "$1 is already installed. skipping." + else + apt-get install $@ -y + fi + elif [[ $OS = "centos" ]]; then + if rpm -qa | grep --quiet $1; then + log "$1 is already installed. skipping." + else + yum install $@ -y + fi + fi +} + +function detect_fpm_output() { + if [[ $OS = "ubuntu" ]]; then + export FPM_OUTPUT=deb + elif [[ $OS = "centos" ]]; then + export FPM_OUTPUT=rpm + fi +} +detect_fpm_output + +function gem_install() { + if gem list | grep --quiet $1; then + log "$1 is already installed. skipping." + else + gem install $@ + fi +} + +function main() { + if [[ $# -ne 1 ]]; then + fatal "Usage: $0 " + else + log "using rocksdb version: $1" + fi + + if [[ -d /vagrant ]]; then + if [[ $OS = "ubuntu" ]]; then + package g++-4.7 + export CXX=g++-4.7 + + # the deb would depend on libgflags2, but the static lib is the only thing + # installed by make install + package libgflags-dev + + package ruby-all-dev + elif [[ $OS = "centos" ]]; then + pushd /etc/yum.repos.d + if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then + wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo + fi + package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6 + package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6 + export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc + export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp + export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++ + export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin + popd + if ! rpm -qa | grep --quiet gflags; then + rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm + fi + + package ruby + package ruby-devel + package rubygems + package rpm-build + fi + fi + gem_install fpm + + make static_lib + make install INSTALL_PATH=package + fpm \ + -s dir \ + -t $FPM_OUTPUT \ + -n rocksdb \ + -v $1 \ + --prefix /usr \ + --url http://rocksdb.org/ \ + -m rocksdb@fb.com \ + --license BSD \ + --vendor Facebook \ + --description "RocksDB is an embeddable persistent key-value store for fast storage." \ + package +} + +main $@ diff --git a/build_tools/regression_build_test.sh b/build_tools/regression_build_test.sh index 5e335afde..ee2d334f0 100755 --- a/build_tools/regression_build_test.sh +++ b/build_tools/regression_build_test.sh @@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \ --threads=32 \ --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram +# measure fillseq with bunch of column families +./db_bench \ + --benchmarks=fillseq \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=0 \ + --num=$NUM \ + --writes=$NUM \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 > ${STAT_FILE}.fillseq_lots_column_families + +# measure overwrite performance with bunch of column families +./db_bench \ + --benchmarks=overwrite \ + --num_column_families=500 \ + --write_buffer_size=1048576 \ + --db=$DATA_DIR \ + --use_existing_db=1 \ + --num=$NUM \ + --writes=$((NUM / 10)) \ + --open_files=55000 \ + --statistics=1 \ + --histogram=1 \ + --disable_data_sync=1 \ + --disable_wal=1 \ + --sync=0 \ + --threads=8 > ${STAT_FILE}.overwrite_lots_column_families # send data to ods function send_to_ods { @@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram +send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families +send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families diff --git a/build_tools/unity b/build_tools/unity index 477b8f7fb..8138de542 100755 --- a/build_tools/unity +++ b/build_tools/unity @@ -54,7 +54,7 @@ case "$TARGET_OS" in exit 1 esac -# We want to make a list of all cc files within util, db, table, and helpers +# We want to make a list of all cc files within util, db and table # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the # prune take effect. diff --git a/build_tools/version.sh b/build_tools/version.sh new file mode 100755 index 000000000..c5a8595fb --- /dev/null +++ b/build_tools/version.sh @@ -0,0 +1,14 @@ +#!/bin/sh +if [ "$#" = "0" ]; then + echo "Usage: $0 major|minor|patch" + exit 1 +fi +if [ "$1" = "major" ]; then + cat include/rocksdb/version.h | grep MAJOR | head -n1 | awk '{print $3}' +fi +if [ "$1" = "minor" ]; then + cat include/rocksdb/version.h | grep MINOR | head -n1 | awk '{print $3}' +fi +if [ "$1" = "patch" ]; then + cat include/rocksdb/version.h | grep PATCH | head -n1 | awk '{print $3}' +fi diff --git a/coverage/coverage_test.sh b/coverage/coverage_test.sh index 08dbd05a5..4d8052c9e 100755 --- a/coverage/coverage_test.sh +++ b/coverage/coverage_test.sh @@ -11,8 +11,8 @@ fi ROOT=".." # Fetch right version of gcov if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then - source $ROOT/build_tools/fbcode.gcc471.sh - GCOV=$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.7.1/cc6c9dc/bin/gcov + source $ROOT/build_tools/fbcode_config.sh + GCOV=$GCC_BASE/bin/gcov else GCOV=$(which gcov) fi diff --git a/db/builder.cc b/db/builder.cc index 1084f0413..5d3273e78 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -26,21 +26,24 @@ namespace rocksdb { class TableFactory; -TableBuilder* NewTableBuilder(const Options& options, +TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) { - return options.table_factory->NewTableBuilder(options, internal_comparator, - file, compression_type); + const CompressionType compression_type, + const CompressionOptions& compression_opts) { + return ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); } -Status BuildTable(const std::string& dbname, Env* env, const Options& options, - const EnvOptions& soptions, TableCache* table_cache, +Status BuildTable(const std::string& dbname, Env* env, + const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority) { Status s; meta->fd.file_size = 0; @@ -50,33 +53,36 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, // If the sequence number of the smallest entry in the memtable is // smaller than the most recent snapshot, then we do not trigger // removal of duplicate/deleted keys as part of this builder. - bool purge = options.purge_redundant_kvs_while_flush; + bool purge = ioptions.purge_redundant_kvs_while_flush; if (earliest_seqno_in_memtable <= newest_snapshot) { purge = false; } - std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(), + std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { unique_ptr file; - s = env->NewWritableFile(fname, &file, soptions); + s = env->NewWritableFile(fname, &file, env_options); if (!s.ok()) { return s; } file->SetIOPriority(io_priority); - TableBuilder* builder = - NewTableBuilder(options, internal_comparator, file.get(), compression); + TableBuilder* builder = NewTableBuilder( + ioptions, internal_comparator, file.get(), + compression, compression_opts); - // the first key is the smallest key - Slice key = iter->key(); - meta->smallest.DecodeFrom(key); - meta->smallest_seqno = GetInternalKeySeqno(key); - meta->largest_seqno = meta->smallest_seqno; + { + // the first key is the smallest key + Slice key = iter->key(); + meta->smallest.DecodeFrom(key); + meta->smallest_seqno = GetInternalKeySeqno(key); + meta->largest_seqno = meta->smallest_seqno; + } MergeHelper merge(internal_comparator.user_comparator(), - options.merge_operator.get(), options.info_log.get(), - options.min_partial_merge_operands, + ioptions.merge_operator, ioptions.info_log, + ioptions.min_partial_merge_operands, true /* internal key corruption is not ok */); if (purge) { @@ -196,12 +202,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, delete builder; // Finish and check for file errors - if (s.ok() && !options.disableDataSync) { - if (options.use_fsync) { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + if (s.ok() && !ioptions.disable_data_sync) { + if (ioptions.use_fsync) { + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Fsync(); } else { - StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); s = file->Sync(); } } @@ -211,7 +217,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, if (s.ok()) { // Verify that the table is usable - Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, + Iterator* it = table_cache->NewIterator(ReadOptions(), env_options, internal_comparator, meta->fd); s = it->status(); delete it; diff --git a/db/builder.h b/db/builder.h index f57501abd..cf3ebd1ae 100644 --- a/db/builder.h +++ b/db/builder.h @@ -11,6 +11,7 @@ #include "rocksdb/status.h" #include "rocksdb/types.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" namespace rocksdb { @@ -26,8 +27,10 @@ class TableBuilder; class WritableFile; extern TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + const ImmutableCFOptions& options, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts); // Build a Table file from the contents of *iter. The generated file // will be named according to number specified in meta. On success, the rest of @@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder( // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. extern Status BuildTable(const std::string& dbname, Env* env, - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& options, + const EnvOptions& env_options, TableCache* table_cache, Iterator* iter, FileMetaData* meta, const InternalKeyComparator& internal_comparator, const SequenceNumber newest_snapshot, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression, + const CompressionOptions& compression_opts, const Env::IOPriority io_priority = Env::IO_HIGH); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index 3114f3500..55afad94e 100644 --- a/db/c.cc +++ b/db/c.cc @@ -29,6 +29,7 @@ #include "rocksdb/statistics.h" #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" +#include "rocksdb/utilities/backupable_db.h" using rocksdb::Cache; using rocksdb::ColumnFamilyDescriptor; @@ -56,6 +57,7 @@ using rocksdb::NewBloomFilterPolicy; using rocksdb::NewLRUCache; using rocksdb::Options; using rocksdb::BlockBasedTableOptions; +using rocksdb::CuckooTableOptions; using rocksdb::RandomAccessFile; using rocksdb::Range; using rocksdb::ReadOptions; @@ -68,21 +70,32 @@ using rocksdb::WritableFile; using rocksdb::WriteBatch; using rocksdb::WriteOptions; using rocksdb::LiveFileMetaData; +using rocksdb::BackupEngine; +using rocksdb::BackupableDBOptions; +using rocksdb::BackupInfo; +using rocksdb::RestoreOptions; using std::shared_ptr; extern "C" { struct rocksdb_t { DB* rep; }; +struct rocksdb_backup_engine_t { BackupEngine* rep; }; +struct rocksdb_backup_engine_info_t { std::vector rep; }; +struct rocksdb_restore_options_t { RestoreOptions rep; }; struct rocksdb_iterator_t { Iterator* rep; }; struct rocksdb_writebatch_t { WriteBatch rep; }; struct rocksdb_snapshot_t { const Snapshot* rep; }; struct rocksdb_flushoptions_t { FlushOptions rep; }; struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; }; -struct rocksdb_readoptions_t { ReadOptions rep; }; +struct rocksdb_readoptions_t { + ReadOptions rep; + Slice upper_bound; // stack variable to set pointer to in ReadOptions +}; struct rocksdb_writeoptions_t { WriteOptions rep; }; struct rocksdb_options_t { Options rep; }; struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; }; +struct rocksdb_cuckoo_table_options_t { CuckooTableOptions rep; }; struct rocksdb_seqfile_t { SequentialFile* rep; }; struct rocksdb_randomfile_t { RandomAccessFile* rep; }; struct rocksdb_writablefile_t { WritableFile* rep; }; @@ -118,7 +131,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter { const Slice& existing_value, std::string* new_value, bool* value_changed) const { - char* c_new_value = NULL; + char* c_new_value = nullptr; size_t new_value_length = 0; unsigned char c_value_changed = 0; unsigned char result = (*filter_)( @@ -385,11 +398,9 @@ struct rocksdb_mergeoperator_t : public MergeOperator { unsigned char success; size_t new_value_len; char* tmp_new_value = (*full_merge_)( - state_, - key.data(), key.size(), - existing_value_data, existing_value_len, - &operand_pointers[0], &operand_sizes[0], n, - &success, &new_value_len); + state_, key.data(), key.size(), existing_value_data, existing_value_len, + &operand_pointers[0], &operand_sizes[0], static_cast(n), &success, + &new_value_len); new_value->assign(tmp_new_value, new_value_len); if (delete_value_ != nullptr) { @@ -417,7 +428,7 @@ struct rocksdb_mergeoperator_t : public MergeOperator { size_t new_value_len; char* tmp_new_value = (*partial_merge_)( state_, key.data(), key.size(), &operand_pointers[0], &operand_sizes[0], - operand_count, &success, &new_value_len); + static_cast(operand_count), &success, &new_value_len); new_value->assign(tmp_new_value, new_value_len); if (delete_value_ != nullptr) { @@ -524,6 +535,85 @@ rocksdb_t* rocksdb_open_for_read_only( return result; } +rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, const char* path, char** errptr) { + BackupEngine* be; + if (SaveError(errptr, BackupEngine::Open(options->rep.env, + BackupableDBOptions(path), &be))) { + return nullptr; + } + rocksdb_backup_engine_t* result = new rocksdb_backup_engine_t; + result->rep = be; + return result; +} + +void rocksdb_backup_engine_create_new_backup(rocksdb_backup_engine_t* be, + rocksdb_t* db, char** errptr) { + SaveError(errptr, be->rep->CreateNewBackup(db->rep)); +} + +rocksdb_restore_options_t* rocksdb_restore_options_create() { + return new rocksdb_restore_options_t; +} + +void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt) { + delete opt; +} + +void rocksdb_restore_options_set_keep_log_files(rocksdb_restore_options_t* opt, + int v) { + opt->rep.keep_log_files = v; +} + +void rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t* be, const char* db_dir, const char* wal_dir, + const rocksdb_restore_options_t* restore_options, char** errptr) { + SaveError(errptr, be->rep->RestoreDBFromLatestBackup(std::string(db_dir), + std::string(wal_dir), + restore_options->rep)); +} + +const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( + rocksdb_backup_engine_t* be) { + rocksdb_backup_engine_info_t* result = new rocksdb_backup_engine_info_t; + be->rep->GetBackupInfo(&result->rep); + return result; +} + +int rocksdb_backup_engine_info_count(const rocksdb_backup_engine_info_t* info) { + return static_cast(info->rep.size()); +} + +const int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].timestamp; +} + +const uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].backup_id; +} + +const uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].size; +} + +const uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, int index) { + return info->rep[index].number_files; +} + +void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t* info) { + delete info; +} + +void rocksdb_backup_engine_close(rocksdb_backup_engine_t* be) { + delete be->rep; + delete be; +} + void rocksdb_close(rocksdb_t* db) { delete db->rep; delete db; @@ -1123,6 +1213,51 @@ void rocksdb_options_set_block_based_table_factory( } +rocksdb_cuckoo_table_options_t* +rocksdb_cuckoo_options_create() { + return new rocksdb_cuckoo_table_options_t; +} + +void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options) { + delete options; +} + +void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v) { + options->rep.hash_table_ratio = v; +} + +void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.max_search_depth = v; +} + +void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v) { + options->rep.cuckoo_block_size = v; +} + +void rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.identity_as_first_hash = v; +} + +void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v) { + options->rep.use_module_hash = v; +} + +void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t *opt, + rocksdb_cuckoo_table_options_t* table_options) { + if (table_options) { + opt->rep.table_factory.reset( + rocksdb::NewCuckooTableFactory(table_options->rep)); + } +} + + rocksdb_options_t* rocksdb_options_create() { return new rocksdb_options_t; } @@ -1216,6 +1351,11 @@ void rocksdb_options_set_info_log_level( opt->rep.info_log_level = static_cast(v); } +void rocksdb_options_set_db_write_buffer_size(rocksdb_options_t* opt, + size_t s) { + opt->rep.db_write_buffer_size = s; +} + void rocksdb_options_set_write_buffer_size(rocksdb_options_t* opt, size_t s) { opt->rep.write_buffer_size = s; } @@ -1224,6 +1364,10 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) { opt->rep.max_open_files = n; } +void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n) { + opt->rep.max_total_wal_size = n; +} + void rocksdb_options_set_target_file_size_base( rocksdb_options_t* opt, uint64_t n) { opt->rep.target_file_size_base = n; @@ -1355,8 +1499,8 @@ void rocksdb_options_set_purge_redundant_kvs_while_flush( opt->rep.purge_redundant_kvs_while_flush = v; } -void rocksdb_options_set_allow_os_buffer( - rocksdb_options_t* opt, unsigned char v) { +void rocksdb_options_set_allow_os_buffer(rocksdb_options_t* opt, + unsigned char v) { opt->rep.allow_os_buffer = v; } @@ -1581,11 +1725,6 @@ void rocksdb_options_set_bloom_locality( opt->rep.bloom_locality = v; } -void rocksdb_options_set_allow_thread_local( - rocksdb_options_t* opt, unsigned char v) { - opt->rep.allow_thread_local = v; -} - void rocksdb_options_set_inplace_update_support( rocksdb_options_t* opt, unsigned char v) { opt->rep.inplace_update_support = v; @@ -1844,6 +1983,19 @@ void rocksdb_readoptions_set_snapshot( opt->rep.snapshot = (snap ? snap->rep : nullptr); } +void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t* opt, + const char* key, size_t keylen) { + if (key == nullptr) { + opt->upper_bound = Slice(); + opt->rep.iterate_upper_bound = nullptr; + + } else { + opt->upper_bound = Slice(key, keylen); + opt->rep.iterate_upper_bound = &opt->upper_bound; + } +} + void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t* opt, int v) { opt->rep.read_tier = static_cast(v); @@ -2039,7 +2191,7 @@ void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level int rocksdb_livefiles_count( const rocksdb_livefiles_t* lf) { - return lf->rep.size(); + return static_cast(lf->rep.size()); } const char* rocksdb_livefiles_name( diff --git a/db/c_test.c b/db/c_test.c index 171fd6d5c..b8f0ea186 100644 --- a/db/c_test.c +++ b/db/c_test.c @@ -10,9 +10,11 @@ #include #include #include +#include const char* phase = ""; static char dbname[200]; +static char dbbackupname[200]; static void StartPhase(const char* name) { fprintf(stderr, "=== Test %s\n", name); @@ -132,7 +134,7 @@ static void CmpDestroy(void* arg) { } static int CmpCompare(void* arg, const char* a, size_t alen, const char* b, size_t blen) { - int n = (alen < blen) ? alen : blen; + size_t n = (alen < blen) ? alen : blen; int r = memcmp(a, b, n); if (r == 0) { if (alen < blen) r = -1; @@ -346,6 +348,11 @@ int main(int argc, char** argv) { GetTempDir(), ((int) geteuid())); + snprintf(dbbackupname, sizeof(dbbackupname), + "%s/rocksdb_c_test-%d-backup", + GetTempDir(), + ((int) geteuid())); + StartPhase("create_objects"); cmp = rocksdb_comparator_create(NULL, CmpDestroy, CmpCompare, CmpName); env = rocksdb_create_default_env(); @@ -396,6 +403,41 @@ int main(int argc, char** argv) { CheckNoError(err); CheckGet(db, roptions, "foo", "hello"); + StartPhase("backup_and_restore"); + { + rocksdb_destroy_db(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_t *be = rocksdb_backup_engine_open(options, dbbackupname, &err); + CheckNoError(err); + + rocksdb_backup_engine_create_new_backup(be, db, &err); + CheckNoError(err); + + rocksdb_delete(db, woptions, "foo", 3, &err); + CheckNoError(err); + + rocksdb_close(db); + + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create(); + rocksdb_restore_options_set_keep_log_files(restore_options, 0); + rocksdb_backup_engine_restore_db_from_latest_backup(be, dbname, dbname, restore_options, &err); + CheckNoError(err); + rocksdb_restore_options_destroy(restore_options); + + rocksdb_options_set_error_if_exists(options, 0); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + rocksdb_options_set_error_if_exists(options, 1); + + CheckGet(db, roptions, "foo", "hello"); + + rocksdb_backup_engine_close(be); + } + StartPhase("compactall"); rocksdb_compact_range(db, NULL, 0, NULL, 0); CheckGet(db, roptions, "foo", "hello"); @@ -576,37 +618,39 @@ int main(int argc, char** argv) { StartPhase("compaction_filter"); { - rocksdb_options_t* options = rocksdb_options_create(); - rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_t* options_with_filter = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter, 1); rocksdb_compactionfilter_t* cfilter; cfilter = rocksdb_compactionfilter_create(NULL, CFilterDestroy, CFilterFilter, CFilterName); // Create new database rocksdb_close(db); - rocksdb_destroy_db(options, dbname, &err); - rocksdb_options_set_compaction_filter(options, cfilter); - db = CheckCompaction(db, options, roptions, woptions); + rocksdb_destroy_db(options_with_filter, dbname, &err); + rocksdb_options_set_compaction_filter(options_with_filter, cfilter); + db = CheckCompaction(db, options_with_filter, roptions, woptions); - rocksdb_options_set_compaction_filter(options, NULL); + rocksdb_options_set_compaction_filter(options_with_filter, NULL); rocksdb_compactionfilter_destroy(cfilter); - rocksdb_options_destroy(options); + rocksdb_options_destroy(options_with_filter); } StartPhase("compaction_filter_factory"); { - rocksdb_options_t* options = rocksdb_options_create(); - rocksdb_options_set_create_if_missing(options, 1); + rocksdb_options_t* options_with_filter_factory = rocksdb_options_create(); + rocksdb_options_set_create_if_missing(options_with_filter_factory, 1); rocksdb_compactionfilterfactory_t* factory; factory = rocksdb_compactionfilterfactory_create( NULL, CFilterFactoryDestroy, CFilterCreate, CFilterFactoryName); // Create new database rocksdb_close(db); - rocksdb_destroy_db(options, dbname, &err); - rocksdb_options_set_compaction_filter_factory(options, factory); - db = CheckCompaction(db, options, roptions, woptions); - - rocksdb_options_set_compaction_filter_factory(options, NULL); - rocksdb_options_destroy(options); + rocksdb_destroy_db(options_with_filter_factory, dbname, &err); + rocksdb_options_set_compaction_filter_factory(options_with_filter_factory, + factory); + db = CheckCompaction(db, options_with_filter_factory, roptions, woptions); + + rocksdb_options_set_compaction_filter_factory( + options_with_filter_factory, NULL); + rocksdb_options_destroy(options_with_filter_factory); } StartPhase("compaction_filter_v2"); @@ -799,8 +843,87 @@ int main(int argc, char** argv) { rocksdb_iter_get_error(iter, &err); CheckNoError(err); rocksdb_iter_destroy(iter); + + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); } + StartPhase("cuckoo_options"); + { + rocksdb_cuckoo_table_options_t* cuckoo_options; + cuckoo_options = rocksdb_cuckoo_options_create(); + rocksdb_cuckoo_options_set_hash_ratio(cuckoo_options, 0.5); + rocksdb_cuckoo_options_set_max_search_depth(cuckoo_options, 200); + rocksdb_cuckoo_options_set_cuckoo_block_size(cuckoo_options, 10); + rocksdb_cuckoo_options_set_identity_as_first_hash(cuckoo_options, 1); + rocksdb_cuckoo_options_set_use_module_hash(cuckoo_options, 0); + rocksdb_options_set_cuckoo_table_factory(options, cuckoo_options); + + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_cuckoo_options_destroy(cuckoo_options); + } + + StartPhase("iterate_upper_bound"); + { + // Create new empty database + rocksdb_close(db); + rocksdb_destroy_db(options, dbname, &err); + CheckNoError(err); + + rocksdb_options_set_prefix_extractor(options, NULL); + db = rocksdb_open(options, dbname, &err); + CheckNoError(err); + + rocksdb_put(db, woptions, "a", 1, "0", 1, &err); CheckNoError(err); + rocksdb_put(db, woptions, "foo", 3, "bar", 3, &err); CheckNoError(err); + rocksdb_put(db, woptions, "foo1", 4, "bar1", 4, &err); CheckNoError(err); + rocksdb_put(db, woptions, "g1", 2, "0", 1, &err); CheckNoError(err); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + rocksdb_readoptions_set_iterate_upper_bound(roptions, NULL, 0); + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "g1", "0"); + + rocksdb_iter_destroy(iter); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + // iterate_upper_bound points beyond the last expected entry + rocksdb_readoptions_set_iterate_upper_bound(roptions, "foo2", 4); + + rocksdb_iterator_t* iter = rocksdb_create_iterator(db, roptions); + + rocksdb_iter_seek(iter, "foo", 3); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo", "bar"); + + rocksdb_iter_next(iter); + CheckCondition(rocksdb_iter_valid(iter)); + CheckIter(iter, "foo1", "bar1"); + + rocksdb_iter_next(iter); + // should stop here... + CheckCondition(!rocksdb_iter_valid(iter)); + + rocksdb_iter_destroy(iter); + } + } StartPhase("cleanup"); rocksdb_close(db); diff --git a/db/column_family.cc b/db/column_family.cc index b1c9ba7e8..ea3e617e2 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -9,24 +9,65 @@ #include "db/column_family.h" +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include #include #include #include +#include "db/compaction_picker.h" #include "db/db_impl.h" +#include "db/job_context.h" #include "db/version_set.h" +#include "db/writebuffer.h" #include "db/internal_stats.h" -#include "db/compaction_picker.h" +#include "db/job_context.h" #include "db/table_properties_collector.h" +#include "db/version_set.h" +#include "db/write_controller.h" #include "util/autovector.h" #include "util/hash_skiplist_rep.h" +#include "util/options_helper.h" namespace rocksdb { -ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, - DBImpl* db, port::Mutex* mutex) - : cfd_(cfd), db_(db), mutex_(mutex) { +namespace { +// This function computes the amount of time in microseconds by which a write +// should be delayed based on the number of level-0 files according to the +// following formula: +// if n < bottom, return 0; +// if n >= top, return 1000; +// otherwise, let r = (n - bottom) / +// (top - bottom) +// and return r^2 * 1000. +// The goal of this formula is to gradually increase the rate at which writes +// are slowed. We also tried linear delay (r * 1000), but it seemed to do +// slightly worse. There is no other particular reason for choosing quadratic. +uint64_t SlowdownAmount(int n, double bottom, double top) { + uint64_t delay; + if (n >= top) { + delay = 1000; + } else if (n < bottom) { + delay = 0; + } else { + // If we are here, we know that: + // level0_start_slowdown <= n < level0_slowdown + // since the previous two conditions are false. + double how_much = static_cast(n - bottom) / (top - bottom); + delay = std::max(how_much * how_much * 1000, 100.0); + } + assert(delay <= 1000); + return delay; +} +} // namespace + +ColumnFamilyHandleImpl::ColumnFamilyHandleImpl( + ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex) + : cfd_(column_family_data), db_(db), mutex_(mutex) { if (cfd_ != nullptr) { cfd_->Ref(); } @@ -34,21 +75,29 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { if (cfd_ != nullptr) { - DBImpl::DeletionState deletion_state; + JobContext job_context; mutex_->Lock(); if (cfd_->Unref()) { delete cfd_; } - db_->FindObsoleteFiles(deletion_state, false, true); + db_->FindObsoleteFiles(&job_context, false, true); mutex_->Unlock(); - if (deletion_state.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(deletion_state); + if (job_context.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(job_context); } } } uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } +const std::string& ColumnFamilyHandleImpl::GetName() const { + return cfd()->GetName(); +} + +const Comparator* ColumnFamilyHandleImpl::user_comparator() const { + return cfd()->user_comparator(); +} + ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; @@ -132,7 +181,7 @@ SuperVersion* SuperVersion::Ref() { bool SuperVersion::Unref() { // fetch_sub returns the previous value of ref - uint32_t previous_refs = refs.fetch_sub(1, std::memory_order_relaxed); + uint32_t previous_refs = refs.fetch_sub(1); assert(previous_refs > 0); return previous_refs == 1; } @@ -174,20 +223,22 @@ void SuperVersionUnrefHandle(void* ptr) { } } // anonymous namespace -ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, - Version* dummy_versions, Cache* table_cache, - const ColumnFamilyOptions& options, - const DBOptions* db_options, - const EnvOptions& storage_options, - ColumnFamilySet* column_family_set) +ColumnFamilyData::ColumnFamilyData( + uint32_t id, const std::string& name, Version* _dummy_versions, + Cache* _table_cache, WriteBuffer* write_buffer, + const ColumnFamilyOptions& cf_options, const DBOptions* db_options, + const EnvOptions& env_options, ColumnFamilySet* column_family_set) : id_(id), name_(name), - dummy_versions_(dummy_versions), + dummy_versions_(_dummy_versions), current_(nullptr), refs_(0), dropped_(false), - internal_comparator_(options.comparator), - options_(*db_options, SanitizeOptions(&internal_comparator_, options)), + internal_comparator_(cf_options.comparator), + options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)), + ioptions_(options_), + mutable_cf_options_(options_, ioptions_), + write_buffer_(write_buffer), mem_(nullptr), imm_(options_.min_write_buffer_number_to_merge), super_version_(nullptr), @@ -196,48 +247,69 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, next_(nullptr), prev_(nullptr), log_number_(0), - need_slowdown_for_num_level0_files_(false), - column_family_set_(column_family_set) { + column_family_set_(column_family_set), + pending_flush_(false), + pending_compaction_(false) { Ref(); - // if dummy_versions is nullptr, then this is a dummy column family. - if (dummy_versions != nullptr) { + // if _dummy_versions is nullptr, then this is a dummy column family. + if (_dummy_versions != nullptr) { internal_stats_.reset( - new InternalStats(options_.num_levels, db_options->env, this)); - table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); - if (options_.compaction_style == kCompactionStyleUniversal) { + new InternalStats(ioptions_.num_levels, db_options->env, this)); + table_cache_.reset(new TableCache(ioptions_, env_options, _table_cache)); + if (ioptions_.compaction_style == kCompactionStyleLevel) { compaction_picker_.reset( - new UniversalCompactionPicker(&options_, &internal_comparator_)); - } else if (options_.compaction_style == kCompactionStyleLevel) { + new LevelCompactionPicker(ioptions_, &internal_comparator_)); +#ifndef ROCKSDB_LITE + } else if (ioptions_.compaction_style == kCompactionStyleUniversal) { compaction_picker_.reset( - new LevelCompactionPicker(&options_, &internal_comparator_)); + new UniversalCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleFIFO) { + compaction_picker_.reset( + new FIFOCompactionPicker(ioptions_, &internal_comparator_)); + } else if (ioptions_.compaction_style == kCompactionStyleNone) { + compaction_picker_.reset(new NullCompactionPicker( + ioptions_, &internal_comparator_)); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "Column family %s does not use any background compaction. " + "Compactions can only be done via CompactFiles\n", + GetName().c_str()); +#endif // !ROCKSDB_LITE } else { - assert(options_.compaction_style == kCompactionStyleFIFO); + Log(InfoLogLevel::ERROR_LEVEL, ioptions_.info_log, + "Unable to recognize the specified compaction style %d. " + "Column family %s will use kCompactionStyleLevel.\n", + ioptions_.compaction_style, GetName().c_str()); compaction_picker_.reset( - new FIFOCompactionPicker(&options_, &internal_comparator_)); + new LevelCompactionPicker(ioptions_, &internal_comparator_)); } - Log(options_.info_log, "Options for column family \"%s\":\n", - name.c_str()); - const ColumnFamilyOptions* cf_options = &options_; - cf_options->Dump(options_.info_log.get()); + if (column_family_set_->NumberOfColumnFamilies() < 10) { + Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log, + "--------------- Options for column family [%s]:\n", name.c_str()); + options_.Dump(ioptions_.info_log); + } else { + Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log, + "\t(skipping printing options)\n"); + } } - RecalculateWriteStallConditions(); + RecalculateWriteStallConditions(mutable_cf_options_); } // DB mutex held ColumnFamilyData::~ColumnFamilyData() { - assert(refs_ == 0); + assert(refs_.load(std::memory_order_relaxed) == 0); // remove from linked list auto prev = prev_; auto next = next_; prev->next_ = next; next->prev_ = prev; - // it's nullptr for dummy CFD - if (column_family_set_ != nullptr) { - // remove from column_family_set + if (!dropped_ && column_family_set_ != nullptr) { + // If it's dropped, it's already removed from column family set + // If column_family_set_ == nullptr, this is dummy CFD and not in + // ColumnFamilySet column_family_set_->RemoveColumnFamily(this); } @@ -245,6 +317,11 @@ ColumnFamilyData::~ColumnFamilyData() { current_->Unref(); } + // It would be wrong if this ColumnFamilyData is in flush_queue_ or + // compaction_queue_ and we destroyed it + assert(!pending_flush_); + assert(!pending_compaction_); + if (super_version_ != nullptr) { // Release SuperVersion reference kept in ThreadLocalPtr. // This must be done outside of mutex_ since unref handler can lock mutex. @@ -262,8 +339,9 @@ ColumnFamilyData::~ColumnFamilyData() { if (dummy_versions_ != nullptr) { // List must be empty - assert(dummy_versions_->next_ == dummy_versions_); - delete dummy_versions_; + assert(dummy_versions_->TEST_Next() == dummy_versions_); + bool deleted __attribute__((unused)) = dummy_versions_->Unref(); + assert(deleted); } if (mem_ != nullptr) { @@ -276,90 +354,146 @@ ColumnFamilyData::~ColumnFamilyData() { } } -void ColumnFamilyData::RecalculateWriteStallConditions() { - need_wait_for_num_memtables_ = - (imm()->size() == options()->max_write_buffer_number - 1); +void ColumnFamilyData::SetDropped() { + // can't drop default CF + assert(id_ != 0); + dropped_ = true; + write_controller_token_.reset(); - if (current_ != nullptr) { - need_wait_for_num_level0_files_ = - (current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger); - } else { - need_wait_for_num_level0_files_ = false; - } - - RecalculateWriteStallRateLimitsConditions(); + // remove from column_family_set + column_family_set_->RemoveColumnFamily(this); } -void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() { +void ColumnFamilyData::RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options) { if (current_ != nullptr) { - exceeds_hard_rate_limit_ = - (options()->hard_rate_limit > 1.0 && - current_->MaxCompactionScore() > options()->hard_rate_limit); - - exceeds_soft_rate_limit_ = - (options()->soft_rate_limit > 0.0 && - current_->MaxCompactionScore() > options()->soft_rate_limit); - } else { - exceeds_hard_rate_limit_ = false; - exceeds_soft_rate_limit_ = false; + auto* vstorage = current_->storage_info(); + const double score = vstorage->max_compaction_score(); + const int max_level = vstorage->max_compaction_score_level(); + + auto write_controller = column_family_set_->write_controller_; + + if (imm()->size() >= mutable_cf_options.max_write_buffer_number) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stopping writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d", + name_.c_str(), imm()->size(), + mutable_cf_options.max_write_buffer_number); + } else if (vstorage->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger) { + write_controller_token_ = write_controller->GetStopToken(); + internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stopping writes because we have %d level-0 files", + name_.c_str(), vstorage->NumLevelFiles(0)); + } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 && + vstorage->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger) { + uint64_t slowdown = + SlowdownAmount(vstorage->NumLevelFiles(0), + mutable_cf_options.level0_slowdown_writes_trigger, + mutable_cf_options.level0_stop_writes_trigger); + write_controller_token_ = write_controller->GetDelayToken(slowdown); + internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because we have %d level-0 files (%" PRIu64 + "us)", + name_.c_str(), vstorage->NumLevelFiles(0), slowdown); + } else if (mutable_cf_options.hard_rate_limit > 1.0 && + score > mutable_cf_options.hard_rate_limit) { + uint64_t kHardLimitSlowdown = 1000; + write_controller_token_ = + write_controller->GetDelayToken(kHardLimitSlowdown); + internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown, + false); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because we hit hard limit on level %d. " + "(%" PRIu64 "us)", + name_.c_str(), max_level, kHardLimitSlowdown); + } else if (mutable_cf_options.soft_rate_limit > 0.0 && + score > mutable_cf_options.soft_rate_limit) { + uint64_t slowdown = SlowdownAmount(score, + mutable_cf_options.soft_rate_limit, + mutable_cf_options.hard_rate_limit); + write_controller_token_ = write_controller->GetDelayToken(slowdown); + internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64 + "us)", + name_.c_str(), max_level, slowdown); + } else { + write_controller_token_.reset(); + } } } const EnvOptions* ColumnFamilyData::soptions() const { - return &(column_family_set_->storage_options_); + return &(column_family_set_->env_options_); } -void ColumnFamilyData::SetCurrent(Version* current) { - current_ = current; - need_slowdown_for_num_level0_files_ = - (options_.level0_slowdown_writes_trigger >= 0 && - current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger); +void ColumnFamilyData::SetCurrent(Version* current_version) { + current_ = current_version; } -void ColumnFamilyData::CreateNewMemtable() { +MemTable* ColumnFamilyData::ConstructNewMemtable( + const MutableCFOptions& mutable_cf_options) { assert(current_ != nullptr); + return new MemTable(internal_comparator_, ioptions_, + mutable_cf_options, write_buffer_); +} + +void ColumnFamilyData::CreateNewMemtable( + const MutableCFOptions& mutable_cf_options) { if (mem_ != nullptr) { delete mem_->Unref(); } - mem_ = new MemTable(internal_comparator_, options_); + SetMemtable(ConstructNewMemtable(mutable_cf_options)); mem_->Ref(); } -Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { - auto result = compaction_picker_->PickCompaction(current_, log_buffer); - RecalculateWriteStallRateLimitsConditions(); +bool ColumnFamilyData::NeedsCompaction() const { + return compaction_picker_->NeedsCompaction(current_->storage_info()); +} + +Compaction* ColumnFamilyData::PickCompaction( + const MutableCFOptions& mutable_options, LogBuffer* log_buffer) { + auto* result = compaction_picker_->PickCompaction( + GetName(), mutable_options, current_->storage_info(), log_buffer); + if (result != nullptr) { + result->SetInputVersion(current_); + } return result; } -Compaction* ColumnFamilyData::CompactRange(int input_level, int output_level, - uint32_t output_path_id, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end) { - return compaction_picker_->CompactRange(current_, input_level, output_level, - output_path_id, begin, end, - compaction_end); +Compaction* ColumnFamilyData::CompactRange( + const MutableCFOptions& mutable_cf_options, + int input_level, int output_level, uint32_t output_path_id, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end) { + auto* result = compaction_picker_->CompactRange( + GetName(), mutable_cf_options, current_->storage_info(), input_level, + output_level, output_path_id, begin, end, compaction_end); + if (result != nullptr) { + result->SetInputVersion(current_); + } + return result; } SuperVersion* ColumnFamilyData::GetReferencedSuperVersion( - port::Mutex* db_mutex) { + InstrumentedMutex* db_mutex) { SuperVersion* sv = nullptr; - if (LIKELY(column_family_set_->db_options_->allow_thread_local)) { - sv = GetThreadLocalSuperVersion(db_mutex); - sv->Ref(); - if (!ReturnThreadLocalSuperVersion(sv)) { - sv->Unref(); - } - } else { - db_mutex->Lock(); - sv = super_version_->Ref(); - db_mutex->Unlock(); + sv = GetThreadLocalSuperVersion(db_mutex); + sv->Ref(); + if (!ReturnThreadLocalSuperVersion(sv)) { + sv->Unref(); } return sv; } SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion( - port::Mutex* db_mutex) { + InstrumentedMutex* db_mutex) { SuperVersion* sv = nullptr; // The SuperVersion is cached in thread local storage to avoid acquiring // mutex when SuperVersion does not change since the last use. When a new @@ -382,11 +516,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion( sv = static_cast(ptr); if (sv == SuperVersion::kSVObsolete || sv->version_number != super_version_number_.load()) { - RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_ACQUIRES); + RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_ACQUIRES); SuperVersion* sv_to_delete = nullptr; if (sv && sv->Unref()) { - RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_CLEANUPS); + RecordTick(ioptions_.statistics, NUMBER_SUPERVERSION_CLEANUPS); db_mutex->Lock(); // NOTE: underlying resources held by superversion (sst files) might // not be released until the next background job. @@ -422,20 +556,68 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) { return false; } +void ColumnFamilyData::NotifyOnCompactionCompleted( + DB* db, Compaction* c, const Status& status) { +#ifndef ROCKSDB_LITE + auto listeners = ioptions()->listeners; + CompactionJobInfo info; + info.cf_name = c->column_family_data()->GetName(); + info.status = status; + info.output_level = c->output_level(); + for (const auto fmd : *c->inputs(c->level())) { + info.input_files.push_back( + TableFileName(options_.db_paths, + fmd->fd.GetNumber(), + fmd->fd.GetPathId())); + } + for (const auto newf : c->edit()->GetNewFiles()) { + info.input_files.push_back( + TableFileName(options_.db_paths, + newf.second.fd.GetNumber(), + newf.second.fd.GetPathId())); + } + for (auto listener : listeners) { + listener->OnCompactionCompleted(db, info); + } +#endif // ROCKSDB_LITE +} + +void ColumnFamilyData::NotifyOnFlushCompleted( + DB* db, const std::string& file_path, + bool triggered_flush_slowdown, + bool triggered_flush_stop) { + +#ifndef ROCKSDB_LITE + auto listeners = ioptions()->listeners; + for (auto listener : listeners) { + listener->OnFlushCompleted( + db, GetName(), file_path, + // Use path 0 as fulled memtables are first flushed into path 0. + triggered_flush_slowdown, triggered_flush_stop); + } +#endif // ROCKSDB_LITE +} + +SuperVersion* ColumnFamilyData::InstallSuperVersion( + SuperVersion* new_superversion, InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_); +} + SuperVersion* ColumnFamilyData::InstallSuperVersion( - SuperVersion* new_superversion, port::Mutex* db_mutex) { + SuperVersion* new_superversion, InstrumentedMutex* db_mutex, + const MutableCFOptions& mutable_cf_options) { new_superversion->db_mutex = db_mutex; + new_superversion->mutable_cf_options = mutable_cf_options; new_superversion->Init(mem_, imm_.current(), current_); SuperVersion* old_superversion = super_version_; super_version_ = new_superversion; ++super_version_number_; super_version_->version_number = super_version_number_; // Reset SuperVersions cached in thread local storage - if (column_family_set_->db_options_->allow_thread_local) { - ResetThreadLocalSuperVersions(); - } + ResetThreadLocalSuperVersions(); - RecalculateWriteStallConditions(); + RecalculateWriteStallConditions(mutable_cf_options); if (old_superversion != nullptr && old_superversion->Unref()) { old_superversion->Cleanup(); @@ -460,20 +642,37 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() { } } +#ifndef ROCKSDB_LITE +Status ColumnFamilyData::SetOptions( + const std::unordered_map& options_map) { + MutableCFOptions new_mutable_cf_options; + Status s = GetMutableOptionsFromStrings(mutable_cf_options_, options_map, + &new_mutable_cf_options); + if (s.ok()) { + mutable_cf_options_ = new_mutable_cf_options; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + } + return s; +} +#endif // ROCKSDB_LITE + ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, - Cache* table_cache) + const EnvOptions& env_options, + Cache* table_cache, + WriteBuffer* write_buffer, + WriteController* write_controller) : max_column_family_(0), - dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, + dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, nullptr, ColumnFamilyOptions(), db_options, - storage_options_, nullptr)), + env_options, nullptr)), default_cfd_cache_(nullptr), db_name_(dbname), db_options_(db_options), - storage_options_(storage_options), + env_options_(env_options), table_cache_(table_cache), - spin_lock_(ATOMIC_FLAG_INIT) { + write_buffer_(write_buffer), + write_controller_(write_controller) { // initialize linked list dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->next_ = dummy_cfd_; @@ -530,18 +729,17 @@ size_t ColumnFamilySet::NumberOfColumnFamilies() const { return column_families_.size(); } -// under a DB mutex +// under a DB mutex AND write thread ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( const std::string& name, uint32_t id, Version* dummy_versions, const ColumnFamilyOptions& options) { assert(column_families_.find(name) == column_families_.end()); ColumnFamilyData* new_cfd = - new ColumnFamilyData(id, name, dummy_versions, table_cache_, options, - db_options_, storage_options_, this); - Lock(); + new ColumnFamilyData(id, name, dummy_versions, table_cache_, + write_buffer_, options, db_options_, + env_options_, this); column_families_.insert({name, id}); column_family_data_.insert({id, new_cfd}); - Unlock(); max_column_family_ = std::max(max_column_family_, id); // add to linked list new_cfd->next_ = dummy_cfd_; @@ -555,19 +753,11 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily( return new_cfd; } -void ColumnFamilySet::Lock() { - // spin lock - while (spin_lock_.test_and_set(std::memory_order_acquire)) { - } -} - -void ColumnFamilySet::Unlock() { spin_lock_.clear(std::memory_order_release); } - // REQUIRES: DB mutex held void ColumnFamilySet::FreeDeadColumnFamilies() { autovector to_delete; for (auto cfd = dummy_cfd_->next_; cfd != dummy_cfd_; cfd = cfd->next_) { - if (cfd->refs_ == 0) { + if (cfd->refs_.load(std::memory_order_relaxed) == 0) { to_delete.push_back(cfd); } } @@ -577,25 +767,21 @@ void ColumnFamilySet::FreeDeadColumnFamilies() { } } -// under a DB mutex +// under a DB mutex AND from a write thread void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) { auto cfd_iter = column_family_data_.find(cfd->GetID()); assert(cfd_iter != column_family_data_.end()); - Lock(); column_family_data_.erase(cfd_iter); column_families_.erase(cfd->GetName()); - Unlock(); } +// under a DB mutex OR from a write thread bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) { if (column_family_id == 0) { // optimization for common case current_ = column_family_set_->GetDefault(); } else { - // maybe outside of db mutex, should lock - column_family_set_->Lock(); current_ = column_family_set_->GetColumnFamily(column_family_id); - column_family_set_->Unlock(); } handle_.SetCFD(current_); return current_ != nullptr; @@ -611,16 +797,18 @@ MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const { return current_->mem(); } -const Options* ColumnFamilyMemTablesImpl::GetOptions() const { - assert(current_ != nullptr); - return current_->options(); -} - ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() { assert(current_ != nullptr); return &handle_; } +void ColumnFamilyMemTablesImpl::CheckMemtableFull() { + if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) { + flush_scheduler_->ScheduleFlush(current_); + current_->mem()->MarkFlushScheduled(); + } +} + uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { uint32_t column_family_id = 0; if (column_family != nullptr) { @@ -630,4 +818,13 @@ uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) { return column_family_id; } +const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family) { + if (column_family != nullptr) { + auto cfh = reinterpret_cast(column_family); + return cfh->user_comparator(); + } + return nullptr; +} + } // namespace rocksdb diff --git a/db/column_family.h b/db/column_family.h index 33bceadc6..84b01dc71 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -19,7 +19,11 @@ #include "rocksdb/env.h" #include "db/memtable_list.h" #include "db/write_batch_internal.h" +#include "db/write_controller.h" #include "db/table_cache.h" +#include "db/flush_scheduler.h" +#include "util/instrumented_mutex.h" +#include "util/mutable_cf_options.h" #include "util/thread_local.h" namespace rocksdb { @@ -35,6 +39,8 @@ class InternalStats; class ColumnFamilyData; class DBImpl; class LogBuffer; +class InstrumentedMutex; +class InstrumentedMutexLock; // ColumnFamilyHandleImpl is the class that clients use to access different // column families. It has non-trivial destructor, which gets called when client @@ -42,17 +48,20 @@ class LogBuffer; class ColumnFamilyHandleImpl : public ColumnFamilyHandle { public: // create while holding the mutex - ColumnFamilyHandleImpl(ColumnFamilyData* cfd, DBImpl* db, port::Mutex* mutex); + ColumnFamilyHandleImpl( + ColumnFamilyData* cfd, DBImpl* db, InstrumentedMutex* mutex); // destroy without mutex virtual ~ColumnFamilyHandleImpl(); virtual ColumnFamilyData* cfd() const { return cfd_; } + virtual const Comparator* user_comparator() const; virtual uint32_t GetID() const; + virtual const std::string& GetName() const override; private: ColumnFamilyData* cfd_; DBImpl* db_; - port::Mutex* mutex_; + InstrumentedMutex* mutex_; }; // Does not ref-count ColumnFamilyData @@ -66,7 +75,7 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl { ColumnFamilyHandleInternal() : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr) {} - void SetCFD(ColumnFamilyData* cfd) { internal_cfd_ = cfd; } + void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; } virtual ColumnFamilyData* cfd() const override { return internal_cfd_; } private: @@ -78,6 +87,7 @@ struct SuperVersion { MemTable* mem; MemTableListVersion* imm; Version* current; + MutableCFOptions mutable_cf_options; std::atomic refs; // We need to_delete because during Cleanup(), imm->Unref() returns // all memtables that we need to free through this vector. We then @@ -85,7 +95,7 @@ struct SuperVersion { autovector to_delete; // Version number of the current SuperVersion uint64_t version_number; - port::Mutex* db_mutex; + InstrumentedMutex* db_mutex; // should be called outside the mutex SuperVersion() = default; @@ -117,8 +127,7 @@ extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, class ColumnFamilySet; -// This class keeps all the data that a column family needs. It's mosly dumb and -// used just to provide access to metadata. +// This class keeps all the data that a column family needs. // Most methods require DB mutex held, unless otherwise noted class ColumnFamilyData { public: @@ -129,17 +138,24 @@ class ColumnFamilyData { // thread-safe const std::string& GetName() const { return name_; } - void Ref() { ++refs_; } + // Ref() can only be called whily holding a DB mutex or during a + // single-threaded write. + void Ref() { refs_.fetch_add(1, std::memory_order_relaxed); } // will just decrease reference count to 0, but will not delete it. returns // true if the ref count was decreased to zero. in that case, it can be - // deleted by the caller immediatelly, or later, by calling + // deleted by the caller immediately, or later, by calling // FreeDeadColumnFamilies() + // Unref() can only be called while holding a DB mutex bool Unref() { - assert(refs_ > 0); - return --refs_ == 0; + int old_refs = refs_.fetch_sub(1, std::memory_order_relaxed); + assert(old_refs > 0); + return old_refs == 1; } - // This can only be called from single-threaded VersionSet::LogAndApply() + // SetDropped() can only be called under following conditions: + // 1) Holding a DB mutex, + // 2) from single-threaded write thread, AND + // 3) from single-threaded VersionSet::LogAndApply() // After dropping column family no other operation on that column family // will be executed. All the files and memory will be, however, kept around // until client drops the column family handle. That way, client can still @@ -147,27 +163,42 @@ class ColumnFamilyData { // Column family can be dropped and still alive. In that state: // *) Column family is not included in the iteration. // *) Compaction and flush is not executed on the dropped column family. - // *) Client can continue writing and reading from column family. However, all - // writes stay in the current memtable. + // *) Client can continue reading from column family. Writes will fail unless + // WriteOptions::ignore_missing_column_families is true // When the dropped column family is unreferenced, then we: // *) delete all memory associated with that column family // *) delete all the files associated with that column family - void SetDropped() { - // can't drop default CF - assert(id_ != 0); - dropped_ = true; - } + void SetDropped(); bool IsDropped() const { return dropped_; } // thread-safe - int NumberLevels() const { return options_.num_levels; } + int NumberLevels() const { return ioptions_.num_levels; } void SetLogNumber(uint64_t log_number) { log_number_ = log_number; } uint64_t GetLogNumber() const { return log_number_; } - // thread-safe + // !!! To be deprecated! Please don't not use this function anymore! const Options* options() const { return &options_; } + + // thread-safe const EnvOptions* soptions() const; + const ImmutableCFOptions* ioptions() const { return &ioptions_; } + // REQUIRES: DB mutex held + // This returns the MutableCFOptions used by current SuperVersion + // You shoul use this API to reference MutableCFOptions most of the time. + const MutableCFOptions* GetCurrentMutableCFOptions() const { + return &(super_version_->mutable_cf_options); + } + // REQUIRES: DB mutex held + // This returns the latest MutableCFOptions, which may be not in effect yet. + const MutableCFOptions* GetLatestMutableCFOptions() const { + return &mutable_cf_options_; + } +#ifndef ROCKSDB_LITE + // REQUIRES: DB mutex held + Status SetOptions( + const std::unordered_map& options_map); +#endif // ROCKSDB_LITE InternalStats* internal_stats() { return internal_stats_.get(); } @@ -175,18 +206,25 @@ class ColumnFamilyData { MemTable* mem() { return mem_; } Version* current() { return current_; } Version* dummy_versions() { return dummy_versions_; } - void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } void SetCurrent(Version* current); - void CreateNewMemtable(); + MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options); + void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } + void CreateNewMemtable(const MutableCFOptions& mutable_cf_options); TableCache* table_cache() const { return table_cache_.get(); } // See documentation in compaction_picker.h - Compaction* PickCompaction(LogBuffer* log_buffer); - Compaction* CompactRange(int input_level, int output_level, - uint32_t output_path_id, const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end); + // REQUIRES: DB mutex held + bool NeedsCompaction() const; + // REQUIRES: DB mutex held + Compaction* PickCompaction(const MutableCFOptions& mutable_options, + LogBuffer* log_buffer); + // REQUIRES: DB mutex held + Compaction* CompactRange( + const MutableCFOptions& mutable_cf_options, + int input_level, int output_level, uint32_t output_path_id, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); CompactionPicker* compaction_picker() { return compaction_picker_.get(); } // thread-safe @@ -201,11 +239,11 @@ class ColumnFamilyData { SuperVersion* GetSuperVersion() { return super_version_; } // thread-safe // Return a already referenced SuperVersion to be used safely. - SuperVersion* GetReferencedSuperVersion(port::Mutex* db_mutex); + SuperVersion* GetReferencedSuperVersion(InstrumentedMutex* db_mutex); // thread-safe // Get SuperVersion stored in thread local storage. If it does not exist, // get a reference from a current SuperVersion. - SuperVersion* GetThreadLocalSuperVersion(port::Mutex* db_mutex); + SuperVersion* GetThreadLocalSuperVersion(InstrumentedMutex* db_mutex); // Try to return SuperVersion back to thread local storage. Retrun true on // success and false on failure. It fails when the thread local storage // contains anything other than SuperVersion::kSVInUse flag. @@ -218,40 +256,35 @@ class ColumnFamilyData { // if its reference count is zero and needs deletion or nullptr if not // As argument takes a pointer to allocated SuperVersion to enable // the clients to allocate SuperVersion outside of mutex. + // IMPORTANT: Only call this from DBImpl::InstallSuperVersion() SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, - port::Mutex* db_mutex); + InstrumentedMutex* db_mutex, + const MutableCFOptions& mutable_cf_options); + SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, + InstrumentedMutex* db_mutex); void ResetThreadLocalSuperVersions(); - // A Flag indicating whether write needs to slowdown because of there are - // too many number of level0 files. - bool NeedSlowdownForNumLevel0Files() const { - return need_slowdown_for_num_level0_files_; - } - - bool NeedWaitForNumLevel0Files() const { - return need_wait_for_num_level0_files_; - } - - bool NeedWaitForNumMemtables() const { - return need_wait_for_num_memtables_; - } + void NotifyOnCompactionCompleted(DB* db, Compaction* c, const Status& status); - bool ExceedsSoftRateLimit() const { - return exceeds_soft_rate_limit_; - } + void NotifyOnFlushCompleted( + DB* db, const std::string& file_path, + bool triggered_flush_slowdown, + bool triggered_flush_stop); - bool ExceedsHardRateLimit() const { - return exceeds_hard_rate_limit_; - } + // Protected by DB mutex + void set_pending_flush(bool value) { pending_flush_ = value; } + void set_pending_compaction(bool value) { pending_compaction_ = value; } + bool pending_flush() { return pending_flush_; } + bool pending_compaction() { return pending_compaction_; } private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, Version* dummy_versions, Cache* table_cache, + WriteBuffer* write_buffer, const ColumnFamilyOptions& options, - const DBOptions* db_options, - const EnvOptions& storage_options, + const DBOptions* db_options, const EnvOptions& env_options, ColumnFamilySet* column_family_set); // Recalculate some small conditions, which are changed only during @@ -259,25 +292,29 @@ class ColumnFamilyData { // recalculation of compaction score. These values are used in // DBImpl::MakeRoomForWrite function to decide, if it need to make // a write stall - void RecalculateWriteStallConditions(); - void RecalculateWriteStallRateLimitsConditions(); + void RecalculateWriteStallConditions( + const MutableCFOptions& mutable_cf_options); uint32_t id_; const std::string name_; Version* dummy_versions_; // Head of circular doubly-linked list of versions. Version* current_; // == dummy_versions->prev_ - int refs_; // outstanding references to ColumnFamilyData + std::atomic refs_; // outstanding references to ColumnFamilyData bool dropped_; // true if client dropped it const InternalKeyComparator internal_comparator_; - Options const options_; + const Options options_; + const ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; std::unique_ptr table_cache_; std::unique_ptr internal_stats_; + WriteBuffer* write_buffer_; + MemTable* mem_; MemTableList imm_; SuperVersion* super_version_; @@ -301,46 +338,38 @@ class ColumnFamilyData { // recovered from uint64_t log_number_; - // A flag indicating whether we should delay writes because - // we have too many level 0 files - bool need_slowdown_for_num_level0_files_; - - // These 4 variables are updated only after compaction, - // adding new memtable, flushing memtables to files - // and/or add recalculation of compaction score. - // That's why theirs values are cached in ColumnFamilyData. - // Recalculation is made by RecalculateWriteStallConditions and - // RecalculateWriteStallRateLimitsConditions function. They are used - // in DBImpl::MakeRoomForWrite function to decide, if it need - // to sleep during write operation - bool need_wait_for_num_memtables_; - - bool need_wait_for_num_level0_files_; - - bool exceeds_hard_rate_limit_; - - bool exceeds_soft_rate_limit_; - // An object that keeps all the compaction stats // and picks the next compaction std::unique_ptr compaction_picker_; ColumnFamilySet* column_family_set_; + + std::unique_ptr write_controller_token_; + + // If true --> this ColumnFamily is currently present in DBImpl::flush_queue_ + bool pending_flush_; + + // If true --> this ColumnFamily is currently present in + // DBImpl::compaction_queue_ + bool pending_compaction_; }; // ColumnFamilySet has interesting thread-safety requirements -// * CreateColumnFamily() or RemoveColumnFamily() -- need to protect by DB -// mutex. Inside, column_family_data_ and column_families_ will be protected -// by Lock() and Unlock(). CreateColumnFamily() should ONLY be called from -// VersionSet::LogAndApply() in the normal runtime. It is also called -// during Recovery and in DumpManifest(). RemoveColumnFamily() is called -// from ColumnFamilyData destructor +// * CreateColumnFamily() or RemoveColumnFamily() -- need to be protected by DB +// mutex AND executed in the write thread. +// CreateColumnFamily() should ONLY be called from VersionSet::LogAndApply() AND +// single-threaded write thread. It is also called during Recovery and in +// DumpManifest(). +// RemoveColumnFamily() is only called from SetDropped(). DB mutex needs to be +// held and it needs to be executed from the write thread. SetDropped() also +// guarantees that it will be called only from single-threaded LogAndApply(), +// but this condition is not that important. // * Iteration -- hold DB mutex, but you can release it in the body of // iteration. If you release DB mutex in body, reference the column // family before the mutex and unreference after you unlock, since the column // family might get dropped when the DB mutex is released // * GetDefault() -- thread safe -// * GetColumnFamily() -- either inside of DB mutex or call Lock() <-> Unlock() +// * GetColumnFamily() -- either inside of DB mutex or from a write thread // * GetNextColumnFamilyID(), GetMaxColumnFamily(), UpdateMaxColumnFamily(), // NumberOfColumnFamilies -- inside of DB mutex class ColumnFamilySet { @@ -354,7 +383,8 @@ class ColumnFamilySet { // dummy is never dead or dropped, so this will never be infinite do { current_ = current_->next_; - } while (current_->refs_ == 0 || current_->IsDropped()); + } while (current_->refs_.load(std::memory_order_relaxed) == 0 || + current_->IsDropped()); return *this; } bool operator!=(const iterator& other) { @@ -367,7 +397,8 @@ class ColumnFamilySet { }; ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, - const EnvOptions& storage_options, Cache* table_cache); + const EnvOptions& env_options, Cache* table_cache, + WriteBuffer* write_buffer, WriteController* write_controller); ~ColumnFamilySet(); ColumnFamilyData* GetDefault() const; @@ -390,9 +421,6 @@ class ColumnFamilySet { iterator begin() { return iterator(dummy_cfd_->next_); } iterator end() { return iterator(dummy_cfd_); } - void Lock(); - void Unlock(); - // REQUIRES: DB mutex held // Don't call while iterating over ColumnFamilySet void FreeDeadColumnFamilies(); @@ -404,9 +432,12 @@ class ColumnFamilySet { void RemoveColumnFamily(ColumnFamilyData* cfd); // column_families_ and column_family_data_ need to be protected: - // * when mutating: 1. DB mutex locked first, 2. spinlock locked second - // * when reading, either: 1. lock DB mutex, or 2. lock spinlock - // (if both, respect the ordering to avoid deadlock!) + // * when mutating both conditions have to be satisfied: + // 1. DB mutex locked + // 2. thread currently in single-threaded write thread + // * when reading, at least one condition needs to be satisfied: + // 1. DB mutex locked + // 2. accessed from a single-threaded write thread std::unordered_map column_families_; std::unordered_map column_family_data_; @@ -420,41 +451,52 @@ class ColumnFamilySet { const std::string db_name_; const DBOptions* const db_options_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Cache* table_cache_; - std::atomic_flag spin_lock_; + WriteBuffer* write_buffer_; + WriteController* write_controller_; }; // We use ColumnFamilyMemTablesImpl to provide WriteBatch a way to access // memtables of different column families (specified by ID in the write batch) class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { public: - explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) - : column_family_set_(column_family_set), current_(nullptr) {} + explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set, + FlushScheduler* flush_scheduler) + : column_family_set_(column_family_set), + current_(nullptr), + flush_scheduler_(flush_scheduler) {} // sets current_ to ColumnFamilyData with column_family_id // returns false if column family doesn't exist + // REQUIRES: under a DB mutex OR from a write thread bool Seek(uint32_t column_family_id) override; // Returns log number of the selected column family + // REQUIRES: under a DB mutex OR from a write thread uint64_t GetLogNumber() const override; // REQUIRES: Seek() called first + // REQUIRES: under a DB mutex OR from a write thread virtual MemTable* GetMemTable() const override; - // Returns options for selected column family - // REQUIRES: Seek() called first - virtual const Options* GetOptions() const override; - // Returns column family handle for the selected column family + // REQUIRES: under a DB mutex OR from a write thread virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; + // REQUIRES: under a DB mutex OR from a write thread + virtual void CheckMemtableFull() override; + private: ColumnFamilySet* column_family_set_; ColumnFamilyData* current_; + FlushScheduler* flush_scheduler_; ColumnFamilyHandleInternal handle_; }; extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family); +extern const Comparator* GetColumnFamilyUserComparator( + ColumnFamilyHandle* column_family); + } // namespace rocksdb diff --git a/db/column_family_test.cc b/db/column_family_test.cc index ac3435593..209f7b528 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -133,7 +133,7 @@ class ColumnFamilyTest { void CreateColumnFamilies( const std::vector& cfs, const std::vector options = {}) { - int cfi = handles_.size(); + int cfi = static_cast(handles_.size()); handles_.resize(cfi + cfs.size()); names_.resize(cfi + cfs.size()); for (size_t i = 0; i < cfs.size(); ++i) { @@ -218,7 +218,7 @@ class ColumnFamilyTest { int NumTableFilesAtLevel(int level, int cf) { return GetProperty(cf, - "rocksdb.num-files-at-level" + std::to_string(level)); + "rocksdb.num-files-at-level" + ToString(level)); } // Return spread of files per level @@ -231,7 +231,7 @@ class ColumnFamilyTest { snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); result += buf; if (f > 0) { - last_non_zero_offset = result.size(); + last_non_zero_offset = static_cast(result.size()); } } result.resize(last_non_zero_offset); @@ -287,8 +287,8 @@ class ColumnFamilyTest { assert(num_per_cf.size() == handles_.size()); for (size_t i = 0; i < num_per_cf.size(); ++i) { - ASSERT_EQ(num_per_cf[i], - GetProperty(i, "rocksdb.num-immutable-mem-table")); + ASSERT_EQ(num_per_cf[i], GetProperty(static_cast(i), + "rocksdb.num-immutable-mem-table")); } } @@ -387,7 +387,7 @@ TEST(ColumnFamilyTest, DropTest) { Open({"default"}); CreateColumnFamiliesAndReopen({"pikachu"}); for (int i = 0; i < 100; ++i) { - ASSERT_OK(Put(1, std::to_string(i), "bar" + std::to_string(i))); + ASSERT_OK(Put(1, ToString(i), "bar" + ToString(i))); } ASSERT_OK(Flush(1)); @@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) { Open(); CreateColumnFamiliesAndReopen({"one", "two"}); WriteBatch batch; + batch.Put(handles_[0], Slice("existing"), Slice("column-family")); batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); ASSERT_OK(db_->Write(WriteOptions(), &batch)); DropColumnFamilies({1}); + WriteOptions woptions_ignore_missing_cf; + woptions_ignore_missing_cf.ignore_missing_column_families = true; + batch.Put(handles_[0], Slice("still here"), Slice("column-family")); + ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch)); + ASSERT_EQ("column-family", Get(0, "still here")); Status s = db_->Write(WriteOptions(), &batch); ASSERT_TRUE(s.IsInvalidArgument()); Close(); @@ -523,8 +529,28 @@ TEST(ColumnFamilyTest, FlushTest) { ASSERT_OK(Put(1, "mirko", "v3")); ASSERT_OK(Put(0, "foo", "v2")); ASSERT_OK(Put(2, "fodor", "v5")); - for (int i = 0; i < 3; ++i) { - Flush(i); + + for (int j = 0; j < 2; j++) { + ReadOptions ro; + std::vector iterators; + // Hold super version. + if (j == 0) { + ASSERT_OK(db_->NewIterators(ro, handles_, &iterators)); + } + + for (int i = 0; i < 3; ++i) { + uint64_t max_total_in_memory_state = + dbfull()->TEST_max_total_in_memory_state(); + Flush(i); + ASSERT_EQ(dbfull()->TEST_max_total_in_memory_state(), + max_total_in_memory_state); + } + ASSERT_OK(Put(1, "foofoo", "bar")); + ASSERT_OK(Put(0, "foofoo", "bar")); + + for (auto* it : iterators) { + delete it; + } } Reopen(); @@ -705,6 +731,27 @@ TEST(ColumnFamilyTest, DifferentWriteBufferSizes) { Close(); } +TEST(ColumnFamilyTest, MemtableNotSupportSnapshot) { + Open(); + auto* s1 = dbfull()->GetSnapshot(); + ASSERT_TRUE(s1 != nullptr); + dbfull()->ReleaseSnapshot(s1); + + // Add a column family that doesn't support snapshot + ColumnFamilyOptions first; + first.memtable_factory.reset(NewHashCuckooRepFactory(1024 * 1024)); + CreateColumnFamilies({"first"}, {first}); + auto* s2 = dbfull()->GetSnapshot(); + ASSERT_TRUE(s2 == nullptr); + + // Add a column family that supports snapshot. Snapshot stays not supported. + ColumnFamilyOptions second; + CreateColumnFamilies({"second"}, {second}); + auto* s3 = dbfull()->GetSnapshot(); + ASSERT_TRUE(s3 == nullptr); + Close(); +} + TEST(ColumnFamilyTest, DifferentMergeOperators) { Open(); CreateColumnFamilies({"first", "second"}); @@ -768,14 +815,14 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) { for (int i = 0; i < one.level0_file_num_compaction_trigger - 1; ++i) { PutRandomData(1, 11, 10000); WaitForFlush(1); - ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(1)); + ASSERT_EQ(ToString(i + 1), FilesPerLevel(1)); } // SETUP column family "two" -- level style with 4 levels for (int i = 0; i < two.level0_file_num_compaction_trigger - 1; ++i) { PutRandomData(2, 15, 10000); WaitForFlush(2); - ASSERT_EQ(std::to_string(i + 1), FilesPerLevel(2)); + ASSERT_EQ(ToString(i + 1), FilesPerLevel(2)); } // TRIGGER compaction "one" @@ -910,11 +957,11 @@ TEST(ColumnFamilyTest, DontRollEmptyLogs) { CreateColumnFamiliesAndReopen({"one", "two", "three", "four"}); for (size_t i = 0; i < handles_.size(); ++i) { - PutRandomData(i, 10, 100); + PutRandomData(static_cast(i), 10, 100); } int num_writable_file_start = env_->GetNumberOfNewWritableFileCalls(); // this will trigger the flushes - for (size_t i = 0; i <= 4; ++i) { + for (int i = 0; i <= 4; ++i) { ASSERT_OK(Flush(i)); } diff --git a/db/compaction.cc b/db/compaction.cc index 0bffa0162..56be34ef3 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -9,7 +9,10 @@ #include "db/compaction.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include @@ -26,7 +29,17 @@ uint64_t TotalFileSize(const std::vector& files) { return sum; } -Compaction::Compaction(Version* input_version, int start_level, int out_level, +void Compaction::SetInputVersion(Version* _input_version) { + input_version_ = _input_version; + cfd_ = input_version_->cfd(); + + cfd_->Ref(); + input_version_->Ref(); + edit_ = new VersionEdit(); + edit_->SetColumnFamily(cfd_->GetID()); +} + +Compaction::Compaction(int number_levels, int start_level, int out_level, uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id, @@ -36,9 +49,10 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level, output_level_(out_level), max_output_file_size_(target_file_size), max_grandparent_overlap_bytes_(max_grandparent_overlap_bytes), - input_version_(input_version), - number_levels_(input_version_->NumberLevels()), - cfd_(input_version_->cfd_), + input_version_(nullptr), + edit_(nullptr), + number_levels_(number_levels), + cfd_(nullptr), output_path_id_(output_path_id), output_compression_(output_compression), seek_compaction_(seek_compaction), @@ -53,11 +67,6 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level, is_full_compaction_(false), is_manual_compaction_(false), level_ptrs_(std::vector(number_levels_)) { - - cfd_->Ref(); - input_version_->Ref(); - edit_ = new VersionEdit(); - edit_->SetColumnFamily(cfd_->GetID()); for (int i = 0; i < number_levels_; i++) { level_ptrs_[i] = 0; } @@ -69,6 +78,38 @@ Compaction::Compaction(Version* input_version, int start_level, int out_level, } } +Compaction::Compaction(VersionStorageInfo* vstorage, + const autovector& _inputs, + int _start_level, int _output_level, + uint64_t _max_grandparent_overlap_bytes, + const CompactionOptions& _options, + bool _deletion_compaction) + : start_level_(_start_level), + output_level_(_output_level), + max_output_file_size_(_options.output_file_size_limit), + max_grandparent_overlap_bytes_(_max_grandparent_overlap_bytes), + input_version_(nullptr), + number_levels_(vstorage->num_levels()), + cfd_(nullptr), + output_compression_(_options.compression), + seek_compaction_(false), + deletion_compaction_(_deletion_compaction), + inputs_(_inputs), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + is_manual_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + Compaction::~Compaction() { delete edit_; if (input_version_ != nullptr) { @@ -83,8 +124,9 @@ Compaction::~Compaction() { void Compaction::GenerateFileLevels() { input_levels_.resize(num_input_levels()); - for (int which = 0; which < num_input_levels(); which++) { - DoGenerateFileLevel(&input_levels_[which], inputs_[which].files, &arena_); + for (size_t which = 0; which < num_input_levels(); which++) { + DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files, + &arena_); } } @@ -98,26 +140,29 @@ bool Compaction::IsTrivialMove() const { num_input_levels() == 2 && num_input_files(0) == 1 && num_input_files(1) == 0 && + input(0, 0)->fd.GetPathId() == GetOutputPathId() && TotalFileSize(grandparents_) <= max_grandparent_overlap_bytes_); } -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < num_input_levels(); which++) { +void Compaction::AddInputDeletions(VersionEdit* out_edit) { + for (size_t which = 0; which < num_input_levels(); which++) { for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); + out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber()); } } } bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) { - assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); - if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { + assert(input_version_ != nullptr); + assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO); + if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) { return bottommost_level_; } // Maybe use binary search to find right entry instead of linear search? const Comparator* user_cmp = cfd_->user_comparator(); for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) { - const std::vector& files = input_version_->files_[lvl]; + const std::vector& files = + input_version_->storage_info()->LevelFiles(lvl); for (; level_ptrs_[lvl] < files.size(); ) { FileMetaData* f = files[level_ptrs_[lvl]]; if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { @@ -163,7 +208,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) { // Mark (or clear) each file that is being compacted void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { - for (int i = 0; i < num_input_levels(); i++) { + for (size_t i = 0; i < num_input_levels(); i++) { for (unsigned int j = 0; j < inputs_[i].size(); j++) { assert(mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted); @@ -173,9 +218,9 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) { } // Is this compaction producing files at the bottommost level? -void Compaction::SetupBottomMostLevel(bool is_manual) { - assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); - if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { +void Compaction::SetupBottomMostLevel(VersionStorageInfo* vstorage, + bool is_manual, bool level0_only) { + if (level0_only) { // If universal compaction style is used and manual // compaction is occuring, then we are guaranteed that // all files will be picked in a single compaction @@ -190,32 +235,20 @@ void Compaction::SetupBottomMostLevel(bool is_manual) { bottommost_level_ = true; // checks whether there are files living beyond the output_level. for (int i = output_level_ + 1; i < number_levels_; i++) { - if (input_version_->NumLevelFiles(i) > 0) { + if (vstorage->NumLevelFiles(i) > 0) { bottommost_level_ = false; break; } } } -void Compaction::ReleaseInputs() { - if (input_version_ != nullptr) { - input_version_->Unref(); - input_version_ = nullptr; - } - if (cfd_ != nullptr) { - if (cfd_->Unref()) { - delete cfd_; - } - cfd_ = nullptr; - } -} - void Compaction::ReleaseCompactionFiles(Status status) { cfd_->compaction_picker()->ReleaseCompactionFiles(this, status); } void Compaction::ResetNextCompactionIndex() { - input_version_->ResetNextCompactionIndex(start_level_); + assert(input_version_ != nullptr); + input_version_->storage_info()->ResetNextCompactionIndex(start_level_); } namespace { @@ -248,14 +281,15 @@ void Compaction::Summary(char* output, int len) { return; } - for (int level = 0; level < num_input_levels(); ++level) { - if (level > 0) { + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + if (level_iter > 0) { write += snprintf(output + write, len - write, "], ["); if (write < 0 || write >= len) { return; } } - write += InputSummary(inputs_[level].files, output + write, len - write); + write += + InputSummary(inputs_[level_iter].files, output + write, len - write); if (write < 0 || write >= len) { return; } @@ -264,15 +298,15 @@ void Compaction::Summary(char* output, int len) { snprintf(output + write, len - write, "]"); } -uint64_t Compaction::OutputFilePreallocationSize() { +uint64_t Compaction::OutputFilePreallocationSize( + const MutableCFOptions& mutable_options) { uint64_t preallocation_size = 0; - if (cfd_->options()->compaction_style == kCompactionStyleLevel) { - preallocation_size = - cfd_->compaction_picker()->MaxFileSizeForLevel(output_level()); + if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + preallocation_size = mutable_options.MaxFileSizeForLevel(output_level()); } else { - for (int level = 0; level < num_input_levels(); ++level) { - for (const auto& f : inputs_[level].files) { + for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) { + for (const auto& f : inputs_[level_iter].files) { preallocation_size += f->fd.GetFileSize(); } } @@ -282,4 +316,15 @@ uint64_t Compaction::OutputFilePreallocationSize() { return preallocation_size * 1.1; } +Compaction* Compaction::TEST_NewCompaction( + int num_levels, int start_level, int out_level, uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id, + CompressionType output_compression, bool seek_compaction, + bool deletion_compaction) { + return new Compaction(num_levels, start_level, out_level, target_file_size, + max_grandparent_overlap_bytes, output_path_id, + output_compression, seek_compaction, + deletion_compaction); +} + } // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h index 6000f636b..37e38532b 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -10,6 +10,7 @@ #pragma once #include "util/arena.h" #include "util/autovector.h" +#include "util/mutable_cf_options.h" #include "db/version_set.h" namespace rocksdb { @@ -22,15 +23,23 @@ struct CompactionInputFiles { inline bool empty() const { return files.empty(); } inline size_t size() const { return files.size(); } inline void clear() { files.clear(); } - inline FileMetaData* operator[](int i) const { return files[i]; } + inline FileMetaData* operator[](size_t i) const { return files[i]; } }; class Version; class ColumnFamilyData; +class VersionStorageInfo; // A Compaction encapsulates information about a compaction. class Compaction { public: + Compaction(VersionStorageInfo* input_version, + const autovector& inputs, + int start_level, int output_level, + uint64_t max_grandparent_overlap_bytes, + const CompactionOptions& options, + bool deletion_compaction); + // No copying allowed Compaction(const Compaction&) = delete; void operator=(const Compaction&) = delete; @@ -39,7 +48,7 @@ class Compaction { // Returns the level associated to the specified compaction input level. // If compaction_input_level is not specified, then input_level is set to 0. - int level(int compaction_input_level = 0) const { + int level(size_t compaction_input_level = 0) const { return inputs_[compaction_input_level].level; } @@ -47,7 +56,7 @@ class Compaction { int output_level() const { return output_level_; } // Returns the number of input levels in this compaction. - int num_input_levels() const { return inputs_.size(); } + size_t num_input_levels() const { return inputs_.size(); } // Return the object that holds the edits to the descriptor done // by this compaction. @@ -57,7 +66,7 @@ class Compaction { // compaction input level. // The function will return 0 if when "compaction_input_level" < 0 // or "compaction_input_level" >= "num_input_levels()". - int num_input_files(size_t compaction_input_level) const { + size_t num_input_files(size_t compaction_input_level) const { if (compaction_input_level < inputs_.size()) { return inputs_[compaction_input_level].size(); } @@ -74,7 +83,7 @@ class Compaction { // specified compaction input level. // REQUIREMENT: "compaction_input_level" must be >= 0 and // < "input_levels()" - FileMetaData* input(size_t compaction_input_level, int i) const { + FileMetaData* input(size_t compaction_input_level, size_t i) const { assert(compaction_input_level < inputs_.size()); return inputs_[compaction_input_level][i]; } @@ -88,8 +97,8 @@ class Compaction { return &inputs_[compaction_input_level].files; } - // Returns the FileLevel of the specified compaction input level. - FileLevel* input_levels(int compaction_input_level) { + // Returns the LevelFilesBrief of the specified compaction input level. + LevelFilesBrief* input_levels(size_t compaction_input_level) { return &input_levels_[compaction_input_level]; } @@ -110,7 +119,7 @@ class Compaction { // moving a single input file to the next level (no merging or splitting) bool IsTrivialMove() const; - // If true, then the comaction can be done by simply deleting input files. + // If true, then the compaction can be done by simply deleting input files. bool IsDeletionCompaction() const { return deletion_compaction_; } @@ -126,10 +135,6 @@ class Compaction { // before processing "internal_key". bool ShouldStopBefore(const Slice& internal_key); - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. void ReleaseCompactionFiles(Status status); @@ -151,10 +156,38 @@ class Compaction { // Was this compaction triggered manually by the client? bool IsManualCompaction() { return is_manual_compaction_; } + void SetOutputPathId(uint32_t path_id) { output_path_id_ = path_id; } + + // Return the MutableCFOptions that should be used throughout the compaction + // procedure + const MutableCFOptions* mutable_cf_options() { return &mutable_cf_options_; } + // Returns the size in bytes that the output file should be preallocated to. // In level compaction, that is max_file_size_. In universal compaction, that // is the sum of all input file sizes. - uint64_t OutputFilePreallocationSize(); + uint64_t OutputFilePreallocationSize(const MutableCFOptions& mutable_options); + + void SetInputVersion(Version* input_version); + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool mark_as_compacted); + + // Initialize whether the compaction is producing files at the + // bottommost level. + // + // @see BottomMostLevel() + void SetupBottomMostLevel(VersionStorageInfo* vstorage, bool is_manual, + bool level0_only); + + static Compaction* TEST_NewCompaction( + int num_levels, int start_level, int out_level, uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id, + CompressionType output_compression, bool seek_compaction = false, + bool deletion_compaction = false); + + CompactionInputFiles* TEST_GetInputFiles(int l) { + return &inputs_[l]; + } private: friend class CompactionPicker; @@ -162,7 +195,7 @@ class Compaction { friend class FIFOCompactionPicker; friend class LevelCompactionPicker; - Compaction(Version* input_version, int start_level, int out_level, + Compaction(int num_levels, int start_level, int out_level, uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, uint32_t output_path_id, CompressionType output_compression, bool seek_compaction = false, bool deletion_compaction = false); @@ -171,6 +204,7 @@ class Compaction { const int output_level_; // levels to which output files are stored uint64_t max_output_file_size_; uint64_t max_grandparent_overlap_bytes_; + MutableCFOptions mutable_cf_options_; Version* input_version_; VersionEdit* edit_; int number_levels_; @@ -187,7 +221,7 @@ class Compaction { autovector inputs_; // A copy of inputs_, organized more closely in memory - autovector input_levels_; + autovector input_levels_; // State used to check for number of of overlapping grandparent files // (grandparent == "output_level_ + 1") @@ -217,15 +251,6 @@ class Compaction { // records indices for all levels beyond "output_level_". std::vector level_ptrs_; - // mark (or clear) all files that are being compacted - void MarkFilesBeingCompacted(bool mark_as_compacted); - - // Initialize whether the compaction is producing files at the - // bottommost level. - // - // @see BottomMostLevel() - void SetupBottomMostLevel(bool is_manual); - // In case of compaction error, reset the nextIndex that is used // to pick up the next file to be compacted from files_by_size_ void ResetNextCompactionIndex(); diff --git a/db/compaction_job.cc b/db/compaction_job.cc new file mode 100644 index 000000000..775dcebec --- /dev/null +++ b/db/compaction_job.cc @@ -0,0 +1,1095 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction_job.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/merge_helper.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/version_set.h" +#include "port/port.h" +#include "port/likely.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/merger.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/iostats_context_imp.h" +#include "util/stop_watch.h" +#include "util/sync_point.h" +#include "util/thread_status_util.h" + +namespace rocksdb { + +struct CompactionJob::CompactionState { + Compaction* const compaction; + + // If there were two snapshots with seq numbers s1 and + // s2 and s1 < s2, and if we find two instances of a key k1 then lies + // entirely within s1 and s2, then the earlier version of k1 can be safely + // deleted because that version is not visible in any snapshot. + std::vector existing_snapshots; + + // Files produced by compaction + struct Output { + uint64_t number; + uint32_t path_id; + uint64_t file_size; + InternalKey smallest, largest; + SequenceNumber smallest_seqno, largest_seqno; + }; + std::vector outputs; + + // State kept for output being generated + std::unique_ptr outfile; + std::unique_ptr builder; + + uint64_t total_bytes; + + Output* current_output() { return &outputs[outputs.size() - 1]; } + + explicit CompactionState(Compaction* c) + : compaction(c), + total_bytes(0), + num_input_records(0), + num_output_records(0) {} + + // Create a client visible context of this compaction + CompactionFilter::Context GetFilterContextV1() { + CompactionFilter::Context context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + // Create a client visible context of this compaction + CompactionFilterContext GetFilterContext() { + CompactionFilterContext context; + context.is_full_compaction = compaction->IsFullCompaction(); + context.is_manual_compaction = compaction->IsManualCompaction(); + return context; + } + + std::vector key_str_buf_; + std::vector existing_value_str_buf_; + // new_value_buf_ will only be appended if a value changes + std::vector new_value_buf_; + // if values_changed_buf_[i] is true + // new_value_buf_ will add a new entry with the changed value + std::vector value_changed_buf_; + // to_delete_buf_[i] is true iff key_buf_[i] is deleted + std::vector to_delete_buf_; + + std::vector other_key_str_buf_; + std::vector other_value_str_buf_; + + std::vector combined_key_buf_; + std::vector combined_value_buf_; + + std::string cur_prefix_; + + uint64_t num_input_records; + uint64_t num_output_records; + + // Buffers the kv-pair that will be run through compaction filter V2 + // in the future. + void BufferKeyValueSlices(const Slice& key, const Slice& value) { + key_str_buf_.emplace_back(key.ToString()); + existing_value_str_buf_.emplace_back(value.ToString()); + } + + // Buffers the kv-pair that will not be run through compaction filter V2 + // in the future. + void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) { + other_key_str_buf_.emplace_back(key.ToString()); + other_value_str_buf_.emplace_back(value.ToString()); + } + + // Add a kv-pair to the combined buffer + void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) { + // The real strings are stored in the batch buffers + combined_key_buf_.emplace_back(key); + combined_value_buf_.emplace_back(value); + } + + // Merging the two buffers + void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) { + size_t i = 0; + size_t j = 0; + size_t total_size = key_str_buf_.size() + other_key_str_buf_.size(); + combined_key_buf_.reserve(total_size); + combined_value_buf_.reserve(total_size); + + while (i + j < total_size) { + int comp_res = 0; + if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) { + comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]); + } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) { + comp_res = 1; + } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) { + comp_res = -1; + } + if (comp_res > 0) { + AddToCombinedKeyValueSlices(other_key_str_buf_[j], + other_value_str_buf_[j]); + j++; + } else if (comp_res < 0) { + AddToCombinedKeyValueSlices(key_str_buf_[i], + existing_value_str_buf_[i]); + i++; + } + } + } + + void CleanupBatchBuffer() { + to_delete_buf_.clear(); + key_str_buf_.clear(); + existing_value_str_buf_.clear(); + new_value_buf_.clear(); + value_changed_buf_.clear(); + + to_delete_buf_.shrink_to_fit(); + key_str_buf_.shrink_to_fit(); + existing_value_str_buf_.shrink_to_fit(); + new_value_buf_.shrink_to_fit(); + value_changed_buf_.shrink_to_fit(); + + other_key_str_buf_.clear(); + other_value_str_buf_.clear(); + other_key_str_buf_.shrink_to_fit(); + other_value_str_buf_.shrink_to_fit(); + } + + void CleanupMergedBuffer() { + combined_key_buf_.clear(); + combined_value_buf_.clear(); + combined_key_buf_.shrink_to_fit(); + combined_value_buf_.shrink_to_fit(); + } +}; + +CompactionJob::CompactionJob( + Compaction* compaction, const DBOptions& db_options, + const MutableCFOptions& mutable_cf_options, const EnvOptions& env_options, + VersionSet* versions, std::atomic* shutting_down, + LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, + Statistics* stats, SnapshotList* snapshots, bool is_snapshot_supported, + std::shared_ptr table_cache, + std::function yield_callback) + : compact_(new CompactionState(compaction)), + compaction_stats_(1), + db_options_(db_options), + mutable_cf_options_(mutable_cf_options), + env_options_(env_options), + env_(db_options.env), + versions_(versions), + shutting_down_(shutting_down), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_directory_(output_directory), + stats_(stats), + snapshots_(snapshots), + is_snapshot_supported_(is_snapshot_supported), + table_cache_(std::move(table_cache)), + yield_callback_(std::move(yield_callback)) {} + +void CompactionJob::Prepare() { + compact_->CleanupBatchBuffer(); + compact_->CleanupMergedBuffer(); + + // Generate file_levels_ for compaction berfore making Iterator + compact_->compaction->GenerateFileLevels(); + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + assert(cfd != nullptr); + LogToBuffer( + log_buffer_, "[%s] Compacting %d@%d + %d@%d files, score %.2f", + cfd->GetName().c_str(), compact_->compaction->num_input_files(0), + compact_->compaction->level(), compact_->compaction->num_input_files(1), + compact_->compaction->output_level(), compact_->compaction->score()); + char scratch[2345]; + compact_->compaction->Summary(scratch, sizeof(scratch)); + LogToBuffer(log_buffer_, "[%s] Compaction start summary: %s\n", + cfd->GetName().c_str(), scratch); + + assert(cfd->current()->storage_info()->NumLevelFiles( + compact_->compaction->level()) > 0); + assert(compact_->builder == nullptr); + assert(!compact_->outfile); + + visible_at_tip_ = 0; + latest_snapshot_ = 0; + // TODO(icanadi) move snapshots_ out of CompactionJob + snapshots_->getAll(compact_->existing_snapshots); + if (compact_->existing_snapshots.size() == 0) { + // optimize for fast path if there are no snapshots + visible_at_tip_ = versions_->LastSequence(); + earliest_snapshot_ = visible_at_tip_; + } else { + latest_snapshot_ = compact_->existing_snapshots.back(); + // Add the current seqno as the 'latest' virtual + // snapshot to the end of this list. + compact_->existing_snapshots.push_back(versions_->LastSequence()); + earliest_snapshot_ = compact_->existing_snapshots[0]; + } + + // Is this compaction producing files at the bottommost level? + bottommost_level_ = compact_->compaction->BottomMostLevel(); +} + +Status CompactionJob::Run() { + log_buffer_->FlushBufferToLog(); + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); +#ifndef NDEBUG + ThreadStatusUtil::TEST_OperationDelay(ThreadStatus::OP_COMPACTION); +#endif + + const uint64_t start_micros = env_->NowMicros(); + std::unique_ptr input( + versions_->MakeInputIterator(compact_->compaction)); + input->SeekToFirst(); + + Status status; + ParsedInternalKey ikey; + std::unique_ptr compaction_filter_from_factory_v2 = + nullptr; + auto context = compact_->GetFilterContext(); + compaction_filter_from_factory_v2 = + cfd->ioptions()->compaction_filter_factory_v2->CreateCompactionFilterV2( + context); + auto compaction_filter_v2 = compaction_filter_from_factory_v2.get(); + + int64_t imm_micros = 0; // Micros spent doing imm_ compactions + if (!compaction_filter_v2) { + status = ProcessKeyValueCompaction(&imm_micros, input.get(), false); + } else { + // temp_backup_input always point to the start of the current buffer + // temp_backup_input = backup_input; + // iterate through input, + // 1) buffer ineligible keys and value keys into 2 separate buffers; + // 2) send value_buffer to compaction filter and alternate the values; + // 3) merge value_buffer with ineligible_value_buffer; + // 4) run the modified "compaction" using the old for loop. + bool prefix_initialized = false; + shared_ptr backup_input( + versions_->MakeInputIterator(compact_->compaction)); + backup_input->SeekToFirst(); + while (backup_input->Valid() && + !shutting_down_->load(std::memory_order_acquire) && + !cfd->IsDropped()) { + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + + imm_micros += yield_callback_(); + + Slice key = backup_input->key(); + Slice value = backup_input->value(); + + if (!ParseInternalKey(key, &ikey)) { + // log error + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "[%s] Failed to parse key: %s", + cfd->GetName().c_str(), key.ToString().c_str()); + continue; + } else { + const SliceTransform* transformer = + cfd->ioptions()->compaction_filter_factory_v2->GetPrefixExtractor(); + const auto key_prefix = transformer->Transform(ikey.user_key); + if (!prefix_initialized) { + compact_->cur_prefix_ = key_prefix.ToString(); + prefix_initialized = true; + } + // If the prefix remains the same, keep buffering + if (key_prefix.compare(Slice(compact_->cur_prefix_)) == 0) { + // Apply the compaction filter V2 to all the kv pairs sharing + // the same prefix + if (ikey.type == kTypeValue && + (visible_at_tip_ || ikey.sequence > latest_snapshot_)) { + // Buffer all keys sharing the same prefix for CompactionFilterV2 + // Iterate through keys to check prefix + compact_->BufferKeyValueSlices(key, value); + } else { + // buffer ineligible keys + compact_->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + continue; + // finish changing values for eligible keys + } else { + // Now prefix changes, this batch is done. + // Call compaction filter on the buffered values to change the value + if (compact_->key_str_buf_.size() > 0) { + CallCompactionFilterV2(compaction_filter_v2); + } + compact_->cur_prefix_ = key_prefix.ToString(); + } + } + + // Merge this batch of data (values + ineligible keys) + compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + + // Done buffering for the current prefix. Spit it out to disk + // Now just iterate through all the kv-pairs + status = ProcessKeyValueCompaction(&imm_micros, input.get(), true); + + if (!status.ok()) { + break; + } + + // After writing the kv-pairs, we can safely remove the reference + // to the string buffer and clean them up + compact_->CleanupBatchBuffer(); + compact_->CleanupMergedBuffer(); + // Buffer the key that triggers the mismatch in prefix + if (ikey.type == kTypeValue && + (visible_at_tip_ || ikey.sequence > latest_snapshot_)) { + compact_->BufferKeyValueSlices(key, value); + } else { + compact_->BufferOtherKeyValueSlices(key, value); + } + backup_input->Next(); + if (!backup_input->Valid()) { + // If this is the single last value, we need to merge it. + if (compact_->key_str_buf_.size() > 0) { + CallCompactionFilterV2(compaction_filter_v2); + } + compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + + status = ProcessKeyValueCompaction(&imm_micros, input.get(), true); + if (!status.ok()) { + break; + } + + compact_->CleanupBatchBuffer(); + compact_->CleanupMergedBuffer(); + } + } // done processing all prefix batches + // finish the last batch + if (status.ok()) { + if (compact_->key_str_buf_.size() > 0) { + CallCompactionFilterV2(compaction_filter_v2); + } + compact_->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); + status = ProcessKeyValueCompaction(&imm_micros, input.get(), true); + } + } // checking for compaction filter v2 + + if (status.ok() && + (shutting_down_->load(std::memory_order_acquire) || cfd->IsDropped())) { + status = Status::ShutdownInProgress( + "Database shutdown or Column family drop during compaction"); + } + if (status.ok() && compact_->builder != nullptr) { + status = FinishCompactionOutputFile(input.get()); + } + if (status.ok()) { + status = input->status(); + } + input.reset(); + + if (output_directory_ && !db_options_.disableDataSync) { + output_directory_->Fsync(); + } + + compaction_stats_.micros = env_->NowMicros() - start_micros - imm_micros; + compaction_stats_.files_in_leveln = + static_cast(compact_->compaction->num_input_files(0)); + compaction_stats_.files_in_levelnp1 = + static_cast(compact_->compaction->num_input_files(1)); + MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros); + + size_t num_output_files = compact_->outputs.size(); + if (compact_->builder != nullptr) { + // An error occurred so ignore the last output. + assert(num_output_files > 0); + --num_output_files; + } + compaction_stats_.files_out_levelnp1 = static_cast(num_output_files); + + for (size_t i = 0; i < compact_->compaction->num_input_files(0); i++) { + compaction_stats_.bytes_readn += + compact_->compaction->input(0, i)->fd.GetFileSize(); + compaction_stats_.num_input_records += + static_cast(compact_->compaction->input(0, i)->num_entries); + } + + for (size_t i = 0; i < compact_->compaction->num_input_files(1); i++) { + compaction_stats_.bytes_readnp1 += + compact_->compaction->input(1, i)->fd.GetFileSize(); + } + + for (size_t i = 0; i < num_output_files; i++) { + compaction_stats_.bytes_written += compact_->outputs[i].file_size; + } + if (compact_->num_input_records > compact_->num_output_records) { + compaction_stats_.num_dropped_records += + compact_->num_input_records - compact_->num_output_records; + compact_->num_input_records = compact_->num_output_records = 0; + } + + RecordCompactionIOStats(); + + LogFlush(db_options_.info_log); + ThreadStatusUtil::ResetThreadStatus(); + return status; +} + +void CompactionJob::Install(Status* status, InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + cfd->internal_stats()->AddCompactionStats( + compact_->compaction->output_level(), compaction_stats_); + + if (status->ok()) { + *status = InstallCompactionResults(db_mutex); + } + VersionStorageInfo::LevelSummaryStorage tmp; + const auto& stats = compaction_stats_; + LogToBuffer(log_buffer_, + "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, " + "files in(%d, %d) out(%d) " + "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " + "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n", + cfd->GetName().c_str(), + cfd->current()->storage_info()->LevelSummary(&tmp), + (stats.bytes_readn + stats.bytes_readnp1) / + static_cast(stats.micros), + stats.bytes_written / static_cast(stats.micros), + compact_->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, + stats.bytes_written / 1048576.0, + (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / + static_cast(stats.bytes_readn), + stats.bytes_written / static_cast(stats.bytes_readn), + status->ToString().c_str(), stats.num_input_records, + stats.num_dropped_records); + + CleanupCompaction(*status); +} + +Status CompactionJob::ProcessKeyValueCompaction(int64_t* imm_micros, + Iterator* input, + bool is_compaction_v2) { + size_t combined_idx = 0; + Status status; + std::string compaction_filter_value; + ParsedInternalKey ikey; + IterKey current_user_key; + bool has_current_user_key = false; + IterKey delete_key; + SequenceNumber last_sequence_for_key __attribute__((unused)) = + kMaxSequenceNumber; + SequenceNumber visible_in_snapshot = kMaxSequenceNumber; + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator, + db_options_.info_log.get(), + cfd->ioptions()->min_partial_merge_operands, + false /* internal key corruption is expected */); + auto compaction_filter = cfd->ioptions()->compaction_filter; + std::unique_ptr compaction_filter_from_factory = nullptr; + if (!compaction_filter) { + auto context = compact_->GetFilterContextV1(); + compaction_filter_from_factory = + cfd->ioptions()->compaction_filter_factory->CreateCompactionFilter( + context); + compaction_filter = compaction_filter_from_factory.get(); + } + + int64_t key_drop_user = 0; + int64_t key_drop_newer_entry = 0; + int64_t key_drop_obsolete = 0; + int64_t loop_cnt = 0; + while (input->Valid() && !shutting_down_->load(std::memory_order_acquire) && + !cfd->IsDropped() && status.ok()) { + compact_->num_input_records++; + if (++loop_cnt > 1000) { + if (key_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); + key_drop_user = 0; + } + if (key_drop_newer_entry > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, + key_drop_newer_entry); + key_drop_newer_entry = 0; + } + if (key_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); + key_drop_obsolete = 0; + } + RecordCompactionIOStats(); + loop_cnt = 0; + } + // FLUSH preempts compaction + // TODO(icanadi) this currently only checks if flush is necessary on + // compacting column family. we should also check if flush is necessary on + // other column families, too + (*imm_micros) += yield_callback_(); + + Slice key; + Slice value; + // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch. + // This prefix batch should contain results after calling + // compaction_filter_v2. + // + // If is_compaction_v2 is off, this function will go through all the + // kv-pairs in input. + if (!is_compaction_v2) { + key = input->key(); + value = input->value(); + } else { + if (combined_idx >= compact_->combined_key_buf_.size()) { + break; + } + assert(combined_idx < compact_->combined_key_buf_.size()); + key = compact_->combined_key_buf_[combined_idx]; + value = compact_->combined_value_buf_[combined_idx]; + + ++combined_idx; + } + + if (compact_->compaction->ShouldStopBefore(key) && + compact_->builder != nullptr) { + status = FinishCompactionOutputFile(input); + if (!status.ok()) { + break; + } + } + + // Handle key/value, add to state, etc. + bool drop = false; + bool current_entry_is_merging = false; + if (!ParseInternalKey(key, &ikey)) { + // Do not hide error keys + // TODO: error key stays in db forever? Figure out the intention/rationale + // v10 error v8 : we cannot hide v8 even though it's pretty obvious. + current_user_key.Clear(); + has_current_user_key = false; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + } else { + if (!has_current_user_key || + cfd->user_comparator()->Compare(ikey.user_key, + current_user_key.GetKey()) != 0) { + // First occurrence of this user key + current_user_key.SetKey(ikey.user_key); + has_current_user_key = true; + last_sequence_for_key = kMaxSequenceNumber; + visible_in_snapshot = kMaxSequenceNumber; + // apply the compaction filter to the first occurrence of the user key + if (compaction_filter && !is_compaction_v2 && ikey.type == kTypeValue && + (visible_at_tip_ || ikey.sequence > latest_snapshot_)) { + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + bool value_changed = false; + compaction_filter_value.clear(); + bool to_delete = compaction_filter->Filter( + compact_->compaction->level(), ikey.user_key, value, + &compaction_filter_value, &value_changed); + if (to_delete) { + // make a copy of the original key and convert it to a delete + delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence, + kTypeDeletion); + // anchor the key again + key = delete_key.GetKey(); + // needed because ikey is backed by key + ParseInternalKey(key, &ikey); + // no value associated with delete + value.clear(); + ++key_drop_user; + } else if (value_changed) { + value = compaction_filter_value; + } + } + } + + // If there are no snapshots, then this kv affect visibility at tip. + // Otherwise, search though all existing snapshots to find + // the earlist snapshot that is affected by this kv. + SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot + SequenceNumber visible = + visible_at_tip_ + ? visible_at_tip_ + : is_snapshot_supported_ + ? findEarliestVisibleSnapshot(ikey.sequence, + compact_->existing_snapshots, + &prev_snapshot) + : 0; + + if (visible_in_snapshot == visible) { + // If the earliest snapshot is which this key is visible in + // is the same as the visibily of a previous instance of the + // same key, then this kv is not visible in any snapshot. + // Hidden by an newer entry for same user key + // TODO: why not > ? + assert(last_sequence_for_key >= ikey.sequence); + drop = true; // (A) + ++key_drop_newer_entry; + } else if (ikey.type == kTypeDeletion && + ikey.sequence <= earliest_snapshot_ && + compact_->compaction->KeyNotExistsBeyondOutputLevel( + ikey.user_key)) { + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger sequence numbers + // (3) data in layers that are being compacted here and have + // smaller sequence numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + drop = true; + ++key_drop_obsolete; + } else if (ikey.type == kTypeMerge) { + if (!merge.HasOperator()) { + LogToBuffer(log_buffer_, "Options::merge_operator is null."); + status = Status::InvalidArgument( + "merge_operator is not properly initialized."); + break; + } + // We know the merge type entry is not hidden, otherwise we would + // have hit (A) + // We encapsulate the merge related state machine in a different + // object to minimize change to the existing flow. Turn out this + // logic could also be nicely re-used for memtable flush purge + // optimization in BuildTable. + int steps = 0; + merge.MergeUntil(input, prev_snapshot, bottommost_level_, + db_options_.statistics.get(), &steps); + // Skip the Merge ops + combined_idx = combined_idx - 1 + steps; + + current_entry_is_merging = true; + if (merge.IsSuccess()) { + // Successfully found Put/Delete/(end-of-key-range) while merging + // Get the merge result + key = merge.key(); + ParseInternalKey(key, &ikey); + value = merge.value(); + } else { + // Did not find a Put/Delete/(end-of-key-range) while merging + // We now have some stack of merge operands to write out. + // NOTE: key,value, and ikey are now referring to old entries. + // These will be correctly set below. + assert(!merge.keys().empty()); + assert(merge.keys().size() == merge.values().size()); + + // Hack to make sure last_sequence_for_key is correct + ParseInternalKey(merge.keys().front(), &ikey); + } + } + + last_sequence_for_key = ikey.sequence; + visible_in_snapshot = visible; + } + + if (!drop) { + // We may write a single key (e.g.: for Put/Delete or successful merge). + // Or we may instead have to write a sequence/list of keys. + // We have to write a sequence iff we have an unsuccessful merge + bool has_merge_list = current_entry_is_merging && !merge.IsSuccess(); + const std::deque* keys = nullptr; + const std::deque* values = nullptr; + std::deque::const_reverse_iterator key_iter; + std::deque::const_reverse_iterator value_iter; + if (has_merge_list) { + keys = &merge.keys(); + values = &merge.values(); + key_iter = keys->rbegin(); // The back (*rbegin()) is the first key + value_iter = values->rbegin(); + + key = Slice(*key_iter); + value = Slice(*value_iter); + } + + // If we have a list of keys to write, traverse the list. + // If we have a single key to write, simply write that key. + while (true) { + // Invariant: key,value,ikey will always be the next entry to write + char* kptr = (char*)key.data(); + std::string kstr; + + // Zeroing out the sequence number leads to better compression. + // If this is the bottommost level (no files in lower levels) + // and the earliest snapshot is larger than this seqno + // then we can squash the seqno to zero. + if (bottommost_level_ && ikey.sequence < earliest_snapshot_ && + ikey.type != kTypeMerge) { + assert(ikey.type != kTypeDeletion); + // make a copy because updating in place would cause problems + // with the priority queue that is managing the input key iterator + kstr.assign(key.data(), key.size()); + kptr = (char*)kstr.c_str(); + UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type); + } + + Slice newkey(kptr, key.size()); + assert((key.clear(), 1)); // we do not need 'key' anymore + + // Open output file if necessary + if (compact_->builder == nullptr) { + status = OpenCompactionOutputFile(); + if (!status.ok()) { + break; + } + } + + SequenceNumber seqno = GetInternalKeySeqno(newkey); + if (compact_->builder->NumEntries() == 0) { + compact_->current_output()->smallest.DecodeFrom(newkey); + compact_->current_output()->smallest_seqno = seqno; + } else { + compact_->current_output()->smallest_seqno = + std::min(compact_->current_output()->smallest_seqno, seqno); + } + compact_->current_output()->largest.DecodeFrom(newkey); + compact_->builder->Add(newkey, value); + compact_->num_output_records++, + compact_->current_output()->largest_seqno = + std::max(compact_->current_output()->largest_seqno, seqno); + + // Close output file if it is big enough + if (compact_->builder->FileSize() >= + compact_->compaction->MaxOutputFileSize()) { + status = FinishCompactionOutputFile(input); + if (!status.ok()) { + break; + } + } + + // If we have a list of entries, move to next element + // If we only had one entry, then break the loop. + if (has_merge_list) { + ++key_iter; + ++value_iter; + + // If at end of list + if (key_iter == keys->rend() || value_iter == values->rend()) { + // Sanity Check: if one ends, then both end + assert(key_iter == keys->rend() && value_iter == values->rend()); + break; + } + + // Otherwise not at end of list. Update key, value, and ikey. + key = Slice(*key_iter); + value = Slice(*value_iter); + ParseInternalKey(key, &ikey); + + } else { + // Only had one item to begin with (Put/Delete) + break; + } + } // while (true) + } // if (!drop) + + // MergeUntil has moved input to the next entry + if (!current_entry_is_merging) { + input->Next(); + } + } + if (key_drop_user > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); + } + if (key_drop_newer_entry > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry); + } + if (key_drop_obsolete > 0) { + RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); + } + RecordCompactionIOStats(); + + return status; +} + +void CompactionJob::CallCompactionFilterV2( + CompactionFilterV2* compaction_filter_v2) { + if (compact_ == nullptr || compaction_filter_v2 == nullptr) { + return; + } + + // Assemble slice vectors for user keys and existing values. + // We also keep track of our parsed internal key structs because + // we may need to access the sequence number in the event that + // keys are garbage collected during the filter process. + std::vector ikey_buf; + std::vector user_key_buf; + std::vector existing_value_buf; + + for (const auto& key : compact_->key_str_buf_) { + ParsedInternalKey ikey; + ParseInternalKey(Slice(key), &ikey); + ikey_buf.emplace_back(ikey); + user_key_buf.emplace_back(ikey.user_key); + } + for (const auto& value : compact_->existing_value_str_buf_) { + existing_value_buf.emplace_back(Slice(value)); + } + + // If the user has specified a compaction filter and the sequence + // number is greater than any external snapshot, then invoke the + // filter. + // If the return value of the compaction filter is true, replace + // the entry with a delete marker. + compact_->to_delete_buf_ = compaction_filter_v2->Filter( + compact_->compaction->level(), user_key_buf, existing_value_buf, + &compact_->new_value_buf_, &compact_->value_changed_buf_); + + // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all + // kv-pairs in this compaction run needs to be deleted. + assert(compact_->to_delete_buf_.size() == compact_->key_str_buf_.size()); + assert(compact_->to_delete_buf_.size() == + compact_->existing_value_str_buf_.size()); + assert(compact_->value_changed_buf_.empty() || + compact_->to_delete_buf_.size() == + compact_->value_changed_buf_.size()); + + int new_value_idx = 0; + for (unsigned int i = 0; i < compact_->to_delete_buf_.size(); ++i) { + if (compact_->to_delete_buf_[i]) { + // update the string buffer directly + // the Slice buffer points to the updated buffer + UpdateInternalKey(&compact_->key_str_buf_[i][0], + compact_->key_str_buf_[i].size(), ikey_buf[i].sequence, + kTypeDeletion); + + // no value associated with delete + compact_->existing_value_str_buf_[i].clear(); + RecordTick(stats_, COMPACTION_KEY_DROP_USER); + } else if (!compact_->value_changed_buf_.empty() && + compact_->value_changed_buf_[i]) { + compact_->existing_value_str_buf_[i] = + compact_->new_value_buf_[new_value_idx++]; + } + } // for +} + +Status CompactionJob::FinishCompactionOutputFile(Iterator* input) { + assert(compact_ != nullptr); + assert(compact_->outfile); + assert(compact_->builder != nullptr); + + const uint64_t output_number = compact_->current_output()->number; + const uint32_t output_path_id = compact_->current_output()->path_id; + assert(output_number != 0); + + // Check for iterator errors + Status s = input->status(); + const uint64_t current_entries = compact_->builder->NumEntries(); + if (s.ok()) { + s = compact_->builder->Finish(); + } else { + compact_->builder->Abandon(); + } + const uint64_t current_bytes = compact_->builder->FileSize(); + compact_->current_output()->file_size = current_bytes; + compact_->total_bytes += current_bytes; + compact_->builder.reset(); + + // Finish and check for file errors + if (s.ok() && !db_options_.disableDataSync) { + if (db_options_.use_fsync) { + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = compact_->outfile->Fsync(); + } else { + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = compact_->outfile->Sync(); + } + } + if (s.ok()) { + s = compact_->outfile->Close(); + } + compact_->outfile.reset(); + + if (s.ok() && current_entries > 0) { + // Verify that the table is usable + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + FileDescriptor fd(output_number, output_path_id, current_bytes); + Iterator* iter = cfd->table_cache()->NewIterator( + ReadOptions(), env_options_, cfd->internal_comparator(), fd); + s = iter->status(); + delete iter; + if (s.ok()) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] Generated table #%" PRIu64 ": %" PRIu64 + " keys, %" PRIu64 " bytes", cfd->GetName().c_str(), + output_number, current_entries, current_bytes); + } + } + return s; +} + +Status CompactionJob::InstallCompactionResults(InstrumentedMutex* db_mutex) { + db_mutex->AssertHeld(); + + // paranoia: verify that the files that we started with + // still exist in the current version and in the same original level. + // This ensures that a concurrent compaction did not erroneously + // pick the same files to compact_. + if (!versions_->VerifyCompactionFileConsistency(compact_->compaction)) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "[%s] Compaction %d@%d + %d@%d files aborted", + compact_->compaction->column_family_data()->GetName().c_str(), + compact_->compaction->num_input_files(0), compact_->compaction->level(), + compact_->compaction->num_input_files(1), + compact_->compaction->output_level()); + return Status::Corruption("Compaction input files inconsistent"); + } + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Compacted %d@%d + %d@%d files => %" PRIu64 " bytes", + compact_->compaction->column_family_data()->GetName().c_str(), + compact_->compaction->num_input_files(0), + compact_->compaction->level(), + compact_->compaction->num_input_files(1), + compact_->compaction->output_level(), + compact_->total_bytes); + + // Add compaction outputs + compact_->compaction->AddInputDeletions(compact_->compaction->edit()); + for (size_t i = 0; i < compact_->outputs.size(); i++) { + const CompactionState::Output& out = compact_->outputs[i]; + compact_->compaction->edit()->AddFile( + compact_->compaction->output_level(), out.number, out.path_id, + out.file_size, out.smallest, out.largest, out.smallest_seqno, + out.largest_seqno); + } + return versions_->LogAndApply( + compact_->compaction->column_family_data(), mutable_cf_options_, + compact_->compaction->edit(), db_mutex, db_directory_); +} + +// Given a sequence number, return the sequence number of the +// earliest snapshot that this sequence number is visible in. +// The snapshots themselves are arranged in ascending order of +// sequence numbers. +// Employ a sequential search because the total number of +// snapshots are typically small. +inline SequenceNumber CompactionJob::findEarliestVisibleSnapshot( + SequenceNumber in, const std::vector& snapshots, + SequenceNumber* prev_snapshot) { + assert(snapshots.size()); + SequenceNumber prev __attribute__((unused)) = 0; + for (const auto cur : snapshots) { + assert(prev <= cur); + if (cur >= in) { + *prev_snapshot = prev; + return cur; + } + prev = cur; // assignment + assert(prev); + } + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "CompactionJob is not able to find snapshot" + " with SeqId later than %" PRIu64 + ": current MaxSeqId is %" PRIu64 "", + in, snapshots[snapshots.size() - 1]); + assert(0); + return 0; +} + +void CompactionJob::RecordCompactionIOStats() { + RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); + IOSTATS_RESET(bytes_read); + RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); + IOSTATS_RESET(bytes_written); +} + +Status CompactionJob::OpenCompactionOutputFile() { + assert(compact_ != nullptr); + assert(compact_->builder == nullptr); + // no need to lock because VersionSet::next_file_number_ is atomic + uint64_t file_number = versions_->NewFileNumber(); + // Make the output file + std::string fname = TableFileName(db_options_.db_paths, file_number, + compact_->compaction->GetOutputPathId()); + Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_); + + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "[%s] OpenCompactionOutputFiles for table #%" PRIu64 + " fails at NewWritableFile with status %s", + compact_->compaction->column_family_data()->GetName().c_str(), + file_number, s.ToString().c_str()); + LogFlush(db_options_.info_log); + return s; + } + CompactionState::Output out; + out.number = file_number; + out.path_id = compact_->compaction->GetOutputPathId(); + out.smallest.Clear(); + out.largest.Clear(); + out.smallest_seqno = out.largest_seqno = 0; + + compact_->outputs.push_back(out); + compact_->outfile->SetIOPriority(Env::IO_LOW); + compact_->outfile->SetPreallocationBlockSize(static_cast( + compact_->compaction->OutputFilePreallocationSize(mutable_cf_options_))); + + ColumnFamilyData* cfd = compact_->compaction->column_family_data(); + compact_->builder.reset(NewTableBuilder( + *cfd->ioptions(), cfd->internal_comparator(), compact_->outfile.get(), + compact_->compaction->OutputCompressionType(), + cfd->ioptions()->compression_opts)); + LogFlush(db_options_.info_log); + return s; +} + +void CompactionJob::CleanupCompaction(const Status& status) { + if (compact_->builder != nullptr) { + // May happen if we get a shutdown call in the middle of compaction + compact_->builder->Abandon(); + compact_->builder.reset(); + } else { + assert(!status.ok() || compact_->outfile == nullptr); + } + for (size_t i = 0; i < compact_->outputs.size(); i++) { + const CompactionState::Output& out = compact_->outputs[i]; + + // If this file was inserted into the table cache then remove + // them here because this compaction was not committed. + if (!status.ok()) { + TableCache::Evict(table_cache_.get(), out.number); + } + } + delete compact_; + compact_ = nullptr; +} + +} // namespace rocksdb diff --git a/db/compaction_job.h b/db/compaction_job.h new file mode 100644 index 000000000..cc31ece87 --- /dev/null +++ b/db/compaction_job.h @@ -0,0 +1,127 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "db/column_family.h" +#include "db/version_edit.h" +#include "db/memtable_list.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/transaction_log.h" +#include "util/autovector.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" +#include "util/scoped_arena_iterator.h" +#include "db/internal_stats.h" +#include "db/write_controller.h" +#include "db/flush_scheduler.h" +#include "db/write_thread.h" +#include "db/job_context.h" + +namespace rocksdb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class Arena; + +class CompactionJob { + public: + // TODO(icanadi) make effort to reduce number of parameters here + // IMPORTANT: mutable_cf_options needs to be alive while CompactionJob is + // alive + CompactionJob(Compaction* compaction, const DBOptions& db_options, + const MutableCFOptions& mutable_cf_options, + const EnvOptions& env_options, VersionSet* versions, + std::atomic* shutting_down, LogBuffer* log_buffer, + Directory* db_directory, Directory* output_directory, + Statistics* stats, SnapshotList* snapshot_list, + bool is_snapshot_supported, std::shared_ptr table_cache, + std::function yield_callback); + + ~CompactionJob() { assert(compact_ == nullptr); } + + // no copy/move + CompactionJob(CompactionJob&& job) = delete; + CompactionJob(const CompactionJob& job) = delete; + CompactionJob& operator=(const CompactionJob& job) = delete; + + // REQUIRED: mutex held + void Prepare(); + // REQUIRED mutex not held + Status Run(); + // REQUIRED: mutex held + // status is the return of Run() + void Install(Status* status, InstrumentedMutex* db_mutex); + + private: + void AllocateCompactionOutputFileNumbers(); + // Call compaction filter if is_compaction_v2 is not true. Then iterate + // through input and compact the kv-pairs + Status ProcessKeyValueCompaction(int64_t* imm_micros, Iterator* input, + bool is_compaction_v2); + // Call compaction_filter_v2->Filter() on kv-pairs in compact + void CallCompactionFilterV2(CompactionFilterV2* compaction_filter_v2); + Status FinishCompactionOutputFile(Iterator* input); + Status InstallCompactionResults(InstrumentedMutex* db_mutex); + SequenceNumber findEarliestVisibleSnapshot( + SequenceNumber in, const std::vector& snapshots, + SequenceNumber* prev_snapshot); + void RecordCompactionIOStats(); + Status OpenCompactionOutputFile(); + void CleanupCompaction(const Status& status); + + // CompactionJob state + struct CompactionState; + CompactionState* compact_; + + bool bottommost_level_; + SequenceNumber earliest_snapshot_; + SequenceNumber visible_at_tip_; + SequenceNumber latest_snapshot_; + + InternalStats::CompactionStats compaction_stats_; + + // DBImpl state + const DBOptions& db_options_; + const MutableCFOptions& mutable_cf_options_; + const EnvOptions& env_options_; + Env* env_; + VersionSet* versions_; + std::atomic* shutting_down_; + LogBuffer* log_buffer_; + Directory* db_directory_; + Directory* output_directory_; + Statistics* stats_; + SnapshotList* snapshots_; + bool is_snapshot_supported_; + std::shared_ptr table_cache_; + + // yield callback + std::function yield_callback_; +}; + +} // namespace rocksdb diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc new file mode 100644 index 000000000..2a089dc57 --- /dev/null +++ b/db/compaction_job_test.cc @@ -0,0 +1,183 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "db/compaction_job.h" +#include "db/column_family.h" +#include "db/version_set.h" +#include "db/writebuffer.h" +#include "rocksdb/cache.h" +#include "rocksdb/options.h" +#include "rocksdb/db.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "table/mock_table.h" + +namespace rocksdb { + +// TODO(icanadi) Make it simpler once we mock out VersionSet +class CompactionJobTest { + public: + CompactionJobTest() + : env_(Env::Default()), + dbname_(test::TmpDir() + "/compaction_job_test"), + mutable_cf_options_(Options(), ImmutableCFOptions(Options())), + table_cache_(NewLRUCache(50000, 16, 8)), + write_buffer_(db_options_.db_write_buffer_size), + versions_(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_, + &write_controller_)), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()) { + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + NewDB(); + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + + ASSERT_OK(versions_->Recover(column_families, false)); + } + + std::string GenerateFileName(uint64_t file_number) { + FileMetaData meta; + std::vector db_paths; + db_paths.emplace_back(dbname_, std::numeric_limits::max()); + meta.fd = FileDescriptor(file_number, 0, 0); + return TableFileName(db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); + } + + // returns expected result after compaction + mock::MockFileContents CreateTwoFiles() { + mock::MockFileContents expected_results; + const int kKeysPerFile = 10000; + SequenceNumber sequence_number = 0; + for (int i = 0; i < 2; ++i) { + mock::MockFileContents contents; + SequenceNumber smallest_seqno = 0, largest_seqno = 0; + InternalKey smallest, largest; + for (int k = 0; k < kKeysPerFile; ++k) { + auto key = ToString(i * (kKeysPerFile / 2) + k); + auto value = ToString(i * kKeysPerFile + k); + InternalKey internal_key(key, ++sequence_number, kTypeValue); + if (k == 0) { + smallest = internal_key; + smallest_seqno = sequence_number; + } else if (k == kKeysPerFile - 1) { + largest = internal_key; + largest_seqno = sequence_number; + } + std::pair key_value( + {internal_key.Encode().ToString(), value}); + contents.insert(key_value); + if (i == 1 || k < kKeysPerFile / 2) { + expected_results.insert(key_value); + } + } + + uint64_t file_number = versions_->NewFileNumber(); + ASSERT_OK(mock_table_factory_->CreateMockTable( + env_, GenerateFileName(file_number), std::move(contents))); + + VersionEdit edit; + edit.AddFile(0, file_number, 0, 10, smallest, largest, smallest_seqno, + largest_seqno); + + mutex_.Lock(); + versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), + mutable_cf_options_, &edit, &mutex_); + mutex_.Unlock(); + } + versions_->SetLastSequence(sequence_number); + return expected_results; + } + + void NewDB() { + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + { + log::Writer log(std::move(file)); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, nullptr); + } + + Env* env_; + std::string dbname_; + EnvOptions env_options_; + MutableCFOptions mutable_cf_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + DBOptions db_options_; + ColumnFamilyOptions cf_options_; + WriteBuffer write_buffer_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; +}; + +TEST(CompactionJobTest, Simple) { + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + + auto expected_results = CreateTwoFiles(); + + auto files = cfd->current()->storage_info()->LevelFiles(0); + ASSERT_EQ(2U, files.size()); + + std::unique_ptr compaction(Compaction::TEST_NewCompaction( + 7, 0, 1, 1024 * 1024, 10, 0, kNoCompression)); + compaction->SetInputVersion(cfd->current()); + + auto compaction_input_files = compaction->TEST_GetInputFiles(0); + compaction_input_files->level = 0; + compaction_input_files->files.push_back(files[0]); + compaction_input_files->files.push_back(files[1]); + + SnapshotList snapshots; + int yield_callback_called = 0; + std::function yield_callback = [&]() { + yield_callback_called++; + return 0; + }; + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + mutex_.Lock(); + CompactionJob compaction_job(compaction.get(), db_options_, + *cfd->GetLatestMutableCFOptions(), env_options_, + versions_.get(), &shutting_down_, &log_buffer, + nullptr, nullptr, nullptr, &snapshots, true, + table_cache_, std::move(yield_callback)); + compaction_job.Prepare(); + mutex_.Unlock(); + ASSERT_OK(compaction_job.Run()); + mutex_.Lock(); + Status s; + compaction_job.Install(&s, &mutex_); + ASSERT_OK(s); + mutex_.Unlock(); + + mock_table_factory_->AssertLatestFile(expected_results); + ASSERT_EQ(yield_callback_called, 20000); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index e05d07776..f74e63436 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -9,15 +9,21 @@ #include "db/compaction_picker.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include +#include #include "db/filename.h" #include "util/log_buffer.h" #include "util/statistics.h" +#include "util/string_util.h" namespace rocksdb { +namespace { uint64_t TotalCompensatedFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { @@ -26,93 +32,45 @@ uint64_t TotalCompensatedFileSize(const std::vector& files) { return sum; } -namespace { // Determine compression type, based on user options, level of the output // file and whether compression is disabled. // If enable_compression is false, then compression is always disabled no // matter what the values of the other two parameters are. // Otherwise, the compression type is determined based on options and level. -CompressionType GetCompressionType(const Options& options, int level, - const bool enable_compression = true) { +CompressionType GetCompressionType( + const ImmutableCFOptions& ioptions, int level, + const bool enable_compression = true) { if (!enable_compression) { // disable compression return kNoCompression; } // If the use has specified a different compression level for each level, - // then pick the compresison for that level. - if (!options.compression_per_level.empty()) { - const int n = options.compression_per_level.size() - 1; + // then pick the compression for that level. + if (!ioptions.compression_per_level.empty()) { + const int n = static_cast(ioptions.compression_per_level.size()) - 1; // It is possible for level_ to be -1; in that case, we use level // 0's compression. This occurs mostly in backwards compatibility // situations when the builder doesn't know what level the file - // belongs to. Likewise, if level_ is beyond the end of the + // belongs to. Likewise, if level is beyond the end of the // specified compression levels, use the last value. - return options.compression_per_level[std::max(0, std::min(level, n))]; + return ioptions.compression_per_level[std::max(0, std::min(level, n))]; } else { - return options.compression; + return ioptions.compression; } } -// Multiple two operands. If they overflow, return op1. -uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { - if (op1 == 0) { - return 0; - } - if (op2 <= 0) { - return op1; - } - uint64_t casted_op2 = (uint64_t) op2; - if (std::numeric_limits::max() / op1 < casted_op2) { - return op1; - } - return op1 * casted_op2; -} } // anonymous namespace -CompactionPicker::CompactionPicker(const Options* options, +CompactionPicker::CompactionPicker(const ImmutableCFOptions& ioptions, const InternalKeyComparator* icmp) - : compactions_in_progress_(options->num_levels), - options_(options), - num_levels_(options->num_levels), + : ioptions_(ioptions), + compactions_in_progress_(ioptions_.num_levels), icmp_(icmp) { - - max_file_size_.reset(new uint64_t[NumberLevels()]); - level_max_bytes_.reset(new uint64_t[NumberLevels()]); - int target_file_size_multiplier = options_->target_file_size_multiplier; - int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; - for (int i = 0; i < NumberLevels(); i++) { - if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { - max_file_size_[i] = ULLONG_MAX; - level_max_bytes_[i] = options_->max_bytes_for_level_base; - } else if (i > 1) { - max_file_size_[i] = MultiplyCheckOverflow(max_file_size_[i - 1], - target_file_size_multiplier); - level_max_bytes_[i] = MultiplyCheckOverflow( - MultiplyCheckOverflow(level_max_bytes_[i - 1], max_bytes_multiplier), - options_->max_bytes_for_level_multiplier_additional[i - 1]); - } else { - max_file_size_[i] = options_->target_file_size_base; - level_max_bytes_[i] = options_->max_bytes_for_level_base; - } - } } CompactionPicker::~CompactionPicker() {} -void CompactionPicker::SizeBeingCompacted(std::vector& sizes) { - for (int level = 0; level < NumberLevels() - 1; level++) { - uint64_t total = 0; - for (auto c : compactions_in_progress_[level]) { - assert(c->level() == level); - for (int i = 0; i < c->num_input_files(0); i++) { - total += c->input(0, i)->compensated_file_size; - } - } - sizes[level] = total; - } -} - // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { @@ -123,26 +81,6 @@ void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { } } -uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const { - assert(level >= 0); - assert(level < NumberLevels()); - return max_file_size_[level]; -} - -uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) { - uint64_t result = MaxFileSizeForLevel(level); - result *= options_->max_grandparent_overlap_factor; - return result; -} - -double CompactionPicker::MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - assert(level >= 0); - assert(level < NumberLevels()); - return level_max_bytes_[level]; -} - void CompactionPicker::GetRange(const std::vector& inputs, InternalKey* smallest, InternalKey* largest) { assert(!inputs.empty()); @@ -172,7 +110,9 @@ void CompactionPicker::GetRange(const std::vector& inputs1, GetRange(all, smallest, largest); } -bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { +bool CompactionPicker::ExpandWhileOverlapping(const std::string& cf_name, + VersionStorageInfo* vstorage, + Compaction* c) { assert(c != nullptr); // If inputs are empty then there is nothing to expand. if (c->inputs_[0].empty()) { @@ -199,9 +139,9 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { old_size = c->inputs_[0].size(); GetRange(c->inputs_[0].files, &smallest, &largest); c->inputs_[0].clear(); - c->input_version_->GetOverlappingInputs( - level, &smallest, &largest, &c->inputs_[0].files, - hint_index, &hint_index); + vstorage->GetOverlappingInputs(level, &smallest, &largest, + &c->inputs_[0].files, hint_index, + &hint_index); } while(c->inputs_[0].size() > old_size); // Get the new range @@ -211,29 +151,30 @@ bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { // compaction, then we must drop/cancel this compaction. int parent_index = -1; if (c->inputs_[0].empty()) { - Log(options_->info_log, + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, "[%s] ExpandWhileOverlapping() failure because zero input files", - c->column_family_data()->GetName().c_str()); + cf_name.c_str()); } if (c->inputs_[0].empty() || FilesInCompaction(c->inputs_[0].files) || (c->level() != c->output_level() && - ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + ParentRangeInCompaction(vstorage, &smallest, &largest, level, &parent_index))) { c->inputs_[0].clear(); c->inputs_[1].clear(); + if (!c->inputs_[0].empty()) { + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] ExpandWhileOverlapping() failure because some of the necessary" + " compaction input files are currently being compacted.", + c->column_family_data()->GetName().c_str()); + } return false; } return true; } -uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) { - uint64_t result = MaxFileSizeForLevel(level); - result *= options_->expanded_compaction_factor; - return result; -} - // Returns true if any one of specified files are being compacted -bool CompactionPicker::FilesInCompaction(std::vector& files) { +bool CompactionPicker::FilesInCompaction( + const std::vector& files) { for (unsigned int i = 0; i < files.size(); i++) { if (files[i]->being_compacted) { return true; @@ -242,16 +183,99 @@ bool CompactionPicker::FilesInCompaction(std::vector& files) { return false; } +Compaction* CompactionPicker::FormCompaction( + const CompactionOptions& compact_options, + const autovector& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options) const { + uint64_t max_grandparent_overlap_bytes = + output_level + 1 < vstorage->num_levels() ? + mutable_cf_options.MaxGrandParentOverlapBytes(output_level + 1) : + std::numeric_limits::max(); + assert(input_files.size()); + auto c = new Compaction(vstorage, input_files, + input_files[0].level, output_level, + max_grandparent_overlap_bytes, + compact_options, false); + c->mutable_cf_options_ = mutable_cf_options; + c->MarkFilesBeingCompacted(true); + + // TODO(yhchiang): complete the SetBottomMostLevel as follows + // If there is no any key of the range in DB that is older than the + // range to compact, it is bottom most. For leveled compaction, + // if number-of_level-1 is empty, and output is going to number-of_level-2, + // it is also bottom-most. On the other hand, if number of level=1 ( + // something like universal), the compaction is only "bottom-most" if + // the oldest file is involved. + c->SetupBottomMostLevel( + vstorage, + (output_level == vstorage->num_levels() - 1), + (output_level == 0)); + return c; +} + +Status CompactionPicker::GetCompactionInputsFromFileNumbers( + autovector* input_files, + std::unordered_set* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const { + if (input_set->size() == 0U) { + return Status::InvalidArgument( + "Compaction must include at least one file."); + } + assert(input_files); + + autovector matched_input_files; + matched_input_files.resize(vstorage->num_levels()); + int first_non_empty_level = -1; + int last_non_empty_level = -1; + // TODO(yhchiang): use a lazy-initialized mapping from + // file_number to FileMetaData in Version. + for (int level = 0; level < vstorage->num_levels(); ++level) { + for (auto file : vstorage->LevelFiles(level)) { + auto iter = input_set->find(file->fd.GetNumber()); + if (iter != input_set->end()) { + matched_input_files[level].files.push_back(file); + input_set->erase(iter); + last_non_empty_level = level; + if (first_non_empty_level == -1) { + first_non_empty_level = level; + } + } + } + } + + if (!input_set->empty()) { + std::string message( + "Cannot find matched SST files for the following file numbers:"); + for (auto fn : *input_set) { + message += " "; + message += ToString(fn); + } + return Status::InvalidArgument(message); + } + + for (int level = first_non_empty_level; + level <= last_non_empty_level; ++level) { + matched_input_files[level].level = level; + input_files->emplace_back(std::move(matched_input_files[level])); + } + + return Status::OK(); +} + + + // Returns true if any one of the parent files are being compacted -bool CompactionPicker::ParentRangeInCompaction(Version* version, +bool CompactionPicker::ParentRangeInCompaction(VersionStorageInfo* vstorage, const InternalKey* smallest, const InternalKey* largest, int level, int* parent_index) { std::vector inputs; assert(level + 1 < NumberLevels()); - version->GetOverlappingInputs(level + 1, smallest, largest, &inputs, - *parent_index, parent_index); + vstorage->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); return FilesInCompaction(inputs); } @@ -259,7 +283,9 @@ bool CompactionPicker::ParentRangeInCompaction(Version* version, // Will also attempt to expand "level" if that doesn't expand "level+1" // or cause "level" to include a file for compaction that has an overlapping // user-key with another file. -void CompactionPicker::SetupOtherInputs(Compaction* c) { +void CompactionPicker::SetupOtherInputs( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, Compaction* c) { // If inputs are empty, then there is nothing to expand. // If both input and output levels are the same, no need to consider // files at level "level+1" @@ -274,10 +300,9 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) { GetRange(c->inputs_[0].files, &smallest, &largest); // Populate the set of next-level files (inputs_[1]) to include in compaction - c->input_version_->GetOverlappingInputs( - level + 1, &smallest, &largest, - &c->inputs_[1].files, c->parent_index_, - &c->parent_index_); + vstorage->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1].files, c->parent_index_, + &c->parent_index_); // Get entire range covered by compaction InternalKey all_start, all_limit; @@ -290,31 +315,30 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) { // can happen when one user key spans multiple files. if (!c->inputs_[1].empty()) { std::vector expanded0; - c->input_version_->GetOverlappingInputs( - level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); + vstorage->GetOverlappingInputs(level, &all_start, &all_limit, &expanded0, + c->base_index_, nullptr); const uint64_t inputs0_size = TotalCompensatedFileSize(c->inputs_[0].files); const uint64_t inputs1_size = TotalCompensatedFileSize(c->inputs_[1].files); const uint64_t expanded0_size = TotalCompensatedFileSize(expanded0); - uint64_t limit = ExpandedCompactionByteSizeLimit(level); + uint64_t limit = mutable_cf_options.ExpandedCompactionByteSizeLimit(level); if (expanded0.size() > c->inputs_[0].size() && inputs1_size + expanded0_size < limit && !FilesInCompaction(expanded0) && - !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { + !vstorage->HasOverlappingUserKey(&expanded0, level)) { InternalKey new_start, new_limit; GetRange(expanded0, &new_start, &new_limit); std::vector expanded1; - c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, - &expanded1, c->parent_index_, - &c->parent_index_); + vstorage->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); if (expanded1.size() == c->inputs_[1].size() && !FilesInCompaction(expanded1)) { - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log, "[%s] Expanding@%d %zu+%zu (%" PRIu64 "+%" PRIu64 " bytes) to %zu+%zu (%" PRIu64 "+%" PRIu64 "bytes)\n", - c->column_family_data()->GetName().c_str(), level, - c->inputs_[0].size(), c->inputs_[1].size(), inputs0_size, - inputs1_size, expanded0.size(), expanded1.size(), expanded0_size, - inputs1_size); + cf_name.c_str(), level, c->inputs_[0].size(), c->inputs_[1].size(), + inputs0_size, inputs1_size, expanded0.size(), expanded1.size(), + expanded0_size, inputs1_size); smallest = new_start; largest = new_limit; c->inputs_[0].files = expanded0; @@ -328,30 +352,30 @@ void CompactionPicker::SetupOtherInputs(Compaction* c) { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) if (level + 2 < NumberLevels()) { - c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, - &c->grandparents_); + vstorage->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); } } -Compaction* CompactionPicker::CompactRange(Version* version, int input_level, - int output_level, - uint32_t output_path_id, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end) { +Compaction* CompactionPicker::CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end) { // CompactionPickerFIFO has its own implementation of compact range - assert(options_->compaction_style != kCompactionStyleFIFO); + assert(ioptions_.compaction_style != kCompactionStyleFIFO); std::vector inputs; bool covering_the_whole_range = true; // All files are 'overlapping' in universal style compaction. // We have to compact the entire range in one shot. - if (options_->compaction_style == kCompactionStyleUniversal) { + if (ioptions_.compaction_style == kCompactionStyleUniversal) { begin = nullptr; end = nullptr; } - version->GetOverlappingInputs(input_level, begin, end, &inputs); + + vstorage->GetOverlappingInputs(input_level, begin, end, &inputs); if (inputs.empty()) { return nullptr; } @@ -361,8 +385,8 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level, // and we must not pick one file and drop another older file if the // two files overlap. if (input_level > 0) { - const uint64_t limit = - MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; + const uint64_t limit = mutable_cf_options.MaxFileSizeForLevel(input_level) * + mutable_cf_options.source_compaction_factor; uint64_t total = 0; for (size_t i = 0; i + 1 < inputs.size(); ++i) { uint64_t s = inputs[i]->compensated_file_size; @@ -375,22 +399,22 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level, } } } - assert(output_path_id < static_cast(options_->db_paths.size())); + assert(output_path_id < static_cast(ioptions_.db_paths.size())); Compaction* c = new Compaction( - version, input_level, output_level, MaxFileSizeForLevel(output_level), - MaxGrandParentOverlapBytes(input_level), output_path_id, - GetCompressionType(*options_, output_level)); + vstorage->num_levels(), input_level, output_level, + mutable_cf_options.MaxFileSizeForLevel(output_level), + mutable_cf_options.MaxGrandParentOverlapBytes(input_level), + output_path_id, GetCompressionType(ioptions_, output_level)); c->inputs_[0].files = inputs; - if (ExpandWhileOverlapping(c) == false) { + if (ExpandWhileOverlapping(cf_name, vstorage, c) == false) { delete c; - Log(options_->info_log, - "[%s] Could not compact due to expansion failure.\n", - version->cfd_->GetName().c_str()); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Could not compact due to expansion failure.\n", cf_name.c_str()); return nullptr; } - SetupOtherInputs(c); + SetupOtherInputs(cf_name, mutable_cf_options, vstorage, c); if (covering_the_whole_range) { *compaction_end = nullptr; @@ -402,35 +426,274 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level, c->MarkFilesBeingCompacted(true); // Is this compaction creating a file at the bottommost level - c->SetupBottomMostLevel(true); + c->SetupBottomMostLevel( + vstorage, true, ioptions_.compaction_style == kCompactionStyleUniversal); c->is_manual_compaction_ = true; + c->mutable_cf_options_ = mutable_cf_options; return c; } -Compaction* LevelCompactionPicker::PickCompaction(Version* version, - LogBuffer* log_buffer) { +#ifndef ROCKSDB_LITE +namespace { +// Test whether two files have overlapping key-ranges. +bool HaveOverlappingKeyRanges( + const Comparator* c, + const SstFileMetaData& a, const SstFileMetaData& b) { + if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { + if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->Compare(a.largestkey, b.largestkey) <= 0) { + if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; +} +} // namespace + +Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const { + auto& levels = cf_meta.levels; + auto comparator = icmp_->user_comparator(); + + // TODO(yhchiang): If there is any input files of L1 or up and there + // is at least one L0 files. All L0 files older than the L0 file needs + // to be included. Otherwise, it is a false conditoin + + // TODO(yhchiang): add is_adjustable to CompactionOptions + + // the smallest and largest key of the current compaction input + std::string smallestkey; + std::string largestkey; + // a flag for initializing smallest and largest key + bool is_first = false; + const int kNotFound = -1; + + // For each level, it does the following things: + // 1. Find the first and the last compaction input files + // in the current level. + // 2. Include all files between the first and the last + // compaction input files. + // 3. Update the compaction key-range. + // 4. For all remaining levels, include files that have + // overlapping key-range with the compaction key-range. + for (int l = 0; l <= output_level; ++l) { + auto& current_files = levels[l].files; + int first_included = static_cast(current_files.size()); + int last_included = kNotFound; + + // identify the first and the last compaction input files + // in the current level. + for (size_t f = 0; f < current_files.size(); ++f) { + if (input_files->find(TableFileNameToNumber(current_files[f].name)) != + input_files->end()) { + first_included = std::min(first_included, static_cast(f)); + last_included = std::max(last_included, static_cast(f)); + if (is_first == false) { + smallestkey = current_files[f].smallestkey; + largestkey = current_files[f].largestkey; + is_first = true; + } + } + } + if (last_included == kNotFound) { + continue; + } + + if (l != 0) { + // expend the compaction input of the current level if it + // has overlapping key-range with other non-compaction input + // files in the same level. + while (first_included > 0) { + if (comparator->Compare( + current_files[first_included - 1].largestkey, + current_files[first_included].smallestkey) < 0) { + break; + } + first_included--; + } + + while (last_included < static_cast(current_files.size()) - 1) { + if (comparator->Compare( + current_files[last_included + 1].smallestkey, + current_files[last_included].largestkey) > 0) { + break; + } + last_included++; + } + } + + // include all files between the first and the last compaction input files. + for (int f = first_included; f <= last_included; ++f) { + if (current_files[f].being_compacted) { + return Status::Aborted( + "Necessary compaction input file " + current_files[f].name + + " is currently being compacted."); + } + input_files->insert( + TableFileNameToNumber(current_files[f].name)); + } + + // update smallest and largest key + if (l == 0) { + for (int f = first_included; f <= last_included; ++f) { + if (comparator->Compare( + smallestkey, current_files[f].smallestkey) > 0) { + smallestkey = current_files[f].smallestkey; + } + if (comparator->Compare( + largestkey, current_files[f].largestkey) < 0) { + largestkey = current_files[f].largestkey; + } + } + } else { + if (comparator->Compare( + smallestkey, current_files[first_included].smallestkey) > 0) { + smallestkey = current_files[first_included].smallestkey; + } + if (comparator->Compare( + largestkey, current_files[last_included].largestkey) < 0) { + largestkey = current_files[last_included].largestkey; + } + } + + SstFileMetaData aggregated_file_meta; + aggregated_file_meta.smallestkey = smallestkey; + aggregated_file_meta.largestkey = largestkey; + + // For all lower levels, include all overlapping files. + for (int m = l + 1; m <= output_level; ++m) { + for (auto& next_lv_file : levels[m].files) { + if (HaveOverlappingKeyRanges( + comparator, aggregated_file_meta, next_lv_file)) { + if (next_lv_file.being_compacted) { + return Status::Aborted( + "File " + next_lv_file.name + + " that has overlapping key range with one of the compaction " + " input file is currently being compacted."); + } + input_files->insert( + TableFileNameToNumber(next_lv_file.name)); + } + } + } + } + return Status::OK(); +} + +Status CompactionPicker::SanitizeCompactionInputFiles( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const { + assert(static_cast(cf_meta.levels.size()) - 1 == + cf_meta.levels[cf_meta.levels.size() - 1].level); + if (output_level >= static_cast(cf_meta.levels.size())) { + return Status::InvalidArgument( + "Output level for column family " + cf_meta.name + + " must between [0, " + + ToString(cf_meta.levels[cf_meta.levels.size() - 1].level) + + "]."); + } + + if (output_level > MaxOutputLevel()) { + return Status::InvalidArgument( + "Exceed the maximum output level defined by " + "the current compaction algorithm --- " + + ToString(MaxOutputLevel())); + } + + if (output_level < 0) { + return Status::InvalidArgument( + "Output level cannot be negative."); + } + + if (input_files->size() == 0) { + return Status::InvalidArgument( + "A compaction must contain at least one file."); + } + + Status s = SanitizeCompactionInputFilesForAllLevels( + input_files, cf_meta, output_level); + + if (!s.ok()) { + return s; + } + + // for all input files, check whether the file number matches + // any currently-existing files. + for (auto file_num : *input_files) { + bool found = false; + for (auto level_meta : cf_meta.levels) { + for (auto file_meta : level_meta.files) { + if (file_num == TableFileNameToNumber(file_meta.name)) { + if (file_meta.being_compacted) { + return Status::Aborted( + "Specified compaction input file " + + MakeTableFileName("", file_num) + + " is already being compacted."); + } + found = true; + break; + } + } + if (found) { + break; + } + } + if (!found) { + return Status::InvalidArgument( + "Specified compaction input file " + + MakeTableFileName("", file_num) + + " does not exist in column family " + cf_meta.name + "."); + } + } + + return Status::OK(); +} +#endif // !ROCKSDB_LITE + +bool LevelCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage) + const { + for (int i = 0; i <= vstorage->MaxInputLevel(); i++) { + if (vstorage->CompactionScore(i) >= 1) { + return true; + } + } + return false; +} + +Compaction* LevelCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { Compaction* c = nullptr; int level = -1; - // Compute the compactions needed. It is better to do it here - // and also in LogAndApply(), otherwise the values could be stale. - std::vector size_being_compacted(NumberLevels() - 1); - SizeBeingCompacted(size_being_compacted); - version->ComputeCompactionScore(size_being_compacted); - // We prefer compactions triggered by too much data in a level over // the compactions triggered by seeks. // // Find the compactions by size on all levels. for (int i = 0; i < NumberLevels() - 1; i++) { - assert(i == 0 || - version->compaction_score_[i] <= version->compaction_score_[i - 1]); - level = version->compaction_level_[i]; - if ((version->compaction_score_[i] >= 1)) { - c = PickCompactionBySize(version, level, version->compaction_score_[i]); - if (c == nullptr || ExpandWhileOverlapping(c) == false) { + double score = vstorage->CompactionScore(i); + level = vstorage->CompactionScoreLevel(i); + assert(i == 0 || score <= vstorage->CompactionScore(i - 1)); + if ((score >= 1)) { + c = PickCompactionBySize(mutable_cf_options, vstorage, level, score); + if (c == nullptr || + ExpandWhileOverlapping(cf_name, vstorage, c) == false) { delete c; c = nullptr; } else { @@ -453,14 +716,14 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version, // c->inputs_[0] earlier and replace it with an overlapping set // which will include the picked file. c->inputs_[0].clear(); - c->input_version_->GetOverlappingInputs(0, &smallest, &largest, - &c->inputs_[0].files); + vstorage->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0].files); // If we include more L0 files in the same compaction run it can // cause the 'smallest' and 'largest' key to get extended to a // larger range. So, re-invoke GetRange to get the new key range GetRange(c->inputs_[0].files, &smallest, &largest); - if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + if (ParentRangeInCompaction(vstorage, &smallest, &largest, level, &c->parent_index_)) { delete c; return nullptr; @@ -469,23 +732,73 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version, } // Setup "level+1" files (inputs_[1]) - SetupOtherInputs(c); + SetupOtherInputs(cf_name, mutable_cf_options, vstorage, c); // mark all the files that are being compacted c->MarkFilesBeingCompacted(true); // Is this compaction creating a file at the bottommost level - c->SetupBottomMostLevel(false); + c->SetupBottomMostLevel(vstorage, false, false); // remember this currently undergoing compaction compactions_in_progress_[level].insert(c); + c->mutable_cf_options_ = mutable_cf_options; + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + { // this piece of code recomputes compaction score + CompactionOptionsFIFO dummy_compaction_options_fifo; + vstorage->ComputeCompactionScore(mutable_cf_options, + dummy_compaction_options_fifo); + } + return c; } -Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, - int level, - double score) { +/* + * Find the optimal path to place a file + * Given a level, finds the path where levels up to it will fit in levels + * up to and including this path + */ +uint32_t LevelCompactionPicker::GetPathId( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, int level) { + uint32_t p = 0; + assert(!ioptions.db_paths.empty()); + + // size remaining in the most recent path + uint64_t current_path_size = ioptions.db_paths[0].target_size; + + uint64_t level_size; + int cur_level = 0; + + level_size = mutable_cf_options.max_bytes_for_level_base; + + // Last path is the fallback + while (p < ioptions.db_paths.size() - 1) { + if (level_size <= current_path_size) { + if (cur_level == level) { + // Does desired level fit in this path? + return p; + } else { + current_path_size -= level_size; + level_size *= mutable_cf_options.max_bytes_for_level_multiplier; + cur_level++; + continue; + } + } + p++; + current_path_size = ioptions.db_paths[p].target_size; + } + return p; +} + +Compaction* LevelCompactionPicker::PickCompactionBySize( + const MutableCFOptions& mutable_cf_options, VersionStorageInfo* vstorage, + int level, double score) { Compaction* c = nullptr; // level 0 files are overlapping. So we cannot pick more @@ -498,29 +811,30 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, assert(level >= 0); assert(level + 1 < NumberLevels()); - c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1), - MaxGrandParentOverlapBytes(level), 0, - GetCompressionType(*options_, level + 1)); + c = new Compaction(vstorage->num_levels(), level, level + 1, + mutable_cf_options.MaxFileSizeForLevel(level + 1), + mutable_cf_options.MaxGrandParentOverlapBytes(level), + GetPathId(ioptions_, mutable_cf_options, level + 1), + GetCompressionType(ioptions_, level + 1)); c->score_ = score; // Pick the largest file in this level that is not already // being compacted - std::vector& file_size = c->input_version_->files_by_size_[level]; + const std::vector& file_size = vstorage->FilesBySize(level); + const std::vector& level_files = vstorage->LevelFiles(level); // record the first file that is not yet compacted int nextIndex = -1; - for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; + for (unsigned int i = vstorage->NextCompactionIndex(level); i < file_size.size(); i++) { int index = file_size[i]; - FileMetaData* f = c->input_version_->files_[level][index]; + FileMetaData* f = level_files[index]; - // Check to verify files are arranged in descending compensated size. assert((i == file_size.size() - 1) || - (i >= Version::number_of_files_to_sort_ - 1) || + (i >= VersionStorageInfo::kNumberFilesToSort - 1) || (f->compensated_file_size >= - c->input_version_->files_[level][file_size[i + 1]]-> - compensated_file_size)); + level_files[file_size[i + 1]]->compensated_file_size)); // do not pick a file to compact if it is being compacted // from n-1 level. @@ -536,8 +850,8 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, // Do not pick this file if its parents at level+1 are being compacted. // Maybe we can avoid redoing this work in SetupOtherInputs int parent_index = -1; - if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest, - level, &parent_index)) { + if (ParentRangeInCompaction(vstorage, &f->smallest, &f->largest, level, + &parent_index)) { continue; } c->inputs_[0].files.push_back(f); @@ -552,68 +866,80 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, } // store where to start the iteration in the next call to PickCompaction - version->next_file_to_compact_by_size_[level] = nextIndex; + vstorage->SetNextCompactionIndex(level, nextIndex); return c; } +#ifndef ROCKSDB_LITE +bool UniversalCompactionPicker::NeedsCompaction( + const VersionStorageInfo* vstorage) const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + // Universal style of compaction. Pick files that are contiguous in // time-range to compact. // -Compaction* UniversalCompactionPicker::PickCompaction(Version* version, - LogBuffer* log_buffer) { - int level = 0; - double score = version->compaction_score_[0]; - - if ((version->files_[level].size() < - (unsigned int)options_->level0_file_num_compaction_trigger)) { - LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", - version->cfd_->GetName().c_str()); +Compaction* UniversalCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + const int kLevel0 = 0; + double score = vstorage->CompactionScore(kLevel0); + const std::vector& level_files = vstorage->LevelFiles(kLevel0); + + if ((level_files.size() < + (unsigned int)mutable_cf_options.level0_file_num_compaction_trigger)) { + LogToBuffer(log_buffer, "[%s] Universal: nothing to do\n", cf_name.c_str()); return nullptr; } - Version::FileSummaryStorage tmp; - LogToBuffer(log_buffer, "[%s] Universal: candidate files(%zu): %s\n", - version->cfd_->GetName().c_str(), version->files_[level].size(), - version->LevelFileSummary(&tmp, 0)); + VersionStorageInfo::FileSummaryStorage tmp; + LogToBuffer(log_buffer, 3072, "[%s] Universal: candidate files(%zu): %s\n", + cf_name.c_str(), level_files.size(), + vstorage->LevelFileSummary(&tmp, kLevel0)); // Check for size amplification first. Compaction* c; - if ((c = PickCompactionUniversalSizeAmp(version, score, log_buffer)) != - nullptr) { + if ((c = PickCompactionUniversalSizeAmp(cf_name, mutable_cf_options, vstorage, + score, log_buffer)) != nullptr) { LogToBuffer(log_buffer, "[%s] Universal: compacting for size amp\n", - version->cfd_->GetName().c_str()); + cf_name.c_str()); } else { // Size amplification is within limits. Try reducing read // amplification while maintaining file size ratios. - unsigned int ratio = options_->compaction_options_universal.size_ratio; + unsigned int ratio = ioptions_.compaction_options_universal.size_ratio; - if ((c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX, + if ((c = PickCompactionUniversalReadAmp(cf_name, mutable_cf_options, + vstorage, score, ratio, UINT_MAX, log_buffer)) != nullptr) { LogToBuffer(log_buffer, "[%s] Universal: compacting for size ratio\n", - version->cfd_->GetName().c_str()); + cf_name.c_str()); } else { // Size amplification and file size ratios are within configured limits. // If max read amplification is exceeding configured limits, then force // compaction without looking at filesize ratios and try to reduce // the number of files to fewer than level0_file_num_compaction_trigger. - unsigned int num_files = version->files_[level].size() - - options_->level0_file_num_compaction_trigger; + unsigned int num_files = + static_cast(level_files.size()) - + mutable_cf_options.level0_file_num_compaction_trigger; if ((c = PickCompactionUniversalReadAmp( - version, score, UINT_MAX, num_files, log_buffer)) != nullptr) { - LogToBuffer(log_buffer, "[%s] Universal: compacting for file num\n", - version->cfd_->GetName().c_str()); + cf_name, mutable_cf_options, vstorage, score, UINT_MAX, + num_files, log_buffer)) != nullptr) { + LogToBuffer(log_buffer, + "[%s] Universal: compacting for file num -- %u\n", + cf_name.c_str(), num_files); } } } if (c == nullptr) { return nullptr; } - assert(c->inputs_[0].size() > 1); + assert(c->inputs_[kLevel0].size() > 1); // validate that all the chosen files are non overlapping in time FileMetaData* newerfile __attribute__((unused)) = nullptr; - for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { - FileMetaData* f = c->inputs_[0][i]; + for (unsigned int i = 0; i < c->inputs_[kLevel0].size(); i++) { + FileMetaData* f = c->inputs_[kLevel0][i]; assert (f->smallest_seqno <= f->largest_seqno); assert(newerfile == nullptr || newerfile->smallest_seqno > f->largest_seqno); @@ -621,29 +947,29 @@ Compaction* UniversalCompactionPicker::PickCompaction(Version* version, } // Is the earliest file part of this compaction? - FileMetaData* last_file = c->input_version_->files_[level].back(); - c->bottommost_level_ = c->inputs_[0].files.back() == last_file; + FileMetaData* last_file = level_files.back(); + c->bottommost_level_ = c->inputs_[kLevel0].files.back() == last_file; // update statistics - MeasureTime(options_->statistics.get(), - NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[0].size()); + MeasureTime(ioptions_.statistics, + NUM_FILES_IN_SINGLE_COMPACTION, c->inputs_[kLevel0].size()); // mark all the files that are being compacted c->MarkFilesBeingCompacted(true); // remember this currently undergoing compaction - compactions_in_progress_[level].insert(c); + compactions_in_progress_[kLevel0].insert(c); // Record whether this compaction includes all sst files. // For now, it is only relevant in universal compaction mode. - c->is_full_compaction_ = - (c->inputs_[0].size() == c->input_version_->files_[0].size()); + c->is_full_compaction_ = (c->inputs_[kLevel0].size() == level_files.size()); + c->mutable_cf_options_ = mutable_cf_options; return c; } -uint32_t UniversalCompactionPicker::GetPathId(const Options& options, - uint64_t file_size) { +uint32_t UniversalCompactionPicker::GetPathId( + const ImmutableCFOptions& ioptions, uint64_t file_size) { // Two conditions need to be satisfied: // (1) the target path needs to be able to hold the file's size // (2) Total size left in this and previous paths need to be not @@ -659,11 +985,12 @@ uint32_t UniversalCompactionPicker::GetPathId(const Options& options, // considered in this algorithm. So the target size can be violated in // that case. We need to improve it. uint64_t accumulated_size = 0; - uint64_t future_size = - file_size * (100 - options.compaction_options_universal.size_ratio) / 100; + uint64_t future_size = file_size * + (100 - ioptions.compaction_options_universal.size_ratio) / 100; uint32_t p = 0; - for (; p < options.db_paths.size() - 1; p++) { - uint64_t target_size = options.db_paths[p].target_size; + assert(!ioptions.db_paths.empty()); + for (; p < ioptions.db_paths.size() - 1; p++) { + uint64_t target_size = ioptions.db_paths[p].target_size; if (target_size > file_size && accumulated_size + (target_size - file_size) > future_size) { return p; @@ -678,17 +1005,18 @@ uint32_t UniversalCompactionPicker::GetPathId(const Options& options, // the next file in time order. // Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( - Version* version, double score, unsigned int ratio, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, double score, unsigned int ratio, unsigned int max_number_of_files_to_compact, LogBuffer* log_buffer) { - int level = 0; + const int kLevel0 = 0; unsigned int min_merge_width = - options_->compaction_options_universal.min_merge_width; + ioptions_.compaction_options_universal.min_merge_width; unsigned int max_merge_width = - options_->compaction_options_universal.max_merge_width; + ioptions_.compaction_options_universal.max_merge_width; // The files are sorted from newest first to oldest last. - const auto& files = version->files_[level]; + const auto& files = vstorage->LevelFiles(kLevel0); FileMetaData* f = nullptr; bool done = false; @@ -715,7 +1043,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( } LogToBuffer(log_buffer, "[%s] Universal: file %" PRIu64 "[%d] being compacted, skipping", - version->cfd_->GetName().c_str(), f->fd.GetNumber(), loop); + cf_name.c_str(), f->fd.GetNumber(), loop); f = nullptr; } @@ -727,14 +1055,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); LogToBuffer(log_buffer, "[%s] Universal: Possible candidate file %s[%d].", - version->cfd_->GetName().c_str(), file_num_buf, loop); + cf_name.c_str(), file_num_buf, loop); } // Check if the suceeding files need compaction. for (unsigned int i = loop + 1; candidate_count < max_files_to_compact && i < files.size(); i++) { - FileMetaData* f = files[i]; - if (f->being_compacted) { + FileMetaData* suceeding_file = files[i]; + if (suceeding_file->being_compacted) { break; } // Pick files if the total/last candidate file size (increased by the @@ -743,24 +1071,25 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // default kCompactionStopStyleTotalSize; with // kCompactionStopStyleSimilarSize, it's simply the size of the last // picked file. - uint64_t sz = (candidate_size * (100L + ratio)) /100; - if (sz < f->fd.GetFileSize()) { + double sz = candidate_size * (100.0 + ratio) / 100.0; + if (sz < static_cast(suceeding_file->fd.GetFileSize())) { break; } - if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { + if (ioptions_.compaction_options_universal.stop_style == + kCompactionStopStyleSimilarSize) { // Similar-size stopping rule: also check the last picked file isn't // far larger than the next candidate file. - sz = (f->fd.GetFileSize() * (100L + ratio)) / 100; - if (sz < candidate_size) { + sz = (suceeding_file->fd.GetFileSize() * (100.0 + ratio)) / 100.0; + if (sz < static_cast(candidate_size)) { // If the small file we've encountered begins a run of similar-size // files, we'll pick them up on a future iteration of the outer // loop. If it's some lonely straggler, it'll eventually get picked // by the last-resort read amp strategy which disregards size ratios. break; } - candidate_size = f->compensated_file_size; - } else { // default kCompactionStopStyleTotalSize - candidate_size += f->compensated_file_size; + candidate_size = suceeding_file->compensated_file_size; + } else { // default kCompactionStopStyleTotalSize + candidate_size += suceeding_file->compensated_file_size; } candidate_count++; } @@ -773,13 +1102,14 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( } else { for (unsigned int i = loop; i < loop + candidate_count && i < files.size(); i++) { - FileMetaData* f = files[i]; + FileMetaData* skipping_file = files[i]; LogToBuffer(log_buffer, "[%s] Universal: Skipping file %" PRIu64 "[%d] with size %" PRIu64 " (compensated size %" PRIu64 ") %d\n", - version->cfd_->GetName().c_str(), f->fd.GetNumber(), i, - f->fd.GetFileSize(), f->compensated_file_size, - f->being_compacted); + cf_name.c_str(), f->fd.GetNumber(), i, + skipping_file->fd.GetFileSize(), + skipping_file->compensated_file_size, + skipping_file->being_compacted); } } } @@ -791,12 +1121,11 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // size ratio of compression. bool enable_compression = true; int ratio_to_compress = - options_->compaction_options_universal.compression_size_percent; + ioptions_.compaction_options_universal.compression_size_percent; if (ratio_to_compress >= 0) { - uint64_t total_size = version->NumLevelBytes(level); + uint64_t total_size = vstorage->NumLevelBytes(kLevel0); uint64_t older_file_size = 0; - for (unsigned int i = files.size() - 1; - i >= first_index_after; i--) { + for (size_t i = files.size() - 1; i >= first_index_after; i--) { older_file_size += files[i]->fd.GetFileSize(); if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { enable_compression = false; @@ -809,24 +1138,26 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( for (unsigned int i = 0; i < first_index_after; i++) { estimated_total_size += files[i]->fd.GetFileSize(); } - uint32_t path_id = GetPathId(*options_, estimated_total_size); + uint32_t path_id = GetPathId(ioptions_, estimated_total_size); Compaction* c = new Compaction( - version, level, level, MaxFileSizeForLevel(level), LLONG_MAX, path_id, - GetCompressionType(*options_, level, enable_compression)); + vstorage->num_levels(), kLevel0, kLevel0, + mutable_cf_options.MaxFileSizeForLevel(kLevel0), LLONG_MAX, path_id, + GetCompressionType(ioptions_, kLevel0, enable_compression)); c->score_ = score; for (unsigned int i = start_index; i < first_index_after; i++) { - FileMetaData* f = c->input_version_->files_[level][i]; - c->inputs_[0].files.push_back(f); + FileMetaData* picking_file = files[i]; + c->inputs_[0].files.push_back(picking_file); char file_num_buf[kFormatFileNumberBufSize]; - FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf, - sizeof(file_num_buf)); + FormatFileNumber(picking_file->fd.GetNumber(), picking_file->fd.GetPathId(), + file_num_buf, sizeof(file_num_buf)); LogToBuffer(log_buffer, "[%s] Universal: Picking file %s[%d] " "with size %" PRIu64 " (compensated size %" PRIu64 ")\n", - version->cfd_->GetName().c_str(), file_num_buf, i, - f->fd.GetFileSize(), f->compensated_file_size); + cf_name.c_str(), file_num_buf, i, + picking_file->fd.GetFileSize(), + picking_file->compensated_file_size); } return c; } @@ -838,15 +1169,16 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // min_merge_width and max_merge_width). // Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( - Version* version, double score, LogBuffer* log_buffer) { - int level = 0; + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer) { + const int kLevel = 0; // percentage flexibilty while reducing size amplification - uint64_t ratio = options_->compaction_options_universal. + uint64_t ratio = ioptions_.compaction_options_universal. max_size_amplification_percent; // The files are sorted from newest first to oldest last. - const auto& files = version->files_[level]; + const auto& files = vstorage->LevelFiles(kLevel); unsigned int candidate_count = 0; uint64_t candidate_size = 0; @@ -864,10 +1196,11 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); LogToBuffer(log_buffer, "[%s] Universal: skipping file %s[%d] compacted %s", - version->cfd_->GetName().c_str(), file_num_buf, loop, + cf_name.c_str(), file_num_buf, loop, " cannot be a candidate to reduce size amp.\n"); f = nullptr; } + if (f == nullptr) { return nullptr; // no candidate files } @@ -876,19 +1209,18 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); LogToBuffer(log_buffer, "[%s] Universal: First candidate file %s[%d] %s", - version->cfd_->GetName().c_str(), file_num_buf, start_index, + cf_name.c_str(), file_num_buf, start_index, " to reduce size amp.\n"); // keep adding up all the remaining files for (unsigned int loop = start_index; loop < files.size() - 1; loop++) { f = files[loop]; if (f->being_compacted) { - char file_num_buf[kFormatFileNumberBufSize]; FormatFileNumber(f->fd.GetNumber(), f->fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); LogToBuffer( log_buffer, "[%s] Universal: Possible candidate file %s[%d] %s.", - version->cfd_->GetName().c_str(), file_num_buf, loop, + cf_name.c_str(), file_num_buf, loop, " is already being compacted. No size amp reduction possible.\n"); return nullptr; } @@ -908,59 +1240,69 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( log_buffer, "[%s] Universal: size amp not needed. newer-files-total-size %" PRIu64 "earliest-file-size %" PRIu64, - version->cfd_->GetName().c_str(), candidate_size, earliest_file_size); + cf_name.c_str(), candidate_size, earliest_file_size); return nullptr; } else { LogToBuffer( log_buffer, "[%s] Universal: size amp needed. newer-files-total-size %" PRIu64 "earliest-file-size %" PRIu64, - version->cfd_->GetName().c_str(), candidate_size, earliest_file_size); + cf_name.c_str(), candidate_size, earliest_file_size); } - assert(start_index >= 0 && start_index < files.size() - 1); + assert(start_index < files.size() - 1); // Estimate total file size uint64_t estimated_total_size = 0; for (unsigned int loop = start_index; loop < files.size(); loop++) { estimated_total_size += files[loop]->fd.GetFileSize(); } - uint32_t path_id = GetPathId(*options_, estimated_total_size); + uint32_t path_id = GetPathId(ioptions_, estimated_total_size); // create a compaction request // We always compact all the files, so always compress. Compaction* c = - new Compaction(version, level, level, MaxFileSizeForLevel(level), - LLONG_MAX, path_id, GetCompressionType(*options_, level)); + new Compaction(vstorage->num_levels(), kLevel, kLevel, + mutable_cf_options.MaxFileSizeForLevel(kLevel), LLONG_MAX, + path_id, GetCompressionType(ioptions_, kLevel)); c->score_ = score; for (unsigned int loop = start_index; loop < files.size(); loop++) { - f = c->input_version_->files_[level][loop]; + f = files[loop]; c->inputs_[0].files.push_back(f); LogToBuffer(log_buffer, - "[%s] Universal: size amp picking file %" PRIu64 "[%d] " - "with size %" PRIu64 " (compensated size %" PRIu64 ")", - version->cfd_->GetName().c_str(), - f->fd.GetNumber(), loop, - f->fd.GetFileSize(), f->compensated_file_size); + "[%s] Universal: size amp picking file %" PRIu64 + "[%d] " + "with size %" PRIu64 " (compensated size %" PRIu64 ")", + cf_name.c_str(), f->fd.GetNumber(), loop, f->fd.GetFileSize(), + f->compensated_file_size); } return c; } -Compaction* FIFOCompactionPicker::PickCompaction(Version* version, - LogBuffer* log_buffer) { - assert(version->NumberLevels() == 1); +bool FIFOCompactionPicker::NeedsCompaction(const VersionStorageInfo* vstorage) + const { + const int kLevel0 = 0; + return vstorage->CompactionScore(kLevel0) >= 1; +} + +Compaction* FIFOCompactionPicker::PickCompaction( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) { + assert(vstorage->num_levels() == 1); + const int kLevel0 = 0; + const std::vector& level_files = vstorage->LevelFiles(kLevel0); uint64_t total_size = 0; - for (const auto& file : version->files_[0]) { - total_size += file->compensated_file_size; + for (const auto& file : level_files) { + total_size += file->fd.file_size; } - if (total_size <= options_->compaction_options_fifo.max_table_files_size || - version->files_[0].size() == 0) { + if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size || + level_files.size() == 0) { // total size not exceeded LogToBuffer(log_buffer, "[%s] FIFO compaction: nothing to do. Total size %" PRIu64 ", max size %" PRIu64 "\n", - version->cfd_->GetName().c_str(), total_size, - options_->compaction_options_fifo.max_table_files_size); + cf_name.c_str(), total_size, + ioptions_.compaction_options_fifo.max_table_files_size); return nullptr; } @@ -968,15 +1310,14 @@ Compaction* FIFOCompactionPicker::PickCompaction(Version* version, LogToBuffer(log_buffer, "[%s] FIFO compaction: Already executing compaction. No need " "to run parallel compactions since compactions are very fast", - version->cfd_->GetName().c_str()); + cf_name.c_str()); return nullptr; } - Compaction* c = new Compaction(version, 0, 0, 0, 0, 0, kNoCompression, false, + Compaction* c = new Compaction(1, 0, 0, 0, 0, 0, kNoCompression, false, true /* is deletion compaction */); // delete old files (FIFO) - for (auto ritr = version->files_[0].rbegin(); - ritr != version->files_[0].rend(); ++ritr) { + for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) { auto f = *ritr; total_size -= f->compensated_file_size; c->inputs_[0].files.push_back(f); @@ -984,33 +1325,37 @@ Compaction* FIFOCompactionPicker::PickCompaction(Version* version, AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64 " with size %s for deletion", - version->cfd_->GetName().c_str(), f->fd.GetNumber(), tmp_fsize); - if (total_size <= options_->compaction_options_fifo.max_table_files_size) { + cf_name.c_str(), f->fd.GetNumber(), tmp_fsize); + if (total_size <= ioptions_.compaction_options_fifo.max_table_files_size) { break; } } c->MarkFilesBeingCompacted(true); compactions_in_progress_[0].insert(c); - + c->mutable_cf_options_ = mutable_cf_options; return c; } Compaction* FIFOCompactionPicker::CompactRange( - Version* version, int input_level, int output_level, + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, InternalKey** compaction_end) { assert(input_level == 0); assert(output_level == 0); *compaction_end = nullptr; - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_->info_log.get()); - Compaction* c = PickCompaction(version, &log_buffer); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log); + Compaction* c = + PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer); if (c != nullptr) { - assert(output_path_id < static_cast(options_->db_paths.size())); + assert(output_path_id < static_cast(ioptions_.db_paths.size())); c->output_path_id_ = output_path_id; } log_buffer.FlushBufferToLog(); return c; } +#endif // !ROCKSDB_LITE + } // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction_picker.h index c1e27c471..7cc58d66b 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -8,32 +8,43 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include +#include +#include +#include + #include "db/version_set.h" #include "db/compaction.h" #include "rocksdb/status.h" #include "rocksdb/options.h" #include "rocksdb/env.h" +#include "util/mutable_cf_options.h" #include #include #include +#include namespace rocksdb { class LogBuffer; class Compaction; -class Version; +class VersionStorageInfo; +struct CompactionInputFiles; class CompactionPicker { public: - CompactionPicker(const Options* options, const InternalKeyComparator* icmp); + CompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp); virtual ~CompactionPicker(); // Pick level and inputs for a new compaction. // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction(Version* version, + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) = 0; // Return a compaction object for compacting the range [begin,end] in @@ -47,36 +58,59 @@ class CompactionPicker { // compaction_end will be set to nullptr. // Client is responsible for compaction_end storage -- when called, // *compaction_end should point to valid InternalKey! - virtual Compaction* CompactRange(Version* version, int input_level, - int output_level, uint32_t output_path_id, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end); + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); // Given the current number of levels, returns the lowest allowed level // for compaction input. virtual int MaxInputLevel(int current_num_levels) const = 0; - // Free up the files that participated in a compaction - void ReleaseCompactionFiles(Compaction* c, Status status); - - // Return the total amount of data that is undergoing - // compactions per level - void SizeBeingCompacted(std::vector& sizes); + // The maximum allowed output level. Default value is NumberLevels() - 1. + virtual int MaxOutputLevel() const { + return NumberLevels() - 1; + } - // Returns maximum total overlap bytes with grandparent - // level (i.e., level+2) before we stop building a single - // file in level->level+1 compaction. - uint64_t MaxGrandParentOverlapBytes(int level); + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const = 0; + + // Sanitize the input set of compaction input files. + // When the input parameters do not describe a valid compaction, the + // function will try to fix the input_files by adding necessary + // files. If it's not possible to conver an invalid input_files + // into a valid one by adding more files, the function will return a + // non-ok status with specific reason. +#ifndef ROCKSDB_LITE + Status SanitizeCompactionInputFiles( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE - // Returns maximum total bytes of data on a given level. - double MaxBytesForLevel(int level); + // Free up the files that participated in a compaction + void ReleaseCompactionFiles(Compaction* c, Status status); - // Get the max file size in a given level. - uint64_t MaxFileSizeForLevel(int level) const; + // Returns true if any one of the specified files are being compacted + bool FilesInCompaction(const std::vector& files); + + // Takes a list of CompactionInputFiles and returns a Compaction object. + Compaction* FormCompaction( + const CompactionOptions& compact_options, + const autovector& input_files, + int output_level, VersionStorageInfo* vstorage, + const MutableCFOptions& mutable_cf_options) const; + + // Converts a set of compaction input file numbers into + // a list of CompactionInputFiles. + Status GetCompactionInputsFromFileNumbers( + autovector* input_files, + std::unordered_set* input_set, + const VersionStorageInfo* vstorage, + const CompactionOptions& compact_options) const; protected: - int NumberLevels() const { return num_levels_; } + int NumberLevels() const { return ioptions_.num_levels; } // Stores the minimal range that covers all entries in inputs in // *smallest, *largest. @@ -101,110 +135,179 @@ class CompactionPicker { // populated. // // Will return false if it is impossible to apply this compaction. - bool ExpandWhileOverlapping(Compaction* c); - - uint64_t ExpandedCompactionByteSizeLimit(int level); - - // Returns true if any one of the specified files are being compacted - bool FilesInCompaction(std::vector& files); + bool ExpandWhileOverlapping(const std::string& cf_name, + VersionStorageInfo* vstorage, Compaction* c); // Returns true if any one of the parent files are being compacted - bool ParentRangeInCompaction(Version* version, const InternalKey* smallest, + bool ParentRangeInCompaction(VersionStorageInfo* vstorage, + const InternalKey* smallest, const InternalKey* largest, int level, int* index); - void SetupOtherInputs(Compaction* c); + void SetupOtherInputs(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, Compaction* c); + + const ImmutableCFOptions& ioptions_; + + // A helper function to SanitizeCompactionInputFiles() that + // sanitizes "input_files" by adding necessary files. +#ifndef ROCKSDB_LITE + virtual Status SanitizeCompactionInputFilesForAllLevels( + std::unordered_set* input_files, + const ColumnFamilyMetaData& cf_meta, + const int output_level) const; +#endif // ROCKSDB_LITE // record all the ongoing compactions for all levels std::vector> compactions_in_progress_; - // Per-level target file size. - std::unique_ptr max_file_size_; + const InternalKeyComparator* const icmp_; +}; + +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; - // Per-level max bytes - std::unique_ptr level_max_bytes_; + // Returns current_num_levels - 2, meaning the last level cannot be + // compaction input level. + virtual int MaxInputLevel(int current_num_levels) const override { + return current_num_levels - 2; + } - const Options* const options_; + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const + override; - private: - int num_levels_; + // Pick a path ID to place a newly generated file, with its level + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + int level); - const InternalKeyComparator* const icmp_; + private: + // For the specfied level, pick a compaction. + // Returns nullptr if there is no compaction to be done. + // If level is 0 and there is already a compaction on that level, this + // function will return nullptr. + Compaction* PickCompactionBySize(const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int level, + double score); }; +#ifndef ROCKSDB_LITE class UniversalCompactionPicker : public CompactionPicker { public: - UniversalCompactionPicker(const Options* options, + UniversalCompactionPicker(const ImmutableCFOptions& ioptions, const InternalKeyComparator* icmp) - : CompactionPicker(options, icmp) {} - virtual Compaction* PickCompaction(Version* version, + : CompactionPicker(ioptions, icmp) {} + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, LogBuffer* log_buffer) override; - // The maxinum allowed input level. Always return 0. + // The maxinum allowed input level. Always returns 0. virtual int MaxInputLevel(int current_num_levels) const override { return 0; } + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { + return 0; + } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const + override; + private: // Pick Universal compaction to limit read amplification - Compaction* PickCompactionUniversalReadAmp(Version* version, double score, - unsigned int ratio, - unsigned int num_files, - LogBuffer* log_buffer); + Compaction* PickCompactionUniversalReadAmp( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, double score, unsigned int ratio, + unsigned int num_files, LogBuffer* log_buffer); // Pick Universal compaction to limit space amplification. - Compaction* PickCompactionUniversalSizeAmp(Version* version, double score, - LogBuffer* log_buffer); + Compaction* PickCompactionUniversalSizeAmp( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, double score, LogBuffer* log_buffer); // Pick a path ID to place a newly generated file, with its estimated file // size. - static uint32_t GetPathId(const Options& options, uint64_t file_size); + static uint32_t GetPathId(const ImmutableCFOptions& ioptions, + uint64_t file_size); }; -class LevelCompactionPicker : public CompactionPicker { +class FIFOCompactionPicker : public CompactionPicker { public: - LevelCompactionPicker(const Options* options, - const InternalKeyComparator* icmp) - : CompactionPicker(options, icmp) {} - virtual Compaction* PickCompaction(Version* version, + FIFOCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) + : CompactionPicker(ioptions, icmp) {} + + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* version, LogBuffer* log_buffer) override; - // Returns current_num_levels - 2, meaning the last level cannot be - // compaction input level. + virtual Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end) override; + + // The maxinum allowed input level. Always returns 0. virtual int MaxInputLevel(int current_num_levels) const override { - return current_num_levels - 2; + return 0; } - private: - // For the specfied level, pick a compaction. - // Returns nullptr if there is no compaction to be done. - // If level is 0 and there is already a compaction on that level, this - // function will return nullptr. - Compaction* PickCompactionBySize(Version* version, int level, double score); + // The maximum allowed output level. Always returns 0. + virtual int MaxOutputLevel() const override { + return 0; + } + + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const + override; }; -class FIFOCompactionPicker : public CompactionPicker { +class NullCompactionPicker : public CompactionPicker { public: - FIFOCompactionPicker(const Options* options, - const InternalKeyComparator* icmp) - : CompactionPicker(options, icmp) {} + NullCompactionPicker(const ImmutableCFOptions& ioptions, + const InternalKeyComparator* icmp) : + CompactionPicker(ioptions, icmp) {} + virtual ~NullCompactionPicker() {} + + // Always return "nullptr" + Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override { + return nullptr; + } - virtual Compaction* PickCompaction(Version* version, - LogBuffer* log_buffer) override; + // Always return "nullptr" + Compaction* CompactRange( + const std::string& cf_name, const MutableCFOptions& mutable_cf_options, + VersionStorageInfo* vstorage, int input_level, int output_level, + uint32_t output_path_id, const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end) override { + return nullptr; + } - virtual Compaction* CompactRange(Version* version, int input_level, - int output_level, uint32_t output_path_id, - const InternalKey* begin, - const InternalKey* end, - InternalKey** compaction_end) override; + // Given the current number of levels, returns the highest allowed level + // for compaction input. + virtual int MaxInputLevel(int current_num_levels) const { + return current_num_levels - 2; + } - // The maxinum allowed input level. Always return 0. - virtual int MaxInputLevel(int current_num_levels) const override { - return 0; + // Always returns false. + virtual bool NeedsCompaction(const VersionStorageInfo* vstorage) const + override { + return false; } }; - -// Utility function -extern uint64_t TotalCompensatedFileSize(const std::vector& files); +#endif // !ROCKSDB_LITE } // namespace rocksdb diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc new file mode 100644 index 000000000..ca7ba014f --- /dev/null +++ b/db/compaction_picker_test.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/compaction_picker.h" +#include +#include +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class CountingLogger : public Logger { + public: + using Logger::Logv; + virtual void Logv(const char* format, va_list ap) override { log_count++; } + size_t log_count; +}; + +class CompactionPickerTest { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + LevelCompactionPicker level_compaction_picker; + std::string cf_name_; + CountingLogger logger_; + LogBuffer log_buffer_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::unique_ptr vstorage_; + std::vector> files_; + + CompactionPickerTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_, ioptions_), + level_compaction_picker(ioptions_, &icmp_), + cf_name_("dummy"), + log_buffer_(InfoLogLevel::INFO_LEVEL, &logger_), + file_num_(1), + vstorage_(nullptr) { + fifo_options_.max_table_files_size = 1; + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + ioptions_.db_paths.emplace_back("dummy", + std::numeric_limits::max()); + } + + ~CompactionPickerTest() { + } + + void NewVersionStorage(int num_levels, CompactionStyle style) { + DeleteVersionStorage(); + options_.num_levels = num_levels; + vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, style, nullptr)); + } + + void DeleteVersionStorage() { + vstorage_.reset(); + files_.clear(); + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100) { + assert(level < vstorage_->num_levels()); + FileMetaData* f = new FileMetaData; + f->fd = FileDescriptor(file_number, path_id, file_size); + f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); + f->largest = InternalKey(largest, largest_seq, kTypeValue); + f->compensated_file_size = file_size; + f->refs = 0; + vstorage_->AddFile(level, f); + files_.emplace_back(f); + } + + void UpdateVersionStorageInfo() { + vstorage_->UpdateFilesBySize(); + vstorage_->UpdateNumNonEmptyLevels(); + vstorage_->GenerateFileIndexer(); + vstorage_->GenerateLevelFilesBrief(); + vstorage_->ComputeCompactionScore(mutable_cf_options_, fifo_options_); + vstorage_->SetFinalized(); + } +}; + +TEST(CompactionPickerTest, Empty) { + NewVersionStorage(6, kCompactionStyleLevel); + UpdateVersionStorageInfo(); + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST(CompactionPickerTest, Single) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "p", "q"); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() == nullptr); +} + +TEST(CompactionPickerTest, Level0Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.level0_file_num_compaction_trigger = 2; + Add(0, 1U, "150", "200"); + Add(0, 2U, "200", "250"); + + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(2U, compaction->num_input_files(0)); + ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber()); +} + +TEST(CompactionPickerTest, Level1Trigger) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST(CompactionPickerTest, Level1Trigger2) { + NewVersionStorage(6, kCompactionStyleLevel); + Add(1, 66U, "150", "200", 1000000001U); + Add(1, 88U, "201", "300", 1000000000U); + Add(2, 6U, "150", "179", 1000000000U); + Add(2, 7U, "180", "220", 1000000000U); + Add(2, 8U, "221", "300", 1000000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(2U, compaction->num_input_files(1)); + ASSERT_EQ(66U, compaction->input(0, 0)->fd.GetNumber()); + ASSERT_EQ(6U, compaction->input(1, 0)->fd.GetNumber()); + ASSERT_EQ(7U, compaction->input(1, 1)->fd.GetNumber()); +} + +TEST(CompactionPickerTest, LevelMaxScore) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + Add(0, 1U, "150", "200", 1000000000U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 6000000U); + Add(1, 88U, "201", "300", 6000000U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 60000000U); + Add(2, 7U, "180", "220", 60000001U); + Add(2, 8U, "221", "300", 60000000U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST(CompactionPickerTest, NeedsCompactionLevel) { + const int kLevels = 6; + const int kFileCount = 20; + for (int level = 0; level < kLevels - 1; ++level) { + uint64_t file_size = + mutable_cf_options_.MaxBytesForLevel(level) * 2 / kFileCount; + for (int file_count = 1; file_count <= kFileCount; ++file_count) { + // start a brand new version in each test. + NewVersionStorage(kLevels, kCompactionStyleLevel); + for (int i = 0; i < file_count; ++i) { + Add(level, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + file_size, 0, i * 100, i * 100 + 99); + } + UpdateVersionStorageInfo(); + ASSERT_EQ(vstorage_->CompactionScoreLevel(0), level); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + // release the version storage + DeleteVersionStorage(); + } + } +} + +TEST(CompactionPickerTest, NeedsCompactionUniversal) { + NewVersionStorage(1, kCompactionStyleUniversal); + UniversalCompactionPicker universal_compaction_picker( + ioptions_, &icmp_); + // must return false when there's no files. + ASSERT_EQ(universal_compaction_picker.NeedsCompaction(vstorage_.get()), + false); + + // verify the trigger given different number of L0 files. + for (int i = 1; + i <= mutable_cf_options_.level0_file_num_compaction_trigger * 2; ++i) { + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), 1000000, 0, i * 100, + i * 100 + 99); + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + +TEST(CompactionPickerTest, NeedsCompactionFIFO) { + NewVersionStorage(1, kCompactionStyleFIFO); + const int kFileCount = + mutable_cf_options_.level0_file_num_compaction_trigger * 3; + const uint64_t kFileSize = 100000; + const uint64_t kMaxSize = kFileSize * kFileCount / 2; + + fifo_options_.max_table_files_size = kMaxSize; + ioptions_.compaction_options_fifo = fifo_options_; + FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_); + + // must return false when there's no files. + ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), false); + + // verify whether compaction is needed based on the current + // size of L0 files. + uint64_t current_size = 0; + for (int i = 1; i <= kFileCount; ++i) { + Add(0, i, ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + kFileSize, 0, i * 100, i * 100 + 99); + current_size += kFileSize; + ASSERT_EQ(level_compaction_picker.NeedsCompaction(vstorage_.get()), + vstorage_->CompactionScore(0) >= 1); + } +} + + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc new file mode 100644 index 000000000..325017224 --- /dev/null +++ b/db/comparator_db_test.cc @@ -0,0 +1,434 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/hash.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "utilities/merge_operators.h" + +using std::unique_ptr; + +namespace rocksdb { +namespace { + +static const Comparator* comparator; + +// A comparator for std::map, using comparator +struct MapComparator { + bool operator()(const std::string& a, const std::string& b) const { + return comparator->Compare(a, b) < 0; + } +}; + +typedef std::map KVMap; + +class KVIter : public Iterator { + public: + explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {} + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); } + virtual void Next() { ++iter_; } + virtual void Prev() { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + + private: + const KVMap* const map_; + KVMap::const_iterator iter_; +}; + +void AssertItersEqual(Iterator* iter1, Iterator* iter2) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + if (iter1->Valid()) { + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + } +} + +// Measuring operations on DB (expect to be empty). +// source_strings are candidate keys +void DoRandomIteraratorTest(DB* db, std::vector source_strings, + Random* rnd, int num_writes, int num_iter_ops, + int num_trigger_flush) { + KVMap map; + + for (int i = 0; i < num_writes; i++) { + if (num_trigger_flush > 0 && i != 0 && i % num_trigger_flush == 0) { + db->Flush(FlushOptions()); + } + + int type = rnd->Uniform(2); + int index = rnd->Uniform(static_cast(source_strings.size())); + auto& key = source_strings[index]; + switch (type) { + case 0: + // put + map[key] = key; + ASSERT_OK(db->Put(WriteOptions(), key, key)); + break; + case 1: + // delete + if (map.find(key) != map.end()) { + map.erase(key); + } + ASSERT_OK(db->Delete(WriteOptions(), key)); + break; + default: + assert(false); + } + } + + std::unique_ptr iter(db->NewIterator(ReadOptions())); + std::unique_ptr result_iter(new KVIter(&map)); + + bool is_valid = false; + for (int i = 0; i < num_iter_ops; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + int type = rnd->Uniform(6); + ASSERT_OK(iter->status()); + switch (type) { + case 0: + // Seek to First + iter->SeekToFirst(); + result_iter->SeekToFirst(); + break; + case 1: + // Seek to last + iter->SeekToLast(); + result_iter->SeekToLast(); + break; + case 2: { + // Seek to random key + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->Seek(key); + result_iter->Seek(key); + break; + } + case 3: + // Next + if (is_valid) { + iter->Next(); + result_iter->Next(); + } else { + continue; + } + break; + case 4: + // Prev + if (is_valid) { + iter->Prev(); + result_iter->Prev(); + } else { + continue; + } + break; + default: { + assert(type == 5); + auto key_idx = rnd->Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + std::string result; + auto status = db->Get(ReadOptions(), key, &result); + if (map.find(key) == map.end()) { + ASSERT_TRUE(status.IsNotFound()); + } else { + ASSERT_EQ(map[key], result); + } + break; + } + } + AssertItersEqual(iter.get(), result_iter.get()); + is_valid = iter->Valid(); + } +} + +class DoubleComparator : public Comparator { + public: + DoubleComparator() {} + + virtual const char* Name() const { return "DoubleComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + double da = std::stod(a.ToString()); + double db = std::stod(b.ToString()); + if (da == db) { + return a.compare(b); + } else if (da > db) { + return 1; + } else { + return -1; + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +class HashComparator : public Comparator { + public: + HashComparator() {} + + virtual const char* Name() const { return "HashComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + uint32_t ha = Hash(a.data(), a.size(), 66); + uint32_t hb = Hash(b.data(), b.size(), 66); + if (ha == hb) { + return a.compare(b); + } else if (ha > hb) { + return 1; + } else { + return -1; + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + +class TwoStrComparator : public Comparator { + public: + TwoStrComparator() {} + + virtual const char* Name() const { return "TwoStrComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + assert(a.size() >= 2); + assert(b.size() >= 2); + size_t size_a1 = static_cast(a[0]); + size_t size_b1 = static_cast(b[0]); + size_t size_a2 = static_cast(a[1]); + size_t size_b2 = static_cast(b[1]); + assert(size_a1 + size_a2 + 2 == a.size()); + assert(size_b1 + size_b2 + 2 == b.size()); + + Slice a1 = Slice(a.data() + 2, size_a1); + Slice b1 = Slice(b.data() + 2, size_b1); + Slice a2 = Slice(a.data() + 2 + size_a1, size_a2); + Slice b2 = Slice(b.data() + 2 + size_b1, size_b2); + + if (a1 != b1) { + return a1.compare(b1); + } + return a2.compare(b2); + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; +} // namespace + +class ComparatorDBTest { + private: + std::string dbname_; + Env* env_; + DB* db_; + Options last_options_; + std::unique_ptr comparator_guard; + + public: + ComparatorDBTest() : env_(Env::Default()), db_(nullptr) { + comparator = BytewiseComparator(); + dbname_ = test::TmpDir() + "/comparator_db_test"; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + ~ComparatorDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + comparator = BytewiseComparator(); + } + + DB* GetDB() { return db_; } + + void SetOwnedComparator(const Comparator* cmp) { + comparator_guard.reset(cmp); + comparator = cmp; + last_options_.comparator = cmp; + } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } +}; + +TEST(ComparatorDBTest, Bytewise) { + for (int rand_seed = 301; rand_seed < 306; rand_seed++) { + DestroyAndReopen(); + Random rnd(rand_seed); + DoRandomIteraratorTest(GetDB(), + {"a", "b", "c", "d", "e", "f", "g", "h", "i"}, &rnd, + 8, 100, 3); + } +} + +TEST(ComparatorDBTest, SimpleSuffixReverseComparator) { + SetOwnedComparator(new test::SimpleSuffixReverseComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = comparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + std::vector source_prefixes; + // Randomly generate 5 prefixes + for (int i = 0; i < 5; i++) { + source_prefixes.push_back(test::RandomHumanReadableString(&rnd, 8)); + } + for (int j = 0; j < 20; j++) { + int prefix_index = rnd.Uniform(static_cast(source_prefixes.size())); + std::string key = source_prefixes[prefix_index] + + test::RandomHumanReadableString(&rnd, rnd.Uniform(8)); + source_strings.push_back(key); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 30, 600, 66); + } +} + +TEST(ComparatorDBTest, Uint64Comparator) { + SetOwnedComparator(test::Uint64Comparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = comparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + Random64 rnd64(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint64_t r = rnd64.Next(); + std::string str; + str.resize(8); + memcpy(&str[0], static_cast(&r), 8); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST(ComparatorDBTest, DoubleComparator) { + SetOwnedComparator(new DoubleComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = comparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + uint32_t r = rnd.Next(); + uint32_t divide_order = rnd.Uniform(8); + double to_divide = 1.0; + for (uint32_t j = 0; j < divide_order; j++) { + to_divide *= 10.0; + } + source_strings.push_back(ToString(r / to_divide)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST(ComparatorDBTest, HashComparator) { + SetOwnedComparator(new HashComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = comparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + source_strings.push_back(test::RandomKey(&rnd, 8)); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +TEST(ComparatorDBTest, TwoStrComparator) { + SetOwnedComparator(new TwoStrComparator()); + + for (int rnd_seed = 301; rnd_seed < 316; rnd_seed++) { + Options* opt = GetOptions(); + opt->comparator = comparator; + DestroyAndReopen(); + Random rnd(rnd_seed); + + std::vector source_strings; + // Randomly generate source keys + for (int i = 0; i < 100; i++) { + std::string str; + uint32_t size1 = rnd.Uniform(8); + uint32_t size2 = rnd.Uniform(8); + str.append(1, static_cast(size1)); + str.append(1, static_cast(size2)); + str.append(test::RandomKey(&rnd, size1)); + str.append(test::RandomKey(&rnd, size2)); + source_strings.push_back(str); + } + + DoRandomIteraratorTest(GetDB(), source_strings, &rnd, 200, 1000, 66); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 7a1a5221b..2cea9da65 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -115,8 +115,8 @@ class CorruptionTest { continue; } missed += (key - next_expected); - next_expected = key + 1; - if (iter->value() != Value(key, &value_space)) { + next_expected = static_cast(key + 1); + if (iter->value() != Value(static_cast(key), &value_space)) { bad_values++; } else { correct++; @@ -131,7 +131,7 @@ class CorruptionTest { ASSERT_GE(max_expected, correct); } - void CorruptFile(const std::string fname, int offset, int bytes_to_corrupt) { + void CorruptFile(const std::string& fname, int offset, int bytes_to_corrupt) { struct stat sbuf; if (stat(fname.c_str(), &sbuf) != 0) { const char* msg = strerror(errno); @@ -143,14 +143,14 @@ class CorruptionTest { if (-offset > sbuf.st_size) { offset = 0; } else { - offset = sbuf.st_size + offset; + offset = static_cast(sbuf.st_size + offset); } } if (offset > sbuf.st_size) { - offset = sbuf.st_size; + offset = static_cast(sbuf.st_size); } if (offset + bytes_to_corrupt > sbuf.st_size) { - bytes_to_corrupt = sbuf.st_size - offset; + bytes_to_corrupt = static_cast(sbuf.st_size - offset); } // Do it @@ -177,7 +177,7 @@ class CorruptionTest { type == filetype && static_cast(number) > picked_number) { // Pick latest file fname = dbname_ + "/" + filenames[i]; - picked_number = number; + picked_number = static_cast(number); } } ASSERT_TRUE(!fname.empty()) << filetype; @@ -231,7 +231,9 @@ TEST(CorruptionTest, Recovery) { Check(100, 100); Corrupt(kLogFile, 19, 1); // WriteBatch tag for first record Corrupt(kLogFile, log::kBlockSize + 1000, 1); // Somewhere in second block - Reopen(); + ASSERT_TRUE(!TryReopen().ok()); + options_.paranoid_checks = false; + Reopen(&options_); // The 64 records in the first two log blocks are completely lost. Check(36, 36); @@ -246,7 +248,8 @@ TEST(CorruptionTest, RecoverWriteError) { TEST(CorruptionTest, NewFileErrorDuringWrite) { // Do enough writing to force minor compaction env_.writable_file_error_ = true; - const int num = 3 + (Options().write_buffer_size / kValueSize); + const int num = + static_cast(3 + (Options().write_buffer_size / kValueSize)); std::string value_storage; Status s; bool failed = false; @@ -332,6 +335,9 @@ TEST(CorruptionTest, CorruptedDescriptor) { } TEST(CorruptionTest, CompactionInputError) { + Options options; + options.max_background_flushes = 0; + Reopen(&options); Build(10); DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_FlushMemTable(); @@ -351,6 +357,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) { options.paranoid_checks = true; options.write_buffer_size = 131072; options.max_write_buffer_number = 2; + options.max_background_flushes = 0; Reopen(&options); DBImpl* dbi = reinterpret_cast(db_); diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index c1e59b1b5..a35eba270 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -92,7 +92,7 @@ class CuckooTableDBTest { // Return spread of files per level std::string FilesPerLevel() { std::string result; - int last_non_zero_offset = 0; + size_t last_non_zero_offset = 0; for (int level = 0; level < db_->NumberLevels(); level++) { int f = NumTableFilesAtLevel(level); char buf[100]; @@ -218,6 +218,7 @@ TEST(CuckooTableDBTest, Uint64Comparator) { // Add more keys. ASSERT_OK(Delete(Uint64Key(2))); // Delete. + dbfull()->TEST_FlushMemTable(); ASSERT_OK(Put(Uint64Key(3), "v0")); // Update. ASSERT_OK(Put(Uint64Key(4), "v4")); dbfull()->TEST_FlushMemTable(); @@ -227,28 +228,25 @@ TEST(CuckooTableDBTest, Uint64Comparator) { ASSERT_EQ("v4", Get(Uint64Key(4))); } -TEST(CuckooTableDBTest, CompactionTrigger) { +TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) { + // Create a big L0 file and check it compacts into multiple files in L1. Options options = CurrentOptions(); - options.write_buffer_size = 100 << 10; // 100KB - options.level0_file_num_compaction_trigger = 2; + options.write_buffer_size = 270 << 10; + // Two SST files should be created, each containing 14 keys. + // Number of buckets will be 16. Total size ~156 KB. + options.target_file_size_base = 160 << 10; Reopen(&options); - // Write 11 values, each 10016 B - for (int idx = 0; idx < 11; ++idx) { + // Write 28 values, each 10016 B ~ 10KB + for (int idx = 0; idx < 28; ++idx) { ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); } dbfull()->TEST_WaitForFlushMemTable(); ASSERT_EQ("1", FilesPerLevel()); - // Generate one more file in level-0, and should trigger level-0 compaction - for (int idx = 11; idx < 22; ++idx) { - ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); - } - dbfull()->TEST_WaitForFlushMemTable(); dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_EQ("0,2", FilesPerLevel()); - for (int idx = 0; idx < 22; ++idx) { + for (int idx = 0; idx < 28; ++idx) { ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); } } diff --git a/db/db_bench.cc b/db/db_bench.cc index 2f88e81ff..34cf6e025 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -7,7 +7,9 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #ifndef GFLAGS #include @@ -37,12 +39,13 @@ int main() { #include "rocksdb/memtablerep.h" #include "rocksdb/write_batch.h" #include "rocksdb/slice.h" +#include "rocksdb/filter_policy.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/statistics.h" #include "rocksdb/perf_context.h" #include "port/port.h" #include "port/stack_trace.h" #include "util/crc32c.h" +#include "util/compression.h" #include "util/histogram.h" #include "util/mutexlock.h" #include "util/random.h" @@ -84,7 +87,8 @@ DEFINE_string(benchmarks, "xxhash," "compress," "uncompress," - "acquireload,", + "acquireload," + "fillseekseq,", "Comma-separated list of operations to run in the specified order" "Actual benchmarks:\n" @@ -127,6 +131,8 @@ DEFINE_string(benchmarks, "\tcrc32c -- repeated crc32c of 4K of data\n" "\txxhash -- repeated xxHash of 4K of data\n" "\tacquireload -- load N*1000 times\n" + "\tfillseekseq -- write N values in sequential key, then read " + "them by seeking to each key\n" "Meta operations:\n" "\tcompact -- Compact the entire DB\n" "\tstats -- Print DB stats\n" @@ -148,6 +154,13 @@ DEFINE_int64(merge_keys, -1, "If negative, there will be FLAGS_num keys."); DEFINE_int32(num_column_families, 1, "Number of Column Families to use."); +DEFINE_int32( + num_hot_column_families, 0, + "Number of Hot Column Families. If more than 0, only write to this " + "number of column families. After finishing all the writes to them, " + "create new set of column families and insert to them. Only used " + "when num_column_families > 1."); + DEFINE_int64(reads, -1, "Number of read operations to do. " "If negative, do FLAGS_num reads."); @@ -163,8 +176,14 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run." DEFINE_int32(value_size, 100, "Size of each value"); +DEFINE_int32(seek_nexts, 0, + "How many times to call Next() after Seek() in " + "fillseekseq and seekrandom"); + DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator"); +DEFINE_int64(batch_size, 1, "Batch size"); + static bool ValidateKeySize(const char* flagname, int32_t value) { return true; } @@ -187,6 +206,9 @@ DEFINE_bool(enable_numa, false, "CPU and memory of same node. Use \"$numactl --hardware\" command " "to see NUMA memory architecture."); +DEFINE_int64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size, + "Number of bytes to buffer in all memtables before compacting"); + DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, "Number of bytes to buffer in memtable before compacting"); @@ -240,7 +262,8 @@ DEFINE_int32(universal_compression_size_percent, -1, DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" "data. Negative means use default settings."); -DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size, +DEFINE_int32(block_size, + static_cast(rocksdb::BlockBasedTableOptions().block_size), "Number of bytes in a block."); DEFINE_int32(block_restart_interval, @@ -305,7 +328,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL"); DEFINE_int32(num_levels, 7, "The total number of levels"); -DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); +DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1"); DEFINE_int32(target_file_size_multiplier, 1, "A multiplier to compute target level-N file size (N >= 2)"); @@ -352,9 +375,8 @@ DEFINE_int32(deletepercent, 2, "Percentage of deletes out of reads/writes/" "deletepercent), so deletepercent must be smaller than (100 - " "FLAGS_readwritepercent)"); -DEFINE_uint64(delete_obsolete_files_period_micros, 0, "Option to delete " - "obsolete files periodically. 0 means that obsolete files are" - " deleted after every compaction run."); +DEFINE_uint64(delete_obsolete_files_period_micros, 0, + "Ignored. Left here for backward compatibility"); namespace { enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { @@ -376,6 +398,16 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); return rocksdb::kSnappyCompression; //default value } + +std::string ColumnFamilyName(size_t i) { + if (i == 0) { + return rocksdb::kDefaultColumnFamilyName; + } else { + char name[100]; + snprintf(name, sizeof(name), "column_family_name_%06zu", i); + return std::string(name); + } +} } // namespace DEFINE_string(compression_type, "snappy", @@ -461,6 +493,7 @@ DEFINE_int32(source_compaction_factor, 1, "Cap the size of data in level-K for" DEFINE_uint64(wal_ttl_seconds, 0, "Set the TTL for the WAL Files in seconds."); DEFINE_uint64(wal_size_limit_MB, 0, "Set the size limit for the WAL Files" " in MB."); +DEFINE_uint64(max_total_wal_size, 0, "Set total max WAL size"); DEFINE_bool(bufferedio, rocksdb::EnvOptions().use_os_buffer, "Allow buffered io using OS buffers"); @@ -512,6 +545,9 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated " "i.e. use the prefix comes with the generated random number."); DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction " "threads' IO priority"); +DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo " + "table becomes an identity function. This is only valid when key " + "is 8 bytes"); enum RepFactory { kSkipList, @@ -551,11 +587,18 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table."); DEFINE_bool(use_hash_search, false, "if use kHashSearch " "instead of kBinarySearch. " "This is valid if only we use BlockTable"); - +DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter " + "instead of kFullFilter for filter block. " + "This is valid if only we use BlockTable"); DEFINE_string(merge_operator, "", "The merge operator to use with the database." "If a new merge operator is specified, be sure to use fresh" " database The possible merge operators are defined in" " utilities/merge_operators.h"); +DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try " + "linear search first for this many steps from the previous " + "position"); +DEFINE_bool(report_file_operations, false, "if report number of file " + "operations"); static const bool FLAGS_soft_rate_limit_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit); @@ -587,6 +630,131 @@ static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = namespace rocksdb { +namespace { +struct ReportFileOpCounters { + std::atomic open_counter_; + std::atomic read_counter_; + std::atomic append_counter_; + std::atomic bytes_read_; + std::atomic bytes_written_; +}; + +// A special Env to records and report file operations in db_bench +class ReportFileOpEnv : public EnvWrapper { + public: + explicit ReportFileOpEnv(Env* base) : EnvWrapper(base) { reset(); } + + void reset() { + counters_.open_counter_ = 0; + counters_.read_counter_ = 0; + counters_.append_counter_ = 0; + counters_.bytes_read_ = 0; + counters_.bytes_written_ = 0; + } + + Status NewSequentialFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public SequentialFile { + private: + unique_ptr target_; + ReportFileOpCounters* counters_; + + public: + CountingFile(unique_ptr&& target, + ReportFileOpCounters* counters) + : target_(std::move(target)), counters_(counters) {} + + virtual Status Read(size_t n, Slice* result, char* scratch) { + counters_->read_counter_.fetch_add(1, std::memory_order_relaxed); + Status rv = target_->Read(n, result, scratch); + counters_->bytes_read_.fetch_add(result->size(), + std::memory_order_relaxed); + return rv; + } + + virtual Status Skip(uint64_t n) { return target_->Skip(n); } + }; + + Status s = target()->NewSequentialFile(f, r, soptions); + if (s.ok()) { + counters()->open_counter_.fetch_add(1, std::memory_order_relaxed); + r->reset(new CountingFile(std::move(*r), counters())); + } + return s; + } + + Status NewRandomAccessFile(const std::string& f, + unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public RandomAccessFile { + private: + unique_ptr target_; + ReportFileOpCounters* counters_; + + public: + CountingFile(unique_ptr&& target, + ReportFileOpCounters* counters) + : target_(std::move(target)), counters_(counters) {} + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + counters_->read_counter_.fetch_add(1, std::memory_order_relaxed); + Status rv = target_->Read(offset, n, result, scratch); + counters_->bytes_read_.fetch_add(result->size(), + std::memory_order_relaxed); + return rv; + } + }; + + Status s = target()->NewRandomAccessFile(f, r, soptions); + if (s.ok()) { + counters()->open_counter_.fetch_add(1, std::memory_order_relaxed); + r->reset(new CountingFile(std::move(*r), counters())); + } + return s; + } + + Status NewWritableFile(const std::string& f, unique_ptr* r, + const EnvOptions& soptions) { + class CountingFile : public WritableFile { + private: + unique_ptr target_; + ReportFileOpCounters* counters_; + + public: + CountingFile(unique_ptr&& target, + ReportFileOpCounters* counters) + : target_(std::move(target)), counters_(counters) {} + + Status Append(const Slice& data) { + counters_->append_counter_.fetch_add(1, std::memory_order_relaxed); + Status rv = target_->Append(data); + counters_->bytes_written_.fetch_add(data.size(), + std::memory_order_relaxed); + return rv; + } + + Status Close() { return target_->Close(); } + Status Flush() { return target_->Flush(); } + Status Sync() { return target_->Sync(); } + }; + + Status s = target()->NewWritableFile(f, r, soptions); + if (s.ok()) { + counters()->open_counter_.fetch_add(1, std::memory_order_relaxed); + r->reset(new CountingFile(std::move(*r), counters())); + } + return s; + } + + // getter + ReportFileOpCounters* counters() { return &counters_; } + + private: + ReportFileOpCounters counters_; +}; + +} // namespace + // Helper for quickly generating random data. class RandomGenerator { private: @@ -627,6 +795,55 @@ static void AppendWithSpace(std::string* str, Slice msg) { str->append(msg.data(), msg.size()); } +struct DBWithColumnFamilies { + std::vector cfh; + DB* db; + std::atomic num_created; // Need to be updated after all the + // new entries in cfh are set. + size_t num_hot; // Number of column families to be queried at each moment. + // After each CreateNewCf(), another num_hot number of new + // Column families will be created and used to be queried. + port::Mutex create_cf_mutex; // Only one thread can execute CreateNewCf() + + DBWithColumnFamilies() : db(nullptr) { + cfh.clear(); + } + + DBWithColumnFamilies(const DBWithColumnFamilies& other) + : cfh(other.cfh), + db(other.db), + num_created(other.num_created.load()), + num_hot(other.num_hot) {} + + ColumnFamilyHandle* GetCfh(int64_t rand_num) { + assert(num_hot > 0); + return cfh[num_created.load(std::memory_order_acquire) - num_hot + + rand_num % num_hot]; + } + + // stage: assume CF from 0 to stage * num_hot has be created. Need to create + // stage * num_hot + 1 to stage * (num_hot + 1). + void CreateNewCf(ColumnFamilyOptions options, int64_t stage) { + MutexLock l(&create_cf_mutex); + if ((stage + 1) * num_hot <= num_created) { + // Already created. + return; + } + auto new_num_created = num_created + num_hot; + assert(new_num_created <= cfh.size()); + for (size_t i = num_created; i < new_num_created; i++) { + Status s = + db->CreateColumnFamily(options, ColumnFamilyName(i), &(cfh[i])); + if (!s.ok()) { + fprintf(stderr, "create column family error: %s\n", + s.ToString().c_str()); + abort(); + } + } + num_created.store(new_num_created, std::memory_order_release); + } +}; + class Stats { private: int id_; @@ -690,7 +907,7 @@ class Stats { void SetId(int id) { id_ = id; } void SetExcludeFromMerge() { exclude_from_merge_ = true; } - void FinishedOps(DB* db, int64_t num_ops) { + void FinishedOps(DBWithColumnFamilies* db_with_cfh, DB* db, int64_t num_ops) { if (FLAGS_histogram) { double now = FLAGS_env->NowMicros(); double micros = now - last_op_finish_; @@ -730,8 +947,17 @@ class Stats { if (FLAGS_stats_per_interval) { std::string stats; - if (db && db->GetProperty("rocksdb.stats", &stats)) + + if (db_with_cfh && db_with_cfh->num_created.load()) { + for (size_t i = 0; i < db_with_cfh->num_created.load(); ++i) { + if (db->GetProperty(db_with_cfh->cfh[i], "rocksdb.cfstats", + &stats)) + fprintf(stderr, "%s\n", stats.c_str()); + } + + } else if (db && db->GetProperty("rocksdb.stats", &stats)) { fprintf(stderr, "%s\n", stats.c_str()); + } } fflush(stderr); @@ -774,6 +1000,21 @@ class Stats { if (FLAGS_histogram) { fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); } + if (FLAGS_report_file_operations) { + ReportFileOpEnv* env = static_cast(FLAGS_env); + ReportFileOpCounters* counters = env->counters(); + fprintf(stdout, "Num files opened: %d\n", + counters->open_counter_.load(std::memory_order_relaxed)); + fprintf(stdout, "Num Read(): %d\n", + counters->read_counter_.load(std::memory_order_relaxed)); + fprintf(stdout, "Num Append(): %d\n", + counters->append_counter_.load(std::memory_order_relaxed)); + fprintf(stdout, "Num bytes read: %" PRIu64 "\n", + counters->bytes_read_.load(std::memory_order_relaxed)); + fprintf(stdout, "Num bytes written: %" PRIu64 "\n", + counters->bytes_written_.load(std::memory_order_relaxed)); + env->reset(); + } fflush(stdout); } }; @@ -813,13 +1054,16 @@ struct ThreadState { class Duration { public: - Duration(int max_seconds, int64_t max_ops) { + Duration(int max_seconds, int64_t max_ops, int64_t ops_per_stage = 0) { max_seconds_ = max_seconds; max_ops_= max_ops; + ops_per_stage_ = (ops_per_stage > 0) ? ops_per_stage : max_ops; ops_ = 0; start_at_ = FLAGS_env->NowMicros(); } + int64_t GetStage() { return std::min(ops_, max_ops_ - 1) / ops_per_stage_; } + bool Done(int64_t increment) { if (increment <= 0) increment = 1; // avoid Done(0) and infinite loops ops_ += increment; @@ -840,6 +1084,7 @@ class Duration { private: int max_seconds_; int64_t max_ops_; + int64_t ops_per_stage_; int64_t ops_; double start_at_; }; @@ -850,13 +1095,6 @@ class Benchmark { std::shared_ptr compressed_cache_; std::shared_ptr filter_policy_; const SliceTransform* prefix_extractor_; - struct DBWithColumnFamilies { - std::vector cfh; - DB* db; - DBWithColumnFamilies() : db(nullptr) { - cfh.clear(); - } - }; DBWithColumnFamilies db_; std::vector multi_dbs_; int64_t num_; @@ -866,10 +1104,12 @@ class Benchmark { int64_t keys_per_prefix_; int64_t entries_per_batch_; WriteOptions write_options_; + Options open_options_; // keep options around to properly destroy db later int64_t reads_; int64_t writes_; int64_t readwrites_; int64_t merge_keys_; + bool report_file_operations_; bool SanityCheck() { if (FLAGS_compression_ratio > 1) { @@ -974,28 +1214,28 @@ class Benchmark { text[len] = '\0'; switch (FLAGS_compression_type_e) { case kSnappyCompression: - result = port::Snappy_Compress(Options().compression_opts, text, - strlen(text), &compressed); + result = Snappy_Compress(Options().compression_opts, text, + strlen(text), &compressed); name = "Snappy"; break; case kZlibCompression: - result = port::Zlib_Compress(Options().compression_opts, text, - strlen(text), &compressed); + result = Zlib_Compress(Options().compression_opts, 2, text, + strlen(text), &compressed); name = "Zlib"; break; case kBZip2Compression: - result = port::BZip2_Compress(Options().compression_opts, text, - strlen(text), &compressed); + result = BZip2_Compress(Options().compression_opts, 2, text, + strlen(text), &compressed); name = "BZip2"; break; case kLZ4Compression: - result = port::LZ4_Compress(Options().compression_opts, text, - strlen(text), &compressed); + result = LZ4_Compress(Options().compression_opts, 2, text, + strlen(text), &compressed); name = "LZ4"; break; case kLZ4HCCompression: - result = port::LZ4HC_Compress(Options().compression_opts, text, - strlen(text), &compressed); + result = LZ4HC_Compress(Options().compression_opts, 2, text, + strlen(text), &compressed); name = "LZ4HC"; break; case kNoCompression: @@ -1020,7 +1260,7 @@ class Benchmark { while (start < s.size() && isspace(s[start])) { start++; } - unsigned int limit = s.size(); + unsigned int limit = static_cast(s.size()); while (limit > start && isspace(s[limit-1])) { limit--; } @@ -1074,9 +1314,9 @@ class Benchmark { (FLAGS_cache_numshardbits >= 1 ? NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), - filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : nullptr), + filter_policy_(FLAGS_bloom_bits >= 0 ? + NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter) + : nullptr), prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), num_(FLAGS_num), value_size_(FLAGS_value_size), @@ -1089,7 +1329,18 @@ class Benchmark { readwrites_((FLAGS_writes < 0 && FLAGS_reads < 0)? FLAGS_num : ((FLAGS_writes > FLAGS_reads) ? FLAGS_writes : FLAGS_reads) ), - merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys) { + merge_keys_(FLAGS_merge_keys < 0 ? FLAGS_num : FLAGS_merge_keys), + report_file_operations_(FLAGS_report_file_operations) { + if (report_file_operations_) { + if (!FLAGS_hdfs.empty()) { + fprintf(stderr, + "--hdfs and --report_file_operations cannot be enabled " + "at the same time"); + exit(1); + } + FLAGS_env = new ReportFileOpEnv(rocksdb::Env::Default()); + } + if (FLAGS_prefix_size > FLAGS_key_size) { fprintf(stderr, "prefix size is larger than key size"); exit(1); @@ -1103,11 +1354,17 @@ class Benchmark { } } if (!FLAGS_use_existing_db) { - DestroyDB(FLAGS_db, Options()); + Options options; + if (!FLAGS_wal_dir.empty()) { + options.wal_dir = FLAGS_wal_dir; + } + DestroyDB(FLAGS_db, options); } } ~Benchmark() { + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; delete prefix_extractor_; } @@ -1164,17 +1421,7 @@ class Benchmark { } std::string GetDbNameForMultiple(std::string base_name, size_t id) { - return base_name + std::to_string(id); - } - - std::string ColumnFamilyName(int i) { - if (i == 0) { - return kDefaultColumnFamilyName; - } else { - char name[100]; - snprintf(name, sizeof(name), "column_family_name_%06d", i); - return std::string(name); - } + return base_name + ToString(id); } void Run() { @@ -1182,7 +1429,7 @@ class Benchmark { exit(1); } PrintHeader(); - Open(); + Open(&open_options_); const char* benchmarks = FLAGS_benchmarks.c_str(); while (benchmarks != nullptr) { const char* sep = strchr(benchmarks, ','); @@ -1201,7 +1448,7 @@ class Benchmark { writes_ = (FLAGS_writes < 0 ? FLAGS_num : FLAGS_writes); value_size_ = FLAGS_value_size; key_size_ = FLAGS_key_size; - entries_per_batch_ = 1; + entries_per_batch_ = FLAGS_batch_size; write_options_ = WriteOptions(); if (FLAGS_sync) { write_options_.sync = true; @@ -1253,7 +1500,11 @@ class Benchmark { method = &Benchmark::ReadReverse; } else if (name == Slice("readrandom")) { method = &Benchmark::ReadRandom; + } else if (name == Slice("readrandomfast")) { + method = &Benchmark::ReadRandomFast; } else if (name == Slice("multireadrandom")) { + fprintf(stderr, "entries_per_batch = %" PRIi64 "\n", + entries_per_batch_); method = &Benchmark::MultiReadRandom; } else if (name == Slice("readmissing")) { ++key_size_; @@ -1300,6 +1551,8 @@ class Benchmark { method = &Benchmark::MergeRandom; } else if (name == Slice("randomwithverify")) { method = &Benchmark::RandomWithVerify; + } else if (name == Slice("fillseekseq")) { + method = &Benchmark::WriteSeqSeekSeq; } else if (name == Slice("compact")) { method = &Benchmark::Compact; } else if (name == Slice("crc32c")) { @@ -1332,18 +1585,20 @@ class Benchmark { method = nullptr; } else { if (db_.db != nullptr) { + std::for_each(db_.cfh.begin(), db_.cfh.end(), + [](ColumnFamilyHandle* cfh) { delete cfh; }); delete db_.db; db_.db = nullptr; db_.cfh.clear(); - DestroyDB(FLAGS_db, Options()); + DestroyDB(FLAGS_db, open_options_); } for (size_t i = 0; i < multi_dbs_.size(); i++) { delete multi_dbs_[i].db; - DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options()); + DestroyDB(GetDbNameForMultiple(FLAGS_db, i), open_options_); } multi_dbs_.clear(); } - Open(); + Open(&open_options_); // use open_options for the last accessed } if (method != nullptr) { @@ -1463,7 +1718,7 @@ class Benchmark { uint32_t crc = 0; while (bytes < 500 * 1048576) { crc = crc32c::Value(data.data(), size); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); bytes += size; } // Print so result is not dead @@ -1482,7 +1737,7 @@ class Benchmark { unsigned int xxh32 = 0; while (bytes < 500 * 1048576) { xxh32 = XXH32(data.data(), size, 0); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); bytes += size; } // Print so result is not dead @@ -1494,16 +1749,16 @@ class Benchmark { void AcquireLoad(ThreadState* thread) { int dummy; - port::AtomicPointer ap(&dummy); + std::atomic ap(&dummy); int count = 0; void *ptr = nullptr; thread->stats.AddMessage("(each op is 1000 loads)"); while (count < 100000) { for (int i = 0; i < 1000; i++) { - ptr = ap.Acquire_Load(); + ptr = ap.load(std::memory_order_acquire); } count++; - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (ptr == nullptr) exit(1); // Disable unused variable warning. } @@ -1520,31 +1775,31 @@ class Benchmark { while (ok && bytes < int64_t(1) << 30) { switch (FLAGS_compression_type_e) { case rocksdb::kSnappyCompression: - ok = port::Snappy_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); break; case rocksdb::kZlibCompression: - ok = port::Zlib_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = Zlib_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kBZip2Compression: - ok = port::BZip2_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = BZip2_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kLZ4Compression: - ok = port::LZ4_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = LZ4_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kLZ4HCCompression: - ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; default: ok = false; } produced += compressed.size(); bytes += input.size(); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (!ok) { @@ -1566,24 +1821,24 @@ class Benchmark { bool ok; switch (FLAGS_compression_type_e) { case rocksdb::kSnappyCompression: - ok = port::Snappy_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = Snappy_Compress(Options().compression_opts, input.data(), + input.size(), &compressed); break; case rocksdb::kZlibCompression: - ok = port::Zlib_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = Zlib_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kBZip2Compression: - ok = port::BZip2_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = BZip2_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kLZ4Compression: - ok = port::LZ4_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = LZ4_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; case rocksdb::kLZ4HCCompression: - ok = port::LZ4HC_Compress(Options().compression_opts, input.data(), - input.size(), &compressed); + ok = LZ4HC_Compress(Options().compression_opts, 2, input.data(), + input.size(), &compressed); break; default: ok = false; @@ -1597,27 +1852,27 @@ class Benchmark { case rocksdb::kSnappyCompression: // allocate here to make comparison fair uncompressed = new char[input.size()]; - ok = port::Snappy_Uncompress(compressed.data(), compressed.size(), - uncompressed); + ok = Snappy_Uncompress(compressed.data(), compressed.size(), + uncompressed); break; case rocksdb::kZlibCompression: - uncompressed = port::Zlib_Uncompress( - compressed.data(), compressed.size(), &decompress_size); + uncompressed = Zlib_Uncompress(compressed.data(), compressed.size(), + &decompress_size, 2); ok = uncompressed != nullptr; break; case rocksdb::kBZip2Compression: - uncompressed = port::BZip2_Uncompress( - compressed.data(), compressed.size(), &decompress_size); + uncompressed = BZip2_Uncompress(compressed.data(), compressed.size(), + &decompress_size, 2); ok = uncompressed != nullptr; break; case rocksdb::kLZ4Compression: - uncompressed = port::LZ4_Uncompress( - compressed.data(), compressed.size(), &decompress_size); + uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(), + &decompress_size, 2); ok = uncompressed != nullptr; break; case rocksdb::kLZ4HCCompression: - uncompressed = port::LZ4_Uncompress( - compressed.data(), compressed.size(), &decompress_size); + uncompressed = LZ4_Uncompress(compressed.data(), compressed.size(), + &decompress_size, 2); ok = uncompressed != nullptr; break; default: @@ -1625,7 +1880,7 @@ class Benchmark { } delete[] uncompressed; bytes += input.size(); - thread->stats.FinishedOps(nullptr, 1); + thread->stats.FinishedOps(nullptr, nullptr, 1); } if (!ok) { @@ -1635,11 +1890,14 @@ class Benchmark { } } - void Open() { + void Open(Options* opts) { + Options& options = *opts; + assert(db_.db == nullptr); - Options options; + options.create_if_missing = !FLAGS_use_existing_db; options.create_missing_column_families = FLAGS_num_column_families > 1; + options.db_write_buffer_size = FLAGS_db_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size; options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.min_write_buffer_number_to_merge = @@ -1684,12 +1942,14 @@ class Benchmark { exit(1); } switch (FLAGS_rep_factory) { - case kPrefixHash: - options.memtable_factory.reset(NewHashSkipListRepFactory( - FLAGS_hash_bucket_count)); - break; case kSkipList: - // no need to do anything + options.memtable_factory.reset(new SkipListFactory( + FLAGS_skip_list_lookahead)); + break; +#ifndef ROCKSDB_LITE + case kPrefixHash: + options.memtable_factory.reset( + NewHashSkipListRepFactory(FLAGS_hash_bucket_count)); break; case kHashLinkedList: options.memtable_factory.reset(NewHashLinkListRepFactory( @@ -1704,8 +1964,14 @@ class Benchmark { options.memtable_factory.reset(NewHashCuckooRepFactory( options.write_buffer_size, FLAGS_key_size + FLAGS_value_size)); break; +#else + default: + fprintf(stderr, "Only skip list is supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } if (FLAGS_use_plain_table) { +#ifndef ROCKSDB_LITE if (FLAGS_rep_factory != kPrefixHash && FLAGS_rep_factory != kHashLinkedList) { fprintf(stderr, "Waring: plain table is used with skipList\n"); @@ -1726,13 +1992,25 @@ class Benchmark { plain_table_options.hash_table_ratio = 0.75; options.table_factory = std::shared_ptr( NewPlainTableFactory(plain_table_options)); +#else + fprintf(stderr, "Plain table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } else if (FLAGS_use_cuckoo_table) { +#ifndef ROCKSDB_LITE if (FLAGS_cuckoo_hash_ratio > 1 || FLAGS_cuckoo_hash_ratio < 0) { fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); exit(1); } + rocksdb::CuckooTableOptions table_options; + table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio; + table_options.identity_as_first_hash = FLAGS_identity_as_first_hash; options.table_factory = std::shared_ptr( - NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio)); + NewCuckooTableFactory(table_options)); +#else + fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } else { BlockBasedTableOptions block_based_options; if (FLAGS_use_hash_search) { @@ -1753,6 +2031,7 @@ class Benchmark { block_based_options.block_size = FLAGS_block_size; block_based_options.block_restart_interval = FLAGS_block_restart_interval; block_based_options.filter_policy = filter_policy_; + block_based_options.format_version = 2; options.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); } @@ -1775,6 +2054,8 @@ class Benchmark { options.compression_opts.level = FLAGS_compression_level; options.WAL_ttl_seconds = FLAGS_wal_ttl_seconds; options.WAL_size_limit_MB = FLAGS_wal_size_limit_MB; + options.max_total_wal_size = FLAGS_max_total_wal_size; + if (FLAGS_min_level_to_compress >= 0) { assert(FLAGS_min_level_to_compress <= FLAGS_num_levels); options.compression_per_level.resize(FLAGS_num_levels); @@ -1786,8 +2067,6 @@ class Benchmark { options.compression_per_level[i] = FLAGS_compression_type_e; } } - options.delete_obsolete_files_period_micros = - FLAGS_delete_obsolete_files_period_micros; options.soft_rate_limit = FLAGS_soft_rate_limit; options.hard_rate_limit = FLAGS_hard_rate_limit; options.rate_limit_delay_max_milliseconds = @@ -1858,9 +2137,15 @@ class Benchmark { Status s; // Open with column families if necessary. if (FLAGS_num_column_families > 1) { - db->cfh.resize(FLAGS_num_column_families); + size_t num_hot = FLAGS_num_column_families; + if (FLAGS_num_hot_column_families > 0 && + FLAGS_num_hot_column_families < FLAGS_num_column_families) { + num_hot = FLAGS_num_hot_column_families; + } else { + FLAGS_num_hot_column_families = FLAGS_num_column_families; + } std::vector column_families; - for (int i = 0; i < FLAGS_num_column_families; i++) { + for (size_t i = 0; i < num_hot; i++) { column_families.push_back(ColumnFamilyDescriptor( ColumnFamilyName(i), ColumnFamilyOptions(options))); } @@ -1870,6 +2155,10 @@ class Benchmark { } else { s = DB::Open(options, db_name, column_families, &db->cfh, &db->db); } + db->cfh.resize(FLAGS_num_column_families); + db->num_created = num_hot; + db->num_hot = num_hot; + } else if (FLAGS_readonly) { s = DB::OpenForReadOnly(options, db_name, &db->db); } else { @@ -1914,8 +2203,9 @@ class Benchmark { for (uint64_t i = 0; i < num_; ++i) { values_[i] = i; } - std::shuffle(values_.begin(), values_.end(), - std::default_random_engine(FLAGS_seed)); + std::shuffle( + values_.begin(), values_.end(), + std::default_random_engine(static_cast(FLAGS_seed))); } } @@ -1965,9 +2255,18 @@ class Benchmark { num_key_gens = multi_dbs_.size(); } std::vector> key_gens(num_key_gens); - Duration duration(test_duration, num_ops * num_key_gens); + int64_t max_ops = num_ops * num_key_gens; + int64_t ops_per_stage = max_ops; + if (FLAGS_num_column_families > 1 && FLAGS_num_hot_column_families > 0) { + ops_per_stage = (max_ops - 1) / (FLAGS_num_column_families / + FLAGS_num_hot_column_families) + + 1; + } + + Duration duration(test_duration, max_ops, ops_per_stage); for (size_t i = 0; i < num_key_gens; i++) { - key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops)); + key_gens[i].reset(new KeyGenerator(&(thread->rand), write_mode, num_ops, + ops_per_stage)); } if (num_ != FLAGS_num) { @@ -1983,7 +2282,18 @@ class Benchmark { Slice key = AllocateKey(); std::unique_ptr key_guard(key.data()); + int64_t stage = 0; while (!duration.Done(entries_per_batch_)) { + if (duration.GetStage() != stage) { + stage = duration.GetStage(); + if (db_.db != nullptr) { + db_.CreateNewCf(open_options_, stage); + } else { + for (auto& db : multi_dbs_) { + db.CreateNewCf(open_options_, stage); + } + } + } size_t id = thread->rand.Next() % num_key_gens; DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id); batch.Clear(); @@ -1996,13 +2306,14 @@ class Benchmark { // We use same rand_num as seed for key and column family so that we // can deterministically find the cfh corresponding to a particular // key while reading the key. - batch.Put(db_with_cfh->cfh[rand_num % db_with_cfh->cfh.size()], - key, gen.Generate(value_size_)); + batch.Put(db_with_cfh->GetCfh(rand_num), key, + gen.Generate(value_size_)); } bytes += value_size_ + key_size_; } s = db_with_cfh->db->Write(write_options_, &batch); - thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_); + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, + entries_per_batch_); if (!s.ok()) { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); @@ -2022,12 +2333,15 @@ class Benchmark { } void ReadSequential(ThreadState* thread, DB* db) { - Iterator* iter = db->NewIterator(ReadOptions(FLAGS_verify_checksum, true)); + ReadOptions options(FLAGS_verify_checksum, true); + options.tailing = FLAGS_use_tailing_iterator; + + Iterator* iter = db->NewIterator(options); int64_t i = 0; int64_t bytes = 0; for (iter->SeekToFirst(); i < reads_ && iter->Valid(); iter->Next()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); ++i; } delete iter; @@ -2050,13 +2364,61 @@ class Benchmark { int64_t bytes = 0; for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { bytes += iter->key().size() + iter->value().size(); - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); ++i; } delete iter; thread->stats.AddBytes(bytes); } + void ReadRandomFast(ThreadState* thread) { + int64_t read = 0; + int64_t found = 0; + int64_t nonexist = 0; + ReadOptions options(FLAGS_verify_checksum, true); + Slice key = AllocateKey(); + std::unique_ptr key_guard(key.data()); + std::string value; + DB* db = SelectDBWithCfh(thread)->db; + + int64_t pot = 1; + while (pot < FLAGS_num) { + pot <<= 1; + } + + Duration duration(FLAGS_duration, reads_); + do { + for (int i = 0; i < 100; ++i) { + int64_t key_rand = thread->rand.Next() & (pot - 1); + GenerateKeyFromInt(key_rand, FLAGS_num, &key); + ++read; + auto status = db->Get(options, key, &value); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); + } + if (key_rand >= FLAGS_num) { + ++nonexist; + } + } + thread->stats.FinishedOps(nullptr, db, 100); + } while (!duration.Done(100)); + + char msg[100]; + snprintf(msg, sizeof(msg), "(%" PRIu64 " of %" PRIu64 " found, " + "issued %" PRIu64 " non-exist keys)\n", + found, read, nonexist); + + thread->stats.AddMessage(msg); + + if (FLAGS_perf_level > 0) { + thread->stats.AddMessage(perf_context.ToString()); + } + } + void ReadRandom(ThreadState* thread) { int64_t read = 0; int64_t found = 0; @@ -2076,15 +2438,18 @@ class Benchmark { read++; Status s; if (FLAGS_num_column_families > 1) { - s = db_with_cfh->db->Get(options, - db_with_cfh->cfh[key_rand % db_with_cfh->cfh.size()], key, &value); + s = db_with_cfh->db->Get(options, db_with_cfh->GetCfh(key_rand), key, + &value); } else { s = db_with_cfh->db->Get(options, key, &value); } if (s.ok()) { found++; + } else if (!s.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", s.ToString().c_str()); + abort(); } - thread->stats.FinishedOps(db_with_cfh->db, 1); + thread->stats.FinishedOps(db_with_cfh, db_with_cfh->db, 1); } char msg[100]; @@ -2124,9 +2489,13 @@ class Benchmark { for (int64_t i = 0; i < entries_per_batch_; ++i) { if (statuses[i].ok()) { ++found; + } else if (!statuses[i].IsNotFound()) { + fprintf(stderr, "MultiGet returned an error: %s\n", + statuses[i].ToString().c_str()); + abort(); } } - thread->stats.FinishedOps(db, entries_per_batch_); + thread->stats.FinishedOps(nullptr, db, entries_per_batch_); } for (auto& k : keys) { delete k.data(); @@ -2145,7 +2514,7 @@ class Benchmark { DB* db = SelectDB(thread); Iterator* iter = db->NewIterator(options); delete iter; - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } } @@ -2178,6 +2547,7 @@ class Benchmark { std::unique_ptr key_guard(key.data()); Duration duration(FLAGS_duration, reads_); + char value_buffer[256]; while (!duration.Done(1)) { if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) { uint64_t now = FLAGS_env->NowMicros(); @@ -2209,7 +2579,17 @@ class Benchmark { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { found++; } - thread->stats.FinishedOps(db_.db, 1); + + for (int j = 0; j < FLAGS_seek_nexts && iter_to_use->Valid(); ++j) { + // Copy out iterator's value to make sure we read them. + Slice value = iter_to_use->value(); + memcpy(value_buffer, value.data(), + std::min(value.size(), sizeof(value_buffer))); + iter_to_use->Next(); + assert(iter_to_use->status().ok()); + } + + thread->stats.FinishedOps(&db_, db_.db, 1); } delete single_iter; for (auto iter : multi_iters) { @@ -2249,7 +2629,7 @@ class Benchmark { batch.Delete(key); } auto s = db->Write(write_options_, &batch); - thread->stats.FinishedOps(db, entries_per_batch_); + thread->stats.FinishedOps(nullptr, db, entries_per_batch_); if (!s.ok()) { fprintf(stderr, "del error: %s\n", s.ToString().c_str()); exit(1); @@ -2309,7 +2689,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db_.db, 1); + thread->stats.FinishedOps(&db_, db_.db, 1); ++num_writes; if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { @@ -2469,7 +2849,7 @@ class Benchmark { deletes_done++; } - thread->stats.FinishedOps(db_.db, 1); + thread->stats.FinishedOps(&db_, db_.db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2527,7 +2907,7 @@ class Benchmark { put_weight--; writes_done++; } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; snprintf(msg, sizeof(msg), "( reads:%" PRIu64 " writes:%" PRIu64 \ @@ -2552,8 +2932,13 @@ class Benchmark { DB* db = SelectDB(thread); GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); - if (db->Get(options, key, &value).ok()) { - found++; + auto status = db->Get(options, key, &value); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); } Status s = db->Put(write_options_, key, gen.Generate(value_size_)); @@ -2561,7 +2946,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; snprintf(msg, sizeof(msg), @@ -2586,9 +2971,13 @@ class Benchmark { DB* db = SelectDB(thread); GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); - // Get the existing value - if (db->Get(options, key, &value).ok()) { - found++; + auto status = db->Get(options, key, &value); + if (status.ok()) { + ++found; + } else if (!status.IsNotFound()) { + fprintf(stderr, "Get returned an error: %s\n", + status.ToString().c_str()); + abort(); } else { // If not existing, then just assume an empty string of data value.clear(); @@ -2608,7 +2997,7 @@ class Benchmark { fprintf(stderr, "put error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; @@ -2644,7 +3033,7 @@ class Benchmark { fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); exit(1); } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } // Print some statistics @@ -2705,7 +3094,7 @@ class Benchmark { } - thread->stats.FinishedOps(db, 1); + thread->stats.FinishedOps(nullptr, db, 1); } char msg[100]; @@ -2716,6 +3105,36 @@ class Benchmark { thread->stats.AddMessage(msg); } + void WriteSeqSeekSeq(ThreadState* thread) { + writes_ = FLAGS_num; + DoWrite(thread, SEQUENTIAL); + // exclude writes from the ops/sec calculation + thread->stats.Start(thread->tid); + + DB* db = SelectDB(thread); + std::unique_ptr iter( + db->NewIterator(ReadOptions(FLAGS_verify_checksum, true))); + + Slice key = AllocateKey(); + for (int64_t i = 0; i < FLAGS_num; ++i) { + GenerateKeyFromInt(i, FLAGS_num, &key); + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + + for (int j = 0; j < FLAGS_seek_nexts && i + 1 < FLAGS_num; ++j) { + iter->Next(); + GenerateKeyFromInt(++i, FLAGS_num, &key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + } + + iter->Seek(key); + assert(iter->Valid() && iter->key() == key); + thread->stats.FinishedOps(nullptr, db, 1); + } + } + void Compact(ThreadState* thread) { DB* db = SelectDB(thread); db->CompactRange(nullptr, nullptr); @@ -2755,8 +3174,8 @@ int main(int argc, char** argv) { dbstats = rocksdb::CreateDBStatistics(); } - std::vector fanout = - rocksdb::stringSplit(FLAGS_max_bytes_for_level_multiplier_additional, ','); + std::vector fanout = rocksdb::StringSplit( + FLAGS_max_bytes_for_level_multiplier_additional, ','); for (unsigned int j= 0; j < fanout.size(); j++) { FLAGS_max_bytes_for_level_multiplier_additional_v.push_back( std::stoi(fanout[j])); @@ -2787,6 +3206,9 @@ int main(int argc, char** argv) { // The number of background threads should be at least as much the // max number of concurrent compactions. FLAGS_env->SetBackgroundThreads(FLAGS_max_background_compactions); + FLAGS_env->SetBackgroundThreads(FLAGS_max_background_flushes, + rocksdb::Env::Priority::HIGH); + // Choose a location for the test database if none given with --db= if (FLAGS_db.empty()) { std::string default_db_path; diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index 4185a40ca..4011b4652 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -9,29 +9,35 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include #include #include "db/db_impl.h" #include "db/filename.h" +#include "db/job_context.h" #include "db/version_set.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "port/port.h" #include "util/mutexlock.h" #include "util/sync_point.h" +#include "util/file_util.h" namespace rocksdb { Status DBImpl::DisableFileDeletions() { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); ++disable_delete_obsolete_files_; if (disable_delete_obsolete_files_ == 1) { - Log(options_.info_log, "File Deletions Disabled"); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File Deletions Disabled"); } else { - Log(options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "File Deletions Disabled, but already disabled. Counter: %d", disable_delete_obsolete_files_); } @@ -39,10 +45,10 @@ Status DBImpl::DisableFileDeletions() { } Status DBImpl::EnableFileDeletions(bool force) { - DeletionState deletion_state; + JobContext job_context; bool should_purge_files = false; { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); if (force) { // if force, we need to enable file deletions right away disable_delete_obsolete_files_ = 0; @@ -50,19 +56,21 @@ Status DBImpl::EnableFileDeletions(bool force) { --disable_delete_obsolete_files_; } if (disable_delete_obsolete_files_ == 0) { - Log(options_.info_log, "File Deletions Enabled"); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File Deletions Enabled"); should_purge_files = true; - FindObsoleteFiles(deletion_state, true); + FindObsoleteFiles(&job_context, true); } else { - Log(options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "File Deletions Enable, but not really enabled. Counter: %d", disable_delete_obsolete_files_); } } if (should_purge_files) { - PurgeObsoleteFiles(deletion_state); + PurgeObsoleteFiles(job_context); } - LogFlush(options_.info_log); + job_context.Clean(); + LogFlush(db_options_.info_log); return Status::OK(); } @@ -95,8 +103,8 @@ Status DBImpl::GetLiveFiles(std::vector& ret, if (!status.ok()) { mutex_.Unlock(); - Log(options_.info_log, "Cannot Flush data %s\n", - status.ToString().c_str()); + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Cannot Flush data %s\n", status.ToString().c_str()); return status; } } @@ -117,65 +125,17 @@ Status DBImpl::GetLiveFiles(std::vector& ret, } ret.push_back(CurrentFileName("")); - ret.push_back(DescriptorFileName("", versions_->ManifestFileNumber())); + ret.push_back(DescriptorFileName("", versions_->manifest_file_number())); // find length of manifest file while holding the mutex lock - *manifest_file_size = versions_->ManifestFileSize(); + *manifest_file_size = versions_->manifest_file_size(); mutex_.Unlock(); return Status::OK(); } Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) { - // First get sorted files in db dir, then get sorted files from archived - // dir, to avoid a race condition where a log file is moved to archived - // dir in between. - Status s; - // list wal files in main db dir. - VectorLogPtr logs; - s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); - if (!s.ok()) { - return s; - } - - // Reproduce the race condition where a log file is moved - // to archived dir, between these two sync points, used in - // (DBTest,TransactionLogIteratorRace) - TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:1"); - TEST_SYNC_POINT("DBImpl::GetSortedWalFiles:2"); - - files.clear(); - // list wal files in archive dir. - std::string archivedir = ArchivalDirectory(options_.wal_dir); - if (env_->FileExists(archivedir)) { - s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); - if (!s.ok()) { - return s; - } - } - - uint64_t latest_archived_log_number = 0; - if (!files.empty()) { - latest_archived_log_number = files.back()->LogNumber(); - Log(options_.info_log, "Latest Archived log: %" PRIu64, - latest_archived_log_number); - } - - files.reserve(files.size() + logs.size()); - for (auto& log : logs) { - if (log->LogNumber() > latest_archived_log_number) { - files.push_back(std::move(log)); - } else { - // When the race condition happens, we could see the - // same log in both db dir and archived dir. Simply - // ignore the one in db dir. Note that, if we read - // archived dir first, we would have missed the log file. - Log(options_.info_log, "%s already moved to archive", - log->PathName().c_str()); - } - } - - return s; + return wal_manager_.GetSortedWalFiles(files); } } diff --git a/db/db_impl.cc b/db/db_impl.cc index 9cb09d719..570928b1e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -9,7 +9,10 @@ #include "db/db_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -24,9 +27,12 @@ #include #include "db/builder.h" +#include "db/flush_job.h" +#include "db/compaction_job.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/filename.h" +#include "db/job_context.h" #include "db/log_reader.h" #include "db/log_writer.h" #include "db/memtable.h" @@ -38,6 +44,7 @@ #include "db/forward_iterator.h" #include "db/transaction_log_impl.h" #include "db/version_set.h" +#include "db/writebuffer.h" #include "db/write_batch_internal.h" #include "port/port.h" #include "rocksdb/cache.h" @@ -46,6 +53,7 @@ #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/version.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" @@ -58,6 +66,8 @@ #include "util/autovector.h" #include "util/build_version.h" #include "util/coding.h" +#include "util/db_info_dumper.h" +#include "util/file_util.h" #include "util/hash_skiplist_rep.h" #include "util/hash_linklist_rep.h" #include "util/logging.h" @@ -67,30 +77,20 @@ #include "util/iostats_context_imp.h" #include "util/stop_watch.h" #include "util/sync_point.h" +#include "util/string_util.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" namespace rocksdb { const std::string kDefaultColumnFamilyName("default"); -void DumpLeveldbBuildVersion(Logger * log); - -// Information kept for every waiting writer -struct DBImpl::Writer { - Status status; - WriteBatch* batch; - bool sync; - bool disableWAL; - bool in_batch_group; - bool done; - uint64_t timeout_hint_us; - port::CondVar cv; - - explicit Writer(port::Mutex* mu) : cv(mu) { } -}; +void DumpRocksDBBuildVersion(Logger * log); struct DBImpl::WriteContext { autovector superversions_to_free_; autovector logs_to_free_; + bool schedule_bg_work_ = false; ~WriteContext() { for (auto& sv : superversions_to_free_) { @@ -102,148 +102,6 @@ struct DBImpl::WriteContext { } }; -struct DBImpl::CompactionState { - Compaction* const compaction; - - // If there were two snapshots with seq numbers s1 and - // s2 and s1 < s2, and if we find two instances of a key k1 then lies - // entirely within s1 and s2, then the earlier version of k1 can be safely - // deleted because that version is not visible in any snapshot. - std::vector existing_snapshots; - - // Files produced by compaction - struct Output { - uint64_t number; - uint32_t path_id; - uint64_t file_size; - InternalKey smallest, largest; - SequenceNumber smallest_seqno, largest_seqno; - }; - std::vector outputs; - std::list allocated_file_numbers; - - // State kept for output being generated - unique_ptr outfile; - unique_ptr builder; - - uint64_t total_bytes; - - Output* current_output() { return &outputs[outputs.size()-1]; } - - explicit CompactionState(Compaction* c) - : compaction(c), - total_bytes(0) { - } - - // Create a client visible context of this compaction - CompactionFilter::Context GetFilterContextV1() { - CompactionFilter::Context context; - context.is_full_compaction = compaction->IsFullCompaction(); - context.is_manual_compaction = compaction->IsManualCompaction(); - return context; - } - - // Create a client visible context of this compaction - CompactionFilterContext GetFilterContext() { - CompactionFilterContext context; - context.is_full_compaction = compaction->IsFullCompaction(); - context.is_manual_compaction = compaction->IsManualCompaction(); - return context; - } - - std::vector key_str_buf_; - std::vector existing_value_str_buf_; - // new_value_buf_ will only be appended if a value changes - std::vector new_value_buf_; - // if values_changed_buf_[i] is true - // new_value_buf_ will add a new entry with the changed value - std::vector value_changed_buf_; - // to_delete_buf_[i] is true iff key_buf_[i] is deleted - std::vector to_delete_buf_; - - std::vector other_key_str_buf_; - std::vector other_value_str_buf_; - - std::vector combined_key_buf_; - std::vector combined_value_buf_; - - std::string cur_prefix_; - - // Buffers the kv-pair that will be run through compaction filter V2 - // in the future. - void BufferKeyValueSlices(const Slice& key, const Slice& value) { - key_str_buf_.emplace_back(key.ToString()); - existing_value_str_buf_.emplace_back(value.ToString()); - } - - // Buffers the kv-pair that will not be run through compaction filter V2 - // in the future. - void BufferOtherKeyValueSlices(const Slice& key, const Slice& value) { - other_key_str_buf_.emplace_back(key.ToString()); - other_value_str_buf_.emplace_back(value.ToString()); - } - - // Add a kv-pair to the combined buffer - void AddToCombinedKeyValueSlices(const Slice& key, const Slice& value) { - // The real strings are stored in the batch buffers - combined_key_buf_.emplace_back(key); - combined_value_buf_.emplace_back(value); - } - - // Merging the two buffers - void MergeKeyValueSliceBuffer(const InternalKeyComparator* comparator) { - size_t i = 0; - size_t j = 0; - size_t total_size = key_str_buf_.size() + other_key_str_buf_.size(); - combined_key_buf_.reserve(total_size); - combined_value_buf_.reserve(total_size); - - while (i + j < total_size) { - int comp_res = 0; - if (i < key_str_buf_.size() && j < other_key_str_buf_.size()) { - comp_res = comparator->Compare(key_str_buf_[i], other_key_str_buf_[j]); - } else if (i >= key_str_buf_.size() && j < other_key_str_buf_.size()) { - comp_res = 1; - } else if (j >= other_key_str_buf_.size() && i < key_str_buf_.size()) { - comp_res = -1; - } - if (comp_res > 0) { - AddToCombinedKeyValueSlices(other_key_str_buf_[j], other_value_str_buf_[j]); - j++; - } else if (comp_res < 0) { - AddToCombinedKeyValueSlices(key_str_buf_[i], existing_value_str_buf_[i]); - i++; - } - } - } - - void CleanupBatchBuffer() { - to_delete_buf_.clear(); - key_str_buf_.clear(); - existing_value_str_buf_.clear(); - new_value_buf_.clear(); - value_changed_buf_.clear(); - - to_delete_buf_.shrink_to_fit(); - key_str_buf_.shrink_to_fit(); - existing_value_str_buf_.shrink_to_fit(); - new_value_buf_.shrink_to_fit(); - value_changed_buf_.shrink_to_fit(); - - other_key_str_buf_.clear(); - other_value_str_buf_.clear(); - other_key_str_buf_.shrink_to_fit(); - other_value_str_buf_.shrink_to_fit(); - } - - void CleanupMergedBuffer() { - combined_key_buf_.clear(); - combined_value_buf_.clear(); - combined_key_buf_.shrink_to_fit(); - combined_value_buf_.shrink_to_fit(); - } -}; - Options SanitizeOptions(const std::string& dbname, const InternalKeyComparator* icmp, const Options& src) { @@ -268,6 +126,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.info_log = nullptr; } } + result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions, + Env::Priority::LOW); + result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes, + Env::Priority::HIGH); if (!result.rate_limiter) { if (result.bytes_per_sync == 0) { @@ -290,12 +152,14 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { return result; } -Status SanitizeDBOptionsByCFOptions( - DBOptions* db_opts, +namespace { + +Status SanitizeOptionsByTable( + const DBOptions& db_opts, const std::vector& column_families) { Status s; for (auto cf : column_families) { - s = cf.options.table_factory->SanitizeDBOptions(db_opts); + s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options); if (!s.ok()) { return s; } @@ -303,25 +167,24 @@ Status SanitizeDBOptionsByCFOptions( return Status::OK(); } -namespace { -CompressionType GetCompressionFlush(const Options& options) { +CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) { // Compressing memtable flushes might not help unless the sequential load // optimization is used for leveled compaction. Otherwise the CPU and // latency overhead is not offset by saving much space. bool can_compress; - if (options.compaction_style == kCompactionStyleUniversal) { + if (ioptions.compaction_style == kCompactionStyleUniversal) { can_compress = - (options.compaction_options_universal.compression_size_percent < 0); + (ioptions.compaction_options_universal.compression_size_percent < 0); } else { // For leveled compress when min_level_to_compress == 0. - can_compress = options.compression_per_level.empty() || - options.compression_per_level[0] != kNoCompression; + can_compress = ioptions.compression_per_level.empty() || + ioptions.compression_per_level[0] != kNoCompression; } if (can_compress) { - return options.compression; + return ioptions.compression; } else { return kNoCompression; } @@ -331,62 +194,73 @@ CompressionType GetCompressionFlush(const Options& options) { DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) : env_(options.env), dbname_(dbname), - options_(SanitizeOptions(dbname, options)), - stats_(options_.statistics.get()), + db_options_(SanitizeOptions(dbname, options)), + stats_(db_options_.statistics.get()), db_lock_(nullptr), - mutex_(options.use_adaptive_mutex), - shutting_down_(nullptr), + mutex_(stats_, env_, + DB_MUTEX_WAIT_MICROS, + options.use_adaptive_mutex), + shutting_down_(false), bg_cv_(&mutex_), logfile_number_(0), + log_dir_synced_(false), log_empty_(true), default_cf_handle_(nullptr), total_log_size_(0), max_total_in_memory_state_(0), - tmp_batch_(), - bg_schedule_needed_(false), + is_snapshot_supported_(true), + write_buffer_(options.db_write_buffer_size), + unscheduled_flushes_(0), + unscheduled_compactions_(0), bg_compaction_scheduled_(0), bg_manual_only_(0), bg_flush_scheduled_(0), manual_compaction_(nullptr), disable_delete_obsolete_files_(0), - delete_obsolete_files_last_run_(options.env->NowMicros()), - purge_wal_files_last_run_(0), + delete_obsolete_files_next_run_( + options.env->NowMicros() + + db_options_.delete_obsolete_files_period_micros), last_stats_dump_time_microsec_(0), - default_interval_to_delete_obsolete_WAL_(600), flush_on_destroy_(false), - delayed_writes_(0), - storage_options_(options), + env_options_(options), +#ifndef ROCKSDB_LITE + wal_manager_(db_options_, env_options_), +#endif // ROCKSDB_LITE bg_work_gate_closed_(false), refitting_level_(false), - opened_successfully_(false) { + opened_successfully_(false), + notifying_events_(0) { env_->GetAbsolutePath(dbname, &db_absolute_path_); // Reserve ten files or so for other uses and give the rest to TableCache. // Give a large number for setting of "infinite" open files. - const int table_cache_size = - (options_.max_open_files == -1) ? 4194304 : options_.max_open_files - 10; + const int table_cache_size = (db_options_.max_open_files == -1) ? + 4194304 : db_options_.max_open_files - 10; // Reserve ten files or so for other uses and give the rest to TableCache. table_cache_ = - NewLRUCache(table_cache_size, options_.table_cache_numshardbits, - options_.table_cache_remove_scan_count_limit); + NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits, + db_options_.table_cache_remove_scan_count_limit); - versions_.reset( - new VersionSet(dbname_, &options_, storage_options_, table_cache_.get())); - column_family_memtables_.reset( - new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet())); + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_, + &write_controller_)); + column_family_memtables_.reset(new ColumnFamilyMemTablesImpl( + versions_->GetColumnFamilySet(), &flush_scheduler_)); - DumpLeveldbBuildVersion(options_.info_log.get()); - DumpDBFileSummary(options_, dbname_); - options_.Dump(options_.info_log.get()); + DumpRocksDBBuildVersion(db_options_.info_log.get()); + DumpDBFileSummary(db_options_, dbname_); + db_options_.Dump(db_options_.info_log.get()); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } DBImpl::~DBImpl() { + EraseThreadStatusDbInfo(); mutex_.Lock(); + if (flush_on_destroy_) { for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->mem()->GetFirstSequenceNumber() != 0) { + if (!cfd->mem()->IsEmpty()) { cfd->Ref(); mutex_.Unlock(); FlushMemTable(cfd, FlushOptions()); @@ -398,10 +272,25 @@ DBImpl::~DBImpl() { } // Wait for background work to finish - shutting_down_.Release_Store(this); // Any non-nullptr value is ok - while (bg_compaction_scheduled_ || bg_flush_scheduled_) { + shutting_down_.store(true, std::memory_order_release); + while (bg_compaction_scheduled_ || bg_flush_scheduled_ || notifying_events_) { bg_cv_.Wait(); } + listeners_.clear(); + flush_scheduler_.Clear(); + + while (!flush_queue_.empty()) { + auto cfd = PopFirstFromFlushQueue(); + if (cfd->Unref()) { + delete cfd; + } + } + while (!compaction_queue_.empty()) { + auto cfd = PopFirstFromCompactionQueue(); + if (cfd->Unref()) { + delete cfd; + } + } if (default_cf_handle_ != nullptr) { // we need to delete handle outside of lock because it does its own locking @@ -410,25 +299,24 @@ DBImpl::~DBImpl() { mutex_.Lock(); } - if (options_.allow_thread_local) { - // Clean up obsolete files due to SuperVersion release. - // (1) Need to delete to obsolete files before closing because RepairDB() - // scans all existing files in the file system and builds manifest file. - // Keeping obsolete files confuses the repair process. - // (2) Need to check if we Open()/Recover() the DB successfully before - // deleting because if VersionSet recover fails (may be due to corrupted - // manifest file), it is not able to identify live files correctly. As a - // result, all "live" files can get deleted by accident. However, corrupted - // manifest is recoverable by RepairDB(). - if (opened_successfully_) { - DeletionState deletion_state; - FindObsoleteFiles(deletion_state, true); - // manifest number starting from 2 - deletion_state.manifest_file_number = 1; - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); - } + // Clean up obsolete files due to SuperVersion release. + // (1) Need to delete to obsolete files before closing because RepairDB() + // scans all existing files in the file system and builds manifest file. + // Keeping obsolete files confuses the repair process. + // (2) Need to check if we Open()/Recover() the DB successfully before + // deleting because if VersionSet recover fails (may be due to corrupted + // manifest file), it is not able to identify live files correctly. As a + // result, all "live" files can get deleted by accident. However, corrupted + // manifest is recoverable by RepairDB(). + if (opened_successfully_) { + JobContext job_context; + FindObsoleteFiles(&job_context, true); + // manifest number starting from 2 + job_context.manifest_file_number = 1; + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); } + job_context.Clean(); } // versions need to be destroyed before table_cache since it can hold @@ -439,7 +327,7 @@ DBImpl::~DBImpl() { env_->UnlockFile(db_lock_); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); } Status DBImpl::NewDB() { @@ -448,24 +336,28 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); - Log(options_.info_log, "Creating manifest 1 \n"); + Log(InfoLogLevel::INFO_LEVEL, + db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); if (!s.ok()) { return s; } - file->SetPreallocationBlockSize(options_.manifest_preallocation_size); + file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); { log::Writer log(std::move(file)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); + if (s.ok()) { + s = SyncManifest(env_, &db_options_, log.file()); + } } if (s.ok()) { // Make "CURRENT" file that points to the new manifest file. - s = SetCurrentFile(env_, dbname_, 1, db_directory_.get()); + s = SetCurrentFile(env_, dbname_, 1, directories_.GetDbDir()); } else { env_->DeleteFile(manifest); } @@ -473,38 +365,39 @@ Status DBImpl::NewDB() { } void DBImpl::MaybeIgnoreError(Status* s) const { - if (s->ok() || options_.paranoid_checks) { + if (s->ok() || db_options_.paranoid_checks) { // No change needed } else { - Log(options_.info_log, "Ignoring error %s", s->ToString().c_str()); + Log(InfoLogLevel::WARN_LEVEL, + db_options_.info_log, "Ignoring error %s", s->ToString().c_str()); *s = Status::OK(); } } const Status DBImpl::CreateArchivalDirectory() { - if (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0) { - std::string archivalPath = ArchivalDirectory(options_.wal_dir); + if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) { + std::string archivalPath = ArchivalDirectory(db_options_.wal_dir); return env_->CreateDirIfMissing(archivalPath); } return Status::OK(); } void DBImpl::PrintStatistics() { - auto dbstats = options_.statistics.get(); + auto dbstats = db_options_.statistics.get(); if (dbstats) { - Log(options_.info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "STATISTCS:\n %s", dbstats->ToString().c_str()); } } void DBImpl::MaybeDumpStats() { - if (options_.stats_dump_period_sec == 0) return; + if (db_options_.stats_dump_period_sec == 0) return; const uint64_t now_micros = env_->NowMicros(); if (last_stats_dump_time_microsec_ + - options_.stats_dump_period_sec * 1000000 + db_options_.stats_dump_period_sec * 1000000 <= now_micros) { // Multiple threads could race in here simultaneously. // However, the last one will update last_stats_dump_time_microsec_ @@ -520,7 +413,7 @@ void DBImpl::MaybeDumpStats() { GetPropertyType("rocksdb.dbstats", &tmp1, &tmp2); std::string stats; { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { cfd->internal_stats()->GetStringProperty(cf_property_type, "rocksdb.cfstats", &stats); @@ -528,21 +421,25 @@ void DBImpl::MaybeDumpStats() { default_cf_internal_stats_->GetStringProperty(db_property_type, "rocksdb.dbstats", &stats); } - Log(options_.info_log, "------- DUMPING STATS -------"); - Log(options_.info_log, "%s", stats.c_str()); + Log(InfoLogLevel::INFO_LEVEL, + db_options_.info_log, "------- DUMPING STATS -------"); + Log(InfoLogLevel::INFO_LEVEL, + db_options_.info_log, "%s", stats.c_str()); PrintStatistics(); } } -// Returns the list of live files in 'sst_live' and the list -// of all files in the filesystem in 'candidate_files'. +// * Returns the list of live files in 'sst_live' +// If it's doing full scan: +// * Returns the list of all files in the filesystem in +// 'full_scan_candidate_files'. +// Otherwise, gets obsolete files from VersionSet. // no_full_scan = true -- never do the full scan using GetChildren() // force = false -- don't force the full scan, except every -// options_.delete_obsolete_files_period_micros +// db_options_.delete_obsolete_files_period_micros // force = true -- force the full scan -void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, - bool force, +void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, bool no_full_scan) { mutex_.AssertHeld(); @@ -556,82 +453,82 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // logic for figurint out if we're doing the full scan if (no_full_scan) { doing_the_full_scan = false; - } else if (force || options_.delete_obsolete_files_period_micros == 0) { + } else if (force || db_options_.delete_obsolete_files_period_micros == 0) { doing_the_full_scan = true; } else { const uint64_t now_micros = env_->NowMicros(); - if (delete_obsolete_files_last_run_ + - options_.delete_obsolete_files_period_micros < now_micros) { + if (delete_obsolete_files_next_run_ < now_micros) { doing_the_full_scan = true; - delete_obsolete_files_last_run_ = now_micros; + delete_obsolete_files_next_run_ = + now_micros + db_options_.delete_obsolete_files_period_micros; } } + // don't delete files that might be currently written to from compaction + // threads + if (!pending_outputs_.empty()) { + job_context->min_pending_output = *pending_outputs_.begin(); + } else { + // delete all of them + job_context->min_pending_output = std::numeric_limits::max(); + } + // get obsolete files - versions_->GetObsoleteFiles(&deletion_state.sst_delete_files); + versions_->GetObsoleteFiles(&job_context->sst_delete_files, + job_context->min_pending_output); // store the current filenum, lognum, etc - deletion_state.manifest_file_number = versions_->ManifestFileNumber(); - deletion_state.pending_manifest_file_number = - versions_->PendingManifestFileNumber(); - deletion_state.log_number = versions_->MinLogNumber(); - deletion_state.prev_log_number = versions_->PrevLogNumber(); - - if (!doing_the_full_scan && !deletion_state.HaveSomethingToDelete()) { - // avoid filling up sst_live if we're sure that we - // are not going to do the full scan and that we don't have - // anything to delete at the moment - return; - } - - // don't delete live files - for (auto pair : pending_outputs_) { - deletion_state.sst_live.emplace_back(pair.first, pair.second, 0); - } - /* deletion_state.sst_live.insert(pending_outputs_.begin(), - pending_outputs_.end());*/ - versions_->AddLiveFiles(&deletion_state.sst_live); + job_context->manifest_file_number = versions_->manifest_file_number(); + job_context->pending_manifest_file_number = + versions_->pending_manifest_file_number(); + job_context->log_number = versions_->MinLogNumber(); + job_context->prev_log_number = versions_->prev_log_number(); + versions_->AddLiveFiles(&job_context->sst_live); if (doing_the_full_scan) { - for (uint32_t path_id = 0; path_id < options_.db_paths.size(); path_id++) { + for (uint32_t path_id = 0; path_id < db_options_.db_paths.size(); + path_id++) { // set of all files in the directory. We'll exclude files that are still // alive in the subsequent processings. std::vector files; - env_->GetChildren(options_.db_paths[path_id].path, + env_->GetChildren(db_options_.db_paths[path_id].path, &files); // Ignore errors for (std::string file : files) { - deletion_state.candidate_files.emplace_back(file, path_id); + // TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes + job_context->full_scan_candidate_files.emplace_back("/" + file, + path_id); } } //Add log files in wal_dir - if (options_.wal_dir != dbname_) { + if (db_options_.wal_dir != dbname_) { std::vector log_files; - env_->GetChildren(options_.wal_dir, &log_files); // Ignore errors + env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors for (std::string log_file : log_files) { - deletion_state.candidate_files.emplace_back(log_file, 0); + job_context->full_scan_candidate_files.emplace_back(log_file, 0); } } // Add info log files in db_log_dir - if (!options_.db_log_dir.empty() && options_.db_log_dir != dbname_) { + if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) { std::vector info_log_files; - env_->GetChildren(options_.db_log_dir, &info_log_files); // Ignore errors + // Ignore errors + env_->GetChildren(db_options_.db_log_dir, &info_log_files); for (std::string log_file : info_log_files) { - deletion_state.candidate_files.emplace_back(log_file, 0); + job_context->full_scan_candidate_files.emplace_back(log_file, 0); } } } } namespace { -bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first, - const rocksdb::DBImpl::CandidateFileInfo& second) { +bool CompareCandidateFile(const JobContext::CandidateFileInfo& first, + const JobContext::CandidateFileInfo& second) { if (first.file_name > second.file_name) { return true; } else if (first.file_name < second.file_name) { return false; } else { - return (first.path_id > first.path_id); + return (first.path_id > second.path_id); } } }; // namespace @@ -640,7 +537,7 @@ bool CompareCandidateFile(const rocksdb::DBImpl::CandidateFileInfo& first, // belong to live files are posibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. -void DBImpl::PurgeObsoleteFiles(DeletionState& state) { +void DBImpl::PurgeObsoleteFiles(const JobContext& state) { // we'd better have sth to delete assert(state.HaveSomethingToDelete()); @@ -654,15 +551,14 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { // Now, convert live list to an unordered map, WITHOUT mutex held; // set is slow. std::unordered_map sst_live_map; - for (FileDescriptor& fd : state.sst_live) { + for (const FileDescriptor& fd : state.sst_live) { sst_live_map[fd.GetNumber()] = &fd; } - auto& candidate_files = state.candidate_files; - candidate_files.reserve( - candidate_files.size() + - state.sst_delete_files.size() + - state.log_delete_files.size()); + auto candidate_files = state.full_scan_candidate_files; + candidate_files.reserve(candidate_files.size() + + state.sst_delete_files.size() + + state.log_delete_files.size()); // We may ignore the dbname when generating the file names. const char* kDumbDbName = ""; for (auto file : state.sst_delete_files) { @@ -686,7 +582,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { candidate_files.end()); std::vector old_info_log_files; - InfoLogPrefix info_log_prefix(!options_.db_log_dir.empty(), dbname_); + InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_); for (const auto& candidate_file : candidate_files) { std::string to_delete = candidate_file.file_name; uint32_t path_id = candidate_file.path_id; @@ -709,7 +605,10 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { keep = (number >= state.manifest_file_number); break; case kTableFile: - keep = (sst_live_map.find(number) != sst_live_map.end()); + // If the second condition is not there, this makes + // DontDeletePendingOutputs fail + keep = (sst_live_map.find(number) != sst_live_map.end()) || + number >= state.min_pending_output; break; case kTempFile: // Any temp files that are currently being written to must @@ -742,377 +641,125 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { if (type == kTableFile) { // evict from cache TableCache::Evict(table_cache_.get(), number); - fname = TableFileName(options_.db_paths, number, path_id); + fname = TableFileName(db_options_.db_paths, number, path_id); } else { - fname = - ((type == kLogFile) ? options_.wal_dir : dbname_) + "/" + to_delete; + fname = ((type == kLogFile) ? + db_options_.wal_dir : dbname_) + "/" + to_delete; } - if (type == kLogFile && - (options_.WAL_ttl_seconds > 0 || options_.WAL_size_limit_MB > 0)) { - auto archived_log_name = ArchivedLogFileName(options_.wal_dir, number); - // The sync point below is used in (DBTest,TransactionLogIteratorRace) - TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:1"); - Status s = env_->RenameFile(fname, archived_log_name); - // The sync point below is used in (DBTest,TransactionLogIteratorRace) - TEST_SYNC_POINT("DBImpl::PurgeObsoleteFiles:2"); - Log(options_.info_log, - "Move log file %s to %s -- %s\n", - fname.c_str(), archived_log_name.c_str(), s.ToString().c_str()); +#ifdef ROCKSDB_LITE + Status s = env_->DeleteFile(fname); + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Delete %s type=%d #%" PRIu64 " -- %s\n", + fname.c_str(), type, number, s.ToString().c_str()); +#else // not ROCKSDB_LITE + if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 || + db_options_.WAL_size_limit_MB > 0)) { + wal_manager_.ArchiveWALFile(fname, number); } else { Status s = env_->DeleteFile(fname); - Log(options_.info_log, "Delete %s type=%d #%" PRIu64 " -- %s\n", + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Delete %s type=%d #%" PRIu64 " -- %s\n", fname.c_str(), type, number, s.ToString().c_str()); } +#endif // ROCKSDB_LITE } // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= options_.keep_log_file_num) { + if (old_info_log_file_count >= db_options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); - size_t end = old_info_log_file_count - options_.keep_log_file_num; + size_t end = old_info_log_file_count - db_options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { std::string& to_delete = old_info_log_files.at(i); - std::string full_path_to_delete = - (options_.db_log_dir.empty() ? dbname_ : options_.db_log_dir) + "/" + - to_delete; - Log(options_.info_log, "Delete info log file %s\n", + std::string full_path_to_delete = (db_options_.db_log_dir.empty() ? + dbname_ : db_options_.db_log_dir) + "/" + to_delete; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Delete info log file %s\n", full_path_to_delete.c_str()); Status s = env_->DeleteFile(full_path_to_delete); if (!s.ok()) { - Log(options_.info_log, "Delete info log file %s FAILED -- %s\n", + Log(InfoLogLevel::ERROR_LEVEL, + db_options_.info_log, "Delete info log file %s FAILED -- %s\n", to_delete.c_str(), s.ToString().c_str()); } } } - PurgeObsoleteWALFiles(); - LogFlush(options_.info_log); +#ifndef ROCKSDB_LITE + wal_manager_.PurgeObsoleteWALFiles(); +#endif // ROCKSDB_LITE + LogFlush(db_options_.info_log); } void DBImpl::DeleteObsoleteFiles() { mutex_.AssertHeld(); - DeletionState deletion_state; - FindObsoleteFiles(deletion_state, true); - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); - } -} - -#ifndef ROCKSDB_LITE -// 1. Go through all archived files and -// a. if ttl is enabled, delete outdated files -// b. if archive size limit is enabled, delete empty files, -// compute file number and size. -// 2. If size limit is enabled: -// a. compute how many files should be deleted -// b. get sorted non-empty archived logs -// c. delete what should be deleted -void DBImpl::PurgeObsoleteWALFiles() { - bool const ttl_enabled = options_.WAL_ttl_seconds > 0; - bool const size_limit_enabled = options_.WAL_size_limit_MB > 0; - if (!ttl_enabled && !size_limit_enabled) { - return; - } - - int64_t current_time; - Status s = env_->GetCurrentTime(¤t_time); + JobContext job_context; + FindObsoleteFiles(&job_context, true); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); +} + +Status DBImpl::Directories::CreateAndNewDirectory( + Env* env, const std::string& dirname, + std::unique_ptr* directory) const { + // We call CreateDirIfMissing() as the directory may already exist (if we + // are reopening a DB), when this happens we don't want creating the + // directory to cause an error. However, we need to check if creating the + // directory fails or else we may get an obscure message about the lock + // file not existing. One real-world example of this occurring is if + // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. + // when dbname_ is "dir/db" but when "dir" doesn't exist. + Status s = env->CreateDirIfMissing(dirname); if (!s.ok()) { - Log(options_.info_log, "Can't get current time: %s", s.ToString().c_str()); - assert(false); - return; - } - uint64_t const now_seconds = static_cast(current_time); - uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) ? - options_.WAL_ttl_seconds / 2 : default_interval_to_delete_obsolete_WAL_; - - if (purge_wal_files_last_run_ + time_to_check > now_seconds) { - return; + return s; } + return env->NewDirectory(dirname, directory); +} - purge_wal_files_last_run_ = now_seconds; - - std::string archival_dir = ArchivalDirectory(options_.wal_dir); - std::vector files; - s = env_->GetChildren(archival_dir, &files); +Status DBImpl::Directories::SetDirectories( + Env* env, const std::string& dbname, const std::string& wal_dir, + const std::vector& data_paths) { + Status s = CreateAndNewDirectory(env, dbname, &db_dir_); if (!s.ok()) { - Log(options_.info_log, "Can't get archive files: %s", s.ToString().c_str()); - assert(false); - return; - } - - size_t log_files_num = 0; - uint64_t log_file_size = 0; - - for (auto& f : files) { - uint64_t number; - FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { - std::string const file_path = archival_dir + "/" + f; - if (ttl_enabled) { - uint64_t file_m_time; - Status const s = env_->GetFileModificationTime(file_path, - &file_m_time); - if (!s.ok()) { - Log(options_.info_log, "Can't get file mod time: %s: %s", - file_path.c_str(), s.ToString().c_str()); - continue; - } - if (now_seconds - file_m_time > options_.WAL_ttl_seconds) { - Status const s = env_->DeleteFile(file_path); - if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", - file_path.c_str(), s.ToString().c_str()); - continue; - } else { - MutexLock l(&read_first_record_cache_mutex_); - read_first_record_cache_.erase(number); - } - continue; - } - } - - if (size_limit_enabled) { - uint64_t file_size; - Status const s = env_->GetFileSize(file_path, &file_size); - if (!s.ok()) { - Log(options_.info_log, "Can't get file size: %s: %s", - file_path.c_str(), s.ToString().c_str()); - return; - } else { - if (file_size > 0) { - log_file_size = std::max(log_file_size, file_size); - ++log_files_num; - } else { - Status s = env_->DeleteFile(file_path); - if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", - file_path.c_str(), s.ToString().c_str()); - continue; - } else { - MutexLock l(&read_first_record_cache_mutex_); - read_first_record_cache_.erase(number); - } - } - } - } - } - } - - if (0 == log_files_num || !size_limit_enabled) { - return; - } - - size_t const files_keep_num = options_.WAL_size_limit_MB * - 1024 * 1024 / log_file_size; - if (log_files_num <= files_keep_num) { - return; - } - - size_t files_del_num = log_files_num - files_keep_num; - VectorLogPtr archived_logs; - GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); - - if (files_del_num > archived_logs.size()) { - Log(options_.info_log, "Trying to delete more archived log files than " - "exist. Deleting all"); - files_del_num = archived_logs.size(); + return s; } - - for (size_t i = 0; i < files_del_num; ++i) { - std::string const file_path = archived_logs[i]->PathName(); - Status const s = DeleteFile(file_path); + if (!wal_dir.empty() && dbname != wal_dir) { + s = CreateAndNewDirectory(env, wal_dir, &wal_dir_); if (!s.ok()) { - Log(options_.info_log, "Can't delete file: %s: %s", - file_path.c_str(), s.ToString().c_str()); - continue; - } else { - MutexLock l(&read_first_record_cache_mutex_); - read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + return s; } } -} - -namespace { -struct CompareLogByPointer { - bool operator()(const unique_ptr& a, const unique_ptr& b) { - LogFileImpl* a_impl = dynamic_cast(a.get()); - LogFileImpl* b_impl = dynamic_cast(b.get()); - return *a_impl < *b_impl; - } -}; -} - -Status DBImpl::GetSortedWalsOfType(const std::string& path, - VectorLogPtr& log_files, - WalFileType log_type) { - std::vector all_files; - const Status status = env_->GetChildren(path, &all_files); - if (!status.ok()) { - return status; - } - log_files.reserve(all_files.size()); - for (const auto& f : all_files) { - uint64_t number; - FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { - SequenceNumber sequence; - Status s = ReadFirstRecord(log_type, number, &sequence); - if (!s.ok()) { - return s; - } - if (sequence == 0) { - // empty file - continue; - } - // Reproduce the race condition where a log file is moved - // to archived dir, between these two sync points, used in - // (DBTest,TransactionLogIteratorRace) - TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:1"); - TEST_SYNC_POINT("DBImpl::GetSortedWalsOfType:2"); - - uint64_t size_bytes; - s = env_->GetFileSize(LogFileName(path, number), &size_bytes); - // re-try in case the alive log file has been moved to archive. - if (!s.ok() && log_type == kAliveLogFile && - env_->FileExists(ArchivedLogFileName(path, number))) { - s = env_->GetFileSize(ArchivedLogFileName(path, number), &size_bytes); - } + data_dirs_.clear(); + for (auto& p : data_paths) { + const std::string db_path = p.path; + if (db_path == dbname) { + data_dirs_.emplace_back(nullptr); + } else { + std::unique_ptr path_directory; + s = CreateAndNewDirectory(env, db_path, &path_directory); if (!s.ok()) { return s; } - - log_files.push_back(std::move(unique_ptr( - new LogFileImpl(number, log_type, sequence, size_bytes)))); - } - } - CompareLogByPointer compare_log_files; - std::sort(log_files.begin(), log_files.end(), compare_log_files); - return status; -} - -Status DBImpl::RetainProbableWalFiles(VectorLogPtr& all_logs, - const SequenceNumber target) { - int64_t start = 0; // signed to avoid overflow when target is < first file. - int64_t end = static_cast(all_logs.size()) - 1; - // Binary Search. avoid opening all files. - while (end >= start) { - int64_t mid = start + (end - start) / 2; // Avoid overflow. - SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); - if (current_seq_num == target) { - end = mid; - break; - } else if (current_seq_num < target) { - start = mid + 1; - } else { - end = mid - 1; + data_dirs_.emplace_back(path_directory.release()); } } - // end could be -ve. - size_t start_index = std::max(static_cast(0), end); - // The last wal file is always included - all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + assert(data_dirs_.size() == data_paths.size()); return Status::OK(); } -Status DBImpl::ReadFirstRecord(const WalFileType type, const uint64_t number, - SequenceNumber* sequence) { - if (type != kAliveLogFile && type != kArchivedLogFile) { - return Status::NotSupported("File Type Not Known " + std::to_string(type)); - } - { - MutexLock l(&read_first_record_cache_mutex_); - auto itr = read_first_record_cache_.find(number); - if (itr != read_first_record_cache_.end()) { - *sequence = itr->second; - return Status::OK(); - } - } - Status s; - if (type == kAliveLogFile) { - std::string fname = LogFileName(options_.wal_dir, number); - s = ReadFirstLine(fname, sequence); - if (env_->FileExists(fname) && !s.ok()) { - // return any error that is not caused by non-existing file - return s; - } - } - - if (type == kArchivedLogFile || !s.ok()) { - // check if the file got moved to archive. - std::string archived_file = ArchivedLogFileName(options_.wal_dir, number); - s = ReadFirstLine(archived_file, sequence); - } - - if (s.ok() && *sequence != 0) { - MutexLock l(&read_first_record_cache_mutex_); - read_first_record_cache_.insert({number, *sequence}); - } - return s; -} - -// the function returns status.ok() and sequence == 0 if the file exists, but is -// empty -Status DBImpl::ReadFirstLine(const std::string& fname, - SequenceNumber* sequence) { - struct LogReporter : public log::Reader::Reporter { - Env* env; - Logger* info_log; - const char* fname; - - Status* status; - bool ignore_error; // true if options_.paranoid_checks==false - virtual void Corruption(size_t bytes, const Status& s) { - Log(info_log, "%s%s: dropping %d bytes; %s", - (this->ignore_error ? "(ignoring error) " : ""), fname, - static_cast(bytes), s.ToString().c_str()); - if (this->status->ok()) { - // only keep the first error - *this->status = s; - } - } - }; - - unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); - - if (!status.ok()) { - return status; - } - - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log.get(); - reporter.fname = fname.c_str(); - reporter.status = &status; - reporter.ignore_error = !options_.paranoid_checks; - log::Reader reader(std::move(file), &reporter, true /*checksum*/, - 0 /*initial_offset*/); - std::string scratch; - Slice record; - - if (reader.ReadRecord(&record, &scratch) && - (status.ok() || !options_.paranoid_checks)) { - if (record.size() < 12) { - reporter.Corruption(record.size(), - Status::Corruption("log record too small")); - // TODO read record's till the first no corrupt entry? - } else { - WriteBatch batch; - WriteBatchInternal::SetContents(&batch, record); - *sequence = WriteBatchInternal::Sequence(&batch); - return Status::OK(); - } +Directory* DBImpl::Directories::GetDataDir(size_t path_id) { + assert(path_id < data_dirs_.size()); + Directory* ret_dir = data_dirs_[path_id].get(); + if (ret_dir == nullptr) { + // Should use db_dir_ + return db_dir_.get(); } - - // ReadRecord returns false on EOF, which means that the log file is empty. we - // return status.ok() in that case and set sequence number to 0 - *sequence = 0; - return status; + return ret_dir; } -#endif // ROCKSDB_LITE - Status DBImpl::Recover( const std::vector& column_families, bool read_only, bool error_if_log_file_exist) { @@ -1121,26 +768,8 @@ Status DBImpl::Recover( bool is_new_db = false; assert(db_lock_ == nullptr); if (!read_only) { - // We call CreateDirIfMissing() as the directory may already exist (if we - // are reopening a DB), when this happens we don't want creating the - // directory to cause an error. However, we need to check if creating the - // directory fails or else we may get an obscure message about the lock - // file not existing. One real-world example of this occurring is if - // env->CreateDirIfMissing() doesn't create intermediate directories, e.g. - // when dbname_ is "dir/db" but when "dir" doesn't exist. - Status s = env_->CreateDirIfMissing(dbname_); - if (!s.ok()) { - return s; - } - - for (auto& db_path : options_.db_paths) { - s = env_->CreateDirIfMissing(db_path.path); - if (!s.ok()) { - return s; - } - } - - s = env_->NewDirectory(dbname_, &db_directory_); + Status s = directories_.SetDirectories(env_, dbname_, db_options_.wal_dir, + db_options_.db_paths); if (!s.ok()) { return s; } @@ -1151,7 +780,7 @@ Status DBImpl::Recover( } if (!env_->FileExists(CurrentFileName(dbname_))) { - if (options_.create_if_missing) { + if (db_options_.create_if_missing) { s = NewDB(); is_new_db = true; if (!s.ok()) { @@ -1162,7 +791,7 @@ Status DBImpl::Recover( dbname_, "does not exist (create_if_missing is false)"); } } else { - if (options_.error_if_exists) { + if (db_options_.error_if_exists) { return Status::InvalidArgument( dbname_, "exists (error_if_exists is true)"); } @@ -1177,7 +806,7 @@ Status DBImpl::Recover( } Status s = versions_->Recover(column_families, read_only); - if (options_.paranoid_checks && s.ok()) { + if (db_options_.paranoid_checks && s.ok()) { s = CheckConsistency(); } if (s.ok()) { @@ -1192,13 +821,13 @@ Status DBImpl::Recover( // descriptor (new log files may have been added by the previous // incarnation without registering them in the descriptor). // - // Note that PrevLogNumber() is no longer used, but we pay + // Note that prev_log_number() is no longer used, but we pay // attention to it in case we are recovering a database // produced by an older version of rocksdb. const uint64_t min_log = versions_->MinLogNumber(); - const uint64_t prev_log = versions_->PrevLogNumber(); + const uint64_t prev_log = versions_->prev_log_number(); std::vector filenames; - s = env_->GetChildren(options_.wal_dir, &filenames); + s = env_->GetChildren(db_options_.wal_dir, &filenames); if (!s.ok()) { return s; } @@ -1225,36 +854,42 @@ Status DBImpl::Recover( "flag but a log file already exists"); } - // Recover in the order in which the logs were generated - std::sort(logs.begin(), logs.end()); - for (const auto& log : logs) { - // The previous incarnation may not have written any MANIFEST - // records after allocating this log number. So we manually - // update the file number allocation counter in VersionSet. - versions_->MarkFileNumberUsed(log); - s = RecoverLogFile(log, &max_sequence, read_only); + if (!logs.empty()) { + // Recover in the order in which the logs were generated + std::sort(logs.begin(), logs.end()); + s = RecoverLogFiles(logs, &max_sequence, read_only); + if (!s.ok()) { + // Clear memtables if recovery failed + for (auto cfd : *versions_->GetColumnFamilySet()) { + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions()); + } + } } SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); } + // Initial value + max_total_in_memory_state_ = 0; for (auto cfd : *versions_->GetColumnFamilySet()) { - max_total_in_memory_state_ += cfd->options()->write_buffer_size * - cfd->options()->max_write_buffer_number; + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ += mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; } return s; } -Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, - bool read_only) { +// REQUIRES: log_numbers are sorted in ascending order +Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* max_sequence, bool read_only) { struct LogReporter : public log::Reader::Reporter { Env* env; Logger* info_log; const char* fname; - Status* status; // nullptr if options_.paranoid_checks==false or - // options_.skip_log_error_on_recovery==true + Status* status; // nullptr if db_options_.paranoid_checks==false virtual void Corruption(size_t bytes, const Status& s) { - Log(info_log, "%s%s: dropping %d bytes; %s", + Log(InfoLogLevel::WARN_LEVEL, + info_log, "%s%s: dropping %d bytes; %s", (this->status == nullptr ? "(ignoring error) " : ""), fname, static_cast(bytes), s.ToString().c_str()); if (this->status != nullptr && this->status->ok()) *this->status = s; @@ -1262,7 +897,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, }; mutex_.AssertHeld(); - + Status status; std::unordered_map version_edits; // no need to refcount because iteration is under mutex for (auto cfd : *versions_->GetColumnFamilySet()) { @@ -1271,61 +906,78 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, version_edits.insert({cfd->GetID(), edit}); } - // Open the log file - std::string fname = LogFileName(options_.wal_dir, log_number); - unique_ptr file; - Status status = env_->NewSequentialFile(fname, &file, storage_options_); - if (!status.ok()) { - MaybeIgnoreError(&status); - return status; - } - - // Create the log reader. - LogReporter reporter; - reporter.env = env_; - reporter.info_log = options_.info_log.get(); - reporter.fname = fname.c_str(); - reporter.status = (options_.paranoid_checks && - !options_.skip_log_error_on_recovery ? &status : nullptr); - // We intentially make log::Reader do checksumming even if - // paranoid_checks==false so that corruptions cause entire commits - // to be skipped instead of propagating bad information (like overly - // large sequence numbers). - log::Reader reader(std::move(file), &reporter, true/*checksum*/, - 0/*initial_offset*/); - Log(options_.info_log, "Recovering log #%" PRIu64 "", log_number); - - // Read all the records and add to a memtable - std::string scratch; - Slice record; - WriteBatch batch; - while (reader.ReadRecord(&record, &scratch)) { - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - continue; + for (auto log_number : log_numbers) { + // The previous incarnation may not have written any MANIFEST + // records after allocating this log number. So we manually + // update the file number allocation counter in VersionSet. + versions_->MarkFileNumberUsedDuringRecovery(log_number); + // Open the log file + std::string fname = LogFileName(db_options_.wal_dir, log_number); + unique_ptr file; + status = env_->NewSequentialFile(fname, &file, env_options_); + if (!status.ok()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } } - WriteBatchInternal::SetContents(&batch, record); - status = WriteBatchInternal::InsertInto( - &batch, column_family_memtables_.get(), true, log_number); + // Create the log reader. + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = (db_options_.paranoid_checks) ? &status : nullptr; + // We intentially make log::Reader do checksumming even if + // paranoid_checks==false so that corruptions cause entire commits + // to be skipped instead of propagating bad information (like overly + // large sequence numbers). + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + Log(InfoLogLevel::INFO_LEVEL, + db_options_.info_log, "Recovering log #%" PRIu64 "", log_number); + + // Read all the records and add to a memtable + std::string scratch; + Slice record; + WriteBatch batch; + while (reader.ReadRecord(&record, &scratch) && status.ok()) { + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + continue; + } + WriteBatchInternal::SetContents(&batch, record); - MaybeIgnoreError(&status); - if (!status.ok()) { - return status; - } - const SequenceNumber last_seq = - WriteBatchInternal::Sequence(&batch) + - WriteBatchInternal::Count(&batch) - 1; - if (last_seq > *max_sequence) { - *max_sequence = last_seq; - } + // If column family was not found, it might mean that the WAL write + // batch references to the column family that was dropped after the + // insert. We don't want to fail the whole write batch in that case -- + // we just ignore the update. + // That's why we set ignore missing column families to true + status = WriteBatchInternal::InsertInto( + &batch, column_family_memtables_.get(), true, log_number); - if (!read_only) { - // no need to refcount since client still doesn't have access - // to the DB and can not drop column families while we iterate - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->mem()->ShouldFlush()) { + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } + const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) + + WriteBatchInternal::Count(&batch) - 1; + if (last_seq > *max_sequence) { + *max_sequence = last_seq; + } + + if (!read_only) { + // we can do this because this is called before client has access to the + // DB and there is only a single thread operating on DB + ColumnFamilyData* cfd; + + while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + cfd->Unref(); // If this asserts, it means that InsertInto failed in // filtering updates to already-flushed column families assert(cfd->GetLogNumber() <= log_number); @@ -1333,33 +985,38 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); - // we still want to clear the memtable, even if the recovery failed - cfd->CreateNewMemtable(); if (!status.ok()) { // Reflect errors immediately so that conditions like full // file-systems cause the DB::Open() to fail. return status; } + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions()); } } } - } - if (versions_->LastSequence() < *max_sequence) { - versions_->SetLastSequence(*max_sequence); + if (!status.ok()) { + return status; + } + + flush_scheduler_.Clear(); + if (versions_->LastSequence() < *max_sequence) { + versions_->SetLastSequence(*max_sequence); + } } if (!read_only) { // no need to refcount since client still doesn't have access // to the DB and can not drop column families while we iterate + auto max_log_number = log_numbers.back(); for (auto cfd : *versions_->GetColumnFamilySet()) { auto iter = version_edits.find(cfd->GetID()); assert(iter != version_edits.end()); VersionEdit* edit = &iter->second; - if (cfd->GetLogNumber() > log_number) { + if (cfd->GetLogNumber() > max_log_number) { // Column family cfd has already flushed the data - // from log_number. Memtable has to be empty because + // from all logs. Memtable has to be empty because // we filter the updates based on log_number // (in WriteBatch::InsertInto) assert(cfd->mem()->GetFirstSequenceNumber() == 0); @@ -1370,28 +1027,30 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // flush the final memtable (if non-empty) if (cfd->mem()->GetFirstSequenceNumber() != 0) { status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); - } - // we still want to clear the memtable, even if the recovery failed - cfd->CreateNewMemtable(); - if (!status.ok()) { - return status; + if (!status.ok()) { + // Recovery failed + break; + } + cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions()); } // write MANIFEST with update - // writing log number in the manifest means that any log file + // writing log_number in the manifest means that any log file // with number strongly less than (log_number + 1) is already // recovered and should be ignored on next reincarnation. - // Since we already recovered log_number, we want all logs - // with numbers `<= log_number` (includes this one) to be ignored - edit->SetLogNumber(log_number + 1); + // Since we already recovered max_log_number, we want all logs + // with numbers `<= max_log_number` (includes this one) to be ignored + edit->SetLogNumber(max_log_number + 1); // we must mark the next log number as used, even though it's // not actually used. that is because VersionSet assumes // VersionSet::next_file_number_ always to be strictly greater than any // log number - versions_->MarkFileNumberUsed(log_number + 1); - status = versions_->LogAndApply(cfd, edit, &mutex_); + versions_->MarkFileNumberUsedDuringRecovery(max_log_number + 1); + status = versions_->LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_); if (!status.ok()) { - return status; + // Recovery failed + break; } } } @@ -1405,34 +1064,40 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); - pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file. + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); ReadOptions ro; ro.total_order_seek = true; - Iterator* iter = mem->NewIterator(ro); - const SequenceNumber newest_snapshot = snapshots_.GetNewest(); - const SequenceNumber earliest_seqno_in_memtable = - mem->GetFirstSequenceNumber(); - Log(options_.info_log, "[%s] Level-0 table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - + Arena arena; Status s; { - mutex_.Unlock(); - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - mutex_.Lock(); - } + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + const SequenceNumber newest_snapshot = snapshots_.GetNewest(); + const SequenceNumber earliest_seqno_in_memtable = + mem->GetFirstSequenceNumber(); + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": started", + cfd->GetName().c_str(), meta.fd.GetNumber()); - Log(options_.info_log, - "[%s] Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), - s.ToString().c_str()); - delete iter; + { + mutex_.Unlock(); + s = BuildTable( + dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), + iter.get(), &meta, cfd->internal_comparator(), newest_snapshot, + earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()), + cfd->ioptions()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + mutex_.Lock(); + } - pending_outputs_.erase(meta.fd.GetNumber()); + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] [WriteLevel0TableForRecovery]" + " Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), + s.ToString().c_str()); + } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. @@ -1454,183 +1119,97 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, return s; } -Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, - autovector& mems, VersionEdit* edit, - uint64_t* filenumber, LogBuffer* log_buffer) { +Status DBImpl::FlushMemTableToOutputFile( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) { mutex_.AssertHeld(); - const uint64_t start_micros = env_->NowMicros(); - FileMetaData meta; + assert(cfd->imm()->size() != 0); + assert(cfd->imm()->IsFlushPending()); - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); - *filenumber = meta.fd.GetNumber(); - pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file. - - const SequenceNumber newest_snapshot = snapshots_.GetNewest(); - const SequenceNumber earliest_seqno_in_memtable = - mems[0]->GetFirstSequenceNumber(); - Version* base = cfd->current(); - base->Ref(); // it is likely that we do not need this reference - Status s; - { - mutex_.Unlock(); - log_buffer->FlushBufferToLog(); - std::vector memtables; - ReadOptions ro; - ro.total_order_seek = true; - for (MemTable* m : mems) { - Log(options_.info_log, - "[%s] Flushing memtable with next log file: %" PRIu64 "\n", - cfd->GetName().c_str(), m->GetNextLogNumber()); - memtables.push_back(m->NewIterator(ro)); - } - Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), - &memtables[0], memtables.size()); - Log(options_.info_log, "[%s] Level-0 flush table #%" PRIu64 ": started", - cfd->GetName().c_str(), meta.fd.GetNumber()); - - s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, - cfd->table_cache(), iter, &meta, cfd->internal_comparator(), - newest_snapshot, earliest_seqno_in_memtable, - GetCompressionFlush(*cfd->options()), Env::IO_HIGH); - LogFlush(options_.info_log); - delete iter; - Log(options_.info_log, - "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s", - cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), - s.ToString().c_str()); - - if (!options_.disableDataSync) { - db_directory_->Fsync(); - } - mutex_.Lock(); - } - base->Unref(); - - // re-acquire the most current version - base = cfd->current(); - - // There could be multiple threads writing to its own level-0 file. - // The pending_outputs cannot be cleared here, otherwise this newly - // created file might not be considered as a live-file by another - // compaction thread that is concurrently deleting obselete files. - // The pending_outputs can be cleared only after the new version is - // committed so that other threads can recognize this file as a - // valid one. - // pending_outputs_.erase(meta.number); - - // Note that if file_size is zero, the file has been deleted and - // should not be added to the manifest. - int level = 0; - if (s.ok() && meta.fd.GetFileSize() > 0) { - const Slice min_user_key = meta.smallest.user_key(); - const Slice max_user_key = meta.largest.user_key(); - // if we have more than 1 background thread, then we cannot - // insert files directly into higher levels because some other - // threads could be concurrently producing compacted files for - // that key range. - if (base != nullptr && options_.max_background_compactions <= 1 && - cfd->options()->compaction_style == kCompactionStyleLevel) { - level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); - } - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno); - } - - InternalStats::CompactionStats stats(1); - stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.fd.GetFileSize(); - cfd->internal_stats()->AddCompactionStats(level, stats); - cfd->internal_stats()->AddCFStats( - InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize()); - RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); - return s; -} - -Status DBImpl::FlushMemTableToOutputFile(ColumnFamilyData* cfd, - bool* madeProgress, - DeletionState& deletion_state, - LogBuffer* log_buffer) { - mutex_.AssertHeld(); - assert(cfd->imm()->size() != 0); - assert(cfd->imm()->IsFlushPending()); + FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options, + env_options_, versions_.get(), &mutex_, &shutting_down_, + snapshots_.GetNewest(), job_context, log_buffer, + directories_.GetDbDir(), directories_.GetDataDir(0U), + GetCompressionFlush(*cfd->ioptions()), stats_); - // Save the contents of the earliest memtable as a new Table uint64_t file_number; - autovector mems; - cfd->imm()->PickMemtablesToFlush(&mems); - if (mems.empty()) { - LogToBuffer(log_buffer, "[%s] Nothing in memtable to flush", - cfd->GetName().c_str()); - return Status::OK(); - } - - // record the logfile_number_ before we release the mutex - // entries mems are (implicitly) sorted in ascending order by their created - // time. We will use the first memtable's `edit` to keep the meta info for - // this flush. - MemTable* m = mems[0]; - VersionEdit* edit = m->GetEdits(); - edit->SetPrevLogNumber(0); - // SetLogNumber(log_num) indicates logs with number smaller than log_num - // will no longer be picked up for recovery. - edit->SetLogNumber(mems.back()->GetNextLogNumber()); - edit->SetColumnFamily(cfd->GetID()); - - // This will release and re-acquire the mutex. - Status s = WriteLevel0Table(cfd, mems, edit, &file_number, log_buffer); - - if (s.ok() && shutting_down_.Acquire_Load() && cfd->IsDropped()) { - s = Status::ShutdownInProgress( - "Database shutdown or Column family drop during flush"); - } - - if (!s.ok()) { - cfd->imm()->RollbackMemtableFlush(mems, file_number, &pending_outputs_); - } else { - // Replace immutable memtable with the generated Table - s = cfd->imm()->InstallMemtableFlushResults( - cfd, mems, versions_.get(), &mutex_, options_.info_log.get(), - file_number, &pending_outputs_, &deletion_state.memtables_to_free, - db_directory_.get(), log_buffer); - } + Status s = flush_job.Run(&file_number); if (s.ok()) { - InstallSuperVersion(cfd, deletion_state); + InstallSuperVersionBackground(cfd, job_context, mutable_cf_options); if (madeProgress) { *madeProgress = 1; } - Version::LevelSummaryStorage tmp; + VersionStorageInfo::LevelSummaryStorage tmp; LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(), - cfd->current()->LevelSummary(&tmp)); + cfd->current()->storage_info()->LevelSummary(&tmp)); if (disable_delete_obsolete_files_ == 0) { // add to deletion state while (alive_log_files_.size() && alive_log_files_.begin()->number < versions_->MinLogNumber()) { const auto& earliest = *alive_log_files_.begin(); - deletion_state.log_delete_files.push_back(earliest.number); + job_context->log_delete_files.push_back(earliest.number); total_log_size_ -= earliest.size; alive_log_files_.pop_front(); } } } - if (!s.ok() && !s.IsShutdownInProgress() && options_.paranoid_checks && + if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks && bg_error_.ok()) { // if a bad error happened (not ShutdownInProgress) and paranoid_checks is // true, mark DB read-only bg_error_ = s; } RecordFlushIOStats(); +#ifndef ROCKSDB_LITE + if (s.ok()) { + // may temporarily unlock and lock the mutex. + NotifyOnFlushCompleted(cfd, file_number, mutable_cf_options); + } +#endif // ROCKSDB_LITE return s; } +void DBImpl::NotifyOnFlushCompleted( + ColumnFamilyData* cfd, uint64_t file_number, + const MutableCFOptions& mutable_cf_options) { +#ifndef ROCKSDB_LITE + if (cfd->ioptions()->listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + bool triggered_flush_slowdown = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_slowdown_writes_trigger); + bool triggered_flush_stop = + (cfd->current()->storage_info()->NumLevelFiles(0) >= + mutable_cf_options.level0_stop_writes_trigger); + notifying_events_++; + // release lock while notifying events + mutex_.Unlock(); + // TODO(yhchiang): make db_paths dynamic. + cfd->NotifyOnFlushCompleted( + this, MakeTableFileName(db_options_.db_paths[0].path, file_number), + triggered_flush_slowdown, + triggered_flush_stop); + mutex_.Lock(); + notifying_events_--; + assert(notifying_events_ >= 0); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#endif // ROCKSDB_LITE +} + Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end, bool reduce_level, int target_level, uint32_t target_path_id) { - if (target_path_id >= options_.db_paths.size()) { + if (target_path_id >= db_options_.db_paths.size()) { return Status::InvalidArgument("Invalid target path ID"); } @@ -1639,16 +1218,16 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, Status s = FlushMemTable(cfd, FlushOptions()); if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } int max_level_with_files = 0; { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); Version* base = cfd->current(); for (int level = 1; level < cfd->NumberLevels(); level++) { - if (base->OverlapInLevel(level, begin, end)) { + if (base->storage_info()->OverlapInLevel(level, begin, end)) { max_level_with_files = level; } } @@ -1658,8 +1237,8 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, // bottom-most level, the output level will be the same as input one. // level 0 can never be the bottommost level (i.e. if all files are in level // 0, we will compact to level 1) - if (cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO || + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO || (level == max_level_with_files && level > 0)) { s = RunManualCompaction(cfd, level, level, target_path_id, begin, end); } else { @@ -1667,7 +1246,7 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, end); } if (!s.ok()) { - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); return s; } } @@ -1675,10 +1254,10 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, if (reduce_level) { s = ReFitLevel(cfd, max_level_with_files, target_level); } - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); // an automatic compaction that has been scheduled might have been // preempted by the manual compactions. Need to schedule it back. MaybeScheduleFlushOrCompaction(); @@ -1687,17 +1266,249 @@ Status DBImpl::CompactRange(ColumnFamilyHandle* column_family, return s; } +Status DBImpl::CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id) { +#ifdef ROCKSDB_LITE + // not supported in lite version + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + InstrumentedMutexLock l(&mutex_); + if (column_family == nullptr) { + return Status::InvalidArgument("ColumnFamilyHandle must be non-null."); + } + + auto cfd = reinterpret_cast(column_family)->cfd(); + assert(cfd); + // TODO(yhchiang): use superversion + cfd->Ref(); + auto version = cfd->current(); + version->Ref(); + auto s = CompactFilesImpl(compact_options, cfd, version, + input_file_names, output_level, output_path_id); + // TODO(yhchiang): unref could move into CompactFilesImpl(). Otherwise, + // FindObsoleteFiles might never able to find any file to delete. + version->Unref(); + // TODO(yhchiang): cfd should be deleted after its last reference. + cfd->Unref(); + return s; +#endif // ROCKSDB_LITE +} + +#ifndef ROCKSDB_LITE +Status DBImpl::CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + const int output_level, int output_path_id) { + mutex_.AssertHeld(); + + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); + + if (shutting_down_.load(std::memory_order_acquire)) { + return Status::ShutdownInProgress(); + } + + std::unordered_set input_set; + for (auto file_name : input_file_names) { + input_set.insert(TableFileNameToNumber(file_name)); + } + + ColumnFamilyMetaData cf_meta; + // TODO(yhchiang): can directly use version here if none of the + // following functions call is pluggable to external developers. + version->GetColumnFamilyMetaData(&cf_meta); + + if (output_path_id < 0) { + if (db_options_.db_paths.size() == 1U) { + output_path_id = 0; + } else { + return Status::NotSupported( + "Automatic output path selection is not " + "yet supported in CompactFiles()"); + } + } + + Status s = cfd->compaction_picker()->SanitizeCompactionInputFiles( + &input_set, cf_meta, output_level); + if (!s.ok()) { + return s; + } + + autovector input_files; + s = cfd->compaction_picker()->GetCompactionInputsFromFileNumbers( + &input_files, &input_set, version->storage_info(), compact_options); + if (!s.ok()) { + return s; + } + + for (auto inputs : input_files) { + if (cfd->compaction_picker()->FilesInCompaction(inputs.files)) { + return Status::Aborted( + "Some of the necessary compaction input " + "files are already being compacted"); + } + } + + // At this point, CompactFiles will be run. + bg_compaction_scheduled_++; + + unique_ptr c; + assert(cfd->compaction_picker()); + c.reset(cfd->compaction_picker()->FormCompaction( + compact_options, input_files, + output_level, version->storage_info(), + *cfd->GetLatestMutableCFOptions())); + assert(c); + c->SetInputVersion(version); + c->SetOutputPathId(static_cast(output_path_id)); + // deletion compaction currently not allowed in CompactFiles. + assert(!c->IsDeletionCompaction()); + + JobContext job_context(true); + auto yield_callback = [&]() { + return CallFlushDuringCompaction(c->column_family_data(), + *c->mutable_cf_options(), &job_context, + &log_buffer); + }; + CompactionJob compaction_job( + c.get(), db_options_, *c->mutable_cf_options(), env_options_, + versions_.get(), &shutting_down_, &log_buffer, directories_.GetDbDir(), + directories_.GetDataDir(c->GetOutputPathId()), stats_, &snapshots_, + is_snapshot_supported_, table_cache_, std::move(yield_callback)); + compaction_job.Prepare(); + + mutex_.Unlock(); + Status status = compaction_job.Run(); + mutex_.Lock(); + compaction_job.Install(&status, &mutex_); + if (status.ok()) { + InstallSuperVersionBackground(c->column_family_data(), &job_context, + *c->mutable_cf_options()); + } + c->ReleaseCompactionFiles(s); + c.reset(); + + if (status.ok()) { + // Done + } else if (status.IsShutdownInProgress()) { + // Ignore compaction errors found during shutting down + } else { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", + status.ToString().c_str()); + if (db_options_.paranoid_checks && bg_error_.ok()) { + bg_error_ = status; + } + } + + // If !s.ok(), this means that Compaction failed. In that case, we want + // to delete all obsolete files we might have created and we force + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + // TODO(yhchiang): write an unit-test to make sure files are actually + // deleted after CompactFiles. + FindObsoleteFiles(&job_context, !s.ok()); + + // delete unnecessary files if any, this is done outside the mutex + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + mutex_.Unlock(); + // Have to flush the info logs before bg_compaction_scheduled_-- + // because if bg_flush_scheduled_ becomes 0 and the lock is + // released, the deconstructor of DB can kick in and destroy all the + // states of DB so info_log might not be available after that point. + // It also applies to access other states that DB owns. + log_buffer.FlushBufferToLog(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); + } + job_context.Clean(); + mutex_.Lock(); + } + + bg_compaction_scheduled_--; + + return status; +} +#endif // ROCKSDB_LITE + +void DBImpl::NotifyOnCompactionCompleted( + ColumnFamilyData* cfd, Compaction *c, const Status &st) { +#ifndef ROCKSDB_LITE + if (cfd->ioptions()->listeners.size() == 0U) { + return; + } + mutex_.AssertHeld(); + if (shutting_down_.load(std::memory_order_acquire)) { + return; + } + notifying_events_++; + // release lock while notifying events + mutex_.Unlock(); + cfd->NotifyOnCompactionCompleted(this, c, st); + mutex_.Lock(); + notifying_events_--; + assert(notifying_events_ >= 0); + // no need to signal bg_cv_ as it will be signaled at the end of the + // flush process. +#endif // ROCKSDB_LITE +} + +Status DBImpl::SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& options_map) { +#ifdef ROCKSDB_LITE + return Status::NotSupported("Not supported in ROCKSDB LITE"); +#else + auto* cfd = reinterpret_cast(column_family)->cfd(); + if (options_map.empty()) { + Log(InfoLogLevel::WARN_LEVEL, + db_options_.info_log, "SetOptions() on column family [%s], empty input", + cfd->GetName().c_str()); + return Status::InvalidArgument("empty input"); + } + + MutableCFOptions new_options; + Status s; + { + InstrumentedMutexLock l(&mutex_); + s = cfd->SetOptions(options_map); + if (s.ok()) { + new_options = *cfd->GetLatestMutableCFOptions(); + } + } + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "SetOptions() on column family [%s], inputs:", + cfd->GetName().c_str()); + for (const auto& o : options_map) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "%s: %s\n", o.first.c_str(), o.second.c_str()); + } + if (s.ok()) { + Log(InfoLogLevel::INFO_LEVEL, + db_options_.info_log, "[%s] SetOptions succeeded", + cfd->GetName().c_str()); + new_options.Dump(db_options_.info_log.get()); + } else { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "[%s] SetOptions failed", cfd->GetName().c_str()); + } + return s; +#endif // ROCKSDB_LITE +} + // return the same level if it cannot be moved -int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level) { +int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, int level) { mutex_.AssertHeld(); - Version* current = cfd->current(); + const auto* vstorage = cfd->current()->storage_info(); int minimum_level = level; for (int i = level - 1; i > 0; --i) { // stop if level i is not empty - if (current->NumLevelFiles(i) > 0) break; + if (vstorage->NumLevelFiles(i) > 0) break; // stop if level i is too small (cannot fit the level files) - if (cfd->compaction_picker()->MaxBytesForLevel(i) < - current->NumLevelBytes(level)) { + if (mutable_cf_options.MaxBytesForLevel(i) < + vstorage->NumLevelBytes(level)) { break; } @@ -1717,7 +1528,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // only allow one thread refitting if (refitting_level_) { mutex_.Unlock(); - Log(options_.info_log, "ReFitLevel: another thread is refitting"); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[ReFitLevel] another thread is refitting"); delete new_superversion; return Status::NotSupported("another thread is refitting"); } @@ -1726,45 +1538,53 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { // wait for all background threads to stop bg_work_gate_closed_ = true; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) { - Log(options_.info_log, - "RefitLevel: waiting for background threads to stop: %d %d", + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[RefitLevel] waiting for background threads to stop: %d %d", bg_compaction_scheduled_, bg_flush_scheduled_); bg_cv_.Wait(); } + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); // move to a smaller level int to_level = target_level; if (target_level < 0) { - to_level = FindMinimumEmptyLevelFitting(cfd, level); + to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level); } assert(to_level <= level); Status status; if (to_level < level) { - Log(options_.info_log, "[%s] Before refitting:\n%s", cfd->GetName().c_str(), - cfd->current()->DebugString().data()); + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] Before refitting:\n%s", + cfd->GetName().c_str(), cfd->current()->DebugString().data()); VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); - for (const auto& f : cfd->current()->files_[level]) { + for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) { edit.DeleteFile(level, f->fd.GetNumber()); edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } - Log(options_.info_log, "[%s] Apply version edit:\n%s", + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); - superversion_to_free = cfd->InstallSuperVersion(new_superversion, &mutex_); + status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, &mutex_, + directories_.GetDbDir()); + superversion_to_free = InstallSuperVersion( + cfd, new_superversion, mutable_cf_options); new_superversion = nullptr; - Log(options_.info_log, "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] LogAndApply: %s\n", cfd->GetName().c_str(), status.ToString().data()); if (status.ok()) { - Log(options_.info_log, "[%s] After refitting:\n%s", + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] After refitting:\n%s", cfd->GetName().c_str(), cfd->current()->DebugString().data()); } } @@ -1785,18 +1605,22 @@ int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) { int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); - return cfh->cfd()->options()->max_mem_compaction_level; + InstrumentedMutexLock l(&mutex_); + return cfh->cfd()->GetSuperVersion()-> + mutable_cf_options.max_mem_compaction_level; } int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); - return cfh->cfd()->options()->level0_stop_writes_trigger; + InstrumentedMutexLock l(&mutex_); + return cfh->cfd()->GetSuperVersion()-> + mutable_cf_options.level0_stop_writes_trigger; } -Status DBImpl::Flush(const FlushOptions& options, +Status DBImpl::Flush(const FlushOptions& flush_options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); - return FlushMemTable(cfh->cfd(), options); + return FlushMemTable(cfh->cfd(), flush_options); } SequenceNumber DBImpl::GetLatestSequenceNumber() const { @@ -1820,23 +1644,23 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, // For universal compaction, we enforce every manual compaction to compact // all files. if (begin == nullptr || - cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { manual.begin = nullptr; } else { begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek); manual.begin = &begin_storage; } if (end == nullptr || - cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { + cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { manual.end = nullptr; } else { end_storage = InternalKey(*end, 0, static_cast(0)); manual.end = &end_storage; } - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); // When a manual compaction arrives, temporarily disable scheduling of // non-manual compactions and wait until the number of scheduled compaction @@ -1854,24 +1678,27 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, ++bg_manual_only_; while (bg_compaction_scheduled_ > 0) { - Log(options_.info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "[%s] Manual compaction waiting for all other scheduled background " "compactions to finish", cfd->GetName().c_str()); bg_cv_.Wait(); } - Log(options_.info_log, "[%s] Manual compaction starting", + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Manual compaction starting", cfd->GetName().c_str()); - while (!manual.done && !shutting_down_.Acquire_Load() && bg_error_.ok()) { + // We don't check bg_error_ here, because if we get the error in compaction, + // the compaction will set manual.status to bg_error_ and set manual.done to + // true. + while (!manual.done) { assert(bg_manual_only_ > 0); if (manual_compaction_ != nullptr) { // Running either this or some other manual compaction bg_cv_.Wait(); } else { manual_compaction_ = &manual; - assert(bg_compaction_scheduled_ == 0); bg_compaction_scheduled_++; env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } @@ -1884,35 +1711,34 @@ Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level, } Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, - const FlushOptions& options) { - Writer w(&mutex_); - w.batch = nullptr; - w.sync = false; - w.disableWAL = false; - w.in_batch_group = false; - w.done = false; - w.timeout_hint_us = kNoTimeOut; - + const FlushOptions& flush_options) { Status s; { WriteContext context; - MutexLock guard_lock(&mutex_); - s = BeginWrite(&w, 0); + InstrumentedMutexLock guard_lock(&mutex_); + + if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) { + // Nothing to flush + return Status::OK(); + } + + WriteThread::Writer w(&mutex_); + s = write_thread_.EnterWriteThread(&w, 0); assert(s.ok() && !w.done); // No timeout and nobody should do our job // SetNewMemtableAndNewLogFile() will release and reacquire mutex // during execution s = SetNewMemtableAndNewLogFile(cfd, &context); + write_thread_.ExitWriteThread(&w, &w, s); + cfd->imm()->FlushRequested(); - MaybeScheduleFlushOrCompaction(); - assert(!writers_.empty()); - assert(writers_.front() == &w); - EndWrite(&w, &w, s); + // schedule flush + SchedulePendingFlush(cfd); + MaybeScheduleFlushOrCompaction(); } - - if (s.ok() && options.wait) { + if (s.ok() && flush_options.wait) { // Wait until the compaction completes s = WaitForFlushMemTable(cfd); } @@ -1922,7 +1748,7 @@ Status DBImpl::FlushMemTable(ColumnFamilyData* cfd, Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { Status s; // Wait until the compaction completes - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); while (cfd->imm()->size() > 0 && bg_error_.ok()) { bg_cv_.Wait(); } @@ -1934,63 +1760,95 @@ Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) { void DBImpl::MaybeScheduleFlushOrCompaction() { mutex_.AssertHeld(); - bg_schedule_needed_ = false; if (bg_work_gate_closed_) { - // gate closed for backgrond work - } else if (shutting_down_.Acquire_Load()) { + // gate closed for background work + return; + } else if (shutting_down_.load(std::memory_order_acquire)) { // DB is being deleted; no more background compactions - } else { - bool is_flush_pending = false; - // no need to refcount since we're under a mutex - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->imm()->IsFlushPending()) { - is_flush_pending = true; - } - } - if (is_flush_pending) { - // memtable flush needed - if (bg_flush_scheduled_ < options_.max_background_flushes) { - bg_flush_scheduled_++; - env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); - } else if (options_.max_background_flushes > 0) { - bg_schedule_needed_ = true; - } - } - bool is_compaction_needed = false; - // no need to refcount since we're under a mutex - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (cfd->current()->NeedsCompaction()) { - is_compaction_needed = true; - break; - } - } + return; + } else if (bg_manual_only_) { + // manual only + return; + } - // Schedule BGWorkCompaction if there's a compaction pending (or a memtable - // flush, but the HIGH pool is not enabled) - // Do it only if max_background_compactions hasn't been reached and - // bg_manual_only_ == 0 - if (!bg_manual_only_ && - (is_compaction_needed || - (is_flush_pending && options_.max_background_flushes == 0))) { - if (bg_compaction_scheduled_ < options_.max_background_compactions) { - bg_compaction_scheduled_++; - env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); - } else { - bg_schedule_needed_ = true; - } + while (unscheduled_flushes_ > 0 && + bg_flush_scheduled_ < db_options_.max_background_flushes) { + unscheduled_flushes_--; + bg_flush_scheduled_++; + env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH); + } + + if (db_options_.max_background_flushes == 0 && + bg_compaction_scheduled_ < db_options_.max_background_compactions && + unscheduled_flushes_ > 0) { + // special case where flush is executed by compaction thread + // (if max_background_flushes == 0). + // Compaction thread will execute all the flushes + unscheduled_flushes_ = 0; + if (unscheduled_compactions_ > 0) { + // bg compaction will execute one compaction + unscheduled_compactions_--; } + bg_compaction_scheduled_++; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); + } + + while (bg_compaction_scheduled_ < db_options_.max_background_compactions && + unscheduled_compactions_ > 0) { + bg_compaction_scheduled_++; + unscheduled_compactions_--; + env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW); } } -void DBImpl::RecordFlushIOStats() { - RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); - IOSTATS_RESET(bytes_written); +void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { + assert(!cfd->pending_compaction()); + cfd->Ref(); + compaction_queue_.push_back(cfd); + cfd->set_pending_compaction(true); +} + +ColumnFamilyData* DBImpl::PopFirstFromCompactionQueue() { + assert(!compaction_queue_.empty()); + auto cfd = *compaction_queue_.begin(); + compaction_queue_.pop_front(); + assert(cfd->pending_compaction()); + cfd->set_pending_compaction(false); + return cfd; +} + +void DBImpl::AddToFlushQueue(ColumnFamilyData* cfd) { + assert(!cfd->pending_flush()); + cfd->Ref(); + flush_queue_.push_back(cfd); + cfd->set_pending_flush(true); } -void DBImpl::RecordCompactionIOStats() { - RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read)); - IOSTATS_RESET(bytes_read); - RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written)); +ColumnFamilyData* DBImpl::PopFirstFromFlushQueue() { + assert(!flush_queue_.empty()); + auto cfd = *flush_queue_.begin(); + flush_queue_.pop_front(); + assert(cfd->pending_flush()); + cfd->set_pending_flush(false); + return cfd; +} + +void DBImpl::SchedulePendingFlush(ColumnFamilyData* cfd) { + if (!cfd->pending_flush() && cfd->imm()->IsFlushPending()) { + AddToFlushQueue(cfd); + ++unscheduled_flushes_; + } +} + +void DBImpl::SchedulePendingCompaction(ColumnFamilyData* cfd) { + if (!cfd->pending_compaction() && cfd->NeedsCompaction()) { + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + } +} + +void DBImpl::RecordFlushIOStats() { + RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written)); IOSTATS_RESET(bytes_written); } @@ -2004,49 +1862,67 @@ void DBImpl::BGWorkCompaction(void* db) { reinterpret_cast(db)->BackgroundCallCompaction(); } -Status DBImpl::BackgroundFlush(bool* madeProgress, - DeletionState& deletion_state, +Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) { mutex_.AssertHeld(); - // call_status is failure if at least one flush was a failure. even if - // flushing one column family reports a failure, we will continue flushing - // other column families. however, call_status will be a failure in that case. - Status call_status; - // refcounting in iteration - for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->Ref(); - Status flush_status; - while (flush_status.ok() && cfd->imm()->IsFlushPending()) { - LogToBuffer( - log_buffer, - "BackgroundCallFlush doing FlushMemTableToOutputFile with column " - "family [%s], flush slots available %d", - cfd->GetName().c_str(), - options_.max_background_flushes - bg_flush_scheduled_); - flush_status = FlushMemTableToOutputFile(cfd, madeProgress, - deletion_state, log_buffer); + + if (!bg_error_.ok()) { + return bg_error_; + } + + ColumnFamilyData* cfd = nullptr; + while (!flush_queue_.empty()) { + // This cfd is already referenced + auto first_cfd = PopFirstFromFlushQueue(); + + if (first_cfd->IsDropped() || !first_cfd->imm()->IsFlushPending()) { + // can't flush this CF, try next one + if (first_cfd->Unref()) { + delete first_cfd; + } + continue; } - if (call_status.ok() && !flush_status.ok()) { - call_status = flush_status; + + // found a flush! + cfd = first_cfd; + break; + } + + Status status; + if (cfd != nullptr) { + const MutableCFOptions mutable_cf_options = + *cfd->GetLatestMutableCFOptions(); + LogToBuffer( + log_buffer, + "Calling FlushMemTableToOutputFile with column " + "family [%s], flush slots available %d, compaction slots available %d", + cfd->GetName().c_str(), + db_options_.max_background_flushes - bg_flush_scheduled_, + db_options_.max_background_compactions - bg_compaction_scheduled_); + status = FlushMemTableToOutputFile(cfd, mutable_cf_options, madeProgress, + job_context, log_buffer); + if (cfd->Unref()) { + delete cfd; } - cfd->Unref(); } - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - return call_status; + return status; } void DBImpl::BackgroundCallFlush() { bool madeProgress = false; - DeletionState deletion_state(true); + JobContext job_context(true); assert(bg_flush_scheduled_); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); Status s; - if (!shutting_down_.Acquire_Load()) { - s = BackgroundFlush(&madeProgress, deletion_state, &log_buffer); + if (!shutting_down_.load(std::memory_order_acquire)) { + s = BackgroundFlush(&madeProgress, &job_context, &log_buffer); if (!s.ok()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to @@ -2056,22 +1932,24 @@ void DBImpl::BackgroundCallFlush() { default_cf_internal_stats_->BumpAndGetBackgroundErrorCount(); bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); - Log(options_.info_log, + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "Waiting after background flush error: %s" "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); log_buffer.FlushBufferToLog(); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + // If !s.ok(), this means that Flush failed. In that case, we want // to delete all obsolete files and we force FindObsoleteFiles() - FindObsoleteFiles(deletion_state, !s.ok()); + FindObsoleteFiles(&job_context, !s.ok()); // delete unnecessary files if any, this is done outside the mutex - if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { mutex_.Unlock(); // Have to flush the info logs before bg_flush_scheduled_-- // because if bg_flush_scheduled_ becomes 0 and the lock is @@ -2079,20 +1957,16 @@ void DBImpl::BackgroundCallFlush() { // states of DB so info_log might not be available after that point. // It also applies to access other states that DB owns. log_buffer.FlushBufferToLog(); - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); } + job_context.Clean(); mutex_.Lock(); } bg_flush_scheduled_--; - // Any time the mutex is released After finding the work to do, another - // thread might execute MaybeScheduleFlushOrCompaction(). It is possible - // that there is a pending job but it is not scheduled because of the - // max thread limit. - if (madeProgress || bg_schedule_needed_) { - MaybeScheduleFlushOrCompaction(); - } + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); RecordFlushIOStats(); bg_cv_.SignalAll(); // IMPORTANT: there should be no code after calling SignalAll. This call may @@ -2104,16 +1978,20 @@ void DBImpl::BackgroundCallFlush() { void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; - DeletionState deletion_state(true); + JobContext job_context(true); MaybeDumpStats(); - LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, options_.info_log.get()); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get()); { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); + + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + assert(bg_compaction_scheduled_); Status s; - if (!shutting_down_.Acquire_Load()) { - s = BackgroundCompaction(&madeProgress, deletion_state, &log_buffer); + if (!shutting_down_.load(std::memory_order_acquire)) { + s = BackgroundCompaction(&madeProgress, &job_context, &log_buffer); if (!s.ok()) { // Wait a little bit before retrying background compaction in // case this is an environmental problem and we do not want to @@ -2124,24 +2002,26 @@ void DBImpl::BackgroundCallCompaction() { bg_cv_.SignalAll(); // In case a waiter can proceed despite the error mutex_.Unlock(); log_buffer.FlushBufferToLog(); - Log(options_.info_log, + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "Waiting after background compaction error: %s, " "Accumulated background error counts: %" PRIu64, s.ToString().c_str(), error_cnt); - LogFlush(options_.info_log); + LogFlush(db_options_.info_log); env_->SleepForMicroseconds(1000000); mutex_.Lock(); } } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); + // If !s.ok(), this means that Compaction failed. In that case, we want // to delete all obsolete files we might have created and we force - // FindObsoleteFiles(). This is because deletion_state does not catch - // all created files if compaction failed. - FindObsoleteFiles(deletion_state, !s.ok()); + // FindObsoleteFiles(). This is because job_context does not + // catch all created files if compaction failed. + FindObsoleteFiles(&job_context, !s.ok()); // delete unnecessary files if any, this is done outside the mutex - if (deletion_state.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { + if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) { mutex_.Unlock(); // Have to flush the info logs before bg_compaction_scheduled_-- // because if bg_flush_scheduled_ becomes 0 and the lock is @@ -2149,9 +2029,10 @@ void DBImpl::BackgroundCallCompaction() { // states of DB so info_log might not be available after that point. // It also applies to access other states that DB owns. log_buffer.FlushBufferToLog(); - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); } + job_context.Clean(); mutex_.Lock(); } @@ -2159,20 +2040,11 @@ void DBImpl::BackgroundCallCompaction() { versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); - // Previous compaction may have produced too many files in a level, - // So reschedule another compaction if we made progress in the - // last compaction. - // - // Also, any time the mutex is released After finding the work to do, - // another thread might execute MaybeScheduleFlushOrCompaction(). It is - // possible that there is a pending job but it is not scheduled because of - // the max thread limit. - if (madeProgress || bg_schedule_needed_) { - MaybeScheduleFlushOrCompaction(); - } + // See if there's more work to be done + MaybeScheduleFlushOrCompaction(); if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) { // signal if - // * madeProgress -- need to wakeup MakeRoomForWrite + // * madeProgress -- need to wakeup DelayWrite // * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl // * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction // If none of this is true, there is no need to signal since nobody is @@ -2186,8 +2058,7 @@ void DBImpl::BackgroundCallCompaction() { } } -Status DBImpl::BackgroundCompaction(bool* madeProgress, - DeletionState& deletion_state, +Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) { *madeProgress = false; mutex_.AssertHeld(); @@ -2195,6 +2066,16 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); + if (!bg_error_.ok()) { + if (is_manual) { + manual_compaction_->status = bg_error_; + manual_compaction_->done = true; + manual_compaction_->in_progress = false; + manual_compaction_ = nullptr; + } + return bg_error_; + } + if (is_manual) { // another thread cannot pick up the same work manual_compaction_->in_progress = true; @@ -2204,27 +2085,28 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, return Status::OK(); } - // FLUSH preempts compaction - Status flush_stat; - for (auto cfd : *versions_->GetColumnFamilySet()) { - while (cfd->imm()->IsFlushPending()) { + // If there are no flush threads, then compaction thread needs to execute the + // flushes + if (db_options_.max_background_flushes == 0) { + // BackgroundFlush() will only execute a single flush. We keep calling it as + // long as there's more flushes to be done + while (!flush_queue_.empty()) { LogToBuffer( log_buffer, - "BackgroundCompaction doing FlushMemTableToOutputFile, " - "compaction slots available %d", - options_.max_background_compactions - bg_compaction_scheduled_); - cfd->Ref(); - flush_stat = FlushMemTableToOutputFile(cfd, madeProgress, deletion_state, - log_buffer); - cfd->Unref(); - if (!flush_stat.ok()) { + "BackgroundCompaction calling BackgroundFlush. flush slots available " + "%d, compaction slots available %d", + db_options_.max_background_flushes - bg_flush_scheduled_, + db_options_.max_background_compactions - bg_compaction_scheduled_); + auto flush_status = + BackgroundFlush(madeProgress, job_context, log_buffer); + if (!flush_status.ok()) { if (is_manual) { - manual_compaction_->status = flush_stat; + manual_compaction_->status = flush_status; manual_compaction_->done = true; manual_compaction_->in_progress = false; manual_compaction_ = nullptr; } - return flush_stat; + return flush_status; } } } @@ -2235,9 +2117,9 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, if (is_manual) { ManualCompaction* m = manual_compaction_; assert(m->in_progress); - c.reset(m->cfd->CompactRange(m->input_level, m->output_level, - m->output_path_id, m->begin, m->end, - &manual_end)); + c.reset(m->cfd->CompactRange( + *m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level, + m->output_path_id, m->begin, m->end, &manual_end)); if (!c) { m->done = true; } @@ -2250,16 +2132,53 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, ((m->done || manual_end == nullptr) ? "(end)" : manual_end->DebugString().c_str())); - } else { - // no need to refcount in iteration since it's always under a mutex - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (!cfd->options()->disable_auto_compactions) { - c.reset(cfd->PickCompaction(log_buffer)); - if (c != nullptr) { - // update statistics - MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs(0)->size()); - break; + } else if (!compaction_queue_.empty()) { + // cfd is referenced here + auto cfd = PopFirstFromCompactionQueue(); + // We unreference here because the following code will take a Ref() on + // this cfd if it is going to use it (Compaction class holds a + // reference). + // This will all happen under a mutex so we don't have to be afraid of + // somebody else deleting it. + if (cfd->Unref()) { + delete cfd; + // This was the last reference of the column family, so no need to + // compact. + return Status::OK(); + } + + // Pick up latest mutable CF Options and use it throughout the + // compaction job + // Compaction makes a copy of the latest MutableCFOptions. It should be used + // throughout the compaction procedure to make sure consistency. It will + // eventually be installed into SuperVersion + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + if (!mutable_cf_options->disable_auto_compactions && !cfd->IsDropped()) { + // NOTE: try to avoid unnecessary copy of MutableCFOptions if + // compaction is not necessary. Need to make sure mutex is held + // until we make a copy in the following code + c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer)); + if (c != nullptr) { + // update statistics + MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs(0)->size()); + // There are three things that can change compaction score: + // 1) When flush or compaction finish. This case is covered by + // InstallSuperVersion() + // 2) When MutableCFOptions changes. This case is also covered by + // InstallSuperVersion(), because this is when the new options take + // effect. + // 3) When we Pick a new compaction, we "remove" those files being + // compacted from the calculation, which then influences compaction + // score. Here we check if we need the new compaction even without the + // files that are currently being compacted. If we need another + // compaction, we might be able to execute it in parallel, so we add it + // to the queue and schedule a new thread. + if (cfd->NeedsCompaction()) { + // Yes, we need more compactions! + AddToCompactionQueue(cfd); + ++unscheduled_compactions_; + MaybeScheduleFlushOrCompaction(); } } } @@ -2274,20 +2193,29 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // file if there is alive snapshot pointing to it assert(c->num_input_files(1) == 0); assert(c->level() == 0); - assert(c->column_family_data()->options()->compaction_style == + assert(c->column_family_data()->ioptions()->compaction_style == kCompactionStyleFIFO); for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, - db_directory_.get()); - InstallSuperVersion(c->column_family_data(), deletion_state); + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + InstallSuperVersionBackground(c->column_family_data(), job_context, + *c->mutable_cf_options()); LogToBuffer(log_buffer, "[%s] Deleted %d files\n", c->column_family_data()->GetName().c_str(), c->num_input_files(0)); - c->ReleaseCompactionFiles(status); *madeProgress = true; } else if (!is_manual && c->IsTrivialMove()) { + // Instrument for event update + // TODO(yhchiang): add op details for showing trivial-move. + ThreadStatusUtil::SetColumnFamily(c->column_family_data()); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); +#ifndef NDEBUG + ThreadStatusUtil::TEST_OperationDelay(ThreadStatus::OP_COMPACTION); +#endif + // Move file to next level assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); @@ -2295,38 +2223,65 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); - status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, - db_directory_.get()); - InstallSuperVersion(c->column_family_data(), deletion_state); - - Version::LevelSummaryStorage tmp; + status = versions_->LogAndApply(c->column_family_data(), + *c->mutable_cf_options(), c->edit(), + &mutex_, directories_.GetDbDir()); + // Use latest MutableCFOptions + InstallSuperVersionBackground(c->column_family_data(), job_context, + *c->mutable_cf_options()); + + VersionStorageInfo::LevelSummaryStorage tmp; + c->column_family_data()->internal_stats()->IncBytesMoved( + c->level() + 1, f->fd.GetFileSize()); LogToBuffer( - log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", - c->column_family_data()->GetName().c_str(), - static_cast(f->fd.GetNumber()), c->level() + 1, - static_cast(f->fd.GetFileSize()), - status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); - c->ReleaseCompactionFiles(status); + log_buffer, + "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64 " bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), f->fd.GetNumber(), + c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(), + c->column_family_data()->current()->storage_info()->LevelSummary(&tmp)); *madeProgress = true; + + // Clear Instrument + ThreadStatusUtil::ResetThreadStatus(); } else { - MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel. - CompactionState* compact = new CompactionState(c.get()); - status = DoCompactionWork(compact, deletion_state, log_buffer); - CleanupCompaction(compact, status); + auto yield_callback = [&]() { + return CallFlushDuringCompaction(c->column_family_data(), + *c->mutable_cf_options(), job_context, + log_buffer); + }; + CompactionJob compaction_job( + c.get(), db_options_, *c->mutable_cf_options(), env_options_, + versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), + directories_.GetDataDir(c->GetOutputPathId()), stats_, &snapshots_, + is_snapshot_supported_, table_cache_, std::move(yield_callback)); + compaction_job.Prepare(); + mutex_.Unlock(); + status = compaction_job.Run(); + mutex_.Lock(); + compaction_job.Install(&status, &mutex_); + if (status.ok()) { + InstallSuperVersionBackground(c->column_family_data(), job_context, + *c->mutable_cf_options()); + } + *madeProgress = true; + } + // FIXME(orib): should I check if column family data is null? + if (c != nullptr) { + NotifyOnCompactionCompleted(c->column_family_data(), c.get(), status); c->ReleaseCompactionFiles(status); - c->ReleaseInputs(); *madeProgress = true; } + // this will unref its input_version and column_family_data c.reset(); if (status.ok()) { // Done - } else if (shutting_down_.Acquire_Load()) { + } else if (status.IsShutdownInProgress()) { // Ignore compaction errors found during shutting down } else { - Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Compaction error: %s", + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s", status.ToString().c_str()); - if (options_.paranoid_checks && bg_error_.ok()) { + if (db_options_.paranoid_checks && bg_error_.ok()) { bg_error_ = status; } } @@ -2357,8 +2312,8 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. // Universal and FIFO compactions should always compact the whole range - assert(m->cfd->options()->compaction_style != kCompactionStyleUniversal); - assert(m->cfd->options()->compaction_style != kCompactionStyleFIFO); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleUniversal); + assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO); m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } @@ -2368,944 +2323,37 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, return status; } -void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { - mutex_.AssertHeld(); - if (compact->builder != nullptr) { - // May happen if we get a shutdown call in the middle of compaction - compact->builder->Abandon(); - compact->builder.reset(); - } else { - assert(compact->outfile == nullptr); - } - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - pending_outputs_.erase(out.number); - - // If this file was inserted into the table cache then remove - // them here because this compaction was not committed. - if (!status.ok()) { - TableCache::Evict(table_cache_.get(), out.number); - } - } - delete compact; -} - -// Allocate the file numbers for the output file. We allocate as -// many output file numbers as there are files in level+1 (at least one) -// Insert them into pending_outputs so that they do not get deleted. -void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { - mutex_.AssertHeld(); - assert(compact != nullptr); - assert(compact->builder == nullptr); - int filesNeeded = compact->compaction->num_input_files(1); - for (int i = 0; i < std::max(filesNeeded, 1); i++) { - uint64_t file_number = versions_->NewFileNumber(); - pending_outputs_[file_number] = compact->compaction->GetOutputPathId(); - compact->allocated_file_numbers.push_back(file_number); - } -} - -// Frees up unused file number. -void DBImpl::ReleaseCompactionUnusedFileNumbers(CompactionState* compact) { - mutex_.AssertHeld(); - for (const auto file_number : compact->allocated_file_numbers) { - pending_outputs_.erase(file_number); - } -} - -Status DBImpl::OpenCompactionOutputFile(CompactionState* compact) { - assert(compact != nullptr); - assert(compact->builder == nullptr); - uint64_t file_number; - // If we have not yet exhausted the pre-allocated file numbers, - // then use the one from the front. Otherwise, we have to acquire - // the heavyweight lock and allocate a new file number. - if (!compact->allocated_file_numbers.empty()) { - file_number = compact->allocated_file_numbers.front(); - compact->allocated_file_numbers.pop_front(); - } else { - mutex_.Lock(); - file_number = versions_->NewFileNumber(); - pending_outputs_[file_number] = compact->compaction->GetOutputPathId(); - mutex_.Unlock(); - } - CompactionState::Output out; - out.number = file_number; - out.path_id = compact->compaction->GetOutputPathId(); - out.smallest.Clear(); - out.largest.Clear(); - out.smallest_seqno = out.largest_seqno = 0; - compact->outputs.push_back(out); - - // Make the output file - std::string fname = TableFileName(options_.db_paths, file_number, - compact->compaction->GetOutputPathId()); - Status s = env_->NewWritableFile(fname, &compact->outfile, storage_options_); - - if (s.ok()) { - compact->outfile->SetIOPriority(Env::IO_LOW); - compact->outfile->SetPreallocationBlockSize( - compact->compaction->OutputFilePreallocationSize()); - - ColumnFamilyData* cfd = compact->compaction->column_family_data(); - compact->builder.reset(NewTableBuilder( - *cfd->options(), cfd->internal_comparator(), compact->outfile.get(), - compact->compaction->OutputCompressionType())); - } - LogFlush(options_.info_log); - return s; -} - -Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, - Iterator* input) { - assert(compact != nullptr); - assert(compact->outfile); - assert(compact->builder != nullptr); - - const uint64_t output_number = compact->current_output()->number; - const uint32_t output_path_id = compact->current_output()->path_id; - assert(output_number != 0); - - // Check for iterator errors - Status s = input->status(); - const uint64_t current_entries = compact->builder->NumEntries(); - if (s.ok()) { - s = compact->builder->Finish(); - } else { - compact->builder->Abandon(); - } - const uint64_t current_bytes = compact->builder->FileSize(); - compact->current_output()->file_size = current_bytes; - compact->total_bytes += current_bytes; - compact->builder.reset(); - - // Finish and check for file errors - if (s.ok() && !options_.disableDataSync) { - if (options_.use_fsync) { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact->outfile->Fsync(); - } else { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact->outfile->Sync(); - } - } - if (s.ok()) { - s = compact->outfile->Close(); - } - compact->outfile.reset(); - - if (s.ok() && current_entries > 0) { - // Verify that the table is usable - ColumnFamilyData* cfd = compact->compaction->column_family_data(); - FileDescriptor fd(output_number, output_path_id, current_bytes); - Iterator* iter = cfd->table_cache()->NewIterator( - ReadOptions(), storage_options_, cfd->internal_comparator(), fd); - s = iter->status(); - delete iter; - if (s.ok()) { - Log(options_.info_log, "[%s] Generated table #%" PRIu64 ": %" PRIu64 - " keys, %" PRIu64 " bytes", - cfd->GetName().c_str(), output_number, current_entries, - current_bytes); - } - } - return s; -} - - -Status DBImpl::InstallCompactionResults(CompactionState* compact, - LogBuffer* log_buffer) { - mutex_.AssertHeld(); - - // paranoia: verify that the files that we started with - // still exist in the current version and in the same original level. - // This ensures that a concurrent compaction did not erroneously - // pick the same files to compact. - if (!versions_->VerifyCompactionFileConsistency(compact->compaction)) { - Log(options_.info_log, "[%s] Compaction %d@%d + %d@%d files aborted", - compact->compaction->column_family_data()->GetName().c_str(), - compact->compaction->num_input_files(0), compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->output_level()); - return Status::Corruption("Compaction input files inconsistent"); - } - - LogToBuffer(log_buffer, "[%s] Compacted %d@%d + %d@%d files => %lld bytes", - compact->compaction->column_family_data()->GetName().c_str(), - compact->compaction->num_input_files(0), - compact->compaction->level(), - compact->compaction->num_input_files(1), - compact->compaction->output_level(), - static_cast(compact->total_bytes)); - - // Add compaction outputs - compact->compaction->AddInputDeletions(compact->compaction->edit()); - for (size_t i = 0; i < compact->outputs.size(); i++) { - const CompactionState::Output& out = compact->outputs[i]; - compact->compaction->edit()->AddFile(compact->compaction->output_level(), - out.number, out.path_id, out.file_size, - out.smallest, out.largest, - out.smallest_seqno, out.largest_seqno); - } - return versions_->LogAndApply(compact->compaction->column_family_data(), - compact->compaction->edit(), &mutex_, - db_directory_.get()); -} - -// Given a sequence number, return the sequence number of the -// earliest snapshot that this sequence number is visible in. -// The snapshots themselves are arranged in ascending order of -// sequence numbers. -// Employ a sequential search because the total number of -// snapshots are typically small. -inline SequenceNumber DBImpl::findEarliestVisibleSnapshot( - SequenceNumber in, std::vector& snapshots, - SequenceNumber* prev_snapshot) { - SequenceNumber prev __attribute__((unused)) = 0; - for (const auto cur : snapshots) { - assert(prev <= cur); - if (cur >= in) { - *prev_snapshot = prev; - return cur; - } - prev = cur; // assignment - assert(prev); +uint64_t DBImpl::CallFlushDuringCompaction( + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + JobContext* job_context, LogBuffer* log_buffer) { + if (db_options_.max_background_flushes > 0) { + // flush thread will take care of this + return 0; } - Log(options_.info_log, - "Looking for seqid %" PRIu64 " but maxseqid is %" PRIu64 "", in, - snapshots[snapshots.size() - 1]); - assert(0); - return 0; -} - -uint64_t DBImpl::CallFlushDuringCompaction(ColumnFamilyData* cfd, - DeletionState& deletion_state, - LogBuffer* log_buffer) { - if (cfd->imm()->imm_flush_needed.NoBarrier_Load() != nullptr) { + if (cfd->imm()->imm_flush_needed.load(std::memory_order_relaxed)) { const uint64_t imm_start = env_->NowMicros(); mutex_.Lock(); if (cfd->imm()->IsFlushPending()) { cfd->Ref(); - FlushMemTableToOutputFile(cfd, nullptr, deletion_state, log_buffer); + FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, job_context, + log_buffer); cfd->Unref(); - bg_cv_.SignalAll(); // Wakeup MakeRoomForWrite() if necessary + bg_cv_.SignalAll(); // Wakeup DelayWrite() if necessary } mutex_.Unlock(); log_buffer->FlushBufferToLog(); return env_->NowMicros() - imm_start; } - return 0; -} - -Status DBImpl::ProcessKeyValueCompaction( - bool is_snapshot_supported, - SequenceNumber visible_at_tip, - SequenceNumber earliest_snapshot, - SequenceNumber latest_snapshot, - DeletionState& deletion_state, - bool bottommost_level, - int64_t& imm_micros, - Iterator* input, - CompactionState* compact, - bool is_compaction_v2, - LogBuffer* log_buffer) { - size_t combined_idx = 0; - Status status; - std::string compaction_filter_value; - ParsedInternalKey ikey; - IterKey current_user_key; - bool has_current_user_key = false; - IterKey delete_key; - SequenceNumber last_sequence_for_key __attribute__((unused)) = - kMaxSequenceNumber; - SequenceNumber visible_in_snapshot = kMaxSequenceNumber; - ColumnFamilyData* cfd = compact->compaction->column_family_data(); - MergeHelper merge( - cfd->user_comparator(), cfd->options()->merge_operator.get(), - options_.info_log.get(), cfd->options()->min_partial_merge_operands, - false /* internal key corruption is expected */); - auto compaction_filter = cfd->options()->compaction_filter; - std::unique_ptr compaction_filter_from_factory = nullptr; - if (!compaction_filter) { - auto context = compact->GetFilterContextV1(); - compaction_filter_from_factory = - cfd->options()->compaction_filter_factory->CreateCompactionFilter( - context); - compaction_filter = compaction_filter_from_factory.get(); - } - - int64_t key_drop_user = 0; - int64_t key_drop_newer_entry = 0; - int64_t key_drop_obsolete = 0; - int64_t loop_cnt = 0; - while (input->Valid() && !shutting_down_.Acquire_Load() && - !cfd->IsDropped()) { - if (++loop_cnt > 1000) { - if (key_drop_user > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); - key_drop_user = 0; - } - if (key_drop_newer_entry > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, - key_drop_newer_entry); - key_drop_newer_entry = 0; - } - if (key_drop_obsolete > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); - key_drop_obsolete = 0; - } - RecordCompactionIOStats(); - loop_cnt = 0; - } - // FLUSH preempts compaction - // TODO(icanadi) this currently only checks if flush is necessary on - // compacting column family. we should also check if flush is necessary on - // other column families, too - imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); - - Slice key; - Slice value; - // If is_compaction_v2 is on, kv-pairs are reset to the prefix batch. - // This prefix batch should contain results after calling - // compaction_filter_v2. - // - // If is_compaction_v2 is off, this function will go through all the - // kv-pairs in input. - if (!is_compaction_v2) { - key = input->key(); - value = input->value(); - } else { - if (combined_idx >= compact->combined_key_buf_.size()) { - break; - } - assert(combined_idx < compact->combined_key_buf_.size()); - key = compact->combined_key_buf_[combined_idx]; - value = compact->combined_value_buf_[combined_idx]; - - ++combined_idx; - } - - if (compact->compaction->ShouldStopBefore(key) && - compact->builder != nullptr) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // Handle key/value, add to state, etc. - bool drop = false; - bool current_entry_is_merging = false; - if (!ParseInternalKey(key, &ikey)) { - // Do not hide error keys - // TODO: error key stays in db forever? Figure out the intention/rationale - // v10 error v8 : we cannot hide v8 even though it's pretty obvious. - current_user_key.Clear(); - has_current_user_key = false; - last_sequence_for_key = kMaxSequenceNumber; - visible_in_snapshot = kMaxSequenceNumber; - } else { - if (!has_current_user_key || - cfd->user_comparator()->Compare(ikey.user_key, - current_user_key.GetKey()) != 0) { - // First occurrence of this user key - current_user_key.SetKey(ikey.user_key); - has_current_user_key = true; - last_sequence_for_key = kMaxSequenceNumber; - visible_in_snapshot = kMaxSequenceNumber; - // apply the compaction filter to the first occurrence of the user key - if (compaction_filter && !is_compaction_v2 && - ikey.type == kTypeValue && - (visible_at_tip || ikey.sequence > latest_snapshot)) { - // If the user has specified a compaction filter and the sequence - // number is greater than any external snapshot, then invoke the - // filter. - // If the return value of the compaction filter is true, replace - // the entry with a delete marker. - bool value_changed = false; - compaction_filter_value.clear(); - bool to_delete = compaction_filter->Filter( - compact->compaction->level(), ikey.user_key, value, - &compaction_filter_value, &value_changed); - if (to_delete) { - // make a copy of the original key and convert it to a delete - delete_key.SetInternalKey(ExtractUserKey(key), ikey.sequence, - kTypeDeletion); - // anchor the key again - key = delete_key.GetKey(); - // needed because ikey is backed by key - ParseInternalKey(key, &ikey); - // no value associated with delete - value.clear(); - ++key_drop_user; - } else if (value_changed) { - value = compaction_filter_value; - } - } - } - - // If there are no snapshots, then this kv affect visibility at tip. - // Otherwise, search though all existing snapshots to find - // the earlist snapshot that is affected by this kv. - SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot - SequenceNumber visible = visible_at_tip ? visible_at_tip : - is_snapshot_supported ? findEarliestVisibleSnapshot(ikey.sequence, - compact->existing_snapshots, &prev_snapshot) - : 0; - - if (visible_in_snapshot == visible) { - // If the earliest snapshot is which this key is visible in - // is the same as the visibily of a previous instance of the - // same key, then this kv is not visible in any snapshot. - // Hidden by an newer entry for same user key - // TODO: why not > ? - assert(last_sequence_for_key >= ikey.sequence); - drop = true; // (A) - ++key_drop_newer_entry; - } else if (ikey.type == kTypeDeletion && - ikey.sequence <= earliest_snapshot && - compact->compaction->KeyNotExistsBeyondOutputLevel(ikey.user_key)) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger sequence numbers - // (3) data in layers that are being compacted here and have - // smaller sequence numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true; - ++key_drop_obsolete; - } else if (ikey.type == kTypeMerge) { - if (!merge.HasOperator()) { - LogToBuffer(log_buffer, "Options::merge_operator is null."); - status = Status::InvalidArgument( - "merge_operator is not properly initialized."); - break; - } - // We know the merge type entry is not hidden, otherwise we would - // have hit (A) - // We encapsulate the merge related state machine in a different - // object to minimize change to the existing flow. Turn out this - // logic could also be nicely re-used for memtable flush purge - // optimization in BuildTable. - int steps = 0; - merge.MergeUntil(input, prev_snapshot, bottommost_level, - options_.statistics.get(), &steps); - // Skip the Merge ops - combined_idx = combined_idx - 1 + steps; - - current_entry_is_merging = true; - if (merge.IsSuccess()) { - // Successfully found Put/Delete/(end-of-key-range) while merging - // Get the merge result - key = merge.key(); - ParseInternalKey(key, &ikey); - value = merge.value(); - } else { - // Did not find a Put/Delete/(end-of-key-range) while merging - // We now have some stack of merge operands to write out. - // NOTE: key,value, and ikey are now referring to old entries. - // These will be correctly set below. - assert(!merge.keys().empty()); - assert(merge.keys().size() == merge.values().size()); - - // Hack to make sure last_sequence_for_key is correct - ParseInternalKey(merge.keys().front(), &ikey); - } - } - - last_sequence_for_key = ikey.sequence; - visible_in_snapshot = visible; - } - - if (!drop) { - // We may write a single key (e.g.: for Put/Delete or successful merge). - // Or we may instead have to write a sequence/list of keys. - // We have to write a sequence iff we have an unsuccessful merge - bool has_merge_list = current_entry_is_merging && !merge.IsSuccess(); - const std::deque* keys = nullptr; - const std::deque* values = nullptr; - std::deque::const_reverse_iterator key_iter; - std::deque::const_reverse_iterator value_iter; - if (has_merge_list) { - keys = &merge.keys(); - values = &merge.values(); - key_iter = keys->rbegin(); // The back (*rbegin()) is the first key - value_iter = values->rbegin(); - - key = Slice(*key_iter); - value = Slice(*value_iter); - } - - // If we have a list of keys to write, traverse the list. - // If we have a single key to write, simply write that key. - while (true) { - // Invariant: key,value,ikey will always be the next entry to write - char* kptr = (char*)key.data(); - std::string kstr; - - // Zeroing out the sequence number leads to better compression. - // If this is the bottommost level (no files in lower levels) - // and the earliest snapshot is larger than this seqno - // then we can squash the seqno to zero. - if (bottommost_level && ikey.sequence < earliest_snapshot && - ikey.type != kTypeMerge) { - assert(ikey.type != kTypeDeletion); - // make a copy because updating in place would cause problems - // with the priority queue that is managing the input key iterator - kstr.assign(key.data(), key.size()); - kptr = (char *)kstr.c_str(); - UpdateInternalKey(kptr, key.size(), (uint64_t)0, ikey.type); - } - - Slice newkey(kptr, key.size()); - assert((key.clear(), 1)); // we do not need 'key' anymore - - // Open output file if necessary - if (compact->builder == nullptr) { - status = OpenCompactionOutputFile(compact); - if (!status.ok()) { - break; - } - } - - SequenceNumber seqno = GetInternalKeySeqno(newkey); - if (compact->builder->NumEntries() == 0) { - compact->current_output()->smallest.DecodeFrom(newkey); - compact->current_output()->smallest_seqno = seqno; - } else { - compact->current_output()->smallest_seqno = - std::min(compact->current_output()->smallest_seqno, seqno); - } - compact->current_output()->largest.DecodeFrom(newkey); - compact->builder->Add(newkey, value); - compact->current_output()->largest_seqno = - std::max(compact->current_output()->largest_seqno, seqno); - - // Close output file if it is big enough - if (compact->builder->FileSize() >= - compact->compaction->MaxOutputFileSize()) { - status = FinishCompactionOutputFile(compact, input); - if (!status.ok()) { - break; - } - } - - // If we have a list of entries, move to next element - // If we only had one entry, then break the loop. - if (has_merge_list) { - ++key_iter; - ++value_iter; - - // If at end of list - if (key_iter == keys->rend() || value_iter == values->rend()) { - // Sanity Check: if one ends, then both end - assert(key_iter == keys->rend() && value_iter == values->rend()); - break; - } - - // Otherwise not at end of list. Update key, value, and ikey. - key = Slice(*key_iter); - value = Slice(*value_iter); - ParseInternalKey(key, &ikey); - - } else{ - // Only had one item to begin with (Put/Delete) - break; - } - } - } - - // MergeUntil has moved input to the next entry - if (!current_entry_is_merging) { - input->Next(); - } - } - if (key_drop_user > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_USER, key_drop_user); - } - if (key_drop_newer_entry > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY, key_drop_newer_entry); - } - if (key_drop_obsolete > 0) { - RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE, key_drop_obsolete); - } - RecordCompactionIOStats(); - - return status; -} - -void DBImpl::CallCompactionFilterV2(CompactionState* compact, - CompactionFilterV2* compaction_filter_v2) { - if (compact == nullptr || compaction_filter_v2 == nullptr) { - return; - } - - // Assemble slice vectors for user keys and existing values. - // We also keep track of our parsed internal key structs because - // we may need to access the sequence number in the event that - // keys are garbage collected during the filter process. - std::vector ikey_buf; - std::vector user_key_buf; - std::vector existing_value_buf; - - for (const auto& key : compact->key_str_buf_) { - ParsedInternalKey ikey; - ParseInternalKey(Slice(key), &ikey); - ikey_buf.emplace_back(ikey); - user_key_buf.emplace_back(ikey.user_key); - } - for (const auto& value : compact->existing_value_str_buf_) { - existing_value_buf.emplace_back(Slice(value)); - } - - // If the user has specified a compaction filter and the sequence - // number is greater than any external snapshot, then invoke the - // filter. - // If the return value of the compaction filter is true, replace - // the entry with a delete marker. - compact->to_delete_buf_ = compaction_filter_v2->Filter( - compact->compaction->level(), - user_key_buf, existing_value_buf, - &compact->new_value_buf_, - &compact->value_changed_buf_); - - // new_value_buf_.size() <= to_delete__buf_.size(). "=" iff all - // kv-pairs in this compaction run needs to be deleted. - assert(compact->to_delete_buf_.size() == - compact->key_str_buf_.size()); - assert(compact->to_delete_buf_.size() == - compact->existing_value_str_buf_.size()); - assert(compact->to_delete_buf_.size() == - compact->value_changed_buf_.size()); - - int new_value_idx = 0; - for (unsigned int i = 0; i < compact->to_delete_buf_.size(); ++i) { - if (compact->to_delete_buf_[i]) { - // update the string buffer directly - // the Slice buffer points to the updated buffer - UpdateInternalKey(&compact->key_str_buf_[i][0], - compact->key_str_buf_[i].size(), - ikey_buf[i].sequence, - kTypeDeletion); - - // no value associated with delete - compact->existing_value_str_buf_[i].clear(); - RecordTick(stats_, COMPACTION_KEY_DROP_USER); - } else if (compact->value_changed_buf_[i]) { - compact->existing_value_str_buf_[i] = - compact->new_value_buf_[new_value_idx++]; - } - } // for -} - -Status DBImpl::DoCompactionWork(CompactionState* compact, - DeletionState& deletion_state, - LogBuffer* log_buffer) { - assert(compact); - compact->CleanupBatchBuffer(); - compact->CleanupMergedBuffer(); - bool prefix_initialized = false; - - // Generate file_levels_ for compaction berfore making Iterator - compact->compaction->GenerateFileLevels(); - int64_t imm_micros = 0; // Micros spent doing imm_ compactions - ColumnFamilyData* cfd = compact->compaction->column_family_data(); - LogToBuffer( - log_buffer, - "[%s] Compacting %d@%d + %d@%d files, score %.2f slots available %d", - cfd->GetName().c_str(), compact->compaction->num_input_files(0), - compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->output_level(), compact->compaction->score(), - options_.max_background_compactions - bg_compaction_scheduled_); - char scratch[2345]; - compact->compaction->Summary(scratch, sizeof(scratch)); - LogToBuffer(log_buffer, "[%s] Compaction start summary: %s\n", - cfd->GetName().c_str(), scratch); - - assert(cfd->current()->NumLevelFiles(compact->compaction->level()) > 0); - assert(compact->builder == nullptr); - assert(!compact->outfile); - - SequenceNumber visible_at_tip = 0; - SequenceNumber earliest_snapshot; - SequenceNumber latest_snapshot = 0; - snapshots_.getAll(compact->existing_snapshots); - if (compact->existing_snapshots.size() == 0) { - // optimize for fast path if there are no snapshots - visible_at_tip = versions_->LastSequence(); - earliest_snapshot = visible_at_tip; - } else { - latest_snapshot = compact->existing_snapshots.back(); - // Add the current seqno as the 'latest' virtual - // snapshot to the end of this list. - compact->existing_snapshots.push_back(versions_->LastSequence()); - earliest_snapshot = compact->existing_snapshots[0]; - } - - // Is this compaction producing files at the bottommost level? - bool bottommost_level = compact->compaction->BottomMostLevel(); - - // Allocate the output file numbers before we release the lock - AllocateCompactionOutputFileNumbers(compact); - - bool is_snapshot_supported = IsSnapshotSupported(); - // Release mutex while we're actually doing the compaction work - mutex_.Unlock(); - log_buffer->FlushBufferToLog(); - - const uint64_t start_micros = env_->NowMicros(); - unique_ptr input(versions_->MakeInputIterator(compact->compaction)); - input->SeekToFirst(); - shared_ptr backup_input( - versions_->MakeInputIterator(compact->compaction)); - backup_input->SeekToFirst(); - - Status status; - ParsedInternalKey ikey; - std::unique_ptr compaction_filter_from_factory_v2 - = nullptr; - auto context = compact->GetFilterContext(); - compaction_filter_from_factory_v2 = - cfd->options()->compaction_filter_factory_v2->CreateCompactionFilterV2( - context); - auto compaction_filter_v2 = - compaction_filter_from_factory_v2.get(); - - // temp_backup_input always point to the start of the current buffer - // temp_backup_input = backup_input; - // iterate through input, - // 1) buffer ineligible keys and value keys into 2 separate buffers; - // 2) send value_buffer to compaction filter and alternate the values; - // 3) merge value_buffer with ineligible_value_buffer; - // 4) run the modified "compaction" using the old for loop. - if (compaction_filter_v2) { - while (backup_input->Valid() && !shutting_down_.Acquire_Load() && - !cfd->IsDropped()) { - // FLUSH preempts compaction - // TODO(icanadi) this currently only checks if flush is necessary on - // compacting column family. we should also check if flush is necessary on - // other column families, too - imm_micros += CallFlushDuringCompaction(cfd, deletion_state, log_buffer); - - Slice key = backup_input->key(); - Slice value = backup_input->value(); - - if (!ParseInternalKey(key, &ikey)) { - // log error - Log(options_.info_log, "[%s] Failed to parse key: %s", - cfd->GetName().c_str(), key.ToString().c_str()); - continue; - } else { - const SliceTransform* transformer = - cfd->options()->compaction_filter_factory_v2->GetPrefixExtractor(); - const auto key_prefix = transformer->Transform(ikey.user_key); - if (!prefix_initialized) { - compact->cur_prefix_ = key_prefix.ToString(); - prefix_initialized = true; - } - // If the prefix remains the same, keep buffering - if (key_prefix.compare(Slice(compact->cur_prefix_)) == 0) { - // Apply the compaction filter V2 to all the kv pairs sharing - // the same prefix - if (ikey.type == kTypeValue && - (visible_at_tip || ikey.sequence > latest_snapshot)) { - // Buffer all keys sharing the same prefix for CompactionFilterV2 - // Iterate through keys to check prefix - compact->BufferKeyValueSlices(key, value); - } else { - // buffer ineligible keys - compact->BufferOtherKeyValueSlices(key, value); - } - backup_input->Next(); - continue; - // finish changing values for eligible keys - } else { - // Now prefix changes, this batch is done. - // Call compaction filter on the buffered values to change the value - if (compact->key_str_buf_.size() > 0) { - CallCompactionFilterV2(compact, compaction_filter_v2); - } - compact->cur_prefix_ = key_prefix.ToString(); - } - } - - // Merge this batch of data (values + ineligible keys) - compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); - - // Done buffering for the current prefix. Spit it out to disk - // Now just iterate through all the kv-pairs - status = ProcessKeyValueCompaction( - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - true, - log_buffer); - - if (!status.ok()) { - break; - } - - // After writing the kv-pairs, we can safely remove the reference - // to the string buffer and clean them up - compact->CleanupBatchBuffer(); - compact->CleanupMergedBuffer(); - // Buffer the key that triggers the mismatch in prefix - if (ikey.type == kTypeValue && - (visible_at_tip || ikey.sequence > latest_snapshot)) { - compact->BufferKeyValueSlices(key, value); - } else { - compact->BufferOtherKeyValueSlices(key, value); - } - backup_input->Next(); - if (!backup_input->Valid()) { - // If this is the single last value, we need to merge it. - if (compact->key_str_buf_.size() > 0) { - CallCompactionFilterV2(compact, compaction_filter_v2); - } - compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); - - status = ProcessKeyValueCompaction( - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - true, - log_buffer); - - compact->CleanupBatchBuffer(); - compact->CleanupMergedBuffer(); - } - } // done processing all prefix batches - // finish the last batch - if (compact->key_str_buf_.size() > 0) { - CallCompactionFilterV2(compact, compaction_filter_v2); - } - compact->MergeKeyValueSliceBuffer(&cfd->internal_comparator()); - status = ProcessKeyValueCompaction( - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - true, - log_buffer); - } // checking for compaction filter v2 - - if (!compaction_filter_v2) { - status = ProcessKeyValueCompaction( - is_snapshot_supported, - visible_at_tip, - earliest_snapshot, - latest_snapshot, - deletion_state, - bottommost_level, - imm_micros, - input.get(), - compact, - false, - log_buffer); - } - - if (status.ok() && (shutting_down_.Acquire_Load() || cfd->IsDropped())) { - status = Status::ShutdownInProgress( - "Database shutdown or Column family drop during compaction"); - } - if (status.ok() && compact->builder != nullptr) { - status = FinishCompactionOutputFile(compact, input.get()); - } - if (status.ok()) { - status = input->status(); - } - input.reset(); - - if (!options_.disableDataSync) { - db_directory_->Fsync(); - } - - InternalStats::CompactionStats stats(1); - stats.micros = env_->NowMicros() - start_micros - imm_micros; - stats.files_in_leveln = compact->compaction->num_input_files(0); - stats.files_in_levelnp1 = compact->compaction->num_input_files(1); - MeasureTime(stats_, COMPACTION_TIME, stats.micros); - - int num_output_files = compact->outputs.size(); - if (compact->builder != nullptr) { - // An error occurred so ignore the last output. - assert(num_output_files > 0); - --num_output_files; - } - stats.files_out_levelnp1 = num_output_files; - - for (int i = 0; i < compact->compaction->num_input_files(0); i++) { - stats.bytes_readn += compact->compaction->input(0, i)->fd.GetFileSize(); - } - - for (int i = 0; i < compact->compaction->num_input_files(1); i++) { - stats.bytes_readnp1 += compact->compaction->input(1, i)->fd.GetFileSize(); - } - - for (int i = 0; i < num_output_files; i++) { - stats.bytes_written += compact->outputs[i].file_size; - } - - RecordCompactionIOStats(); - - LogFlush(options_.info_log); - mutex_.Lock(); - cfd->internal_stats()->AddCompactionStats( - compact->compaction->output_level(), stats); - - // if there were any unused file number (mostly in case of - // compaction error), free up the entry from pending_putputs - ReleaseCompactionUnusedFileNumbers(compact); - - if (status.ok()) { - status = InstallCompactionResults(compact, log_buffer); - InstallSuperVersion(cfd, deletion_state); - } - Version::LevelSummaryStorage tmp; - LogToBuffer( - log_buffer, - "[%s] compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " - "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " - "write-amplify(%.1f) %s\n", - cfd->GetName().c_str(), cfd->current()->LevelSummary(&tmp), - (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / - (double)stats.micros, - compact->compaction->output_level(), stats.files_in_leveln, - stats.files_in_levelnp1, stats.files_out_levelnp1, - stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, - stats.bytes_written / 1048576.0, - (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / - (double)stats.bytes_readn, - stats.bytes_written / (double)stats.bytes_readn, - status.ToString().c_str()); - - return status; + return 0; } namespace { struct IterState { - IterState(DBImpl* db, port::Mutex* mu, SuperVersion* super_version) - : db(db), mu(mu), super_version(super_version) {} + IterState(DBImpl* _db, InstrumentedMutex* _mu, SuperVersion* _super_version) + : db(_db), mu(_mu), super_version(_super_version) {} DBImpl* db; - port::Mutex* mu; + InstrumentedMutex* mu; SuperVersion* super_version; }; @@ -3313,53 +2361,41 @@ static void CleanupIteratorState(void* arg1, void* arg2) { IterState* state = reinterpret_cast(arg1); if (state->super_version->Unref()) { - DBImpl::DeletionState deletion_state; + JobContext job_context; state->mu->Lock(); state->super_version->Cleanup(); - state->db->FindObsoleteFiles(deletion_state, false, true); + state->db->FindObsoleteFiles(&job_context, false, true); state->mu->Unlock(); delete state->super_version; - if (deletion_state.HaveSomethingToDelete()) { - state->db->PurgeObsoleteFiles(deletion_state); + if (job_context.HaveSomethingToDelete()) { + state->db->PurgeObsoleteFiles(job_context); } + job_context.Clean(); } delete state; } } // namespace -Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, +Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, ColumnFamilyData* cfd, SuperVersion* super_version, Arena* arena) { Iterator* internal_iter; - if (arena != nullptr) { - // Need to create internal iterator from the arena. - MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); - // Collect iterator for mutable mem - merge_iter_builder.AddIterator( - super_version->mem->NewIterator(options, arena)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &merge_iter_builder); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &merge_iter_builder); - internal_iter = merge_iter_builder.Finish(); - } else { - // Need to create internal iterator using malloc. - std::vector iterator_list; - // Collect iterator for mutable mem - iterator_list.push_back(super_version->mem->NewIterator(options)); - // Collect all needed child iterators for immutable memtables - super_version->imm->AddIterators(options, &iterator_list); - // Collect iterators for files in L0 - Ln - super_version->current->AddIterators(options, storage_options_, - &iterator_list); - internal_iter = NewMergingIterator(&cfd->internal_comparator(), - &iterator_list[0], iterator_list.size()); - } + assert(arena != nullptr); + // Need to create internal iterator from the arena. + MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena); + // Collect iterator for mutable mem + merge_iter_builder.AddIterator( + super_version->mem->NewIterator(read_options, arena)); + // Collect all needed child iterators for immutable memtables + super_version->imm->AddIterators(read_options, &merge_iter_builder); + // Collect iterators for files in L0 - Ln + super_version->current->AddIterators(read_options, env_options_, + &merge_iter_builder); + internal_iter = merge_iter_builder.Finish(); IterState* cleanup = new IterState(this, &mutex_, super_version); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); @@ -3370,47 +2406,82 @@ ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const { return default_cf_handle_; } -Status DBImpl::Get(const ReadOptions& options, +Status DBImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - return GetImpl(options, column_family, key, value); + return GetImpl(read_options, column_family, key, value); } -// DeletionState gets created and destructed outside of the lock -- we +// JobContext gets created and destructed outside of the lock -- +// we // use this convinently to: // * malloc one SuperVersion() outside of the lock -- new_superversion // * delete SuperVersion()s outside of the lock -- superversions_to_free // -// However, if InstallSuperVersion() gets called twice with the same, -// deletion_state, we can't reuse the SuperVersion() that got malloced because +// However, if InstallSuperVersion() gets called twice with the same +// job_context, we can't reuse the SuperVersion() that got +// malloced +// because // first call already used it. In that rare case, we take a hit and create a // new SuperVersion() inside of the mutex. We do similar thing // for superversion_to_free -void DBImpl::InstallSuperVersion(ColumnFamilyData* cfd, - DeletionState& deletion_state) { +void DBImpl::InstallSuperVersionBackground( + ColumnFamilyData* cfd, JobContext* job_context, + const MutableCFOptions& mutable_cf_options) { + mutex_.AssertHeld(); + SuperVersion* old_superversion = InstallSuperVersion( + cfd, job_context->new_superversion, mutable_cf_options); + job_context->new_superversion = nullptr; + job_context->superversions_to_free.push_back(old_superversion); +} + +SuperVersion* DBImpl::InstallSuperVersion( + ColumnFamilyData* cfd, SuperVersion* new_sv, + const MutableCFOptions& mutable_cf_options, bool dont_schedule_bg_work) { mutex_.AssertHeld(); - // if new_superversion == nullptr, it means somebody already used it - SuperVersion* new_superversion = - (deletion_state.new_superversion != nullptr) ? - deletion_state.new_superversion : new SuperVersion(); - SuperVersion* old_superversion = - cfd->InstallSuperVersion(new_superversion, &mutex_); - deletion_state.new_superversion = nullptr; - deletion_state.superversions_to_free.push_back(old_superversion); + + // Update max_total_in_memory_state_ + size_t old_memtable_size = 0; + auto* old_sv = cfd->GetSuperVersion(); + if (old_sv) { + old_memtable_size = old_sv->mutable_cf_options.write_buffer_size * + old_sv->mutable_cf_options.max_write_buffer_number; + } + + auto* old = cfd->InstallSuperVersion( + new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options); + + // Whenever we install new SuperVersion, we might need to issue new flushes or + // compactions. dont_schedule_bg_work is true when scheduling from write + // thread and we don't want to add additional overhead. Callers promise to + // call SchedulePendingFlush() and MaybeScheduleFlushOrCompaction() eventually + if (!dont_schedule_bg_work) { + SchedulePendingFlush(cfd); + SchedulePendingCompaction(cfd); + MaybeScheduleFlushOrCompaction(); + } + + // Update max_total_in_memory_state_ + max_total_in_memory_state_ = + max_total_in_memory_state_ - old_memtable_size + + mutable_cf_options.write_buffer_size * + mutable_cf_options.max_write_buffer_number; + return old; } -Status DBImpl::GetImpl(const ReadOptions& options, +Status DBImpl::GetImpl(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value, bool* value_found) { StopWatch sw(env_, stats_, DB_GET); - PERF_TIMER_AUTO(get_snapshot_time); + PERF_TIMER_GUARD(get_snapshot_time); auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); SequenceNumber snapshot; - if (options.snapshot != nullptr) { - snapshot = reinterpret_cast(options.snapshot)->number_; + if (read_options.snapshot != nullptr) { + snapshot = reinterpret_cast( + read_options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } @@ -3427,37 +2498,38 @@ Status DBImpl::GetImpl(const ReadOptions& options, // merge_operands will contain the sequence of merges in the latter case. LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (sv->mem->Get(lkey, value, &s, merge_context, *cfd->options())) { + + if (sv->mem->Get(lkey, value, &s, &merge_context)) { // Done RecordTick(stats_, MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, merge_context, *cfd->options())) { + } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { // Done RecordTick(stats_, MEMTABLE_HIT); } else { - PERF_TIMER_START(get_from_output_files_time); - - sv->current->Get(options, lkey, value, &s, &merge_context, value_found); - PERF_TIMER_STOP(get_from_output_files_time); + PERF_TIMER_GUARD(get_from_output_files_time); + sv->current->Get(read_options, lkey, value, &s, &merge_context, + value_found); RecordTick(stats_, MEMTABLE_MISS); } - PERF_TIMER_START(get_post_process_time); + { + PERF_TIMER_GUARD(get_post_process_time); - ReturnAndCleanupSuperVersion(cfd, sv); + ReturnAndCleanupSuperVersion(cfd, sv); - RecordTick(stats_, NUMBER_KEYS_READ); - RecordTick(stats_, BYTES_READ, value->size()); - PERF_TIMER_STOP(get_post_process_time); + RecordTick(stats_, NUMBER_KEYS_READ); + RecordTick(stats_, BYTES_READ, value->size()); + } return s; } std::vector DBImpl::MultiGet( - const ReadOptions& options, + const ReadOptions& read_options, const std::vector& column_family, const std::vector& keys, std::vector* values) { StopWatch sw(env_, stats_, DB_MULTIGET); - PERF_TIMER_AUTO(get_snapshot_time); + PERF_TIMER_GUARD(get_snapshot_time); SequenceNumber snapshot; @@ -3478,8 +2550,9 @@ std::vector DBImpl::MultiGet( } mutex_.Lock(); - if (options.snapshot != nullptr) { - snapshot = reinterpret_cast(options.snapshot)->number_; + if (read_options.snapshot != nullptr) { + snapshot = reinterpret_cast( + read_options.snapshot)->number_; } else { snapshot = versions_->LastSequence(); } @@ -3516,15 +2589,14 @@ std::vector DBImpl::MultiGet( assert(mgd_iter != multiget_cf_data.end()); auto mgd = mgd_iter->second; auto super_version = mgd->super_version; - auto cfd = mgd->cfd; - if (super_version->mem->Get(lkey, value, &s, merge_context, - *cfd->options())) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { // Done - } else if (super_version->imm->Get(lkey, value, &s, merge_context, - *cfd->options())) { + } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { // Done } else { - super_version->current->Get(options, lkey, value, &s, &merge_context); + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, value, &s, + &merge_context); } if (s.ok()) { @@ -3533,7 +2605,7 @@ std::vector DBImpl::MultiGet( } // Post processing (decrement reference counts and record statistics) - PERF_TIMER_START(get_post_process_time); + PERF_TIMER_GUARD(get_post_process_time); autovector superversions_to_delete; // TODO(icanadi) do we need lock here or just around Cleanup()? @@ -3562,41 +2634,66 @@ std::vector DBImpl::MultiGet( return stat_list; } -Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, +Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { + Status s; *handle = nullptr; - MutexLock l(&mutex_); + { + InstrumentedMutexLock l(&mutex_); - if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != - nullptr) { - return Status::InvalidArgument("Column family already exists"); - } - VersionEdit edit; - edit.AddColumnFamily(column_family_name); - uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); - edit.SetColumnFamily(new_id); - edit.SetLogNumber(logfile_number_); - edit.SetComparatorName(options.comparator->Name()); - - // LogAndApply will both write the creation in MANIFEST and create - // ColumnFamilyData object - Status s = versions_->LogAndApply(nullptr, &edit, &mutex_, - db_directory_.get(), false, &options); + if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) != + nullptr) { + return Status::InvalidArgument("Column family already exists"); + } + VersionEdit edit; + edit.AddColumnFamily(column_family_name); + uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID(); + edit.SetColumnFamily(new_id); + edit.SetLogNumber(logfile_number_); + edit.SetComparatorName(cf_options.comparator->Name()); + + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + Options opt(db_options_, cf_options); + { // write thread + WriteThread::Writer w(&mutex_); + s = write_thread_.EnterWriteThread(&w, 0); + assert(s.ok() && !w.done); // No timeout and nobody should do our job + // LogAndApply will both write the creation in MANIFEST and create + // ColumnFamilyData object + s = versions_->LogAndApply( + nullptr, MutableCFOptions(opt, ImmutableCFOptions(opt)), &edit, + &mutex_, directories_.GetDbDir(), false, &cf_options); + write_thread_.ExitWriteThread(&w, &w, s); + } + if (s.ok()) { + single_column_family_mode_ = false; + auto* cfd = + versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); + assert(cfd != nullptr); + delete InstallSuperVersion( + cfd, nullptr, *cfd->GetLatestMutableCFOptions()); + + if (!cfd->mem()->IsSnapshotSupported()) { + is_snapshot_supported_ = false; + } + + *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Created column family [%s] (ID %u)", + column_family_name.c_str(), (unsigned)cfd->GetID()); + } else { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Creating column family [%s] FAILED -- %s", + column_family_name.c_str(), s.ToString().c_str()); + } + } // InstrumentedMutexLock l(&mutex_) + + // this is outside the mutex if (s.ok()) { - single_column_family_mode_ = false; - auto cfd = - versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name); - assert(cfd != nullptr); - delete cfd->InstallSuperVersion(new SuperVersion(), &mutex_); - *handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_); - Log(options_.info_log, "Created column family [%s] (ID %u)", - column_family_name.c_str(), (unsigned)cfd->GetID()); - max_total_in_memory_state_ += cfd->options()->write_buffer_size * - cfd->options()->max_write_buffer_number; - } else { - Log(options_.info_log, "Creating column family [%s] FAILED -- %s", - column_family_name.c_str(), s.ToString().c_str()); + NewThreadStatusCfInfo( + reinterpret_cast(*handle)->cfd()); } return s; } @@ -3608,42 +2705,71 @@ Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) { return Status::InvalidArgument("Can't drop default column family"); } + bool cf_support_snapshot = cfd->mem()->IsSnapshotSupported(); + VersionEdit edit; edit.DropColumnFamily(); edit.SetColumnFamily(cfd->GetID()); Status s; { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); if (cfd->IsDropped()) { s = Status::InvalidArgument("Column family already dropped!\n"); } if (s.ok()) { - s = versions_->LogAndApply(cfd, &edit, &mutex_); + // we drop column family from a single write thread + WriteThread::Writer w(&mutex_); + s = write_thread_.EnterWriteThread(&w, 0); + assert(s.ok() && !w.done); // No timeout and nobody should do our job + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_); + write_thread_.ExitWriteThread(&w, &w, s); + } + + if (!cf_support_snapshot) { + // Dropped Column Family doesn't support snapshot. Need to recalculate + // is_snapshot_supported_. + bool new_is_snapshot_supported = true; + for (auto c : *versions_->GetColumnFamilySet()) { + if (!c->mem()->IsSnapshotSupported()) { + new_is_snapshot_supported = false; + break; + } + } + is_snapshot_supported_ = new_is_snapshot_supported; } } if (s.ok()) { + // Note that here we erase the associated cf_info of the to-be-dropped + // cfd before its ref-count goes to zero to avoid having to erase cf_info + // later inside db_mutex. + EraseThreadStatusCfInfo(cfd); assert(cfd->IsDropped()); - max_total_in_memory_state_ -= cfd->options()->write_buffer_size * - cfd->options()->max_write_buffer_number; - Log(options_.info_log, "Dropped column family with id %u\n", cfd->GetID()); + auto* mutable_cf_options = cfd->GetLatestMutableCFOptions(); + max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size * + mutable_cf_options->max_write_buffer_number; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Dropped column family with id %u\n", + cfd->GetID()); } else { - Log(options_.info_log, "Dropping column family with id %u FAILED -- %s\n", + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Dropping column family with id %u FAILED -- %s\n", cfd->GetID(), s.ToString().c_str()); } return s; } -bool DBImpl::KeyMayExist(const ReadOptions& options, +bool DBImpl::KeyMayExist(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value, bool* value_found) { if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value *value_found = true; } - ReadOptions roptions = options; + ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only auto s = GetImpl(roptions, column_family, key, value, value_found); @@ -3653,30 +2779,31 @@ bool DBImpl::KeyMayExist(const ReadOptions& options, return s.ok() || s.IsIncomplete(); } -Iterator* DBImpl::NewIterator(const ReadOptions& options, +Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - if (options.tailing) { + if (read_options.tailing) { #ifdef ROCKSDB_LITE // not supported in lite version return nullptr; #else - // TODO(ljin): remove tailing iterator - auto iter = new ForwardIterator(this, options, cfd); - return NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, - kMaxSequenceNumber); -// return new TailingIterator(env_, this, options, cfd); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); + return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_options.iterate_upper_bound); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); - SuperVersion* sv = nullptr; - sv = cfd->GetReferencedSuperVersion(&mutex_); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ : latest_snapshot; // Try to generate a DB iterator tree in continuous memory area to be @@ -3722,86 +2849,81 @@ Iterator* DBImpl::NewIterator(const ReadOptions& options, // likely that any iterator pointer is close to the iterator it points to so // that they are likely to be in the same cache line and/or page. ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), snapshot); + env_, *cfd->ioptions(), cfd->user_comparator(), + snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, + read_options.iterate_upper_bound); + Iterator* internal_iter = - NewInternalIterator(options, cfd, sv, db_iter->GetArena()); + NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } + // To stop compiler from complaining + return nullptr; } Status DBImpl::NewIterators( - const ReadOptions& options, + const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { iterators->clear(); iterators->reserve(column_families.size()); - SequenceNumber latest_snapshot = 0; - std::vector super_versions; - super_versions.reserve(column_families.size()); - if (!options.tailing) { - mutex_.Lock(); - latest_snapshot = versions_->LastSequence(); - for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - super_versions.push_back(cfd->GetSuperVersion()->Ref()); - } - mutex_.Unlock(); - } - - if (options.tailing) { + if (read_options.tailing) { #ifdef ROCKSDB_LITE return Status::InvalidArgument( "Tailing interator not supported in RocksDB lite"); #else for (auto cfh : column_families) { auto cfd = reinterpret_cast(cfh)->cfd(); - auto iter = new ForwardIterator(this, options, cfd); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); + auto iter = new ForwardIterator(this, read_options, cfd, sv); iterators->push_back( - NewDBIterator(env_, *cfd->options(), cfd->user_comparator(), iter, - kMaxSequenceNumber)); + NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter, + kMaxSequenceNumber, + sv->mutable_cf_options.max_sequential_skip_in_iterations)); } #endif } else { + SequenceNumber latest_snapshot = versions_->LastSequence(); + for (size_t i = 0; i < column_families.size(); ++i) { - auto cfh = reinterpret_cast(column_families[i]); - auto cfd = cfh->cfd(); + auto* cfd = reinterpret_cast( + column_families[i])->cfd(); + SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_); auto snapshot = - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ + read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ : latest_snapshot; - auto iter = NewInternalIterator(options, cfd, super_versions[i]); - iter = NewDBIterator(env_, *cfd->options(), - cfd->user_comparator(), iter, snapshot); - iterators->push_back(iter); + ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, + sv->mutable_cf_options.max_sequential_skip_in_iterations); + Iterator* internal_iter = NewInternalIterator( + read_options, cfd, sv, db_iter->GetArena()); + db_iter->SetIterUnderDBIter(internal_iter); + iterators->push_back(db_iter); } } return Status::OK(); } -bool DBImpl::IsSnapshotSupported() const { - for (auto cfd : *versions_->GetColumnFamilySet()) { - if (!cfd->mem()->IsSnapshotSupported()) { - return false; - } - } - return true; -} - const Snapshot* DBImpl::GetSnapshot() { - MutexLock l(&mutex_); + int64_t unix_time = 0; + env_->GetCurrentTime(&unix_time); // Ignore error + + InstrumentedMutexLock l(&mutex_); // returns null if the underlying memtable does not support snapshot. - if (!IsSnapshotSupported()) return nullptr; - return snapshots_.New(versions_->LastSequence()); + if (!is_snapshot_supported_) return nullptr; + return snapshots_.New(versions_->LastSequence(), unix_time); } void DBImpl::ReleaseSnapshot(const Snapshot* s) { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); snapshots_.Delete(reinterpret_cast(s)); } @@ -3814,122 +2936,52 @@ Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family, Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family, const Slice& key, const Slice& val) { auto cfh = reinterpret_cast(column_family); - if (!cfh->cfd()->options()->merge_operator) { + if (!cfh->cfd()->ioptions()->merge_operator) { return Status::NotSupported("Provide a merge_operator when opening DB"); } else { return DB::Merge(o, column_family, key, val); } } -Status DBImpl::Delete(const WriteOptions& options, +Status DBImpl::Delete(const WriteOptions& write_options, ColumnFamilyHandle* column_family, const Slice& key) { - return DB::Delete(options, column_family, key); -} - -// REQUIRES: mutex_ is held -Status DBImpl::BeginWrite(Writer* w, uint64_t expiration_time) { - // the following code block pushes the current writer "w" into the writer - // queue "writers_" and wait until one of the following conditions met: - // 1. the job of "w" has been done by some other writers. - // 2. "w" becomes the first writer in "writers_" - // 3. "w" timed-out. - mutex_.AssertHeld(); - writers_.push_back(w); - - bool timed_out = false; - while (!w->done && w != writers_.front()) { - if (expiration_time == 0) { - w->cv.Wait(); - } else if (w->cv.TimedWait(expiration_time)) { - if (w->in_batch_group) { - // then it means the front writer is currently doing the - // write on behalf of this "timed-out" writer. Then it - // should wait until the write completes. - expiration_time = 0; - } else { - timed_out = true; - break; - } - } - } - - if (timed_out) { -#ifndef NDEBUG - bool found = false; -#endif - for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { - if (*iter == w) { - writers_.erase(iter); -#ifndef NDEBUG - found = true; -#endif - break; - } - } -#ifndef NDEBUG - assert(found); -#endif - // writers_.front() might still be in cond_wait without a time-out. - // As a result, we need to signal it to wake it up. Otherwise no - // one else will wake him up, and RocksDB will hang. - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } - return Status::TimedOut(); - } - return Status::OK(); -} - -// REQUIRES: mutex_ is held -void DBImpl::EndWrite(Writer* w, Writer* last_writer, Status status) { - // Pop out the current writer and all writers being pushed before the - // current writer from the writer queue. - mutex_.AssertHeld(); - while (!writers_.empty()) { - Writer* ready = writers_.front(); - writers_.pop_front(); - if (ready != w) { - ready->status = status; - ready->done = true; - ready->cv.Signal(); - } - if (ready == last_writer) break; - } - - // Notify new head of write queue - if (!writers_.empty()) { - writers_.front()->cv.Signal(); - } + return DB::Delete(write_options, column_family, key); } -Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { +Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) { if (my_batch == nullptr) { return Status::Corruption("Batch is nullptr!"); } - PERF_TIMER_AUTO(write_pre_and_post_process_time); - Writer w(&mutex_); + PERF_TIMER_GUARD(write_pre_and_post_process_time); + WriteThread::Writer w(&mutex_); w.batch = my_batch; - w.sync = options.sync; - w.disableWAL = options.disableWAL; + w.sync = write_options.sync; + w.disableWAL = write_options.disableWAL; w.in_batch_group = false; w.done = false; - w.timeout_hint_us = options.timeout_hint_us; + w.timeout_hint_us = write_options.timeout_hint_us; uint64_t expiration_time = 0; + bool has_timeout = false; if (w.timeout_hint_us == 0) { - w.timeout_hint_us = kNoTimeOut; + w.timeout_hint_us = WriteThread::kNoTimeOut; } else { expiration_time = env_->NowMicros() + w.timeout_hint_us; + has_timeout = true; } - if (!options.disableWAL) { + if (!write_options.disableWAL) { RecordTick(stats_, WRITE_WITH_WAL); - default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1); } WriteContext context; mutex_.Lock(); - Status status = BeginWrite(&w, expiration_time); + + if (!write_options.disableWAL) { + default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1); + } + + Status status = write_thread_.EnterWriteThread(&w, expiration_time); assert(status.ok() || status.IsTimedOut()); if (status.IsTimedOut()) { mutex_.Unlock(); @@ -3954,59 +3006,75 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { assert(!single_column_family_mode_ || versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1); - uint64_t flush_column_family_if_log_file = 0; - uint64_t max_total_wal_size = (options_.max_total_wal_size == 0) + uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0) ? 4 * max_total_in_memory_state_ - : options_.max_total_wal_size; + : db_options_.max_total_wal_size; if (UNLIKELY(!single_column_family_mode_) && alive_log_files_.begin()->getting_flushed == false && total_log_size_ > max_total_wal_size) { - flush_column_family_if_log_file = alive_log_files_.begin()->number; + uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; alive_log_files_.begin()->getting_flushed = true; - Log(options_.info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Flushing all column families with data in WAL number %" PRIu64 ". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64, flush_column_family_if_log_file, total_log_size_, max_total_wal_size); - } - - if (LIKELY(single_column_family_mode_)) { - // fast path - status = MakeRoomForWrite(default_cf_handle_->cfd(), - &context, expiration_time); - } else { - // refcounting cfd in iteration - bool dead_cfd = false; + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->Ref(); - if (flush_column_family_if_log_file != 0 && - cfd->GetLogNumber() <= flush_column_family_if_log_file) { - // log size excedded limit and we need to do flush - // SetNewMemtableAndNewLogFie may temporarily unlock and wait + if (cfd->GetLogNumber() <= flush_column_family_if_log_file) { status = SetNewMemtableAndNewLogFile(cfd, &context); + if (!status.ok()) { + break; + } cfd->imm()->FlushRequested(); - MaybeScheduleFlushOrCompaction(); - } else { - // May temporarily unlock and wait. - status = MakeRoomForWrite(cfd, &context, expiration_time); - } - - if (cfd->Unref()) { - dead_cfd = true; - } - if (!status.ok()) { - break; + SchedulePendingFlush(cfd); + context.schedule_bg_work_ = true; } } - if (dead_cfd) { - versions_->GetColumnFamilySet()->FreeDeadColumnFamilies(); + } else if (UNLIKELY(write_buffer_.ShouldFlush())) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Flushing all column families. Write buffer is using %" PRIu64 + " bytes out of a total of %" PRIu64 ".", + write_buffer_.memory_usage(), write_buffer_.buffer_size()); + // no need to refcount because drop is happening in write thread, so can't + // happen while we're in the write thread + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->mem()->IsEmpty()) { + status = SetNewMemtableAndNewLogFile(cfd, &context); + if (!status.ok()) { + break; + } + cfd->imm()->FlushRequested(); + SchedulePendingFlush(cfd); + context.schedule_bg_work_ = true; + } } + MaybeScheduleFlushOrCompaction(); + } + + if (UNLIKELY(status.ok() && !bg_error_.ok())) { + status = bg_error_; + } + + if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) { + status = ScheduleFlushes(&context); + } + + if (UNLIKELY(status.ok()) && + (write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) { + status = DelayWrite(expiration_time); + } + + if (UNLIKELY(status.ok() && has_timeout && + env_->NowMicros() > expiration_time)) { + status = Status::TimedOut(); } uint64_t last_sequence = versions_->LastSequence(); - Writer* last_writer = &w; + WriteThread::Writer* last_writer = &w; if (status.ok()) { autovector write_batch_group; - BuildBatchGroup(&last_writer, &write_batch_group); + write_thread_.BuildBatchGroup(&last_writer, &write_batch_group); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -4031,39 +3099,45 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { const uint64_t batch_size = WriteBatchInternal::ByteSize(updates); // Record statistics RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count); - RecordTick(stats_, BYTES_WRITTEN, WriteBatchInternal::ByteSize(updates)); - if (options.disableWAL) { + RecordTick(stats_, BYTES_WRITTEN, batch_size); + if (write_options.disableWAL) { flush_on_destroy_ = true; } PERF_TIMER_STOP(write_pre_and_post_process_time); uint64_t log_size = 0; - if (!options.disableWAL) { - PERF_TIMER_START(write_wal_time); + if (!write_options.disableWAL) { + PERF_TIMER_GUARD(write_wal_time); Slice log_entry = WriteBatchInternal::Contents(updates); status = log_->AddRecord(log_entry); total_log_size_ += log_entry.size(); alive_log_files_.back().AddSize(log_entry.size()); log_empty_ = false; log_size = log_entry.size(); - RecordTick(stats_, WAL_FILE_SYNCED); RecordTick(stats_, WAL_FILE_BYTES, log_size); - if (status.ok() && options.sync) { - if (options_.use_fsync) { - StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); + if (status.ok() && write_options.sync) { + RecordTick(stats_, WAL_FILE_SYNCED); + StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); + if (db_options_.use_fsync) { status = log_->file()->Fsync(); } else { - StopWatch(env_, stats_, WAL_FILE_SYNC_MICROS); status = log_->file()->Sync(); } + if (status.ok() && !log_dir_synced_) { + // We only sync WAL directory the first time WAL syncing is + // requested, so that in case users never turn on WAL sync, + // we can avoid the disk I/O in the write code path. + status = directories_.GetWalDir()->Fsync(); + } + log_dir_synced_ = true; } - PERF_TIMER_STOP(write_wal_time); } if (status.ok()) { - PERF_TIMER_START(write_memtable_time); + PERF_TIMER_GUARD(write_memtable_time); status = WriteBatchInternal::InsertInto( - updates, column_family_memtables_.get(), false, 0, this, false); + updates, column_family_memtables_.get(), + write_options.ignore_missing_column_families, 0, this, false); // A non-OK status here indicates iteration failure (either in-memory // writebatch corruption (very bad), or the client specified invalid // column family). This will later on trigger bg_error_. @@ -4072,8 +3146,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // into the memtable would result in a state that some write ops might // have succeeded in memtable but Status reports error for all writes. - PERF_TIMER_STOP(write_memtable_time); - SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); } PERF_TIMER_START(write_pre_and_post_process_time); @@ -4084,7 +3156,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // internal stats default_cf_internal_stats_->AddDBStats( InternalStats::BYTES_WRITTEN, batch_size); - if (!options.disableWAL) { + default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN, + my_batch_count); + if (!write_options.disableWAL) { default_cf_internal_stats_->AddDBStats( InternalStats::WAL_FILE_SYNCED, 1); default_cf_internal_stats_->AddDBStats( @@ -4095,253 +3169,77 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } } } - if (options_.paranoid_checks && !status.ok() && + if (db_options_.paranoid_checks && !status.ok() && !status.IsTimedOut() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes } - EndWrite(&w, last_writer, status); + write_thread_.ExitWriteThread(&w, last_writer, status); + + if (context.schedule_bg_work_) { + MaybeScheduleFlushOrCompaction(); + } mutex_.Unlock(); if (status.IsTimedOut()) { RecordTick(stats_, WRITE_TIMEDOUT); } - PERF_TIMER_STOP(write_pre_and_post_process_time); return status; } -// This function will be called only when the first writer succeeds. -// All writers in the to-be-built batch group will be processed. -// -// REQUIRES: Writer list must be non-empty -// REQUIRES: First writer must have a non-nullptr batch -void DBImpl::BuildBatchGroup(Writer** last_writer, - autovector* write_batch_group) { - assert(!writers_.empty()); - Writer* first = writers_.front(); - assert(first->batch != nullptr); - - size_t size = WriteBatchInternal::ByteSize(first->batch); - write_batch_group->push_back(first->batch); - - // Allow the group to grow up to a maximum size, but if the - // original write is small, limit the growth so we do not slow - // down the small write too much. - size_t max_size = 1 << 20; - if (size <= (128<<10)) { - max_size = size + (128<<10); - } - - *last_writer = first; - std::deque::iterator iter = writers_.begin(); - ++iter; // Advance past "first" - for (; iter != writers_.end(); ++iter) { - Writer* w = *iter; - if (w->sync && !first->sync) { - // Do not include a sync write into a batch handled by a non-sync write. - break; - } - - if (!w->disableWAL && first->disableWAL) { - // Do not include a write that needs WAL into a batch that has - // WAL disabled. - break; - } - - if (w->timeout_hint_us < first->timeout_hint_us) { - // Do not include those writes with shorter timeout. Otherwise, we might - // execute a write that should instead be aborted because of timeout. - break; +// REQUIRES: mutex_ is held +// REQUIRES: this thread is currently at the front of the writer queue +Status DBImpl::DelayWrite(uint64_t expiration_time) { + uint64_t time_delayed = 0; + bool delayed = false; + { + StopWatch sw(env_, stats_, WRITE_STALL, &time_delayed); + bool has_timeout = (expiration_time > 0); + auto delay = write_controller_.GetDelay(); + if (write_controller_.IsStopped() == false && delay > 0) { + mutex_.Unlock(); + delayed = true; + // hopefully we don't have to sleep more than 2 billion microseconds + env_->SleepForMicroseconds(static_cast(delay)); + mutex_.Lock(); } - if (w->batch != nullptr) { - size += WriteBatchInternal::ByteSize(w->batch); - if (size > max_size) { - // Do not make batch too big - break; + while (bg_error_.ok() && write_controller_.IsStopped()) { + delayed = true; + if (has_timeout) { + bg_cv_.TimedWait(expiration_time); + if (env_->NowMicros() > expiration_time) { + return Status::TimedOut(); + } + } else { + bg_cv_.Wait(); } - - write_batch_group->push_back(w->batch); } - w->in_batch_group = true; - *last_writer = w; } -} + if (delayed) { + default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_STALL_MICROS, + time_delayed); + RecordTick(stats_, STALL_MICROS, time_delayed); + } -// This function computes the amount of time in microseconds by which a write -// should be delayed based on the number of level-0 files according to the -// following formula: -// if n < bottom, return 0; -// if n >= top, return 1000; -// otherwise, let r = (n - bottom) / -// (top - bottom) -// and return r^2 * 1000. -// The goal of this formula is to gradually increase the rate at which writes -// are slowed. We also tried linear delay (r * 1000), but it seemed to do -// slightly worse. There is no other particular reason for choosing quadratic. -uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { - uint64_t delay; - if (n >= top) { - delay = 1000; - } - else if (n < bottom) { - delay = 0; - } - else { - // If we are here, we know that: - // level0_start_slowdown <= n < level0_slowdown - // since the previous two conditions are false. - double how_much = - (double) (n - bottom) / - (top - bottom); - delay = std::max(how_much * how_much * 1000, 100.0); - } - assert(delay <= 1000); - return delay; + return bg_error_; } -// REQUIRES: mutex_ is held -// REQUIRES: this thread is currently at the front of the writer queue -Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, - WriteContext* context, - uint64_t expiration_time) { - mutex_.AssertHeld(); - assert(!writers_.empty()); - bool allow_delay = true; - bool allow_hard_rate_limit_delay = true; - bool allow_soft_rate_limit_delay = true; - uint64_t rate_limit_delay_millis = 0; - Status s; - double score; - // Once we schedule background work, we shouldn't schedule it again, since it - // might generate a tight feedback loop, constantly scheduling more background - // work, even if additional background work is not needed - bool schedule_background_work = true; - bool has_timeout = (expiration_time > 0); - - while (true) { - if (!bg_error_.ok()) { - // Yield previous error - s = bg_error_; - break; - } else if (has_timeout && env_->NowMicros() > expiration_time) { - s = Status::TimedOut(); - break; - } else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) { - // We are getting close to hitting a hard limit on the number of - // L0 files. Rather than delaying a single write by several - // seconds when we hit the hard limit, start delaying each - // individual write by 0-1ms to reduce latency variance. Also, - // this delay hands over some CPU to the compaction thread in - // case it is sharing the same core as the writer. - uint64_t slowdown = - SlowdownAmount(cfd->current()->NumLevelFiles(0), - cfd->options()->level0_slowdown_writes_trigger, - cfd->options()->level0_stop_writes_trigger); - mutex_.Unlock(); - uint64_t delayed; - { - StopWatch sw(env_, stats_, STALL_L0_SLOWDOWN_COUNT, &delayed); - env_->SleepForMicroseconds(slowdown); - } - RecordTick(stats_, STALL_L0_SLOWDOWN_MICROS, delayed); - allow_delay = false; // Do not delay a single write more than once - mutex_.Lock(); - cfd->internal_stats()->AddCFStats( - InternalStats::LEVEL0_SLOWDOWN, delayed); - delayed_writes_++; - } else if (!cfd->mem()->ShouldFlush()) { - // There is room in current memtable - if (allow_delay) { - DelayLoggingAndReset(); - } - break; - } else if (cfd->NeedWaitForNumMemtables()) { - // We have filled up the current memtable, but the previous - // ones are still being flushed, so we wait. - DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for memtable flush...\n", - cfd->GetName().c_str()); - if (schedule_background_work) { - MaybeScheduleFlushOrCompaction(); - schedule_background_work = false; - } - uint64_t stall; - { - StopWatch sw(env_, stats_, STALL_MEMTABLE_COMPACTION_COUNT, &stall); - if (!has_timeout) { - bg_cv_.Wait(); - } else { - bg_cv_.TimedWait(expiration_time); - } - } - RecordTick(stats_, STALL_MEMTABLE_COMPACTION_MICROS, stall); - cfd->internal_stats()->AddCFStats( - InternalStats::MEMTABLE_COMPACTION, stall); - } else if (cfd->NeedWaitForNumLevel0Files()) { - DelayLoggingAndReset(); - Log(options_.info_log, "[%s] wait for fewer level0 files...\n", - cfd->GetName().c_str()); - uint64_t stall; - { - StopWatch sw(env_, stats_, STALL_L0_NUM_FILES_COUNT, &stall); - if (!has_timeout) { - bg_cv_.Wait(); - } else { - bg_cv_.TimedWait(expiration_time); - } - } - RecordTick(stats_, STALL_L0_NUM_FILES_MICROS, stall); - cfd->internal_stats()->AddCFStats( - InternalStats::LEVEL0_NUM_FILES, stall); - } else if (allow_hard_rate_limit_delay && cfd->ExceedsHardRateLimit()) { - // Delay a write when the compaction score for any level is too large. - const int max_level = cfd->current()->MaxCompactionScoreLevel(); - score = cfd->current()->MaxCompactionScore(); - mutex_.Unlock(); - uint64_t delayed; - { - StopWatch sw(env_, stats_, HARD_RATE_LIMIT_DELAY_COUNT, &delayed); - env_->SleepForMicroseconds(1000); - } - // Make sure the following value doesn't round to zero. - uint64_t rate_limit = std::max((delayed / 1000), (uint64_t) 1); - rate_limit_delay_millis += rate_limit; - RecordTick(stats_, RATE_LIMIT_DELAY_MILLIS, rate_limit); - if (cfd->options()->rate_limit_delay_max_milliseconds > 0 && - rate_limit_delay_millis >= - (unsigned)cfd->options()->rate_limit_delay_max_milliseconds) { - allow_hard_rate_limit_delay = false; - } - mutex_.Lock(); - cfd->internal_stats()->RecordLevelNSlowdown(max_level, delayed, false); - } else if (allow_soft_rate_limit_delay && cfd->ExceedsSoftRateLimit()) { - const int max_level = cfd->current()->MaxCompactionScoreLevel(); - score = cfd->current()->MaxCompactionScore(); - // Delay a write when the compaction score for any level is too large. - // TODO: add statistics - uint64_t slowdown = SlowdownAmount(score, cfd->options()->soft_rate_limit, - cfd->options()->hard_rate_limit); - uint64_t elapsed = 0; - mutex_.Unlock(); - { - StopWatch sw(env_, stats_, SOFT_RATE_LIMIT_DELAY_COUNT, &elapsed); - env_->SleepForMicroseconds(slowdown); - rate_limit_delay_millis += slowdown; - } - allow_soft_rate_limit_delay = false; - mutex_.Lock(); - cfd->internal_stats()->RecordLevelNSlowdown(max_level, elapsed, true); - } else { - s = SetNewMemtableAndNewLogFile(cfd, context); - if (!s.ok()) { - break; - } - MaybeScheduleFlushOrCompaction(); +Status DBImpl::ScheduleFlushes(WriteContext* context) { + ColumnFamilyData* cfd; + while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) { + auto status = SetNewMemtableAndNewLogFile(cfd, context); + SchedulePendingFlush(cfd); + context->schedule_bg_work_ = true; + if (cfd->Unref()) { + delete cfd; + } + if (!status.ok()) { + return status; } } - return s; + return Status::OK(); } // REQUIRES: mutex_ is held @@ -4355,39 +3253,41 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, // Attempt to switch to a new memtable and trigger flush of old. // Do this without holding the dbmutex lock. - assert(versions_->PrevLogNumber() == 0); + assert(versions_->prev_log_number() == 0); bool creating_new_log = !log_empty_; uint64_t new_log_number = creating_new_log ? versions_->NewFileNumber() : logfile_number_; SuperVersion* new_superversion = nullptr; + const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions(); mutex_.Unlock(); Status s; { - DelayLoggingAndReset(); if (creating_new_log) { - s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), - &lfile, - env_->OptimizeForLogWrite(storage_options_)); + s = env_->NewWritableFile( + LogFileName(db_options_.wal_dir, new_log_number), &lfile, + env_->OptimizeForLogWrite(env_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. - lfile->SetPreallocationBlockSize(1.1 * - cfd->options()->write_buffer_size); + lfile->SetPreallocationBlockSize( + 1.1 * mutable_cf_options.write_buffer_size); new_log = new log::Writer(std::move(lfile)); + log_dir_synced_ = false; } } if (s.ok()) { - new_mem = new MemTable(cfd->internal_comparator(), *cfd->options()); + new_mem = cfd->ConstructNewMemtable(mutable_cf_options); new_superversion = new SuperVersion(); } } + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "[%s] New memtable created with log file: #%" PRIu64 "\n", + cfd->GetName().c_str(), new_log_number); mutex_.Lock(); if (!s.ok()) { // how do we fail if we're not creating new log? assert(creating_new_log); - // Avoid chewing through file number space in a tight loop. - versions_->ReuseLogFileNumber(new_log_number); assert(!new_mem); assert(!new_log); return s; @@ -4399,14 +3299,14 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, log_.reset(new_log); log_empty_ = true; alive_log_files_.push_back(LogFileNumberSize(logfile_number_)); - for (auto cfd : *versions_->GetColumnFamilySet()) { + for (auto loop_cfd : *versions_->GetColumnFamilySet()) { // all this is just optimization to delete logs that // are no longer needed -- if CF is empty, that means it // doesn't need that particular log to stay alive, so we just // advance the log number. no need to persist this in the manifest - if (cfd->mem()->GetFirstSequenceNumber() == 0 && - cfd->imm()->size() == 0) { - cfd->SetLogNumber(logfile_number_); + if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 && + loop_cfd->imm()->size() == 0) { + loop_cfd->SetLogNumber(logfile_number_); } } } @@ -4414,11 +3314,8 @@ Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, cfd->imm()->Add(cfd->mem()); new_mem->Ref(); cfd->SetMemtable(new_mem); - Log(options_.info_log, - "[%s] New memtable created with log file: #%" PRIu64 "\n", - cfd->GetName().c_str(), logfile_number_); context->superversions_to_free_.push_back( - cfd->InstallSuperVersion(new_superversion, &mutex_)); + InstallSuperVersion(cfd, new_superversion, mutable_cf_options, true)); return s; } @@ -4471,13 +3368,13 @@ bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, bool ret_value = GetIntPropertyInternal(column_family, property_type, need_out_of_mutex, &int_value); if (ret_value) { - *value = std::to_string(int_value); + *value = ToString(int_value); } return ret_value; } else { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); return cfd->internal_stats()->GetStringProperty(property_type, property, value); } @@ -4503,7 +3400,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, auto cfd = cfh->cfd(); if (!need_out_of_mutex) { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); return cfd->internal_stats()->GetIntProperty(property_type, value, this); } else { SuperVersion* sv = GetAndRefSuperVersion(cfd); @@ -4519,26 +3416,18 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family, SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) { // TODO(ljin): consider using GetReferencedSuperVersion() directly - if (LIKELY(options_.allow_thread_local)) { - return cfd->GetThreadLocalSuperVersion(&mutex_); - } else { - MutexLock l(&mutex_); - return cfd->GetSuperVersion()->Ref(); - } + return cfd->GetThreadLocalSuperVersion(&mutex_); } void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd, SuperVersion* sv) { - bool unref_sv = true; - if (LIKELY(options_.allow_thread_local)) { - unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); - } + bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv); if (unref_sv) { // Release SuperVersion if (sv->Unref()) { { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); sv->Cleanup(); } delete sv; @@ -4555,7 +3444,7 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); v = cfd->current(); v->Ref(); } @@ -4570,16 +3459,25 @@ void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family, } { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); v->Unref(); } } -inline void DBImpl::DelayLoggingAndReset() { - if (delayed_writes_ > 0) { - Log(options_.info_log, "delayed %d write...\n", delayed_writes_ ); - delayed_writes_ = 0; - } +std::list::iterator +DBImpl::CaptureCurrentFileNumberInPendingOutputs() { + // We need to remember the iterator of our insert, because after the + // background job is done, we need to remove that element from + // pending_outputs_. + pending_outputs_.push_back(versions_->current_next_file_number()); + auto pending_outputs_inserted_elem = pending_outputs_.end(); + --pending_outputs_inserted_elem; + return pending_outputs_inserted_elem; +} + +void DBImpl::ReleaseFileNumberFromPendingOutputs( + std::list::iterator v) { + pending_outputs_.erase(v); } #ifndef ROCKSDB_LITE @@ -4591,23 +3489,7 @@ Status DBImpl::GetUpdatesSince( if (seq > versions_->LastSequence()) { return Status::NotFound("Requested sequence not yet written in the db"); } - // Get all sorted Wal Files. - // Do binary search and open files and find the seq number. - - std::unique_ptr wal_files(new VectorLogPtr); - Status s = GetSortedWalFiles(*wal_files); - if (!s.ok()) { - return s; - } - - s = RetainProbableWalFiles(*wal_files, seq); - if (!s.ok()) { - return s; - } - iter->reset(new TransactionLogIteratorImpl(options_.wal_dir, &options_, - read_options, storage_options_, - seq, std::move(wal_files), this)); - return (*iter)->status(); + return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get()); } Status DBImpl::DeleteFile(std::string name) { @@ -4616,7 +3498,8 @@ Status DBImpl::DeleteFile(std::string name) { WalFileType log_type; if (!ParseFileName(name, &number, &type, &log_type) || (type != kTableFile && type != kLogFile)) { - Log(options_.info_log, "DeleteFile %s failed.\n", name.c_str()); + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "DeleteFile %s failed.\n", name.c_str()); return Status::InvalidArgument("Invalid file name"); } @@ -4624,13 +3507,15 @@ Status DBImpl::DeleteFile(std::string name) { if (type == kLogFile) { // Only allow deleting archived log files if (log_type != kArchivedLogFile) { - Log(options_.info_log, "DeleteFile %s failed - not archived log.\n", + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "DeleteFile %s failed - not archived log.\n", name.c_str()); return Status::NotSupported("Delete only supported for archived logs"); } - status = env_->DeleteFile(options_.wal_dir + "/" + name.c_str()); + status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str()); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed -- %s.\n", + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "DeleteFile %s failed -- %s.\n", name.c_str(), status.ToString().c_str()); } return status; @@ -4640,59 +3525,81 @@ Status DBImpl::DeleteFile(std::string name) { FileMetaData* metadata; ColumnFamilyData* cfd; VersionEdit edit; - DeletionState deletion_state(true); + JobContext job_context(true); { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd); if (!status.ok()) { - Log(options_.info_log, "DeleteFile %s failed. File not found\n", - name.c_str()); + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "DeleteFile %s failed. File not found\n", name.c_str()); + job_context.Clean(); return Status::InvalidArgument("File not found"); } - assert((level > 0) && (level < cfd->NumberLevels())); + assert(level < cfd->NumberLevels()); // If the file is being compacted no need to delete. if (metadata->being_compacted) { - Log(options_.info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "DeleteFile %s Skipped. File about to be compacted\n", name.c_str()); + job_context.Clean(); return Status::OK(); } // Only the files in the last level can be deleted externally. // This is to make sure that any deletion tombstones are not // lost. Check that the level passed is the last level. + auto* vstoreage = cfd->current()->storage_info(); for (int i = level + 1; i < cfd->NumberLevels(); i++) { - if (cfd->current()->NumLevelFiles(i) != 0) { - Log(options_.info_log, + if (vstoreage->NumLevelFiles(i) != 0) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); + job_context.Clean(); return Status::InvalidArgument("File not in last level"); } } + // if level == 0, it has to be the oldest file + if (level == 0 && + vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "DeleteFile %s failed ---" + " target file in level 0 must be the oldest.", name.c_str()); + job_context.Clean(); + return Status::InvalidArgument("File in level 0, but not oldest"); + } + edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); - status = versions_->LogAndApply(cfd, &edit, &mutex_, db_directory_.get()); + status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + &edit, &mutex_, directories_.GetDbDir()); if (status.ok()) { - InstallSuperVersion(cfd, deletion_state); + InstallSuperVersionBackground(cfd, &job_context, + *cfd->GetLatestMutableCFOptions()); } - FindObsoleteFiles(deletion_state, false); - } // lock released here - LogFlush(options_.info_log); + FindObsoleteFiles(&job_context, false); + } // lock released here + LogFlush(db_options_.info_log); // remove files outside the db-lock - if (deletion_state.HaveSomethingToDelete()) { - PurgeObsoleteFiles(deletion_state); - } - { - MutexLock l(&mutex_); - // schedule flush if file deletion means we freed the space for flushes to - // continue - MaybeScheduleFlushOrCompaction(); + if (job_context.HaveSomethingToDelete()) { + PurgeObsoleteFiles(job_context); } + job_context.Clean(); return status; } void DBImpl::GetLiveFilesMetaData(std::vector* metadata) { - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); versions_->GetLiveFilesMetaData(metadata); } + +void DBImpl::GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* cf_meta) { + assert(column_family); + auto* cfd = reinterpret_cast(column_family)->cfd(); + auto* sv = GetAndRefSuperVersion(cfd); + sv->current->GetColumnFamilyMetaData(cf_meta); + ReturnAndCleanupSuperVersion(cfd, sv); +} + #endif // ROCKSDB_LITE Status DBImpl::CheckConsistency() { @@ -4702,7 +3609,8 @@ Status DBImpl::CheckConsistency() { std::string corruption_messages; for (const auto& md : metadata) { - std::string file_path = md.db_path + "/" + md.name; + // md.name has a leading "/". + std::string file_path = md.db_path + md.name; uint64_t fsize = 0; Status s = env_->GetFileSize(file_path, &fsize); @@ -4712,8 +3620,8 @@ Status DBImpl::CheckConsistency() { } else if (fsize != md.size) { corruption_messages += "Sst file size mismatch: " + file_path + ". Size recorded in manifest " + - std::to_string(md.size) + ", actual size " + - std::to_string(fsize) + "\n"; + ToString(md.size) + ", actual size " + + ToString(fsize) + "\n"; } } if (corruption_messages.size() == 0) { @@ -4738,7 +3646,7 @@ Status DBImpl::GetDbIdentity(std::string& identity) { } char buffer[file_size]; Slice id; - s = idfile->Read(file_size, &id, buffer); + s = idfile->Read(static_cast(file_size), &id, buffer); if (!s.ok()) { return s; } @@ -4777,7 +3685,7 @@ Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family, } // Default implementation -- returns not supported status -Status DB::CreateColumnFamily(const ColumnFamilyOptions& options, +Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { return Status::NotSupported(""); @@ -4795,11 +3703,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { column_families.push_back( ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); std::vector handles; - Status s = SanitizeDBOptionsByCFOptions(&db_options, column_families); - if (!s.ok()) { - return s; - } - s = DB::Open(db_options, dbname, column_families, &handles, dbptr); + Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { assert(handles.size() == 1); // i can delete the handle since DBImpl is always holding a reference to @@ -4812,18 +3716,24 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { Status DB::Open(const DBOptions& db_options, const std::string& dbname, const std::vector& column_families, std::vector* handles, DB** dbptr) { + Status s = SanitizeOptionsByTable(db_options, column_families); + if (!s.ok()) { + return s; + } + if (db_options.db_paths.size() > 1) { for (auto& cfd : column_families) { - if (cfd.options.compaction_style != kCompactionStyleUniversal) { + if ((cfd.options.compaction_style != kCompactionStyleUniversal) && + (cfd.options.compaction_style != kCompactionStyleLevel)) { return Status::NotSupported( "More than one DB paths are only supported in " - "universal compaction style. "); + "universal and level compaction styles. "); } } if (db_options.db_paths.size() > 4) { return Status::NotSupported( - "More than four DB paths are not supported yet. "); + "More than four DB paths are not supported yet. "); } } @@ -4837,9 +3747,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } DBImpl* impl = new DBImpl(db_options, dbname); - Status s = impl->env_->CreateDirIfMissing(impl->options_.wal_dir); + s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir); if (s.ok()) { - for (auto db_path : impl->options_.db_paths) { + for (auto db_path : impl->db_options_.db_paths) { s = impl->env_->CreateDirIfMissing(db_path.path); if (!s.ok()) { break; @@ -4864,9 +3774,9 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); - s = impl->options_.env->NewWritableFile( - LogFileName(impl->options_.wal_dir, new_log_number), &lfile, - impl->options_.env->OptimizeForLogWrite(soptions)); + s = impl->db_options_.env->NewWritableFile( + LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, + impl->db_options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; @@ -4879,6 +3789,7 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, if (cfd != nullptr) { handles->push_back( new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_)); + impl->NewThreadStatusCfInfo(cfd); } else { if (db_options.create_missing_column_families) { // missing column family, create it @@ -4900,23 +3811,23 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - delete cfd->InstallSuperVersion(new SuperVersion(), &impl->mutex_); + delete impl->InstallSuperVersion( + cfd, nullptr, *cfd->GetLatestMutableCFOptions()); } impl->alive_log_files_.push_back( DBImpl::LogFileNumberSize(impl->logfile_number_)); impl->DeleteObsoleteFiles(); - impl->MaybeScheduleFlushOrCompaction(); - s = impl->db_directory_->Fsync(); + s = impl->directories_.GetDbDir()->Fsync(); } } if (s.ok()) { for (auto cfd : *impl->versions_->GetColumnFamilySet()) { - if (cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) { - Version* current = cfd->current(); - for (int i = 1; i < current->NumberLevels(); ++i) { - int num_files = current->NumLevelFiles(i); + if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) { + auto* vstorage = cfd->current()->storage_info(); + for (int i = 1; i < vstorage->num_levels(); ++i) { + int num_files = vstorage->NumLevelFiles(i); if (num_files > 0) { s = Status::InvalidArgument( "Not all files are at level 0. Cannot " @@ -4925,7 +3836,10 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } } } - if (cfd->options()->merge_operator != nullptr && + if (!cfd->mem()->IsSnapshotSupported()) { + impl->is_snapshot_supported_ = false; + } + if (cfd->ioptions()->merge_operator != nullptr && !cfd->mem()->IsMergeOperatorSupported()) { s = Status::InvalidArgument( "The memtable of column family %s does not support merge operator " @@ -4941,9 +3855,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, if (s.ok()) { impl->opened_successfully_ = true; + Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p", + impl); *dbptr = impl; } else { - for (auto h : *handles) { + for (auto* h : *handles) { delete h; } handles->clear(); @@ -4966,23 +3882,10 @@ Status DestroyDB(const std::string& dbname, const Options& options) { const Options& soptions(SanitizeOptions(dbname, &comparator, options)); Env* env = soptions.env; std::vector filenames; - std::vector archiveFiles; - std::string archivedir = ArchivalDirectory(dbname); // Ignore error in case directory does not exist env->GetChildren(dbname, &filenames); - if (dbname != soptions.wal_dir) { - std::vector logfilenames; - env->GetChildren(soptions.wal_dir, &logfilenames); - filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end()); - archivedir = ArchivalDirectory(soptions.wal_dir); - } - - if (filenames.empty()) { - return Status::OK(); - } - FileLock* lock; const std::string lockname = LockFileName(dbname); Status result = env->LockFile(lockname, &lock); @@ -4996,8 +3899,6 @@ Status DestroyDB(const std::string& dbname, const Options& options) { Status del; if (type == kMetaDatabase) { del = DestroyDB(dbname + "/" + filenames[i], options); - } else if (type == kLogFile) { - del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]); } else { del = env->DeleteFile(dbname + "/" + filenames[i]); } @@ -5009,8 +3910,6 @@ Status DestroyDB(const std::string& dbname, const Options& options) { for (auto& db_path : options.db_paths) { env->GetChildren(db_path.path, &filenames); - uint64_t number; - FileType type; for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { // Lock file will be deleted at end @@ -5022,6 +3921,24 @@ Status DestroyDB(const std::string& dbname, const Options& options) { } } + std::vector walDirFiles; + std::string archivedir = ArchivalDirectory(dbname); + if (dbname != soptions.wal_dir) { + env->GetChildren(soptions.wal_dir, &walDirFiles); + archivedir = ArchivalDirectory(soptions.wal_dir); + } + + // Delete log files in the WAL dir + for (const auto& file : walDirFiles) { + if (ParseFileName(file, &number, &type) && type == kLogFile) { + Status del = env->DeleteFile(soptions.wal_dir + "/" + file); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + std::vector archiveFiles; env->GetChildren(archivedir, &archiveFiles); // Delete archival files. for (size_t i = 0; i < archiveFiles.size(); ++i) { @@ -5044,13 +3961,52 @@ Status DestroyDB(const std::string& dbname, const Options& options) { return result; } +#if ROCKSDB_USING_THREAD_STATUS + +void DBImpl::NewThreadStatusCfInfo( + ColumnFamilyData* cfd) const { + if (db_options_.enable_thread_tracking) { + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd); + } +} + +void DBImpl::EraseThreadStatusCfInfo( + ColumnFamilyData* cfd) const { + if (db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseColumnFamilyInfo(cfd); + } +} + +void DBImpl::EraseThreadStatusDbInfo() const { + if (db_options_.enable_thread_tracking) { + ThreadStatusUtil::EraseDatabaseInfo(this); + } +} + +#else +void DBImpl::NewThreadStatusCfInfo( + ColumnFamilyData* cfd) const { +} + +void DBImpl::EraseThreadStatusCfInfo( + ColumnFamilyData* cfd) const { +} + +void DBImpl::EraseThreadStatusDbInfo() const { +} +#endif // ROCKSDB_USING_THREAD_STATUS + // // A global method that can dump out the build version -void DumpLeveldbBuildVersion(Logger * log) { +void DumpRocksDBBuildVersion(Logger * log) { #if !defined(IOS_CROSS_COMPILE) - // if we compile with Xcode, we don't run build_detect_vesion, so we don't generate util/build_version.cc - Log(log, "Git sha %s", rocksdb_build_git_sha); - Log(log, "Compile time %s %s", + // if we compile with Xcode, we don't run build_detect_vesion, so we don't + // generate util/build_version.cc + Log(InfoLogLevel::INFO_LEVEL, log, + "RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR, + ROCKSDB_PATCH); + Log(InfoLogLevel::INFO_LEVEL, log, "Git sha %s", rocksdb_build_git_sha); + Log(InfoLogLevel::INFO_LEVEL, log, "Compile time %s %s", rocksdb_build_compile_time, rocksdb_build_compile_date); #endif } diff --git a/db/db_impl.h b/db/db_impl.h index 086ac9fd4..86402e817 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #include @@ -21,6 +23,8 @@ #include "db/snapshot.h" #include "db/column_family.h" #include "db/version_edit.h" +#include "db/wal_manager.h" +#include "db/writebuffer.h" #include "memtable_list.h" #include "port/port.h" #include "rocksdb/db.h" @@ -30,7 +34,13 @@ #include "util/autovector.h" #include "util/stop_watch.h" #include "util/thread_local.h" +#include "util/scoped_arena_iterator.h" +#include "util/hash.h" +#include "util/instrumented_mutex.h" #include "db/internal_stats.h" +#include "db/write_controller.h" +#include "db/flush_scheduler.h" +#include "db/write_thread.h" namespace rocksdb { @@ -41,6 +51,7 @@ class VersionEdit; class VersionSet; class CompactionFilterV2; class Arena; +struct JobContext; class DBImpl : public DB { public: @@ -108,6 +119,17 @@ class DBImpl : public DB { bool reduce_level = false, int target_level = -1, uint32_t target_path_id = 0); + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1); + + using DB::SetOptions; + Status SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& options_map); + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family); using DB::MaxMemCompactionLevel; @@ -141,6 +163,15 @@ class DBImpl : public DB { virtual Status DeleteFile(std::string name); virtual void GetLiveFilesMetaData(std::vector* metadata); + + // Obtains the meta data of the specified column family of the DB. + // Status::NotFound() will be returned if the current DB does not have + // any column family match the specified name. + // TODO(yhchiang): output parameter is placed in the end in this codebase. + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) override; + #endif // ROCKSDB_LITE // checks if all live files exist on file system and that their file sizes @@ -173,8 +204,8 @@ class DBImpl : public DB { // Return an internal iterator over the current state of the database. // The keys of this iterator are internal keys (see format.h). // The returned iterator should be deleted when no longer needed. - Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = - nullptr); + Iterator* TEST_NewInternalIterator( + Arena* arena, ColumnFamilyHandle* column_family = nullptr); // Return the maximum overlapping data (in bytes) at next level for any // file at a level >= 1. @@ -184,132 +215,79 @@ class DBImpl : public DB { // Return the current manifest file no. uint64_t TEST_Current_Manifest_FileNo(); - // Trigger's a background call for testing. - void TEST_PurgeObsoleteteWAL(); - // get total level0 file size. Only for testing. uint64_t TEST_GetLevel0TotalSize(); - void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) - { - default_interval_to_delete_obsolete_WAL_ = default_interval_to_delete_obsolete_WAL; - } - void TEST_GetFilesMetaData(ColumnFamilyHandle* column_family, std::vector>* metadata); - Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, - SequenceNumber* sequence); + void TEST_LockMutex(); - Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); -#endif // NDEBUG + void TEST_UnlockMutex(); - // Structure to store information for candidate files to delete. - struct CandidateFileInfo { - std::string file_name; - uint32_t path_id; - CandidateFileInfo(std::string name, uint32_t path) - : file_name(name), path_id(path) {} - bool operator==(const CandidateFileInfo& other) const { - return file_name == other.file_name && path_id == other.path_id; - } - }; + // REQUIRES: mutex locked + void* TEST_BeginWrite(); - // needed for CleanupIteratorState - struct DeletionState { - inline bool HaveSomethingToDelete() const { - return candidate_files.size() || - sst_delete_files.size() || - log_delete_files.size(); - } + // REQUIRES: mutex locked + // pass the pointer that you got from TEST_BeginWrite() + void TEST_EndWrite(void* w); - // a list of all files that we'll consider deleting - // (every once in a while this is filled up with all files - // in the DB directory) - std::vector candidate_files; - - // the list of all live sst files that cannot be deleted - std::vector sst_live; - - // a list of sst files that we need to delete - std::vector sst_delete_files; - - // a list of log files that we need to delete - std::vector log_delete_files; - - // a list of memtables to be free - autovector memtables_to_free; - - autovector superversions_to_free; - - SuperVersion* new_superversion; // if nullptr no new superversion - - // the current manifest_file_number, log_number and prev_log_number - // that corresponds to the set of files in 'live'. - uint64_t manifest_file_number, pending_manifest_file_number, log_number, - prev_log_number; - - explicit DeletionState(bool create_superversion = false) { - manifest_file_number = 0; - pending_manifest_file_number = 0; - log_number = 0; - prev_log_number = 0; - new_superversion = create_superversion ? new SuperVersion() : nullptr; - } + uint64_t TEST_max_total_in_memory_state() { + return max_total_in_memory_state_; + } - ~DeletionState() { - // free pending memtables - for (auto m : memtables_to_free) { - delete m; - } - // free superversions - for (auto s : superversions_to_free) { - delete s; - } - // if new_superversion was not used, it will be non-nullptr and needs - // to be freed here - delete new_superversion; - } - }; +#endif // ROCKSDB_LITE // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than - // options_.delete_obsolete_files_period_micros microseconds ago, - // it will not fill up the deletion_state - void FindObsoleteFiles(DeletionState& deletion_state, - bool force, + // db_options_.delete_obsolete_files_period_micros microseconds ago, + // it will not fill up the job_context + void FindObsoleteFiles(JobContext* job_context, bool force, bool no_full_scan = false); // Diffs the files listed in filenames and those that do not // belong to live files are posibly removed. Also, removes all the // files in sst_delete_files and log_delete_files. // It is not necessary to hold the mutex when invoking this method. - void PurgeObsoleteFiles(DeletionState& deletion_state); + void PurgeObsoleteFiles(const JobContext& background_contet); ColumnFamilyHandle* DefaultColumnFamily() const; + const SnapshotList& snapshots() const { return snapshots_; } + protected: Env* const env_; const std::string dbname_; unique_ptr versions_; - const DBOptions options_; + const DBOptions db_options_; Statistics* stats_; Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, - SuperVersion* super_version, - Arena* arena = nullptr); + SuperVersion* super_version, Arena* arena); + + void NotifyOnFlushCompleted(ColumnFamilyData* cfd, uint64_t file_number, + const MutableCFOptions& mutable_cf_options); + + void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, + Compaction *c, const Status &st); + + void NewThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusCfInfo(ColumnFamilyData* cfd) const; + + void EraseThreadStatusDbInfo() const; private: friend class DB; friend class InternalStats; #ifndef ROCKSDB_LITE - friend class TailingIterator; friend class ForwardIterator; #endif friend struct SuperVersion; + friend class CompactedDBImpl; struct CompactionState; - struct Writer; + struct WriteContext; Status NewDB(); @@ -327,14 +305,34 @@ class DBImpl : public DB { // Delete any unneeded files and stale in-memory entries. void DeleteObsoleteFiles(); + // Background process needs to call + // auto x = CaptureCurrentFileNumberInPendingOutputs() + // + // ReleaseFileNumberFromPendingOutputs(x) + // This will protect any temporary files created while is + // executing from being deleted. + // ----------- + // This function will capture current file number and append it to + // pending_outputs_. This will prevent any background process to delete any + // file created after this point. + std::list::iterator CaptureCurrentFileNumberInPendingOutputs(); + // This function should be called with the result of + // CaptureCurrentFileNumberInPendingOutputs(). It then marks that any file + // created between the calls CaptureCurrentFileNumberInPendingOutputs() and + // ReleaseFileNumberFromPendingOutputs() can now be deleted (if it's not live + // and blocked by any other pending_outputs_ calls) + void ReleaseFileNumberFromPendingOutputs(std::list::iterator v); + // Flush the in-memory write buffer to storage. Switches to a new // log-file/memtable and writes a new descriptor iff successful. - Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, bool* madeProgress, - DeletionState& deletion_state, + Status FlushMemTableToOutputFile(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, + bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer); - Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, - bool read_only); + // REQUIRES: log_numbers are sorted in ascending order + Status RecoverLogFiles(const std::vector& log_numbers, + SequenceNumber* max_sequence, bool read_only); // The following two methods are used to flush a memtable to // storage. The first one is used atdatabase RecoveryTime (when the @@ -343,47 +341,13 @@ class DBImpl : public DB { // concurrent flush memtables to storage. Status WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, VersionEdit* edit); - Status WriteLevel0Table(ColumnFamilyData* cfd, autovector& mems, - VersionEdit* edit, uint64_t* filenumber, - LogBuffer* log_buffer); - - uint64_t SlowdownAmount(int n, double bottom, double top); - - // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) - // thread should grab the mutex_ and be the first on writers queue. - // BeginWrite is used for it. - // Be aware! Writer's job can be done by other thread (see DBImpl::Write - // for examples), so check it via w.done before applying changes. - // - // Writer* w: writer to be placed in the queue - // uint64_t expiration_time: maximum time to be in the queue - // See also: EndWrite - Status BeginWrite(Writer* w, uint64_t expiration_time); - - // After doing write job, we need to remove already used writers from - // writers_ queue and notify head of the queue about it. - // EndWrite is used for this. - // - // Writer* w: Writer, that was added by BeginWrite function - // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write - // does) - // we should pass last_writer as a parameter to - // EndWrite - // (if you don't touch other writers, just pass w) - // Status status: Status of write operation - // See also: BeginWrite - void EndWrite(Writer* w, Writer* last_writer, Status status); - - Status MakeRoomForWrite(ColumnFamilyData* cfd, - WriteContext* context, - uint64_t expiration_time); + Status DelayWrite(uint64_t expiration_time); + + Status ScheduleFlushes(WriteContext* context); Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, WriteContext* context); - void BuildBatchGroup(Writer** last_writer, - autovector* write_batch_group); - // Force current memtable contents to be flushed. Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); @@ -393,98 +357,55 @@ class DBImpl : public DB { void RecordFlushIOStats(); void RecordCompactionIOStats(); +#ifndef ROCKSDB_LITE + Status CompactFilesImpl( + const CompactionOptions& compact_options, ColumnFamilyData* cfd, + Version* version, const std::vector& input_file_names, + const int output_level, int output_path_id); +#endif // ROCKSDB_LITE + + ColumnFamilyData* GetColumnFamilyDataByName(const std::string& cf_name); + void MaybeScheduleFlushOrCompaction(); + void SchedulePendingFlush(ColumnFamilyData* cfd); + void SchedulePendingCompaction(ColumnFamilyData* cfd); static void BGWorkCompaction(void* db); static void BGWorkFlush(void* db); void BackgroundCallCompaction(); void BackgroundCallFlush(); - Status BackgroundCompaction(bool* madeProgress, DeletionState& deletion_state, + Status BackgroundCompaction(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer); - Status BackgroundFlush(bool* madeProgress, DeletionState& deletion_state, + Status BackgroundFlush(bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer); - void CleanupCompaction(CompactionState* compact, Status status); - Status DoCompactionWork(CompactionState* compact, - DeletionState& deletion_state, - LogBuffer* log_buffer); // This function is called as part of compaction. It enables Flush process to // preempt compaction, since it's higher prioirty - // Returns: micros spent executing uint64_t CallFlushDuringCompaction(ColumnFamilyData* cfd, - DeletionState& deletion_state, + const MutableCFOptions& mutable_cf_options, + JobContext* job_context, LogBuffer* log_buffer); - // Call compaction filter if is_compaction_v2 is not true. Then iterate - // through input and compact the kv-pairs - Status ProcessKeyValueCompaction( - bool is_snapshot_supported, - SequenceNumber visible_at_tip, - SequenceNumber earliest_snapshot, - SequenceNumber latest_snapshot, - DeletionState& deletion_state, - bool bottommost_level, - int64_t& imm_micros, - Iterator* input, - CompactionState* compact, - bool is_compaction_v2, - LogBuffer* log_buffer); - - // Call compaction_filter_v2->Filter() on kv-pairs in compact - void CallCompactionFilterV2(CompactionState* compact, - CompactionFilterV2* compaction_filter_v2); - - Status OpenCompactionOutputFile(CompactionState* compact); - Status FinishCompactionOutputFile(CompactionState* compact, Iterator* input); - Status InstallCompactionResults(CompactionState* compact, - LogBuffer* log_buffer); - void AllocateCompactionOutputFileNumbers(CompactionState* compact); - void ReleaseCompactionUnusedFileNumbers(CompactionState* compact); - -#ifdef ROCKSDB_LITE - void PurgeObsoleteWALFiles() { - // this function is used for archiving WAL files. we don't need this in - // ROCKSDB_LITE - } -#else - void PurgeObsoleteWALFiles(); - - Status GetSortedWalsOfType(const std::string& path, - VectorLogPtr& log_files, - WalFileType type); - - // Requires: all_logs should be sorted with earliest log file first - // Retains all log files in all_logs which contain updates with seq no. - // Greater Than or Equal to the requested SequenceNumber. - Status RetainProbableWalFiles(VectorLogPtr& all_logs, - const SequenceNumber target); - - Status ReadFirstRecord(const WalFileType type, const uint64_t number, - SequenceNumber* sequence); - - Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence); -#endif // ROCKSDB_LITE - void PrintStatistics(); // dump rocksdb.stats to LOG void MaybeDumpStats(); - // Return true if the current db supports snapshot. If the current - // DB does not support snapshot, then calling GetSnapshot() will always - // return nullptr. - // - // @see GetSnapshot() - virtual bool IsSnapshotSupported() const; - // Return the minimum empty level that could hold the total data in the // input level. Return the input level, if such level could not be found. - int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, int level); + int FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd, + const MutableCFOptions& mutable_cf_options, int level); // Move the files in the input level to the target level. // If target_level < 0, automatically calculate the minimum level that could // hold the data set. Status ReFitLevel(ColumnFamilyData* cfd, int level, int target_level = -1); + // helper functions for adding and removing from flush & compaction queues + void AddToCompactionQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromCompactionQueue(); + void AddToFlushQueue(ColumnFamilyData* cfd); + ColumnFamilyData* PopFirstFromFlushQueue(); + // table_cache_ provides its own synchronization std::shared_ptr table_cache_; @@ -492,8 +413,8 @@ class DBImpl : public DB { FileLock* db_lock_; // State below is protected by mutex_ - port::Mutex mutex_; - port::AtomicPointer shutting_down_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; // This condition variable is signaled on these conditions: // * whenever bg_compaction_scheduled_ goes down to 0 // * if bg_manual_only_ > 0, whenever a compaction finishes, even if it hasn't @@ -502,9 +423,10 @@ class DBImpl : public DB { // * whenever bg_flush_scheduled_ value decreases (i.e. whenever a flush is // done, even if it didn't make any progress) // * whenever there is an error in background flush or compaction - port::CondVar bg_cv_; + InstrumentedCondVar bg_cv_; uint64_t logfile_number_; unique_ptr log_; + bool log_dir_synced_; bool log_empty_; ColumnFamilyHandleImpl* default_cf_handle_; InternalStats* default_cf_internal_stats_; @@ -526,26 +448,85 @@ class DBImpl : public DB { // some code-paths bool single_column_family_mode_; - std::unique_ptr db_directory_; + bool is_snapshot_supported_; - // Queue of writers. - std::deque writers_; - WriteBatch tmp_batch_; + // Class to maintain directories for all database paths other than main one. + class Directories { + public: + Status SetDirectories(Env* env, const std::string& dbname, + const std::string& wal_dir, + const std::vector& data_paths); - SnapshotList snapshots_; + Directory* GetDataDir(size_t path_id); + + Directory* GetWalDir() { + if (wal_dir_) { + return wal_dir_.get(); + } + return db_dir_.get(); + } + + Directory* GetDbDir() { return db_dir_.get(); } + + private: + std::unique_ptr db_dir_; + std::vector> data_dirs_; + std::unique_ptr wal_dir_; - // cache for ReadFirstRecord() calls - std::unordered_map read_first_record_cache_; - port::Mutex read_first_record_cache_mutex_; + Status CreateAndNewDirectory(Env* env, const std::string& dirname, + std::unique_ptr* directory) const; + }; + + Directories directories_; + + WriteBuffer write_buffer_; - // Set of table files to protect from deletion because they are - // part of ongoing compactions. - // map from pending file number ID to their path IDs. - FileNumToPathIdMap pending_outputs_; + WriteThread write_thread_; + + WriteBatch tmp_batch_; - // At least one compaction or flush job is pending but not yet scheduled - // because of the max background thread limit. - bool bg_schedule_needed_; + WriteController write_controller_; + FlushScheduler flush_scheduler_; + + SnapshotList snapshots_; + + // For each background job, pending_outputs_ keeps the current file number at + // the time that background job started. + // FindObsoleteFiles()/PurgeObsoleteFiles() never deletes any file that has + // number bigger than any of the file number in pending_outputs_. Since file + // numbers grow monotonically, this also means that pending_outputs_ is always + // sorted. After a background job is done executing, its file number is + // deleted from pending_outputs_, which allows PurgeObsoleteFiles() to clean + // it up. + // State is protected with db mutex. + std::list pending_outputs_; + + // flush_queue_ and compaction_queue_ hold column families that we need to + // flush and compact, respectively. + // A column family is inserted into flush_queue_ when it satisfies condition + // cfd->imm()->IsFlushPending() + // A column family is inserted into compaction_queue_ when it satisfied + // condition cfd->NeedsCompaction() + // Column families in this list are all Ref()-erenced + // TODO(icanadi) Provide some kind of ReferencedColumnFamily class that will + // do RAII on ColumnFamilyData + // Column families are in this queue when they need to be flushed or + // compacted. Consumers of these queues are flush and compaction threads. When + // column family is put on this queue, we increase unscheduled_flushes_ and + // unscheduled_compactions_. When these variables are bigger than zero, that + // means we need to schedule background threads for compaction and thread. + // Once the background threads are scheduled, we decrease unscheduled_flushes_ + // and unscheduled_compactions_. That way we keep track of number of + // compaction and flush threads we need to schedule. This scheduling is done + // in MaybeScheduleFlushOrCompaction() + // invariant(column family present in flush_queue_ <==> + // ColumnFamilyData::pending_flush_ == true) + std::deque flush_queue_; + // invariant(column family present in compaction_queue_ <==> + // ColumnFamilyData::pending_compaction_ == true) + std::deque compaction_queue_; + int unscheduled_flushes_; + int unscheduled_compactions_; // count how many background compactions are running or have been scheduled int bg_compaction_scheduled_; @@ -584,30 +565,23 @@ class DBImpl : public DB { // without any synchronization int disable_delete_obsolete_files_; - // last time when DeleteObsoleteFiles was invoked - uint64_t delete_obsolete_files_last_run_; - - // last time when PurgeObsoleteWALFiles ran. - uint64_t purge_wal_files_last_run_; + // next time when we should run DeleteObsoleteFiles with full scan + uint64_t delete_obsolete_files_next_run_; // last time stats were dumped to LOG std::atomic last_stats_dump_time_microsec_; - // obsolete files will be deleted every this seconds if ttl deletion is - // enabled and archive size_limit is disabled. - uint64_t default_interval_to_delete_obsolete_WAL_; - bool flush_on_destroy_; // Used when disableWAL is true. static const int KEEP_LOG_FILE_NUM = 1000; - static const uint64_t kNoTimeOut = std::numeric_limits::max(); std::string db_absolute_path_; - // count of the number of contiguous delaying writes - int delayed_writes_; - // The options to access storage files - const EnvOptions storage_options_; + const EnvOptions env_options_; + +#ifndef ROCKSDB_LITE + WalManager wal_manager_; +#endif // ROCKSDB_LITE // A value of true temporarily disables scheduling of background work bool bg_work_gate_closed_; @@ -618,13 +592,16 @@ class DBImpl : public DB { // Indicate DB was opened successfully bool opened_successfully_; + // The list of registered event listeners. + std::list listeners_; + + // count how many events are currently being notified. + int notifying_events_; + // No copying allowed DBImpl(const DBImpl&); void operator=(const DBImpl&); - // dump the delayed_writes_ to the log file and reset counter. - void DelayLoggingAndReset(); - // Return the earliest snapshot where seqno is visible. // Store the snapshot right before that, if any, in prev_snapshot inline SequenceNumber findEarliestVisibleSnapshot( @@ -633,10 +610,24 @@ class DBImpl : public DB { SequenceNumber* prev_snapshot); // Background threads call this function, which is just a wrapper around - // the cfd->InstallSuperVersion() function. Background threads carry - // deletion_state which can have new_superversion already allocated. - void InstallSuperVersion(ColumnFamilyData* cfd, - DeletionState& deletion_state); + // the InstallSuperVersion() function. Background threads carry + // job_context which can have new_superversion already + // allocated. + void InstallSuperVersionBackground( + ColumnFamilyData* cfd, JobContext* job_context, + const MutableCFOptions& mutable_cf_options); + + // All ColumnFamily state changes go through this function. Here we analyze + // the new state and we schedule background work if we detect that the new + // state needs flush or compaction. + // If dont_schedule_bg_work == true, then caller asks us to not schedule flush + // or compaction here, but it also promises to schedule needed background + // work. We use this to scheduling background compactions when we are in the + // write thread, which is very performance critical. Caller schedules + // background work as soon as it exits the write thread + SuperVersion* InstallSuperVersion(ColumnFamilyData* cfd, SuperVersion* new_sv, + const MutableCFOptions& mutable_cf_options, + bool dont_schedule_bg_work = false); // Find Super version and reference it. Based on options, it might return // the thread local cached one. @@ -680,8 +671,4 @@ static void ClipToRange(T* ptr, V minvalue, V maxvalue) { if (static_cast(*ptr) < minvalue) *ptr = minvalue; } -// Dump db file summary, implemented in util/ -extern void DumpDBFileSummary(const DBOptions& options, - const std::string& dbname); - } // namespace rocksdb diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index 8df66f6c6..efa209a2b 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -10,17 +10,17 @@ #ifndef ROCKSDB_LITE #include "db/db_impl.h" +#include "util/thread_status_updater.h" namespace rocksdb { -void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); } - uint64_t DBImpl::TEST_GetLevel0TotalSize() { - MutexLock l(&mutex_); - return default_cf_handle_->cfd()->current()->NumLevelBytes(0); + InstrumentedMutexLock l(&mutex_); + return default_cf_handle_->cfd()->current()->storage_info()->NumLevelBytes(0); } -Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { +Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena, + ColumnFamilyHandle* column_family) { ColumnFamilyData* cfd; if (column_family == nullptr) { cfd = default_cf_handle_->cfd(); @@ -33,7 +33,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); mutex_.Unlock(); ReadOptions roptions; - return NewInternalIterator(roptions, cfd, super_version); + return NewInternalIterator(roptions, cfd, super_version, arena); } int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( @@ -45,8 +45,8 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( auto cfh = reinterpret_cast(column_family); cfd = cfh->cfd(); } - MutexLock l(&mutex_); - return cfd->current()->MaxNextLevelOverlappingBytes(); + InstrumentedMutexLock l(&mutex_); + return cfd->current()->storage_info()->MaxNextLevelOverlappingBytes(); } void DBImpl::TEST_GetFilesMetaData( @@ -54,10 +54,11 @@ void DBImpl::TEST_GetFilesMetaData( std::vector>* metadata) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); metadata->resize(NumberLevels()); for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = cfd->current()->files_[level]; + const std::vector& files = + cfd->current()->storage_info()->LevelFiles(level); (*metadata)[level].clear(); for (const auto& f : files) { @@ -67,7 +68,7 @@ void DBImpl::TEST_GetFilesMetaData( } uint64_t DBImpl::TEST_Current_Manifest_FileNo() { - return versions_->ManifestFileNumber(); + return versions_->manifest_file_number(); } Status DBImpl::TEST_CompactRange(int level, const Slice* begin, @@ -81,8 +82,8 @@ Status DBImpl::TEST_CompactRange(int level, const Slice* begin, cfd = cfh->cfd(); } int output_level = - (cfd->options()->compaction_style == kCompactionStyleUniversal || - cfd->options()->compaction_style == kCompactionStyleFIFO) + (cfd->ioptions()->compaction_style == kCompactionStyleUniversal || + cfd->ioptions()->compaction_style == kCompactionStyleFIFO) ? level : level + 1; return RunManualCompaction(cfd, level, output_level, 0, begin, end); @@ -112,22 +113,33 @@ Status DBImpl::TEST_WaitForCompact() { // wait for compact. It actually waits for scheduled compaction // OR flush to finish. - MutexLock l(&mutex_); + InstrumentedMutexLock l(&mutex_); while ((bg_compaction_scheduled_ || bg_flush_scheduled_) && bg_error_.ok()) { bg_cv_.Wait(); } return bg_error_; } -Status DBImpl::TEST_ReadFirstRecord(const WalFileType type, - const uint64_t number, - SequenceNumber* sequence) { - return ReadFirstRecord(type, number, sequence); +void DBImpl::TEST_LockMutex() { + mutex_.Lock(); +} + +void DBImpl::TEST_UnlockMutex() { + mutex_.Unlock(); +} + +void* DBImpl::TEST_BeginWrite() { + auto w = new WriteThread::Writer(&mutex_); + Status s = write_thread_.EnterWriteThread(w, 0); + assert(s.ok() && !w->done); // No timeout and nobody should do our job + return reinterpret_cast(w); } -Status DBImpl::TEST_ReadFirstLine(const std::string& fname, - SequenceNumber* sequence) { - return ReadFirstLine(fname, sequence); +void DBImpl::TEST_EndWrite(void* w) { + auto writer = reinterpret_cast(w); + write_thread_.ExitWriteThread(writer, writer, Status::OK()); + delete writer; } + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 6c864aefd..c1d61e377 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -2,56 +2,31 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 Facebook. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. + #include "db/db_impl_readonly.h" +#include "utilities/compacted_db/compacted_db_impl.h" #include "db/db_impl.h" - -#include -#include -#include -#include -#include -#include -#include "db/db_iter.h" -#include "db/dbformat.h" -#include "db/filename.h" -#include "db/log_reader.h" -#include "db/log_writer.h" -#include "db/memtable.h" #include "db/merge_context.h" -#include "db/table_cache.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "rocksdb/status.h" -#include "rocksdb/table.h" -#include "rocksdb/merge_operator.h" -#include "port/port.h" -#include "table/block.h" -#include "table/merger.h" -#include "table/two_level_iterator.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/build_version.h" +#include "db/db_iter.h" +#include "util/perf_context_imp.h" namespace rocksdb { -DBImplReadOnly::DBImplReadOnly(const DBOptions& options, +#ifndef ROCKSDB_LITE + +DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options, const std::string& dbname) - : DBImpl(options, dbname) { - Log(options_.info_log, "Opening the db in read only mode"); + : DBImpl(db_options, dbname) { + Log(INFO_LEVEL, db_options_.info_log, "Opening the db in read only mode"); + LogFlush(db_options_.info_log); } DBImplReadOnly::~DBImplReadOnly() { } // Implementations of the DB interface -Status DBImplReadOnly::Get(const ReadOptions& options, +Status DBImplReadOnly::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { Status s; @@ -61,33 +36,35 @@ Status DBImplReadOnly::Get(const ReadOptions& options, SuperVersion* super_version = cfd->GetSuperVersion(); MergeContext merge_context; LookupKey lkey(key, snapshot); - if (super_version->mem->Get(lkey, value, &s, merge_context, - *cfd->options())) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { } else { - super_version->current->Get(options, lkey, value, &s, &merge_context); + PERF_TIMER_GUARD(get_from_output_files_time); + super_version->current->Get(read_options, lkey, value, &s, &merge_context); } return s; } -Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, +Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SequenceNumber latest_snapshot = versions_->LastSequence(); auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), - (options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot)); - auto internal_iter = - NewInternalIterator(options, cfd, super_version, db_iter->GetArena()); + env_, *cfd->ioptions(), cfd->user_comparator(), + (read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot), + super_version->mutable_cf_options.max_sequential_skip_in_iterations); + auto internal_iter = NewInternalIterator( + read_options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); return db_iter; } Status DBImplReadOnly::NewIterators( - const ReadOptions& options, + const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { if (iterators == nullptr) { @@ -98,14 +75,17 @@ Status DBImplReadOnly::NewIterators( SequenceNumber latest_snapshot = versions_->LastSequence(); for (auto cfh : column_families) { - auto cfd = reinterpret_cast(cfh)->cfd(); - auto db_iter = NewArenaWrappedDbIterator( - env_, *cfd->options(), cfd->user_comparator(), - options.snapshot != nullptr - ? reinterpret_cast(options.snapshot)->number_ - : latest_snapshot); - auto internal_iter = NewInternalIterator( - options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); + auto* cfd = reinterpret_cast(cfh)->cfd(); + auto* sv = cfd->GetSuperVersion()->Ref(); + auto* db_iter = NewArenaWrappedDbIterator( + env_, *cfd->ioptions(), cfd->user_comparator(), + (read_options.snapshot != nullptr + ? reinterpret_cast( + read_options.snapshot)->number_ + : latest_snapshot), + sv->mutable_cf_options.max_sequential_skip_in_iterations); + auto* internal_iter = NewInternalIterator( + read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); iterators->push_back(db_iter); } @@ -117,6 +97,13 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DB** dbptr, bool error_if_log_file_exist) { *dbptr = nullptr; + // Try to first open DB as fully compacted DB + Status s; + s = CompactedDBImpl::Open(options, dbname, dbptr); + if (s.ok()) { + return s; + } + DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; @@ -124,8 +111,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); std::vector handles; - Status s = - DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); + s = DB::OpenForReadOnly(db_options, dbname, column_families, &handles, dbptr); if (s.ok()) { assert(handles.size() == 1); // i can delete the handle since DBImpl is always holding a @@ -167,6 +153,10 @@ Status DB::OpenForReadOnly( impl->mutex_.Unlock(); if (s.ok()) { *dbptr = impl; + for (auto* h : *handles) { + impl->NewThreadStatusCfInfo( + reinterpret_cast(h)->cfd()); + } } else { for (auto h : *handles) { delete h; @@ -177,5 +167,20 @@ Status DB::OpenForReadOnly( return s; } +#else // !ROCKSDB_LITE + +Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, + DB** dbptr, bool error_if_log_file_exist) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} + +Status DB::OpenForReadOnly( + const DBOptions& db_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, DB** dbptr, + bool error_if_log_file_exist) { + return Status::NotSupported("Not supported in ROCKSDB_LITE."); +} +#endif // !ROCKSDB_LITE } // namespace rocksdb diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 1dfdf422e..25fcb4350 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -2,24 +2,14 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 Facebook. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. #pragma once -#include "db/db_impl.h" -#include -#include +#ifndef ROCKSDB_LITE + +#include "db/db_impl.h" #include #include -#include "db/dbformat.h" -#include "db/log_writer.h" -#include "db/snapshot.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "port/port.h" namespace rocksdb { @@ -75,10 +65,20 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DBImpl::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + #ifndef ROCKSDB_LITE virtual Status DisableFileDeletions() override { return Status::NotSupported("Not supported operation in read only mode."); } + virtual Status EnableFileDeletions(bool force) override { return Status::NotSupported("Not supported operation in read only mode."); } @@ -103,3 +103,5 @@ class DBImplReadOnly : public DBImpl { void operator=(const DBImplReadOnly&); }; } + +#endif // !ROCKSDB_LITE diff --git a/db/db_iter.cc b/db/db_iter.cc index 370ffd8cb..1b5bf860e 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -58,22 +58,25 @@ class DBIter: public Iterator { kReverse }; - DBIter(Env* env, const Options& options, const Comparator* cmp, - Iterator* iter, SequenceNumber s, bool arena_mode) + DBIter(Env* env, const ImmutableCFOptions& ioptions, + const Comparator* cmp, Iterator* iter, SequenceNumber s, + bool arena_mode, uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr) : arena_mode_(arena_mode), env_(env), - logger_(options.info_log.get()), + logger_(ioptions.info_log), user_comparator_(cmp), - user_merge_operator_(options.merge_operator.get()), + user_merge_operator_(ioptions.merge_operator), iter_(iter), sequence_(s), direction_(kForward), valid_(false), current_entry_is_merged_(false), - statistics_(options.statistics.get()) { + statistics_(ioptions.statistics), + iterate_upper_bound_(iterate_upper_bound) { RecordTick(statistics_, NO_ITERATORS); - has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr); - max_skip_ = options.max_sequential_skip_in_iterations; + prefix_extractor_ = ioptions.prefix_extractor; + max_skip_ = max_sequential_skip_in_iterations; } virtual ~DBIter() { RecordTick(statistics_, NO_ITERATORS, -1); @@ -132,7 +135,7 @@ class DBIter: public Iterator { } } - bool has_prefix_extractor_; + const SliceTransform* prefix_extractor_; bool arena_mode_; Env* const env_; Logger* logger_; @@ -149,6 +152,7 @@ class DBIter: public Iterator { bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; + const Slice* iterate_upper_bound_; // No copying allowed DBIter(const DBIter&); @@ -158,7 +162,8 @@ class DBIter: public Iterator { inline bool DBIter::ParseKey(ParsedInternalKey* ikey) { if (!ParseInternalKey(iter_->key(), ikey)) { status_ = Status::Corruption("corrupted internal key in DBIter"); - Log(logger_, "corrupted internal key in DBIter: %s", + Log(InfoLogLevel::ERROR_LEVEL, + logger_, "corrupted internal key in DBIter: %s", iter_->key().ToString(true).c_str()); return false; } else { @@ -194,9 +199,8 @@ void DBIter::Next() { // NOTE: In between, saved_key_ can point to a user key that has // a delete marker inline void DBIter::FindNextUserEntry(bool skipping) { - PERF_TIMER_AUTO(find_next_user_entry_time); + PERF_TIMER_GUARD(find_next_user_entry_time); FindNextUserEntryInternal(skipping); - PERF_TIMER_STOP(find_next_user_entry_time); } // Actual implementation of DBIter::FindNextUserEntry() @@ -208,36 +212,44 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { uint64_t num_skipped = 0; do { ParsedInternalKey ikey; - if (ParseKey(&ikey) && ikey.sequence <= sequence_) { - if (skipping && - user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { - num_skipped++; // skip this entry - PERF_COUNTER_ADD(internal_key_skipped_count, 1); - } else { - skipping = false; - switch (ikey.type) { - case kTypeDeletion: - // Arrange to skip all upcoming entries for this key since - // they are hidden by this deletion. - saved_key_.SetKey(ikey.user_key); - skipping = true; - num_skipped = 0; - PERF_COUNTER_ADD(internal_delete_skipped_count, 1); - break; - case kTypeValue: - valid_ = true; - saved_key_.SetKey(ikey.user_key); - return; - case kTypeMerge: - // By now, we are sure the current ikey is going to yield a value - saved_key_.SetKey(ikey.user_key); - current_entry_is_merged_ = true; - valid_ = true; - MergeValuesNewToOld(); // Go to a different state machine - return; - default: - assert(false); - break; + + if (ParseKey(&ikey)) { + if (iterate_upper_bound_ != nullptr && + ikey.user_key.compare(*iterate_upper_bound_) >= 0) { + break; + } + + if (ikey.sequence <= sequence_) { + if (skipping && + user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { + num_skipped++; // skip this entry + PERF_COUNTER_ADD(internal_key_skipped_count, 1); + } else { + skipping = false; + switch (ikey.type) { + case kTypeDeletion: + // Arrange to skip all upcoming entries for this key since + // they are hidden by this deletion. + saved_key_.SetKey(ikey.user_key); + skipping = true; + num_skipped = 0; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); + break; + case kTypeValue: + valid_ = true; + saved_key_.SetKey(ikey.user_key); + return; + case kTypeMerge: + // By now, we are sure the current ikey is going to yield a value + saved_key_.SetKey(ikey.user_key); + current_entry_is_merged_ = true; + valid_ = true; + MergeValuesNewToOld(); // Go to a different state machine + return; + default: + assert(false); + break; + } } } } @@ -267,16 +279,17 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { // iter_ points to the next entry (or invalid) void DBIter::MergeValuesNewToOld() { if (!user_merge_operator_) { - Log(logger_, "Options::merge_operator is null."); - throw std::logic_error("DBIter::MergeValuesNewToOld() with" - " Options::merge_operator null"); + Log(InfoLogLevel::ERROR_LEVEL, + logger_, "Options::merge_operator is null."); + status_ = Status::InvalidArgument("user_merge_operator_ must be set."); + valid_ = false; + return; } // Start the merge process by pushing the first operand std::deque operands; operands.push_front(iter_->value().ToString()); - std::string merge_result; // Temporary string to hold merge result later ParsedInternalKey ikey; for (iter_->Next(); iter_->Valid(); iter_->Next()) { if (!ParseKey(&ikey)) { @@ -300,8 +313,8 @@ void DBIter::MergeValuesNewToOld() { // hit a put, merge the put value with operands and store the // final result in saved_value_. We are done! // ignore corruption if there is any. - const Slice value = iter_->value(); - user_merge_operator_->FullMerge(ikey.user_key, &value, operands, + const Slice val = iter_->value(); + user_merge_operator_->FullMerge(ikey.user_key, &val, operands, &saved_value_, logger_); // iter_ is positioned after put iter_->Next(); @@ -311,8 +324,8 @@ void DBIter::MergeValuesNewToOld() { if (kTypeMerge == ikey.type) { // hit a merge, add the value as an operand and run associative merge. // when complete, add result to operands and continue. - const Slice& value = iter_->value(); - operands.push_front(value.ToString()); + const Slice& val = iter_->value(); + operands.push_front(val.ToString()); } } @@ -399,6 +412,7 @@ bool DBIter::FindValueForCurrentKey() { case kTypeDeletion: operands.clear(); last_not_merge_type = kTypeDeletion; + PERF_COUNTER_ADD(internal_delete_skipped_count, 1); break; case kTypeMerge: assert(user_merge_operator_ != nullptr); @@ -408,6 +422,7 @@ bool DBIter::FindValueForCurrentKey() { assert(false); } + PERF_COUNTER_ADD(internal_key_skipped_count, 1); assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0); iter_->Prev(); ++num_skipped; @@ -491,8 +506,8 @@ bool DBIter::FindValueForCurrentKeyUsingSeek() { return true; } - const Slice& value = iter_->value(); - user_merge_operator_->FullMerge(saved_key_.GetKey(), &value, operands, + const Slice& val = iter_->value(); + user_merge_operator_->FullMerge(saved_key_.GetKey(), &val, operands, &saved_value_, logger_); valid_ = true; return true; @@ -554,12 +569,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) { void DBIter::Seek(const Slice& target) { StopWatch sw(env_, statistics_, DB_SEEK); + // total ordering is not guaranteed if prefix_extractor is set + // hence prefix based seeks will not give correct results + if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) { + if (!prefix_extractor_->InDomain(*iterate_upper_bound_) || + !prefix_extractor_->InDomain(target) || + prefix_extractor_->Transform(*iterate_upper_bound_).compare( + prefix_extractor_->Transform(target)) != 0) { + status_ = Status::InvalidArgument("read_options.iterate_*_bound " + " and seek target need to have the same prefix."); + valid_ = false; + return; + } + } + saved_key_.Clear(); // now savved_key is used to store internal key. saved_key_.SetInternalKey(target, sequence_); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->Seek(saved_key_.GetKey()); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->Seek(saved_key_.GetKey()); + } + if (iter_->Valid()) { direction_ = kForward; ClearSavedValue(); @@ -572,14 +604,17 @@ void DBIter::Seek(const Slice& target) { void DBIter::SeekToFirst() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kForward; ClearSavedValue(); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->SeekToFirst(); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->SeekToFirst(); + } + if (iter_->Valid()) { FindNextUserEntry(false /* not skipping */); } else { @@ -590,24 +625,29 @@ void DBIter::SeekToFirst() { void DBIter::SeekToLast() { // Don't use iter_::Seek() if we set a prefix extractor // because prefix seek wiil be used. - if (has_prefix_extractor_) { + if (prefix_extractor_ != nullptr) { max_skip_ = std::numeric_limits::max(); } direction_ = kReverse; ClearSavedValue(); - PERF_TIMER_AUTO(seek_internal_seek_time); - iter_->SeekToLast(); - PERF_TIMER_STOP(seek_internal_seek_time); + + { + PERF_TIMER_GUARD(seek_internal_seek_time); + iter_->SeekToLast(); + } PrevInternal(); } -Iterator* NewDBIterator(Env* env, const Options& options, +Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, const Comparator* user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence) { - return new DBIter(env, options, user_key_comparator, internal_iter, sequence, - false); + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound) { + return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, + false, max_sequential_skip_in_iterations, + iterate_upper_bound); } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } @@ -635,14 +675,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, } ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence) { + Env* env, const ImmutableCFOptions& ioptions, + const Comparator* user_key_comparator, + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); Arena* arena = iter->GetArena(); auto mem = arena->AllocateAligned(sizeof(DBIter)); - DBIter* db_iter = new (mem) - DBIter(env, options, user_key_comparator, nullptr, sequence, true); + DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, + nullptr, sequence, true, max_sequential_skip_in_iterations, + iterate_upper_bound); + iter->SetDBIter(db_iter); + return iter; } diff --git a/db/db_iter.h b/db/db_iter.h index cb9840324..c676d6cda 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -24,10 +24,12 @@ class DBIter; // into appropriate user keys. extern Iterator* NewDBIterator( Env* env, - const Options& options, + const ImmutableCFOptions& options, const Comparator *user_key_comparator, Iterator* internal_iter, - const SequenceNumber& sequence); + const SequenceNumber& sequence, + uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator { // Generate the arena wrapped iterator class. extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( - Env* env, const Options& options, const Comparator* user_key_comparator, - const SequenceNumber& sequence); + Env* env, const ImmutableCFOptions& options, + const Comparator* user_key_comparator, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + const Slice* iterate_upper_bound = nullptr); } // namespace rocksdb diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index 4ce79da1b..f045d7798 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -19,7 +19,7 @@ namespace rocksdb { -static uint32_t TestGetTickerCount(const Options& options, +static uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { return options.statistics->getTickerCount(ticker_type); } @@ -33,20 +33,23 @@ class TestIterator : public Iterator { iter_(0), cmp(comparator) {} - void AddMerge(std::string key, std::string value) { - Add(key, kTypeMerge, value); + void AddMerge(std::string argkey, std::string argvalue) { + Add(argkey, kTypeMerge, argvalue); } - void AddDeletion(std::string key) { Add(key, kTypeDeletion, std::string()); } + void AddDeletion(std::string argkey) { + Add(argkey, kTypeDeletion, std::string()); + } - void AddPut(std::string key, std::string value) { - Add(key, kTypeValue, value); + void AddPut(std::string argkey, std::string argvalue) { + Add(argkey, kTypeValue, argvalue); } - void Add(std::string key, ValueType type, std::string value) { + void Add(std::string argkey, ValueType type, std::string argvalue) { valid_ = true; - ParsedInternalKey internal_key(key, sequence_number_++, type); - data_.push_back(std::pair(std::string(), value)); + ParsedInternalKey internal_key(argkey, sequence_number_++, type); + data_.push_back( + std::pair(std::string(), argvalue)); AppendInternalKey(&data_.back().first, internal_key); } @@ -158,7 +161,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -191,7 +196,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -213,7 +220,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { } { - Options options; TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddPut("a", "val_a"); internal_iter->AddPut("b", "val_b"); @@ -232,7 +238,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -248,7 +256,6 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { } { - Options options; TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddPut("a", "val_a"); internal_iter->AddPut("a", "val_a"); @@ -262,7 +269,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 10, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -288,7 +297,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -298,7 +309,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); } @@ -318,7 +331,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) { internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -350,19 +365,21 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddMerge("b", "merge_1"); internal_iter->AddMerge("a", "merge_2"); - for (size_t i = 0; i < 200; ++i) { - internal_iter->AddPut("c", std::to_string(i)); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", ToString(k)); } internal_iter->Finish(); options.statistics = rocksdb::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), std::to_string(i)); + ASSERT_EQ(db_iter->value().ToString(), ToString(i)); db_iter->Prev(); ASSERT_TRUE(db_iter->Valid()); @@ -384,14 +401,16 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddMerge("b", "merge_1"); internal_iter->AddMerge("a", "merge_2"); - for (size_t i = 0; i < 200; ++i) { + for (size_t k = 0; k < 200; ++k) { internal_iter->AddDeletion("c"); } internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -418,7 +437,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 202)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 202, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -443,13 +464,15 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { { for (size_t i = 0; i < 200; ++i) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - for (size_t i = 0; i < 200; ++i) { + for (size_t k = 0; k < 200; ++k) { internal_iter->AddDeletion("c"); } internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -464,7 +487,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { internal_iter->AddPut("c", "200"); internal_iter->Finish(); std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200)); + NewDBIterator(env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, 200, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -487,22 +512,24 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddMerge("b", "merge_1"); internal_iter->AddMerge("a", "merge_2"); - for (size_t i = 0; i < 200; ++i) { - internal_iter->AddPut("d", std::to_string(i)); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("d", ToString(k)); } - for (size_t i = 0; i < 200; ++i) { - internal_iter->AddPut("c", std::to_string(i)); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddPut("c", ToString(k)); } internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "d"); - ASSERT_EQ(db_iter->value().ToString(), std::to_string(i)); + ASSERT_EQ(db_iter->value().ToString(), ToString(i)); db_iter->Prev(); ASSERT_TRUE(db_iter->Valid()); @@ -524,20 +551,22 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->AddMerge("b", "b"); internal_iter->AddMerge("a", "a"); - for (size_t i = 0; i < 200; ++i) { - internal_iter->AddMerge("c", std::to_string(i)); + for (size_t k = 0; k < 200; ++k) { + internal_iter->AddMerge("c", ToString(k)); } internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, i + 2)); + env_, ImmutableCFOptions(options), + BytewiseComparator(), internal_iter, i + 2, + options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); std::string merge_result = "0"; for (size_t j = 1; j <= i; ++j) { - merge_result += "," + std::to_string(j); + merge_result += "," + ToString(j); } ASSERT_EQ(db_iter->value().ToString(), merge_result); @@ -557,777 +586,819 @@ TEST(DBIteratorTest, DBIteratorUseSkip) { } } -TEST(DBIteratorTest, DBIterator) { +TEST(DBIteratorTest, DBIterator1) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1, + options.max_sequential_skip_in_iterations)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); +} + +TEST(DBIteratorTest, DBIterator2) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0, + options.max_sequential_skip_in_iterations)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST(DBIteratorTest, DBIterator3) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} +TEST(DBIteratorTest, DBIterator4) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("a", "1"); + internal_iter->AddMerge("b", "2"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4, + options.max_sequential_skip_in_iterations)); + db_iter->SeekToFirst(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0,1"); + db_iter->Next(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "2"); + db_iter->Next(); + ASSERT_TRUE(!db_iter->Valid()); +} + +TEST(DBIteratorTest, DBIterator5) { Options options; options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddPut("a", "0"); - internal_iter->AddPut("b", "0"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("a", "1"); - internal_iter->AddMerge("b", "2"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); - db_iter->SeekToFirst(); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "0"); - db_iter->Next(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); } { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddPut("a", "0"); - internal_iter->AddPut("b", "0"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("a", "1"); - internal_iter->AddMerge("b", "2"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); - db_iter->SeekToFirst(); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 1, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "0"); - db_iter->Next(); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); } { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddPut("a", "0"); - internal_iter->AddPut("b", "0"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("a", "1"); - internal_iter->AddMerge("b", "2"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); - db_iter->SeekToFirst(); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 2, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "0"); - db_iter->Next(); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); } { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddPut("a", "0"); - internal_iter->AddPut("b", "0"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("a", "1"); - internal_iter->AddMerge("b", "2"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); - db_iter->SeekToFirst(); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 3, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "0,1"); - db_iter->Next(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "2"); - db_iter->Next(); + ASSERT_EQ(db_iter->value().ToString(), "put_1"); + db_iter->Prev(); ASSERT_TRUE(!db_iter->Valid()); } { - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 4, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 5, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddPut("a", "put_1"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 6, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); +TEST(DBIteratorTest, DBIterator6) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "put_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 1, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 2, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddPut("a", "put_1"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "put_1,merge_4,merge_5,merge_6"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 3, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(!db_iter->Valid()); } { - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 4, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1,merge_2,merge_3"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); - db_iter->SeekToLast(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddMerge("a", "merge_2"); - internal_iter->AddMerge("a", "merge_3"); - internal_iter->AddDeletion("a"); - internal_iter->AddMerge("a", "merge_4"); - internal_iter->AddMerge("a", "merge_5"); - internal_iter->AddMerge("a", "merge_6"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 5, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); } { - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); - - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); - - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); - - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); - - internal_iter->AddDeletion("c"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); - - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); - - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); - - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); - - internal_iter->AddDeletion("c"); - internal_iter->Finish(); - - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); - - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "val,merge_2"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); - - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } - - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddMerge("a", "merge_2"); + internal_iter->AddMerge("a", "merge_3"); + internal_iter->AddDeletion("a"); + internal_iter->AddMerge("a", "merge_4"); + internal_iter->AddMerge("a", "merge_5"); + internal_iter->AddMerge("a", "merge_6"); + internal_iter->Finish(); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 6, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5,merge_6"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } +} - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); +TEST(DBIteratorTest, DBIterator7) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "merge_3"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 2, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "val,merge_2"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4"); - db_iter->Prev(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "merge_3"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 4, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "merge_3"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 5, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4"); + db_iter->Prev(); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 6, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_3"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 7, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 13)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 9, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "c"); - ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), - "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "merge_6,merge_7"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); + } - { - TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddMerge("a", "merge_1"); - internal_iter->AddPut("b", "val"); - internal_iter->AddMerge("b", "merge_2"); + { + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_3"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); - internal_iter->AddMerge("c", "merge_4"); - internal_iter->AddMerge("c", "merge_5"); + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); - internal_iter->AddDeletion("b"); - internal_iter->AddMerge("b", "merge_6"); - internal_iter->AddMerge("b", "merge_7"); - internal_iter->AddMerge("b", "merge_8"); - internal_iter->AddMerge("b", "merge_9"); - internal_iter->AddMerge("b", "merge_10"); - internal_iter->AddMerge("b", "merge_11"); + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); + internal_iter->Finish(); - internal_iter->AddDeletion("c"); - internal_iter->Finish(); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 13, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); - std::unique_ptr db_iter(NewDBIterator( - env_, options, BytewiseComparator(), internal_iter, 14)); - db_iter->SeekToLast(); - ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "c"); + ASSERT_EQ(db_iter->value().ToString(), "merge_4,merge_5"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), - "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); - db_iter->Prev(); - ASSERT_TRUE(db_iter->Valid()); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "merge_1"); - db_iter->Prev(); - ASSERT_TRUE(!db_iter->Valid()); - } + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); } { - Options options; TestIterator* internal_iter = new TestIterator(BytewiseComparator()); - internal_iter->AddDeletion("a"); - internal_iter->AddPut("a", "0"); - internal_iter->AddPut("b", "0"); + internal_iter->AddMerge("a", "merge_1"); + internal_iter->AddPut("b", "val"); + internal_iter->AddMerge("b", "merge_2"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_3"); + + internal_iter->AddMerge("c", "merge_4"); + internal_iter->AddMerge("c", "merge_5"); + + internal_iter->AddDeletion("b"); + internal_iter->AddMerge("b", "merge_6"); + internal_iter->AddMerge("b", "merge_7"); + internal_iter->AddMerge("b", "merge_8"); + internal_iter->AddMerge("b", "merge_9"); + internal_iter->AddMerge("b", "merge_10"); + internal_iter->AddMerge("b", "merge_11"); + + internal_iter->AddDeletion("c"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 14, options.max_sequential_skip_in_iterations)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); - ASSERT_EQ(db_iter->key().ToString(), "b"); - ASSERT_EQ(db_iter->value().ToString(), "0"); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), + "merge_6,merge_7,merge_8,merge_9,merge_10,merge_11"); db_iter->Prev(); ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); - ASSERT_EQ(db_iter->value().ToString(), "0"); + ASSERT_EQ(db_iter->value().ToString(), "merge_1"); + db_iter->Prev(); + ASSERT_TRUE(!db_iter->Valid()); } } +TEST(DBIteratorTest, DBIterator8) { + Options options; + options.merge_operator = MergeOperators::CreateFromStringId("stringappend"); + + TestIterator* internal_iter = new TestIterator(BytewiseComparator()); + internal_iter->AddDeletion("a"); + internal_iter->AddPut("a", "0"); + internal_iter->AddPut("b", "0"); + internal_iter->Finish(); + + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations)); + db_iter->SeekToLast(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "b"); + ASSERT_EQ(db_iter->value().ToString(), "0"); + + db_iter->Prev(); + ASSERT_TRUE(db_iter->Valid()); + ASSERT_EQ(db_iter->key().ToString(), "a"); + ASSERT_EQ(db_iter->value().ToString(), "0"); +} } // namespace rocksdb diff --git a/db/db_test.cc b/db/db_test.cc index 6295f5921..b4e0a46d0 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -11,14 +11,18 @@ #include #include #include +#include #include #include +#include "db/filename.h" #include "db/dbformat.h" #include "db/db_impl.h" #include "db/filename.h" +#include "db/job_context.h" #include "db/version_set.h" #include "db/write_batch_internal.h" +#include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" @@ -30,53 +34,61 @@ #include "rocksdb/table.h" #include "rocksdb/options.h" #include "rocksdb/table_properties.h" +#include "rocksdb/thread_status.h" #include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksdb/utilities/checkpoint.h" #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "util/hash.h" #include "util/hash_linklist_rep.h" #include "utilities/merge_operators.h" #include "util/logging.h" +#include "util/compression.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" #include "util/statistics.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" #include "util/sync_point.h" #include "util/testutil.h" +#include "util/mock_env.h" +#include "util/string_util.h" +#include "util/thread_status_util.h" +#include "util/xfunc.h" namespace rocksdb { static bool SnappyCompressionSupported(const CompressionOptions& options) { std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(options, in.data(), in.size(), &out); + return Snappy_Compress(options, in.data(), in.size(), &out); } static bool ZlibCompressionSupported(const CompressionOptions& options) { std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Zlib_Compress(options, in.data(), in.size(), &out); + return Zlib_Compress(options, 2, in.data(), in.size(), &out); } static bool BZip2CompressionSupported(const CompressionOptions& options) { std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::BZip2_Compress(options, in.data(), in.size(), &out); + return BZip2_Compress(options, 2, in.data(), in.size(), &out); } static bool LZ4CompressionSupported(const CompressionOptions &options) { std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::LZ4_Compress(options, in.data(), in.size(), &out); + return LZ4_Compress(options, 2, in.data(), in.size(), &out); } static bool LZ4HCCompressionSupported(const CompressionOptions &options) { std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::LZ4HC_Compress(options, in.data(), in.size(), &out); + return LZ4HC_Compress(options, 2, in.data(), in.size(), &out); } -static std::string RandomString(Random *rnd, int len) { +static std::string RandomString(Random* rnd, int len) { std::string r; test::RandomString(rnd, len, &r); return r; @@ -105,6 +117,9 @@ class AtomicCounter { struct OptionsOverride { std::shared_ptr filter_policy = nullptr; + + // Used as a bit mask of individual enums in which to skip an XF test point + int skip_policy = 0; }; } // namespace anon @@ -118,23 +133,32 @@ static std::string Key(int i) { // Special Env used to delay background operations class SpecialEnv : public EnvWrapper { public: + Random rnd_; + port::Mutex rnd_mutex_; // Lock to pretect rnd_ + // sstable Sync() calls are blocked while this pointer is non-nullptr. - port::AtomicPointer delay_sstable_sync_; + std::atomic delay_sstable_sync_; + + // Drop writes on the floor while this pointer is non-nullptr. + std::atomic drop_writes_; // Simulate no-space errors while this pointer is non-nullptr. - port::AtomicPointer no_space_; + std::atomic no_space_; // Simulate non-writable file system while this pointer is non-nullptr - port::AtomicPointer non_writable_; + std::atomic non_writable_; // Force sync of manifest files to fail while this pointer is non-nullptr - port::AtomicPointer manifest_sync_error_; + std::atomic manifest_sync_error_; // Force write to manifest files to fail while this pointer is non-nullptr - port::AtomicPointer manifest_write_error_; + std::atomic manifest_write_error_; // Force write to log files to fail while this pointer is non-nullptr - port::AtomicPointer log_write_error_; + std::atomic log_write_error_; + + // Slow down every log write, in micro-seconds. + std::atomic log_write_slowdown_; bool count_random_reads_; anon::AtomicCounter random_read_counter_; @@ -146,16 +170,35 @@ class SpecialEnv : public EnvWrapper { std::atomic bytes_written_; - explicit SpecialEnv(Env* base) : EnvWrapper(base) { - delay_sstable_sync_.Release_Store(nullptr); - no_space_.Release_Store(nullptr); - non_writable_.Release_Store(nullptr); + std::atomic sync_counter_; + + std::atomic non_writeable_rate_; + + std::atomic new_writable_count_; + + std::atomic non_writable_count_; + + std::function* table_write_callback_; + + int64_t addon_time_; + + explicit SpecialEnv(Env* base) : EnvWrapper(base), rnd_(301), addon_time_(0) { + delay_sstable_sync_.store(false, std::memory_order_release); + drop_writes_.store(false, std::memory_order_release); + no_space_.store(false, std::memory_order_release); + non_writable_.store(false, std::memory_order_release); count_random_reads_ = false; count_sequential_reads_ = false; - manifest_sync_error_.Release_Store(nullptr); - manifest_write_error_.Release_Store(nullptr); - log_write_error_.Release_Store(nullptr); + manifest_sync_error_.store(false, std::memory_order_release); + manifest_write_error_.store(false, std::memory_order_release); + log_write_error_.store(false, std::memory_order_release); + log_write_slowdown_ = 0; bytes_written_ = 0; + sync_counter_ = 0; + non_writeable_rate_ = 0; + new_writable_count_ = 0; + non_writable_count_ = 0; + table_write_callback_ = nullptr; } Status NewWritableFile(const std::string& f, unique_ptr* r, @@ -171,9 +214,14 @@ class SpecialEnv : public EnvWrapper { base_(std::move(base)) { } Status Append(const Slice& data) { - if (env_->no_space_.Acquire_Load() != nullptr) { + if (env_->table_write_callback_) { + (*env_->table_write_callback_)(); + } + if (env_->drop_writes_.load(std::memory_order_acquire)) { // Drop writes on the floor return Status::OK(); + } else if (env_->no_space_.load(std::memory_order_acquire)) { + return Status::IOError("No space left on device"); } else { env_->bytes_written_ += data.size(); return base_->Append(data); @@ -182,7 +230,8 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - while (env_->delay_sstable_sync_.Acquire_Load() != nullptr) { + ++env_->sync_counter_; + while (env_->delay_sstable_sync_.load(std::memory_order_acquire)) { env_->SleepForMicroseconds(100000); } return base_->Sync(); @@ -199,7 +248,7 @@ class SpecialEnv : public EnvWrapper { ManifestFile(SpecialEnv* env, unique_ptr&& b) : env_(env), base_(std::move(b)) { } Status Append(const Slice& data) { - if (env_->manifest_write_error_.Acquire_Load() != nullptr) { + if (env_->manifest_write_error_.load(std::memory_order_acquire)) { return Status::IOError("simulated writer error"); } else { return base_->Append(data); @@ -208,33 +257,59 @@ class SpecialEnv : public EnvWrapper { Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } Status Sync() { - if (env_->manifest_sync_error_.Acquire_Load() != nullptr) { + ++env_->sync_counter_; + if (env_->manifest_sync_error_.load(std::memory_order_acquire)) { return Status::IOError("simulated sync error"); } else { return base_->Sync(); } } + uint64_t GetFileSize() { + return base_->GetFileSize(); + } }; - class LogFile : public WritableFile { + class WalFile : public WritableFile { private: SpecialEnv* env_; unique_ptr base_; public: - LogFile(SpecialEnv* env, unique_ptr&& b) - : env_(env), base_(std::move(b)) { } + WalFile(SpecialEnv* env, unique_ptr&& b) + : env_(env), base_(std::move(b)) {} Status Append(const Slice& data) { - if (env_->log_write_error_.Acquire_Load() != nullptr) { + if (env_->log_write_error_.load(std::memory_order_acquire)) { return Status::IOError("simulated writer error"); } else { + int slowdown = + env_->log_write_slowdown_.load(std::memory_order_acquire); + if (slowdown > 0) { + env_->SleepForMicroseconds(slowdown); + } return base_->Append(data); } } Status Close() { return base_->Close(); } Status Flush() { return base_->Flush(); } - Status Sync() { return base_->Sync(); } + Status Sync() { + ++env_->sync_counter_; + return base_->Sync(); + } }; - if (non_writable_.Acquire_Load() != nullptr) { + if (non_writeable_rate_.load(std::memory_order_acquire) > 0) { + uint32_t random_number; + { + MutexLock l(&rnd_mutex_); + random_number = rnd_.Uniform(100); + } + if (random_number < non_writeable_rate_.load()) { + return Status::IOError("simulated random write error"); + } + } + + new_writable_count_++; + + if (non_writable_count_.load() > 0) { + non_writable_count_--; return Status::IOError("simulated write error"); } @@ -245,7 +320,7 @@ class SpecialEnv : public EnvWrapper { } else if (strstr(f.c_str(), "MANIFEST") != nullptr) { r->reset(new ManifestFile(this, std::move(*r))); } else if (strstr(f.c_str(), "log") != nullptr) { - r->reset(new LogFile(this, std::move(*r))); + r->reset(new WalFile(this, std::move(*r))); } } return s; @@ -306,6 +381,14 @@ class SpecialEnv : public EnvWrapper { sleep_counter_.Increment(); target()->SleepForMicroseconds(micros); } + + virtual Status GetCurrentTime(int64_t* unix_time) override { + Status s = target()->GetCurrentTime(unix_time); + if (s.ok()) { + *unix_time += addon_time_; + } + return s; + } }; class DBTest { @@ -316,32 +399,35 @@ class DBTest { kBlockBasedTableWithPrefixHashIndex = 1, kBlockBasedTableWithWholeKeyHashIndex = 2, kPlainTableFirstBytePrefix = 3, - kPlainTableAllBytesPrefix = 4, - kVectorRep = 5, - kHashLinkList = 6, - kHashCuckoo = 7, - kMergePut = 8, - kFilter = 9, - kUncompressed = 10, - kNumLevel_3 = 11, - kDBLogDir = 12, - kWalDir = 13, - kManifestFileSize = 14, - kCompactOnFlush = 15, - kPerfOptions = 16, - kDeletesFilterFirst = 17, - kHashSkipList = 18, - kUniversalCompaction = 19, - kCompressedBlockCache = 20, - kInfiniteMaxOpenFiles = 21, - kxxHashChecksum = 22, - kFIFOCompaction = 23, - kEnd = 24 + kPlainTableCappedPrefix = 4, + kPlainTableAllBytesPrefix = 5, + kVectorRep = 6, + kHashLinkList = 7, + kHashCuckoo = 8, + kMergePut = 9, + kFilter = 10, + kFullFilter = 11, + kUncompressed = 12, + kNumLevel_3 = 13, + kDBLogDir = 14, + kWalDirAndMmapReads = 15, + kManifestFileSize = 16, + kCompactOnFlush = 17, + kPerfOptions = 18, + kDeletesFilterFirst = 19, + kHashSkipList = 20, + kUniversalCompaction = 21, + kCompressedBlockCache = 22, + kInfiniteMaxOpenFiles = 23, + kxxHashChecksum = 24, + kFIFOCompaction = 25, + kEnd = 26 }; int option_config_; public: std::string dbname_; + MockEnv* mem_env_; SpecialEnv* env_; DB* db_; std::vector handles_; @@ -360,15 +446,21 @@ class DBTest { kSkipNoSeekToLast = 32, kSkipHashCuckoo = 64, kSkipFIFOCompaction = 128, + kSkipMmapReads = 256, }; DBTest() : option_config_(kDefault), - env_(new SpecialEnv(Env::Default())) { - dbname_ = test::TmpDir() + "/db_test"; - ASSERT_OK(DestroyDB(dbname_, Options())); + mem_env_(!getenv("MEM_ENV") ? nullptr : + new MockEnv(Env::Default())), + env_(new SpecialEnv(mem_env_ ? mem_env_ : Env::Default())) { + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + dbname_ = test::TmpDir(env_) + "/db_test"; + auto options = CurrentOptions(); + ASSERT_OK(DestroyDB(dbname_, options)); db_ = nullptr; - Reopen(); + Reopen(options); } ~DBTest() { @@ -402,9 +494,10 @@ class DBTest { option_config_ == kHashSkipList)) {; continue; } - if ((skip_mask & kSkipPlainTable) - && (option_config_ == kPlainTableAllBytesPrefix - || option_config_ == kPlainTableFirstBytePrefix)) { + if ((skip_mask & kSkipPlainTable) && + (option_config_ == kPlainTableAllBytesPrefix || + option_config_ == kPlainTableFirstBytePrefix || + option_config_ == kPlainTableCappedPrefix)) { continue; } if ((skip_mask & kSkipHashIndex) && @@ -419,33 +512,56 @@ class DBTest { option_config_ == kFIFOCompaction) { continue; } + if ((skip_mask & kSkipMmapReads) && + option_config_ == kWalDirAndMmapReads) { + continue; + } break; } if (option_config_ >= kEnd) { - Destroy(&last_options_); + Destroy(last_options_); return false; } else { - DestroyAndReopen(); + auto options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); return true; } } // Switch between different compaction styles (we have only 2 now). - bool ChangeCompactOptions(Options* prev_options = nullptr) { + bool ChangeCompactOptions() { if (option_config_ == kDefault) { option_config_ = kUniversalCompaction; - if (prev_options == nullptr) { - prev_options = &last_options_; - } - Destroy(prev_options); - TryReopen(); + Destroy(last_options_); + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); return true; } else { return false; } } + // Switch between different filter policy + // Jump from kDefault to kFilter to kFullFilter + bool ChangeFilterOptions() { + if (option_config_ == kDefault) { + option_config_ = kFilter; + } else if (option_config_ == kFilter) { + option_config_ = kFullFilter; + } else { + return false; + } + Destroy(last_options_); + + auto options = CurrentOptions(); + options.create_if_missing = true; + TryReopen(options); + return true; + } + // Return the current option configuration. Options CurrentOptions( const anon::OptionsOverride& options_override = anon::OptionsOverride()) { @@ -458,6 +574,9 @@ class DBTest { const anon::OptionsOverride& options_override = anon::OptionsOverride()) { // this redudant copy is to minimize code change w/o having lint error. Options options = defaultOptions; + XFUNC_TEST("", "dbtest_options", inplace_options1, GetXFTestOptions, + reinterpret_cast(&options), + options_override.skip_policy); BlockBasedTableOptions table_options; bool set_block_based_table_factory = true; switch (option_config_) { @@ -473,6 +592,13 @@ class DBTest { options.max_sequential_skip_in_iterations = 999999; set_block_based_table_factory = false; break; + case kPlainTableCappedPrefix: + options.table_factory.reset(new PlainTableFactory()); + options.prefix_extractor.reset(NewCappedPrefixTransform(8)); + options.allow_mmap_reads = true; + options.max_sequential_skip_in_iterations = 999999; + set_block_based_table_factory = false; + break; case kPlainTableAllBytesPrefix: options.table_factory.reset(new PlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); @@ -484,7 +610,10 @@ class DBTest { options.merge_operator = MergeOperators::CreatePutOperator(); break; case kFilter: - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + break; + case kFullFilter: + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); break; case kUncompressed: options.compression = kNoCompression; @@ -493,10 +622,13 @@ class DBTest { options.num_levels = 3; break; case kDBLogDir: - options.db_log_dir = test::TmpDir(); + options.db_log_dir = test::TmpDir(env_); break; - case kWalDir: - options.wal_dir = test::TmpDir() + "/wal"; + case kWalDirAndMmapReads: + options.wal_dir = dbname_ + "/wal"; + // mmap reads should be orthogonal to WalDir setting, so we piggyback to + // this option config to test mmap reads as well + options.allow_mmap_reads = true; break; case kManifestFileSize: options.max_manifest_file_size = 50; // 50 bytes @@ -562,6 +694,8 @@ class DBTest { if (set_block_based_table_factory) { options.table_factory.reset(NewBlockBasedTableFactory(table_options)); } + options.env = env_; + options.create_if_missing = true; return options; } @@ -570,14 +704,9 @@ class DBTest { } void CreateColumnFamilies(const std::vector& cfs, - const ColumnFamilyOptions* options = nullptr) { - ColumnFamilyOptions cf_opts; - if (options != nullptr) { - cf_opts = ColumnFamilyOptions(*options); - } else { - cf_opts = ColumnFamilyOptions(CurrentOptions()); - } - int cfi = handles_.size(); + const Options& options) { + ColumnFamilyOptions cf_opts(options); + size_t cfi = handles_.size(); handles_.resize(cfi + cfs.size()); for (auto cf : cfs) { ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); @@ -585,7 +714,7 @@ class DBTest { } void CreateAndReopenWithCF(const std::vector& cfs, - const Options* options = nullptr) { + const Options& options) { CreateColumnFamilies(cfs, options); std::vector cfs_plus_default = cfs; cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); @@ -593,37 +722,36 @@ class DBTest { } void ReopenWithColumnFamilies(const std::vector& cfs, - const std::vector& options) { + const std::vector& options) { ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } void ReopenWithColumnFamilies(const std::vector& cfs, - const Options* options = nullptr) { + const Options& options) { ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); } Status TryReopenWithColumnFamilies( const std::vector& cfs, - const std::vector& options) { + const std::vector& options) { Close(); ASSERT_EQ(cfs.size(), options.size()); std::vector column_families; for (size_t i = 0; i < cfs.size(); ++i) { - column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i])); + column_families.push_back(ColumnFamilyDescriptor(cfs[i], options[i])); } - DBOptions db_opts = DBOptions(*options[0]); + DBOptions db_opts = DBOptions(options[0]); return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); } Status TryReopenWithColumnFamilies(const std::vector& cfs, - const Options* options = nullptr) { + const Options& options) { Close(); - Options opts = (options == nullptr) ? CurrentOptions() : *options; - std::vector v_opts(cfs.size(), &opts); + std::vector v_opts(cfs.size(), options); return TryReopenWithColumnFamilies(cfs, v_opts); } - void Reopen(Options* options = nullptr) { + void Reopen(const Options& options) { ASSERT_OK(TryReopen(options)); } @@ -636,32 +764,25 @@ class DBTest { db_ = nullptr; } - void DestroyAndReopen(Options* options = nullptr) { + void DestroyAndReopen(const Options& options) { //Destroy using last options - Destroy(&last_options_); + Destroy(last_options_); ASSERT_OK(TryReopen(options)); } - void Destroy(Options* options) { + void Destroy(const Options& options) { Close(); - ASSERT_OK(DestroyDB(dbname_, *options)); + ASSERT_OK(DestroyDB(dbname_, options)); } - Status ReadOnlyReopen(Options* options) { - return DB::OpenForReadOnly(*options, dbname_, &db_); + Status ReadOnlyReopen(const Options& options) { + return DB::OpenForReadOnly(options, dbname_, &db_); } - Status TryReopen(Options* options = nullptr) { + Status TryReopen(const Options& options) { Close(); - Options opts; - if (options != nullptr) { - opts = *options; - } else { - opts = CurrentOptions(); - opts.create_if_missing = true; - } - last_options_ = opts; - return DB::Open(opts, dbname_, &db_); + last_options_ = options; + return DB::Open(options, dbname_, &db_); } Status Flush(int cf = 0) { @@ -726,6 +847,19 @@ class DBTest { return result; } + uint64_t GetNumSnapshots() { + uint64_t int_num; + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.num-snapshots", &int_num)); + return int_num; + } + + uint64_t GetTimeOldestSnapshots() { + uint64_t int_num; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.oldest-snapshot-time", &int_num)); + return int_num; + } + // Return a string that contains all key,value pairs in order, // formatted like "(k1->v1)(k2->v2)". std::string Contents(int cf = 0) { @@ -755,11 +889,12 @@ class DBTest { } std::string AllEntriesFor(const Slice& user_key, int cf = 0) { - Iterator* iter; + Arena arena; + ScopedArenaIterator iter; if (cf == 0) { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } else { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } InternalKey target(user_key, kMaxSequenceNumber, kTypeValue); iter->Seek(target.Encode()); @@ -804,7 +939,6 @@ class DBTest { } result += "]"; } - delete iter; return result; } @@ -822,6 +956,18 @@ class DBTest { return atoi(property.c_str()); } + uint64_t SizeAtLevel(int level) { + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + uint64_t sum = 0; + for (const auto& m : metadata) { + if (m.level == level) { + sum += m.size; + } + } + return sum; + } + int TotalTableFiles(int cf = 0, int levels = -1) { if (levels == -1) { levels = CurrentOptions().num_levels; @@ -838,7 +984,7 @@ class DBTest { int num_levels = (cf == 0) ? db_->NumberLevels() : db_->NumberLevels(handles_[1]); std::string result; - int last_non_zero_offset = 0; + size_t last_non_zero_offset = 0; for (int level = 0; level < num_levels; level++) { int f = NumTableFilesAtLevel(level, cf); char buf[100]; @@ -852,7 +998,7 @@ class DBTest { return result; } - int CountFiles() { + size_t CountFiles() { std::vector files; env_->GetChildren(dbname_, &files); @@ -861,10 +1007,10 @@ class DBTest { env_->GetChildren(last_options_.wal_dir, &logfiles); } - return static_cast(files.size() + logfiles.size()); + return files.size() + logfiles.size(); } - int CountLiveFiles() { + size_t CountLiveFiles() { std::vector metadata; db_->GetLiveFilesMetaData(&metadata); return metadata.size(); @@ -881,6 +1027,12 @@ class DBTest { return size; } + void Compact(int cf, const Slice& start, const Slice& limit, + uint32_t target_path_id) { + ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit, false, -1, + target_path_id)); + } + void Compact(int cf, const Slice& start, const Slice& limit) { ASSERT_OK(db_->CompactRange(handles_[cf], &start, &limit)); } @@ -1042,11 +1194,12 @@ class DBTest { // Utility method to test InplaceUpdate void validateNumberOfEntries(int numValues, int cf = 0) { - Iterator* iter; + ScopedArenaIterator iter; + Arena arena; if (cf != 0) { - iter = dbfull()->TEST_NewInternalIterator(handles_[cf]); + iter.set(dbfull()->TEST_NewInternalIterator(&arena, handles_[cf])); } else { - iter = dbfull()->TEST_NewInternalIterator(); + iter.set(dbfull()->TEST_NewInternalIterator(&arena)); } iter->SeekToFirst(); ASSERT_EQ(iter->status().ok(), true); @@ -1060,7 +1213,6 @@ class DBTest { ASSERT_EQ(ikey.sequence, (unsigned)seq--); iter->Next(); } - delete iter; ASSERT_EQ(0, seq); } @@ -1115,6 +1267,17 @@ void VerifyTableProperties(DB* db, uint64_t expected_entries_size) { ASSERT_EQ(props.size(), unique_entries.size()); ASSERT_EQ(expected_entries_size, sum); } + +uint64_t GetNumberOfSstFilesForColumnFamily(DB* db, + std::string column_family_name) { + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + uint64_t result = 0; + for (auto& fileMetadata : metadata) { + result += (fileMetadata.column_family_name == column_family_name); + } + return result; +} } // namespace TEST(DBTest, Empty) { @@ -1123,7 +1286,7 @@ TEST(DBTest, Empty) { options.env = env_; options.write_buffer_size = 100000; // Small write buffer options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); std::string num; ASSERT_TRUE(dbfull()->GetProperty( @@ -1136,7 +1299,8 @@ TEST(DBTest, Empty) { handles_[1], "rocksdb.num-entries-active-mem-table", &num)); ASSERT_EQ("1", num); - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); Put(1, "k1", std::string(100000, 'x')); // Fill memtable ASSERT_TRUE(dbfull()->GetProperty( handles_[1], "rocksdb.num-entries-active-mem-table", &num)); @@ -1148,7 +1312,8 @@ TEST(DBTest, Empty) { ASSERT_EQ("1", num); ASSERT_EQ("v1", Get(1, "foo")); - env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); ASSERT_OK(db_->DisableFileDeletions()); ASSERT_TRUE( @@ -1177,14 +1342,36 @@ TEST(DBTest, Empty) { } while (ChangeOptions()); } +TEST(DBTest, WriteEmptyBatch) { + Options options; + options.env = env_; + options.write_buffer_size = 100000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "foo", "bar")); + env_->sync_counter_.store(0); + WriteOptions wo; + wo.sync = true; + wo.disableWAL = false; + WriteBatch empty_batch; + ASSERT_OK(dbfull()->Write(wo, &empty_batch)); + ASSERT_GE(env_->sync_counter_.load(), 1); + + // make sure we can re-open it. + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); + ASSERT_EQ("bar", Get(1, "foo")); +} + TEST(DBTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); ASSERT_OK(Put("foo", "v3")); Close(); - Options options; - ASSERT_OK(ReadOnlyReopen(&options)); + auto options = CurrentOptions(); + assert(options.env = env_); + ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1198,15 +1385,108 @@ TEST(DBTest, ReadOnlyDB) { Close(); // Reopen and flush memtable. - Reopen(); + Reopen(options); Flush(); Close(); // Now check keys in read only mode. - ASSERT_OK(ReadOnlyReopen(&options)); + ASSERT_OK(ReadOnlyReopen(options)); ASSERT_EQ("v3", Get("foo")); ASSERT_EQ("v2", Get("bar")); } +TEST(DBTest, CompactedDB) { + const uint64_t kFileSize = 1 << 20; + Options options; + options.disable_auto_compactions = true; + options.max_mem_compaction_level = 0; + options.write_buffer_size = kFileSize; + options.target_file_size_base = kFileSize; + options.max_bytes_for_level_base = 1 << 30; + options.compression = kNoCompression; + options = CurrentOptions(options); + Reopen(options); + // 1 L0 file, use CompactedDB if max_open_files = -1 + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, '1'))); + Flush(); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + options.max_open_files = -1; + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ(DummyString(kFileSize / 2, '1'), Get("aaa")); + Close(); + Reopen(options); + // Add more L0 files + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, '2'))); + Flush(); + ASSERT_OK(Put("aaa", DummyString(kFileSize / 2, 'a'))); + Flush(); + ASSERT_OK(Put("bbb", DummyString(kFileSize / 2, 'b'))); + Flush(); + Close(); + + ASSERT_OK(ReadOnlyReopen(options)); + // Fallback to read-only DB + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported operation in read only mode."); + Close(); + + // Full compaction + Reopen(options); + // Add more keys + ASSERT_OK(Put("eee", DummyString(kFileSize / 2, 'e'))); + ASSERT_OK(Put("fff", DummyString(kFileSize / 2, 'f'))); + ASSERT_OK(Put("hhh", DummyString(kFileSize / 2, 'h'))); + ASSERT_OK(Put("iii", DummyString(kFileSize / 2, 'i'))); + ASSERT_OK(Put("jjj", DummyString(kFileSize / 2, 'j'))); + db_->CompactRange(nullptr, nullptr); + ASSERT_EQ(3, NumTableFilesAtLevel(1)); + Close(); + + // CompactedDB + ASSERT_OK(ReadOnlyReopen(options)); + s = Put("new", "value"); + ASSERT_EQ(s.ToString(), + "Not implemented: Not supported in compacted db mode."); + ASSERT_EQ("NOT_FOUND", Get("abc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), Get("aaa")); + ASSERT_EQ(DummyString(kFileSize / 2, 'b'), Get("bbb")); + ASSERT_EQ("NOT_FOUND", Get("ccc")); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), Get("eee")); + ASSERT_EQ(DummyString(kFileSize / 2, 'f'), Get("fff")); + ASSERT_EQ("NOT_FOUND", Get("ggg")); + ASSERT_EQ(DummyString(kFileSize / 2, 'h'), Get("hhh")); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), Get("iii")); + ASSERT_EQ(DummyString(kFileSize / 2, 'j'), Get("jjj")); + ASSERT_EQ("NOT_FOUND", Get("kkk")); + + // MultiGet + std::vector values; + std::vector status_list = dbfull()->MultiGet(ReadOptions(), + std::vector({Slice("aaa"), Slice("ccc"), Slice("eee"), + Slice("ggg"), Slice("iii"), Slice("kkk")}), + &values); + ASSERT_EQ(status_list.size(), static_cast(6)); + ASSERT_EQ(values.size(), static_cast(6)); + ASSERT_OK(status_list[0]); + ASSERT_EQ(DummyString(kFileSize / 2, 'a'), values[0]); + ASSERT_TRUE(status_list[1].IsNotFound()); + ASSERT_OK(status_list[2]); + ASSERT_EQ(DummyString(kFileSize / 2, 'e'), values[2]); + ASSERT_TRUE(status_list[3].IsNotFound()); + ASSERT_OK(status_list[4]); + ASSERT_EQ(DummyString(kFileSize / 2, 'i'), values[4]); + ASSERT_TRUE(status_list[5].IsNotFound()); +} + // Make sure that when options.block_cache is set, after a new table is // created its index/filter blocks are added to block cache. TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { @@ -1217,7 +1497,7 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { table_options.cache_index_and_filter_blocks = true; table_options.filter_policy.reset(NewBloomFilterPolicy(20)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "key", "val")); // Create a new table. @@ -1262,42 +1542,43 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { TEST(DBTest, GetPropertiesOfAllTablesTest) { Options options = CurrentOptions(); - Reopen(&options); + options.max_background_flushes = 0; + Reopen(options); // Create 4 tables for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), std::to_string(table * 100 + i), "val"); + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); } db_->Flush(FlushOptions()); } // 1. Read table properties directly from file - Reopen(&options); + Reopen(options); VerifyTableProperties(db_, 10 + 11 + 12 + 13); // 2. Put two tables to table cache and - Reopen(&options); + Reopen(options); // fetch key from 1st and 2nd table, which will internally place that table to // the table cache. for (int i = 0; i < 2; ++i) { - Get(std::to_string(i * 100 + 0)); + Get(ToString(i * 100 + 0)); } VerifyTableProperties(db_, 10 + 11 + 12 + 13); // 3. Put all tables to table cache - Reopen(&options); + Reopen(options); // fetch key from 1st and 2nd table, which will internally place that table to // the table cache. for (int i = 0; i < 4; ++i) { - Get(std::to_string(i * 100 + 0)); + Get(ToString(i * 100 + 0)); } VerifyTableProperties(db_, 10 + 11 + 12 + 13); } TEST(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); const std::string value(1024 * 1024, ' '); int i = 0; @@ -1307,49 +1588,19 @@ TEST(DBTest, LevelLimitReopen) { options.num_levels = 1; options.max_bytes_for_level_multiplier_additional.resize(1, 1); - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ(s.IsInvalidArgument(), true); ASSERT_EQ(s.ToString(), "Invalid argument: db has more levels than options.num_levels"); options.num_levels = 10; options.max_bytes_for_level_multiplier_additional.resize(10, 1); - ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, &options)); -} - -TEST(DBTest, Preallocation) { - const std::string src = dbname_ + "/alloc_test"; - unique_ptr srcfile; - const EnvOptions soptions; - ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); - srcfile->SetPreallocationBlockSize(1024 * 1024); - - // No writes should mean no preallocation - size_t block_size, last_allocated_block; - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 0UL); - - // Small write should preallocate one block - srcfile->Append("test"); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 1UL); - - // Write an entire preallocation block, make sure we increased by two. - std::string buf(block_size, ' '); - srcfile->Append(buf); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 2UL); - - // Write five more blocks at once, ensure we're where we need to be. - buf = std::string(block_size * 5, ' '); - srcfile->Append(buf); - srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); - ASSERT_EQ(last_allocated_block, 7UL); + ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } TEST(DBTest, PutDeleteGet) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_OK(Put(1, "foo", "v2")); @@ -1366,23 +1617,25 @@ TEST(DBTest, GetFromImmutableLayer) { options.env = env_; options.write_buffer_size = 100000; // Small write buffer options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_EQ("v1", Get(1, "foo")); - env_->delay_sstable_sync_.Release_Store(env_); // Block sync calls + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); Put(1, "k1", std::string(100000, 'x')); // Fill memtable Put(1, "k2", std::string(100000, 'y')); // Trigger flush ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("NOT_FOUND", Get(0, "foo")); - env_->delay_sstable_sync_.Release_Store(nullptr); // Release sync calls + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); } while (ChangeOptions()); } TEST(DBTest, GetFromVersions) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_EQ("v1", Get(1, "foo")); @@ -1391,13 +1644,20 @@ TEST(DBTest, GetFromVersions) { } TEST(DBTest, GetSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); // Try with both a short key and a long key for (int i = 0; i < 2; i++) { std::string key = (i == 0) ? std::string("foo") : std::string(200, 'x'); ASSERT_OK(Put(1, key, "v1")); const Snapshot* s1 = db_->GetSnapshot(); + if (option_config_ == kHashCuckoo) { + // NOt supported case. + ASSERT_TRUE(s1 == nullptr); + break; + } ASSERT_OK(Put(1, key, "v2")); ASSERT_EQ("v2", Get(1, key)); ASSERT_EQ("v1", Get(1, key, s1)); @@ -1406,13 +1666,69 @@ TEST(DBTest, GetSnapshot) { ASSERT_EQ("v1", Get(1, key, s1)); db_->ReleaseSnapshot(s1); } - // skip as HashCuckooRep does not support snapshot - } while (ChangeOptions(kSkipHashCuckoo)); + } while (ChangeOptions()); +} + +TEST(DBTest, GetSnapshotLink) { + do { + Options options; + const std::string snapshot_name = test::TmpDir(env_) + "/snapshot"; + DB* snapshotDB; + ReadOptions roptions; + std::string result; + Checkpoint* checkpoint; + + options = CurrentOptions(options); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + ASSERT_OK(DestroyDB(snapshot_name, options)); + env_->DeleteDir(snapshot_name); + + // Create a database + Status s; + options.create_if_missing = true; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + std::string key = std::string("foo"); + ASSERT_OK(Put(key, "v1")); + // Take a snapshot + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(snapshot_name)); + ASSERT_OK(Put(key, "v2")); + ASSERT_EQ("v2", Get(key)); + ASSERT_OK(Flush()); + ASSERT_EQ("v2", Get(key)); + // Open snapshot and verify contents while DB is running + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, snapshot_name, &snapshotDB)); + ASSERT_OK(snapshotDB->Get(roptions, key, &result)); + ASSERT_EQ("v1", result); + delete snapshotDB; + snapshotDB = nullptr; + delete db_; + db_ = nullptr; + + // Destroy original DB + ASSERT_OK(DestroyDB(dbname_, options)); + + // Open snapshot and verify contents + options.create_if_missing = false; + dbname_ = snapshot_name; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + ASSERT_EQ("v1", Get(key)); + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, options)); + delete checkpoint; + + // Restore DB name + dbname_ = test::TmpDir(env_) + "/db_test"; + } while (ChangeOptions()); } TEST(DBTest, GetLevel0Ordering) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); // Check that we process level-0 files in correct order. The code // below generates two level-0 files where the earlier one comes // before the later one in the level-0 file list since the earlier @@ -1428,7 +1744,7 @@ TEST(DBTest, GetLevel0Ordering) { TEST(DBTest, GetOrderedByLevels) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); Compact(1, "a", "z"); ASSERT_EQ("v1", Get(1, "foo")); @@ -1441,7 +1757,7 @@ TEST(DBTest, GetOrderedByLevels) { TEST(DBTest, GetPicksCorrectFile) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); // Arrange to have multiple files in a non-level-0 level. ASSERT_OK(Put(1, "a", "va")); Compact(1, "a", "b"); @@ -1457,7 +1773,10 @@ TEST(DBTest, GetPicksCorrectFile) { TEST(DBTest, GetEncountersEmptyLevel) { do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + options.disableDataSync = true; + CreateAndReopenWithCF({"pikachu"}, options); // Arrange for the following to happen: // * sstable A in level 0 // * nothing in level 1 @@ -1505,7 +1824,7 @@ TEST(DBTest, KeyMayExist) { options_override.filter_policy.reset(NewBloomFilterPolicy(20)); Options options = CurrentOptions(options_override); options.statistics = rocksdb::CreateDBStatistics(); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_TRUE(!db_->KeyMayExist(ropts, handles_[1], "a", &value)); @@ -1566,7 +1885,7 @@ TEST(DBTest, NonBlockingIteration) { Options options = CurrentOptions(); options.statistics = rocksdb::CreateDBStatistics(); non_blocking_opts.read_tier = kBlockCacheTier; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // write one kv to the database. ASSERT_OK(Put(1, "a", "b")); @@ -1620,8 +1939,8 @@ TEST(DBTest, NonBlockingIteration) { // This test verifies block cache behaviors, which is not used by plain // table format. // Exclude kHashCuckoo as it does not support iteration currently - } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | - kSkipHashCuckoo)); + } while (ChangeOptions(kSkipPlainTable | kSkipNoSeekToLast | kSkipHashCuckoo | + kSkipMmapReads)); } // A delete is skipped for key if KeyMayExist(key) returns False @@ -1632,7 +1951,7 @@ TEST(DBTest, FilterDeletes) { options_override.filter_policy.reset(NewBloomFilterPolicy(20)); Options options = CurrentOptions(options_override); options.filter_deletes = true; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); WriteBatch batch; batch.Delete(handles_[1], "a"); @@ -1663,6 +1982,41 @@ TEST(DBTest, FilterDeletes) { } while (ChangeCompactOptions()); } +TEST(DBTest, GetFilterByPrefixBloom) { + Options options = last_options_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyAndReopen(options); + + WriteOptions wo; + ReadOptions ro; + FlushOptions fo; + fo.wait = true; + std::string value; + + ASSERT_OK(dbfull()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(dbfull()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(dbfull()->Put(wo, "foofoofoo", "bar")); + + dbfull()->Flush(fo); + + ASSERT_EQ("foo", Get("barbarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("foo2", Get("barbarbar2")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ("NOT_FOUND", Get("barbarbar3")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + ASSERT_EQ("NOT_FOUND", Get("barfoofoo")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 1); + + ASSERT_EQ("NOT_FOUND", Get("foobarbar")); + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 2); +} TEST(DBTest, IterSeekBeforePrev) { ASSERT_OK(Put("a", "b")); @@ -1791,7 +2145,7 @@ TEST(DBTest, IterPrevWithNewerSeq2) { TEST(DBTest, IterEmpty) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); iter->SeekToFirst(); @@ -1809,7 +2163,7 @@ TEST(DBTest, IterEmpty) { TEST(DBTest, IterSingle) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "a", "va")); Iterator* iter = db_->NewIterator(ReadOptions(), handles_[1]); @@ -1850,7 +2204,7 @@ TEST(DBTest, IterSingle) { TEST(DBTest, IterMulti) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "a", "va")); ASSERT_OK(Put(1, "b", "vb")); ASSERT_OK(Put(1, "c", "vc")); @@ -1938,12 +2292,14 @@ TEST(DBTest, IterMulti) { // Check that we can skip over a run of user keys // by using reseek rather than sequential scan TEST(DBTest, IterReseek) { - Options options = CurrentOptions(); + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; + Options options = CurrentOptions(options_override); options.max_sequential_skip_in_iterations = 3; options.create_if_missing = true; options.statistics = rocksdb::CreateDBStatistics(); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); // insert two keys with same userkey and verify that // reseek is not invoked. For each of these test cases, @@ -2022,7 +2378,7 @@ TEST(DBTest, IterReseek) { TEST(DBTest, IterSmallAndLargeMix) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "a", "va")); ASSERT_OK(Put(1, "b", std::string(100000, 'b'))); ASSERT_OK(Put(1, "c", "vc")); @@ -2063,7 +2419,7 @@ TEST(DBTest, IterSmallAndLargeMix) { TEST(DBTest, IterMultiWithDelete) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "ka", "va")); ASSERT_OK(Put(1, "kb", "vb")); ASSERT_OK(Put(1, "kc", "vc")); @@ -2088,7 +2444,7 @@ TEST(DBTest, IterMultiWithDelete) { TEST(DBTest, IterPrevMaxSkip) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); for (int i = 0; i < 2; i++) { ASSERT_OK(Put(1, "key1", "v1")); ASSERT_OK(Put(1, "key2", "v2")); @@ -2117,8 +2473,10 @@ TEST(DBTest, IterPrevMaxSkip) { } TEST(DBTest, IterWithSnapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); ASSERT_OK(Put(1, "key1", "val1")); ASSERT_OK(Put(1, "key2", "val2")); ASSERT_OK(Put(1, "key3", "val3")); @@ -2162,11 +2520,11 @@ TEST(DBTest, IterWithSnapshot) { TEST(DBTest, Recover) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Put(1, "baz", "v5")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v1", Get(1, "foo")); @@ -2174,7 +2532,7 @@ TEST(DBTest, Recover) { ASSERT_OK(Put(1, "bar", "v2")); ASSERT_OK(Put(1, "foo", "v3")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v3", Get(1, "foo")); ASSERT_OK(Put(1, "foo", "v4")); ASSERT_EQ("v4", Get(1, "foo")); @@ -2190,8 +2548,8 @@ TEST(DBTest, RecoverWithTableHandle) { options.write_buffer_size = 100; options.disable_auto_compactions = true; options = CurrentOptions(options); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Put(1, "bar", "v2")); @@ -2200,7 +2558,7 @@ TEST(DBTest, RecoverWithTableHandle) { ASSERT_OK(Put(1, "bar", "v4")); ASSERT_OK(Flush(1)); ASSERT_OK(Put(1, "big", std::string(100, 'a'))); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); std::vector> files; dbfull()->TEST_GetFilesMetaData(handles_[1], &files); @@ -2238,8 +2596,9 @@ TEST(DBTest, IgnoreRecoveredLog) { Options options = CurrentOptions(); options.create_if_missing = true; options.merge_operator = MergeOperators::CreateUInt64AddOperator(); - options.wal_dir = dbname_ + "/logs"; - DestroyAndReopen(&options); + options.wal_dir = dbname_ + "/wal"; + Destroy(options); + Reopen(options); // fill up the DB std::string one, two; @@ -2259,7 +2618,7 @@ TEST(DBTest, IgnoreRecoveredLog) { } // recover the DB - Reopen(&options); + Reopen(options); ASSERT_EQ(two, Get("foo")); ASSERT_EQ(one, Get("bar")); Close(); @@ -2273,12 +2632,12 @@ TEST(DBTest, IgnoreRecoveredLog) { // this should ignore the log files, recovery should not happen again // if the recovery happens, the same merge operator would be called twice, // leading to incorrect results - Reopen(&options); + Reopen(options); ASSERT_EQ(two, Get("foo")); ASSERT_EQ(one, Get("bar")); Close(); - Destroy(&options); - Reopen(&options); + Destroy(options); + Reopen(options); Close(); // copy the logs from backup back to wal dir @@ -2290,12 +2649,12 @@ TEST(DBTest, IgnoreRecoveredLog) { } // assert that we successfully recovered only from logs, even though we // destroyed the DB - Reopen(&options); + Reopen(options); ASSERT_EQ(two, Get("foo")); ASSERT_EQ(one, Get("bar")); // Recovery will fail if DB directory doesn't exist. - Destroy(&options); + Destroy(options); // copy the logs from backup back to wal dir env_->CreateDirIfMissing(options.wal_dir); for (auto& log : logs) { @@ -2305,37 +2664,37 @@ TEST(DBTest, IgnoreRecoveredLog) { env_->DeleteFile(backup_logs + "/" + log); } } - Status s = TryReopen(&options); + Status s = TryReopen(options); ASSERT_TRUE(!s.ok()); } while (ChangeOptions(kSkipHashCuckoo)); } TEST(DBTest, RollLog) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Put(1, "baz", "v5")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); for (int i = 0; i < 10; i++) { - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); } ASSERT_OK(Put(1, "foo", "v4")); for (int i = 0; i < 10; i++) { - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); } } while (ChangeOptions()); } TEST(DBTest, WAL) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v1", Get(1, "bar")); @@ -2344,7 +2703,7 @@ TEST(DBTest, WAL) { writeOpt.disableWAL = true; ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); // Both value's should be present. ASSERT_EQ("v2", Get(1, "bar")); ASSERT_EQ("v2", Get(1, "foo")); @@ -2354,7 +2713,7 @@ TEST(DBTest, WAL) { writeOpt.disableWAL = false; ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); // again both values should be present. ASSERT_EQ("v3", Get(1, "foo")); ASSERT_EQ("v3", Get(1, "bar")); @@ -2365,7 +2724,7 @@ TEST(DBTest, CheckLock) { do { DB* localdb; Options options = CurrentOptions(); - ASSERT_OK(TryReopen(&options)); + ASSERT_OK(TryReopen(options)); // second open should fail ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); @@ -2379,7 +2738,7 @@ TEST(DBTest, FlushMultipleMemtable) { writeOpt.disableWAL = true; options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); ASSERT_OK(Flush(1)); ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); @@ -2398,7 +2757,7 @@ TEST(DBTest, NumImmutableMemTable) { options.max_write_buffer_number = 4; options.min_write_buffer_number_to_merge = 3; options.write_buffer_size = 1000000; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); std::string big_value(1000000 * 2, 'x'); std::string num; @@ -2505,6 +2864,49 @@ class SleepingBackgroundTask { bool done_with_sleep_; }; +TEST(DBTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_high, + Env::Priority::HIGH); + + Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 2; + options.min_write_buffer_number_to_merge = 1; + CreateAndReopenWithCF({"pikachu"}, options); + + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); + + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); +} + TEST(DBTest, GetProperty) { // Set sizes to both background thread pool to be 1 and block them. env_->SetBackgroundThreads(1, Env::HIGH); @@ -2527,7 +2929,7 @@ TEST(DBTest, GetProperty) { options.max_write_buffer_number = 10; options.min_write_buffer_number_to_merge = 1; options.write_buffer_size = 1000000; - Reopen(&options); + Reopen(options); std::string big_value(1000000 * 2, 'x'); std::string num; @@ -2601,7 +3003,7 @@ TEST(DBTest, GetProperty) { dbfull()->TEST_WaitForFlushMemTable(); options.max_open_files = 10; - Reopen(&options); + Reopen(options); // After reopening, no table reader is loaded, so no memory for table readers ASSERT_TRUE( dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); @@ -2618,7 +3020,7 @@ TEST(DBTest, GetProperty) { TEST(DBTest, FLUSH) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; SetPerfLevel(kEnableTime);; @@ -2631,7 +3033,7 @@ TEST(DBTest, FLUSH) { Get(1, "foo"); ASSERT_TRUE((int) perf_context.get_from_output_files_time > 0); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v1", Get(1, "bar")); @@ -2640,7 +3042,7 @@ TEST(DBTest, FLUSH) { ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v2")); ASSERT_OK(Flush(1)); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v2", Get(1, "bar")); perf_context.Reset(); ASSERT_EQ("v2", Get(1, "foo")); @@ -2651,7 +3053,7 @@ TEST(DBTest, FLUSH) { ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v3")); ASSERT_OK(Flush(1)); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); // 'foo' should be there because its put // has WAL enabled. ASSERT_EQ("v3", Get(1, "foo")); @@ -2663,13 +3065,13 @@ TEST(DBTest, FLUSH) { TEST(DBTest, RecoveryWithEmptyLog) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v1")); ASSERT_OK(Put(1, "foo", "v2")); - ReopenWithColumnFamilies({"default", "pikachu"}); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v3")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("v3", Get(1, "foo")); } while (ChangeOptions()); } @@ -2682,7 +3084,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { options.env = env_; options.write_buffer_size = 1000000; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Trigger a long memtable compaction and reopen the database during it ASSERT_OK(Put(1, "foo", "v1")); // Goes to 1st log file @@ -2690,7 +3092,7 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { ASSERT_OK(Put(1, "big2", std::string(1000, 'y'))); // Triggers compaction ASSERT_OK(Put(1, "bar", "v2")); // Goes to new log file - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ("v1", Get(1, "foo")); ASSERT_EQ("v2", Get(1, "bar")); ASSERT_EQ(std::string(10000000, 'x'), Get(1, "big1")); @@ -2698,12 +3100,54 @@ TEST(DBTest, RecoverDuringMemtableCompaction) { } while (ChangeOptions()); } +// false positive TSAN report on shared_ptr -- +// https://groups.google.com/forum/#!topic/thread-sanitizer/vz_s-t226Vg +#ifndef ROCKSDB_TSAN_RUN +TEST(DBTest, FlushSchedule) { + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + options.level0_stop_writes_trigger = 1 << 10; + options.level0_slowdown_writes_trigger = 1 << 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number = 2; + options.write_buffer_size = 100 * 1000; + CreateAndReopenWithCF({"pikachu"}, options); + std::vector threads; + + std::atomic thread_num(0); + // each column family will have 5 thread, each thread generating 2 memtables. + // each column family should end up with 10 table files + for (int i = 0; i < 10; ++i) { + threads.emplace_back([&]() { + int a = thread_num.fetch_add(1); + Random rnd(a); + WriteOptions wo; + // this should fill up 2 memtables + for (int k = 0; k < 5000; ++k) { + ASSERT_OK(db_->Put(wo, handles_[a & 1], RandomString(&rnd, 13), "")); + } + }); + } + + for (auto& t : threads) { + t.join(); + } + + auto default_tables = GetNumberOfSstFilesForColumnFamily(db_, "default"); + auto pikachu_tables = GetNumberOfSstFilesForColumnFamily(db_, "pikachu"); + ASSERT_LE(default_tables, static_cast(10)); + ASSERT_GT(default_tables, static_cast(0)); + ASSERT_LE(pikachu_tables, static_cast(10)); + ASSERT_GT(pikachu_tables, static_cast(0)); +} +#endif // enabled only if not TSAN run + TEST(DBTest, MinorCompactionsHappen) { do { Options options; options.write_buffer_size = 10000; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); const int N = 500; @@ -2718,7 +3162,7 @@ TEST(DBTest, MinorCompactionsHappen) { ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); } - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); for (int i = 0; i < N; i++) { ASSERT_EQ(Key(i) + std::string(1000, 'v'), Get(1, Key(i))); @@ -2731,7 +3175,7 @@ TEST(DBTest, ManifestRollOver) { Options options; options.max_manifest_file_size = 10 ; // 10 bytes options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); { ASSERT_OK(Put(1, "manifest_key1", std::string(1000, '1'))); ASSERT_OK(Put(1, "manifest_key2", std::string(1000, '2'))); @@ -2740,7 +3184,7 @@ TEST(DBTest, ManifestRollOver) { ASSERT_OK(Flush(1)); // This should trigger LogAndApply. uint64_t manifest_after_flush = dbfull()->TEST_Current_Manifest_FileNo(); ASSERT_GT(manifest_after_flush, manifest_before_flush); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_GT(dbfull()->TEST_Current_Manifest_FileNo(), manifest_after_flush); // check if a new manifest file got inserted or not. ASSERT_EQ(std::string(1000, '1'), Get(1, "manifest_key1")); @@ -2756,7 +3200,7 @@ TEST(DBTest, IdentityAcrossRestarts) { ASSERT_OK(db_->GetDbIdentity(id1)); Options options = CurrentOptions(); - Reopen(&options); + Reopen(options); std::string id2; ASSERT_OK(db_->GetDbIdentity(id2)); // id1 should match id2 because identity was not regenerated @@ -2764,7 +3208,7 @@ TEST(DBTest, IdentityAcrossRestarts) { std::string idfilename = IdentityFileName(dbname_); ASSERT_OK(env_->DeleteFile(idfilename)); - Reopen(&options); + Reopen(options); std::string id3; ASSERT_OK(db_->GetDbIdentity(id3)); // id1 should NOT match id3 because identity was regenerated @@ -2776,7 +3220,7 @@ TEST(DBTest, RecoverWithLargeLog) { do { { Options options = CurrentOptions(); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "big1", std::string(200000, '1'))); ASSERT_OK(Put(1, "big2", std::string(200000, '2'))); ASSERT_OK(Put(1, "small3", std::string(10, '3'))); @@ -2789,7 +3233,7 @@ TEST(DBTest, RecoverWithLargeLog) { Options options; options.write_buffer_size = 100000; options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); ASSERT_EQ(std::string(200000, '1'), Get(1, "big1")); ASSERT_EQ(std::string(200000, '2'), Get(1, "big2")); @@ -2803,7 +3247,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { Options options; options.write_buffer_size = 100000000; // Large write buffer options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -2816,7 +3260,7 @@ TEST(DBTest, CompactionsGenerateMultipleFiles) { } // Reopening moves updates to level-0 - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); @@ -2833,7 +3277,7 @@ TEST(DBTest, CompactionTrigger) { options.max_mem_compaction_level = 0; options.level0_file_num_compaction_trigger = 3; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -2884,13 +3328,17 @@ Options DeletionTriggerOptions() { } // anonymous namespace TEST(DBTest, CompactionDeletionTrigger) { - Options options = DeletionTriggerOptions(); - options.create_if_missing = true; - for (int tid = 0; tid < 2; ++tid) { uint64_t db_size[2]; + Options options = CurrentOptions(DeletionTriggerOptions()); + + if (tid == 1) { + // second pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } - DestroyAndReopen(&options); + DestroyAndReopen(options); Random rnd(301); const int kTestSize = kCDTKeysPerBuffer * 512; @@ -2912,20 +3360,21 @@ TEST(DBTest, CompactionDeletionTrigger) { // must have much smaller db size. ASSERT_GT(db_size[0] / 3, db_size[1]); - - // repeat the test with universal compaction - options.compaction_style = kCompactionStyleUniversal; - options.num_levels = 1; } } TEST(DBTest, CompactionDeletionTriggerReopen) { for (int tid = 0; tid < 2; ++tid) { uint64_t db_size[3]; - Options options = DeletionTriggerOptions(); - options.create_if_missing = true; + Options options = CurrentOptions(DeletionTriggerOptions()); + + if (tid == 1) { + // second pass with universal compaction + options.compaction_style = kCompactionStyleUniversal; + options.num_levels = 1; + } - DestroyAndReopen(&options); + DestroyAndReopen(options); Random rnd(301); // round 1 --- insert key/value pairs. @@ -2943,7 +3392,7 @@ TEST(DBTest, CompactionDeletionTriggerReopen) { // round 2 --- disable auto-compactions and issue deletions. options.create_if_missing = false; options.disable_auto_compactions = true; - Reopen(&options); + Reopen(options); for (int k = 0; k < kTestSize; ++k) { ASSERT_OK(Delete(Key(k))); @@ -2957,7 +3406,7 @@ TEST(DBTest, CompactionDeletionTriggerReopen) { // round 3 --- reopen db with auto_compaction on and see if // deletion compensation still work. options.disable_auto_compactions = false; - Reopen(&options); + Reopen(options); // insert relatively small amount of data to trigger auto compaction. for (int k = 0; k < kTestSize / 10; ++k) { ASSERT_OK(Put(Key(k), values[k])); @@ -2967,10 +3416,6 @@ TEST(DBTest, CompactionDeletionTriggerReopen) { db_size[2] = Size(Key(0), Key(kTestSize - 1)); // this time we're expecting significant drop in size. ASSERT_GT(db_size[0] / 3, db_size[2]); - - // repeat the test with universal compaction - options.compaction_style = kCompactionStyleUniversal; - options.num_levels = 1; } } @@ -3003,6 +3448,22 @@ class DeleteFilter : public CompactionFilter { virtual const char* Name() const override { return "DeleteFilter"; } }; +class ConditionalFilter : public CompactionFilter { + public: + explicit ConditionalFilter(const std::string* filtered_value) + : filtered_value_(filtered_value) {} + virtual bool Filter(int level, const Slice& key, const Slice& value, + std::string* new_value, + bool* value_changed) const override { + return value.ToString() == *filtered_value_; + } + + virtual const char* Name() const override { return "ConditionalFilter"; } + + private: + const std::string* filtered_value_; +}; + class ChangeFilter : public CompactionFilter { public: explicit ChangeFilter() {} @@ -3053,6 +3514,25 @@ class DeleteFilterFactory : public CompactionFilterFactory { virtual const char* Name() const override { return "DeleteFilterFactory"; } }; +class ConditionalFilterFactory : public CompactionFilterFactory { + public: + explicit ConditionalFilterFactory(const Slice& filtered_value) + : filtered_value_(filtered_value.ToString()) {} + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override { + return std::unique_ptr( + new ConditionalFilter(&filtered_value_)); + } + + virtual const char* Name() const override { + return "ConditionalFilterFactory"; + } + + private: + std::string filtered_value_; +}; + class ChangeFilterFactory : public CompactionFilterFactory { public: explicit ChangeFilterFactory() {} @@ -3067,7 +3547,7 @@ class ChangeFilterFactory : public CompactionFilterFactory { // TODO(kailiu) The tests on UniversalCompaction has some issues: // 1. A lot of magic numbers ("11" or "12"). -// 2. Made assumption on the memtable flush conidtions, which may change from +// 2. Made assumption on the memtable flush conditions, which may change from // time to time. TEST(DBTest, UniversalCompactionTrigger) { Options options; @@ -3080,7 +3560,7 @@ TEST(DBTest, UniversalCompactionTrigger) { options.compaction_filter_factory.reset(filter); options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); int key_idx = 0; @@ -3143,7 +3623,7 @@ TEST(DBTest, UniversalCompactionTrigger) { } dbfull()->TEST_WaitForCompact(); // Before compaction, we have 4 files at level 0, with size 4, 0.4, 1, 1. - // After comapction, we should have 2 files, with size 4, 2.4. + // After compaction, we should have 2 files, with size 4, 2.4. ASSERT_EQ(NumTableFilesAtLevel(0, 1), 2); for (int i = 1; i < options.num_levels ; i++) { ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); @@ -3171,7 +3651,7 @@ TEST(DBTest, UniversalCompactionTrigger) { } dbfull()->TEST_WaitForCompact(); // Before compaction, we have 4 files at level 0, with size 4, 2.4, 1, 1. - // After comapction, we should have 3 files, with size 4, 2.4, 2. + // After compaction, we should have 3 files, with size 4, 2.4, 2. ASSERT_EQ(NumTableFilesAtLevel(0, 1), 3); for (int i = 1; i < options.num_levels ; i++) { ASSERT_EQ(NumTableFilesAtLevel(i, 1), 0); @@ -3212,12 +3692,13 @@ TEST(DBTest, UniversalCompactionSizeAmplification) { options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 100<<10; //100KB options.level0_file_num_compaction_trigger = 3; - CreateAndReopenWithCF({"pikachu"}, &options); + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); // Trigger compaction if size amplification exceeds 110% options.compaction_options_universal.max_size_amplification_percent = 110; options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); Random rnd(301); int key_idx = 0; @@ -3254,7 +3735,7 @@ TEST(DBTest, UniversalCompactionOptions) { options.num_levels = 1; options.compaction_options_universal.compression_size_percent = -1; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); int key_idx = 0; @@ -3288,7 +3769,7 @@ TEST(DBTest, UniversalCompactionStopStyleSimilarSize) { options.compaction_options_universal.size_ratio = 10; options.compaction_options_universal.stop_style = kCompactionStopStyleSimilarSize; options.num_levels=1; - Reopen(&options); + Reopen(options); Random rnd(301); int key_idx = 0; @@ -3376,6 +3857,7 @@ TEST(DBTest, CompressedCache) { Options options; options.write_buffer_size = 64*1024; // small write buffer options.statistics = rocksdb::CreateDBStatistics(); + options = CurrentOptions(options); BlockBasedTableOptions table_options; switch (iter) { @@ -3409,16 +3891,17 @@ TEST(DBTest, CompressedCache) { default: ASSERT_TRUE(false); } - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // default column family doesn't have block cache Options no_block_cache_opts; no_block_cache_opts.statistics = options.statistics; + no_block_cache_opts = CurrentOptions(no_block_cache_opts); BlockBasedTableOptions table_options_no_bc; table_options_no_bc.no_block_cache = true; no_block_cache_opts.table_factory.reset( NewBlockBasedTableFactory(table_options_no_bc)); ReopenWithColumnFamilies({"default", "pikachu"}, - {&no_block_cache_opts, &options}); + std::vector({no_block_cache_opts, options})); Random rnd(301); @@ -3472,7 +3955,7 @@ TEST(DBTest, CompressedCache) { } options.create_if_missing = true; - DestroyAndReopen(&options); + DestroyAndReopen(options); } } @@ -3490,7 +3973,7 @@ TEST(DBTest, UniversalCompactionCompressRatio1) { options.num_levels = 1; options.compaction_options_universal.compression_size_percent = 70; options = CurrentOptions(options); - Reopen(&options); + Reopen(options); Random rnd(301); int key_idx = 0; @@ -3555,7 +4038,7 @@ TEST(DBTest, UniversalCompactionCompressRatio2) { options.num_levels = 1; options.compaction_options_universal.compression_size_percent = 95; options = CurrentOptions(options); - Reopen(&options); + Reopen(options); Random rnd(301); int key_idx = 0; @@ -3576,13 +4059,13 @@ TEST(DBTest, UniversalCompactionCompressRatio2) { } TEST(DBTest, FailMoreDbPaths) { - Options options; + Options options = CurrentOptions(); options.db_paths.emplace_back(dbname_, 10000000); options.db_paths.emplace_back(dbname_ + "_2", 1000000); options.db_paths.emplace_back(dbname_ + "_3", 1000000); options.db_paths.emplace_back(dbname_ + "_4", 1000000); options.db_paths.emplace_back(dbname_ + "_5", 1000000); - ASSERT_TRUE(TryReopen(&options).IsNotSupported()); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); } TEST(DBTest, UniversalCompactionSecondPathRatio) { @@ -3602,7 +4085,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) { env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); } env_->DeleteDir(options.db_paths[1].path); - Reopen(&options); + Reopen(options); Random rnd(301); int key_idx = 0; @@ -3668,7 +4151,7 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) { ASSERT_TRUE(v.size() == 1 || v.size() == 10000); } - Reopen(&options); + Reopen(options); for (int i = 0; i < key_idx; i++) { auto v = Get(Key(i)); @@ -3676,20 +4159,20 @@ TEST(DBTest, UniversalCompactionSecondPathRatio) { ASSERT_TRUE(v.size() == 1 || v.size() == 10000); } - Destroy(&options); + Destroy(options); } -TEST(DBTest, UniversalCompactionFourPaths) { - Options options; - options.db_paths.emplace_back(dbname_, 300 * 1024); - options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); - options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); - options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); - options.compaction_style = kCompactionStyleUniversal; +TEST(DBTest, LevelCompactionThirdPath) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.compaction_style = kCompactionStyleLevel; options.write_buffer_size = 100 << 10; // 100KB options.level0_file_num_compaction_trigger = 2; - options.num_levels = 1; - options = CurrentOptions(options); + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + // options = CurrentOptions(options); std::vector filenames; env_->GetChildren(options.db_paths[1].path, &filenames); @@ -3698,7 +4181,7 @@ TEST(DBTest, UniversalCompactionFourPaths) { env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); } env_->DeleteDir(options.db_paths[1].path); - Reopen(&options); + Reopen(options); Random rnd(301); int key_idx = 0; @@ -3709,13 +4192,240 @@ TEST(DBTest, UniversalCompactionFourPaths) { GenerateNewFile(&rnd, &key_idx); } - // Another 110KB triggers a compaction to 400K file to second path + // Another 110KB triggers a compaction to 400K file to fill up first path GenerateNewFile(&rnd, &key_idx); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[1].path)); // (1, 4) GenerateNewFile(&rnd, &key_idx); - ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ("1,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 1) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 2) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,2", FilesPerLevel(0)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 3) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,3", FilesPerLevel(0)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,4", FilesPerLevel(0)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 5) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,5", FilesPerLevel(0)); + ASSERT_EQ(5, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 6) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,6", FilesPerLevel(0)); + ASSERT_EQ(6, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 7) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,7", FilesPerLevel(0)); + ASSERT_EQ(7, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + // (1, 4, 8) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,4,8", FilesPerLevel(0)); + ASSERT_EQ(8, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(4, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 10000); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 10000); + } + + Destroy(options); +} + +TEST(DBTest, LevelCompactionPathUse) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_, 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 4 * 1024 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 1024 * 1024 * 1024); + options.compaction_style = kCompactionStyleLevel; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 4; + options.max_bytes_for_level_base = 400 * 1024; + // options = CurrentOptions(options); + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // Always gets compacted into 1 Level1 file, + // 0/1 Level 0 file + for (int num = 0; num < 3; num++) { + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + } + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("0,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + key_idx = 0; + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ("1,1", FilesPerLevel(0)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[2].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(dbname_)); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 10000); + } + + Reopen(options); + + for (int i = 0; i < key_idx; i++) { + auto v = Get(Key(i)); + ASSERT_NE(v, "NOT_FOUND"); + ASSERT_TRUE(v.size() == 1 || v.size() == 10000); + } + + Destroy(options); +} + +TEST(DBTest, UniversalCompactionFourPaths) { + Options options; + options.db_paths.emplace_back(dbname_, 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_2", 300 * 1024); + options.db_paths.emplace_back(dbname_ + "_3", 500 * 1024); + options.db_paths.emplace_back(dbname_ + "_4", 1024 * 1024 * 1024); + options.compaction_style = kCompactionStyleUniversal; + options.write_buffer_size = 100 << 10; // 100KB + options.level0_file_num_compaction_trigger = 2; + options.num_levels = 1; + options = CurrentOptions(options); + + std::vector filenames; + env_->GetChildren(options.db_paths[1].path, &filenames); + // Delete archival files. + for (size_t i = 0; i < filenames.size(); ++i) { + env_->DeleteFile(options.db_paths[1].path + "/" + filenames[i]); + } + env_->DeleteDir(options.db_paths[1].path); + Reopen(options); + + Random rnd(301); + int key_idx = 0; + + // First three 110KB files are not going to second path. + // After that, (100K, 200K) + for (int num = 0; num < 3; num++) { + GenerateNewFile(&rnd, &key_idx); + } + + // Another 110KB triggers a compaction to 400K file to second path + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); + + // (1, 4) + GenerateNewFile(&rnd, &key_idx); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[2].path)); ASSERT_EQ(1, GetSstFileCount(dbname_)); // (1,1,4) -> (2, 4) @@ -3767,7 +4477,7 @@ TEST(DBTest, UniversalCompactionFourPaths) { ASSERT_TRUE(v.size() == 1 || v.size() == 10000); } - Reopen(&options); + Reopen(options); for (int i = 0; i < key_idx; i++) { auto v = Get(Key(i)); @@ -3775,10 +4485,45 @@ TEST(DBTest, UniversalCompactionFourPaths) { ASSERT_TRUE(v.size() == 1 || v.size() == 10000); } - Destroy(&options); + Destroy(options); } + #endif +void CheckColumnFamilyMeta(const ColumnFamilyMetaData& cf_meta) { + uint64_t cf_size = 0; + uint64_t cf_csize = 0; + size_t file_count = 0; + for (auto level_meta : cf_meta.levels) { + uint64_t level_size = 0; + uint64_t level_csize = 0; + file_count += level_meta.files.size(); + for (auto file_meta : level_meta.files) { + level_size += file_meta.size; + } + ASSERT_EQ(level_meta.size, level_size); + cf_size += level_size; + cf_csize += level_csize; + } + ASSERT_EQ(cf_meta.file_count, file_count); + ASSERT_EQ(cf_meta.size, cf_size); +} + +TEST(DBTest, ColumnFamilyMetaDataTest) { + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + + Random rnd(301); + int key_index = 0; + ColumnFamilyMetaData cf_meta; + for (int i = 0; i < 100; ++i) { + GenerateNewFile(&rnd, &key_index); + db_->GetColumnFamilyMetaData(&cf_meta); + CheckColumnFamilyMeta(cf_meta); + } +} + TEST(DBTest, ConvertCompactionStyle) { Random rnd(301); int max_key_level_insert = 200; @@ -3794,7 +4539,7 @@ TEST(DBTest, ConvertCompactionStyle) { options.target_file_size_base = 200<<10; // 200KB options.target_file_size_multiplier = 1; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); for (int i = 0; i <= max_key_level_insert; i++) { // each value is 10K @@ -3814,7 +4559,7 @@ TEST(DBTest, ConvertCompactionStyle) { options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options = CurrentOptions(options); - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &options); + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(s.IsInvalidArgument()); // Stage 3: compact into a single file and move the file to level 0 @@ -3825,7 +4570,7 @@ TEST(DBTest, ConvertCompactionStyle) { options.max_bytes_for_level_base = INT_MAX; options.max_bytes_for_level_multiplier = 1; options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); dbfull()->CompactRange(handles_[1], nullptr, nullptr, true /* reduce level */, 0 /* reduce to level 0 */); @@ -3845,7 +4590,7 @@ TEST(DBTest, ConvertCompactionStyle) { options.write_buffer_size = 100<<10; //100KB options.level0_file_num_compaction_trigger = 3; options = CurrentOptions(options); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); for (int i = max_key_level_insert / 2; i <= max_key_universal_insert; i++) { ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); @@ -3955,11 +4700,11 @@ bool MinLevelToCompress(CompressionType& type, Options& options, int wbits, TEST(DBTest, MinLevelToCompress1) { Options options = CurrentOptions(); - CompressionType type; + CompressionType type = kSnappyCompression; if (!MinLevelToCompress(type, options, -14, -1, 0)) { return; } - Reopen(&options); + Reopen(options); MinLevelHelper(this, options); // do not compress L0 and L1 @@ -3969,17 +4714,17 @@ TEST(DBTest, MinLevelToCompress1) { for (int i = 2; i < options.num_levels; i++) { options.compression_per_level[i] = type; } - DestroyAndReopen(&options); + DestroyAndReopen(options); MinLevelHelper(this, options); } TEST(DBTest, MinLevelToCompress2) { Options options = CurrentOptions(); - CompressionType type; + CompressionType type = kSnappyCompression; if (!MinLevelToCompress(type, options, 15, -1, 0)) { return; } - Reopen(&options); + Reopen(options); MinLevelHelper(this, options); // do not compress L0 and L1 @@ -3989,7 +4734,7 @@ TEST(DBTest, MinLevelToCompress2) { for (int i = 2; i < options.num_levels; i++) { options.compression_per_level[i] = type; } - DestroyAndReopen(&options); + DestroyAndReopen(options); MinLevelHelper(this, options); } @@ -3999,7 +4744,7 @@ TEST(DBTest, RepeatedWritesToSameKey) { options.env = env_; options.write_buffer_size = 100000; // Small write buffer options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // We must have at most one file per level except for level-0, // which may have up to kL0_StopWritesTrigger files. @@ -4007,7 +4752,8 @@ TEST(DBTest, RepeatedWritesToSameKey) { options.num_levels + options.level0_stop_writes_trigger; Random rnd(301); - std::string value = RandomString(&rnd, 2 * options.write_buffer_size); + std::string value = + RandomString(&rnd, static_cast(2 * options.write_buffer_size)); for (int i = 0; i < 5 * kMaxFiles; i++) { ASSERT_OK(Put(1, "key", value)); ASSERT_LE(TotalTableFiles(1), kMaxFiles); @@ -4023,7 +4769,7 @@ TEST(DBTest, InPlaceUpdate) { options.env = env_; options.write_buffer_size = 100000; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller size int numValues = 10; @@ -4047,7 +4793,7 @@ TEST(DBTest, InPlaceUpdateLargeNewValue) { options.env = env_; options.write_buffer_size = 100000; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of larger size int numValues = 10; @@ -4075,7 +4821,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerSize) { options.inplace_callback = rocksdb::DBTest::updateInPlaceSmallerSize; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller size int numValues = 10; @@ -4104,7 +4850,7 @@ TEST(DBTest, InPlaceUpdateCallbackSmallerVarintSize) { options.inplace_callback = rocksdb::DBTest::updateInPlaceSmallerVarintSize; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of smaller varint size int numValues = 265; @@ -4133,7 +4879,7 @@ TEST(DBTest, InPlaceUpdateCallbackLargeNewValue) { options.inplace_callback = rocksdb::DBTest::updateInPlaceLargerSize; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Update key with values of larger size int numValues = 10; @@ -4160,7 +4906,7 @@ TEST(DBTest, InPlaceUpdateCallbackNoAction) { options.inplace_callback = rocksdb::DBTest::updateInPlaceNoAction; options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Callback function requests no actions from db ASSERT_OK(Put(1, "key", DummyString(1, 'a'))); @@ -4176,7 +4922,7 @@ TEST(DBTest, CompactionFilter) { options.max_mem_compaction_level = 0; options.compaction_filter_factory = std::make_shared(); options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Write 100K keys, these are written to a few files in L0. const std::string value(10, 'x'); @@ -4210,22 +4956,25 @@ TEST(DBTest, CompactionFilter) { // TODO: figure out sequence number squashtoo int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + Arena arena; + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); ASSERT_EQ(count, 1); - delete iter; // overwrite all the 100K keys once again. for (int i = 0; i < 100000; i++) { @@ -4252,8 +5001,8 @@ TEST(DBTest, CompactionFilter) { // filter in such a way that it deletes all keys options.compaction_filter_factory = std::make_shared(); options.create_if_missing = true; - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); // write all the keys once again. for (int i = 0; i < 100000; i++) { @@ -4279,16 +5028,18 @@ TEST(DBTest, CompactionFilter) { ASSERT_EQ(NumTableFilesAtLevel(0, 1), 0); ASSERT_EQ(NumTableFilesAtLevel(1, 1), 0); - // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions(), handles_[1]); - iter->SeekToFirst(); - count = 0; - while (iter->Valid()) { - count++; - iter->Next(); + { + // Scan the entire database to ensure that nothing is left + std::unique_ptr iter( + db_->NewIterator(ReadOptions(), handles_[1])); + iter->SeekToFirst(); + count = 0; + while (iter->Valid()) { + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); } - ASSERT_EQ(count, 0); - delete iter; // The sequence number of the remaining record // is not zeroed out even though it is at the @@ -4296,18 +5047,20 @@ TEST(DBTest, CompactionFilter) { // TODO: remove the following or design a different // test count = 0; - iter = dbfull()->TEST_NewInternalIterator(handles_[1]); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - ASSERT_NE(ikey.sequence, (unsigned)0); - count++; - iter->Next(); + { + ScopedArenaIterator iter( + dbfull()->TEST_NewInternalIterator(&arena, handles_[1])); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + ASSERT_NE(ikey.sequence, (unsigned)0); + count++; + iter->Next(); + } + ASSERT_EQ(count, 0); } - ASSERT_EQ(count, 0); - delete iter; } // Tests the edge case where compaction does not produce any output -- all @@ -4318,21 +5071,22 @@ TEST(DBTest, CompactionFilterDeletesAll) { options.compaction_filter_factory = std::make_shared(); options.disable_auto_compactions = true; options.create_if_missing = true; - DestroyAndReopen(&options); + options = CurrentOptions(options); + DestroyAndReopen(options); // put some data for (int table = 0; table < 4; ++table) { for (int i = 0; i < 10 + table; ++i) { - Put(std::to_string(table * 100 + i), "val"); + Put(ToString(table * 100 + i), "val"); } Flush(); } // this will produce empty file (delete compaction filter) ASSERT_OK(db_->CompactRange(nullptr, nullptr)); - ASSERT_EQ(0, CountLiveFiles()); + ASSERT_EQ(0U, CountLiveFiles()); - Reopen(&options); + Reopen(options); Iterator* itr = db_->NewIterator(ReadOptions()); itr->SeekToFirst(); @@ -4350,7 +5104,7 @@ TEST(DBTest, CompactionFilterWithValueChange) { options.compaction_filter_factory = std::make_shared(); options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Write 100K+1 keys, these are written to a few files // in L0. We do this so that the current snapshot points @@ -4393,6 +5147,75 @@ TEST(DBTest, CompactionFilterWithValueChange) { } while (ChangeCompactOptions()); } +TEST(DBTest, CompactionFilterWithMergeOperator) { + std::string one, two, three, four; + PutFixed64(&one, 1); + PutFixed64(&two, 2); + PutFixed64(&three, 3); + PutFixed64(&four, 4); + + Options options; + options = CurrentOptions(options); + options.create_if_missing = true; + options.merge_operator = MergeOperators::CreateUInt64AddOperator(); + options.num_levels = 3; + options.max_mem_compaction_level = 0; + // Filter out keys with value is 2. + options.compaction_filter_factory = + std::make_shared(two); + DestroyAndReopen(options); + + // In the same compaction, a value type needs to be deleted based on + // compaction filter, and there is a merge type for the key. compaction + // filter result is ignored. + ASSERT_OK(db_->Put(WriteOptions(), "foo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foo", one)); + ASSERT_OK(Flush()); + std::string newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + dbfull()->CompactRange(nullptr, nullptr); + newvalue = Get("foo"); + ASSERT_EQ(newvalue, three); + + // value key can be deleted based on compaction filter, leaving only + // merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + dbfull()->CompactRange(nullptr, nullptr); + newvalue = Get("bar"); + ASSERT_EQ("NOT_FOUND", newvalue); + ASSERT_OK(db_->Merge(WriteOptions(), "bar", two)); + ASSERT_OK(Flush()); + dbfull()->CompactRange(nullptr, nullptr); + newvalue = Get("bar"); + ASSERT_EQ(two, two); + + // Compaction filter never applies to merge keys. + ASSERT_OK(db_->Put(WriteOptions(), "foobar", one)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "foobar", two)); + ASSERT_OK(Flush()); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + dbfull()->CompactRange(nullptr, nullptr); + newvalue = Get("foobar"); + ASSERT_EQ(newvalue, three); + + // In the same compaction, both of value type and merge type keys need to be + // deleted based on compaction filter, and there is a merge type for the key. + // For both keys, compaction filter results are ignored. + ASSERT_OK(db_->Put(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + ASSERT_OK(db_->Merge(WriteOptions(), "barfoo", two)); + ASSERT_OK(Flush()); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); + dbfull()->CompactRange(nullptr, nullptr); + newvalue = Get("barfoo"); + ASSERT_EQ(newvalue, four); +} + TEST(DBTest, CompactionFilterContextManual) { KeepFilterFactory* filter = new KeepFilterFactory(); @@ -4401,7 +5224,7 @@ TEST(DBTest, CompactionFilterContextManual) { options.compaction_filter_factory.reset(filter); options.compression = kNoCompression; options.level0_file_num_compaction_trigger = 8; - Reopen(&options); + Reopen(options); int num_keys_per_file = 400; for (int j = 0; j < 3; j++) { // Write several keys. @@ -4427,24 +5250,26 @@ TEST(DBTest, CompactionFilterContextManual) { ASSERT_EQ(NumTableFilesAtLevel(0), 1); // Verify total number of keys is correct after manual compaction. - int count = 0; - int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + int count = 0; + int total = 0; + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); + ASSERT_EQ(total, 700); + ASSERT_EQ(count, 1); } - ASSERT_EQ(total, 700); - ASSERT_EQ(count, 1); - delete iter; } class KeepFilterV2 : public CompactionFilterV2 { @@ -4581,7 +5406,7 @@ TEST(DBTest, CompactionFilterV2) { // compaction filter buffer using universal compaction option_config_ = kUniversalCompaction; options.compaction_style = (rocksdb::CompactionStyle)1; - Reopen(&options); + Reopen(options); // Write 100K keys, these are written to a few files in L0. const std::string value(10, 'x'); @@ -4601,32 +5426,34 @@ TEST(DBTest, CompactionFilterV2) { // All the files are in the lowest level. int count = 0; int total = 0; - Iterator* iter = dbfull()->TEST_NewInternalIterator(); - iter->SeekToFirst(); - ASSERT_OK(iter->status()); - while (iter->Valid()) { - ParsedInternalKey ikey(Slice(), 0, kTypeValue); - ikey.sequence = -1; - ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); - total++; - if (ikey.sequence != 0) { - count++; + { + Arena arena; + ScopedArenaIterator iter(dbfull()->TEST_NewInternalIterator(&arena)); + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + while (iter->Valid()) { + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + ikey.sequence = -1; + ASSERT_EQ(ParseInternalKey(iter->key(), &ikey), true); + total++; + if (ikey.sequence != 0) { + count++; + } + iter->Next(); } - iter->Next(); } ASSERT_EQ(total, 100000); // 1 snapshot only. Since we are using universal compacton, // the sequence no is cleared for better compression ASSERT_EQ(count, 1); - delete iter; // create a new database with the compaction // filter in such a way that it deletes all keys options.compaction_filter_factory_v2 = std::make_shared(prefix_extractor.get()); options.create_if_missing = true; - DestroyAndReopen(&options); + DestroyAndReopen(options); // write all the keys once again. for (int i = 0; i < 100000; i++) { @@ -4643,7 +5470,7 @@ TEST(DBTest, CompactionFilterV2) { ASSERT_EQ(NumTableFilesAtLevel(1), 0); // Scan the entire database to ensure that nothing is left - iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); count = 0; while (iter->Valid()) { @@ -4668,7 +5495,7 @@ TEST(DBTest, CompactionFilterV2WithValueChange) { option_config_ = kUniversalCompaction; options.compaction_style = (rocksdb::CompactionStyle)1; options = CurrentOptions(options); - Reopen(&options); + Reopen(options); // Write 100K+1 keys, these are written to a few files // in L0. We do this so that the current snapshot points @@ -4709,7 +5536,7 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) { // compaction filter buffer using universal compaction option_config_ = kUniversalCompaction; options.compaction_style = (rocksdb::CompactionStyle)1; - Reopen(&options); + Reopen(options); // Write 100K+1 keys, these are written to a few files // in L0. We do this so that the current snapshot points @@ -4743,7 +5570,7 @@ TEST(DBTest, CompactionFilterV2NULLPrefix) { for (int i = 1; i < 100000; i++) { char key[100]; snprintf(key, sizeof(key), "%08d%010d", i, i); - std::string newvalue = Get(key); + newvalue = Get(key); ASSERT_EQ(newvalue.compare(NEW_VALUE), 0); } } @@ -4752,7 +5579,7 @@ TEST(DBTest, SparseMerge) { do { Options options = CurrentOptions(); options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); FillLevels("A", "Z", 1); @@ -4809,12 +5636,13 @@ TEST(DBTest, ApproximateSizes) { Options options; options.write_buffer_size = 100000000; // Large write buffer options.compression = kNoCompression; + options.create_if_missing = true; options = CurrentOptions(options); - DestroyAndReopen(); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(Between(Size("", "xyz", 1), 0, 0)); // Write 8MB (80 values, each 100K) @@ -4832,7 +5660,7 @@ TEST(DBTest, ApproximateSizes) { // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); for (int compact_start = 0; compact_start < N; compact_start += 10) { for (int i = 0; i < N; i += 10) { @@ -4864,7 +5692,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { do { Options options = CurrentOptions(); options.compression = kNoCompression; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); std::string big1 = RandomString(&rnd, 100000); @@ -4879,7 +5707,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { // Check sizes across recovery by reopening a few times for (int run = 0; run < 3; run++) { - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(Between(Size("", Key(0), 1), 0, 0)); ASSERT_TRUE(Between(Size("", Key(1), 1), 10000, 11000)); @@ -4901,7 +5729,7 @@ TEST(DBTest, ApproximateSizes_MixOfSmallAndLarge) { TEST(DBTest, IteratorPinsRef) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); Put(1, "foo", "hello"); // Get iterator that will yield the current contents of the DB. @@ -4926,17 +5754,31 @@ TEST(DBTest, IteratorPinsRef) { } TEST(DBTest, Snapshot) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions(options_override)); Put(0, "foo", "0v1"); Put(1, "foo", "1v1"); + const Snapshot* s1 = db_->GetSnapshot(); + ASSERT_EQ(1U, GetNumSnapshots()); + uint64_t time_snap1 = GetTimeOldestSnapshots(); + ASSERT_GT(time_snap1, 0U); Put(0, "foo", "0v2"); Put(1, "foo", "1v2"); + + env_->addon_time_++; + const Snapshot* s2 = db_->GetSnapshot(); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); Put(0, "foo", "0v3"); Put(1, "foo", "1v3"); + const Snapshot* s3 = db_->GetSnapshot(); + ASSERT_EQ(3U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); Put(0, "foo", "0v4"); Put(1, "foo", "1v4"); @@ -4950,6 +5792,8 @@ TEST(DBTest, Snapshot) { ASSERT_EQ("1v4", Get(1, "foo")); db_->ReleaseSnapshot(s3); + ASSERT_EQ(2U, GetNumSnapshots()); + ASSERT_EQ(time_snap1, GetTimeOldestSnapshots()); ASSERT_EQ("0v1", Get(0, "foo", s1)); ASSERT_EQ("1v1", Get(1, "foo", s1)); ASSERT_EQ("0v2", Get(0, "foo", s2)); @@ -4962,16 +5806,23 @@ TEST(DBTest, Snapshot) { ASSERT_EQ("1v2", Get(1, "foo", s2)); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); + ASSERT_EQ(1U, GetNumSnapshots()); + ASSERT_LT(time_snap1, GetTimeOldestSnapshots()); db_->ReleaseSnapshot(s2); + ASSERT_EQ(0U, GetNumSnapshots()); ASSERT_EQ("0v4", Get(0, "foo")); ASSERT_EQ("1v4", Get(1, "foo")); } while (ChangeOptions(kSkipHashCuckoo)); } TEST(DBTest, HiddenValuesAreRemoved) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(options_override); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); FillLevels("a", "z", 1); @@ -5006,10 +5857,12 @@ TEST(DBTest, HiddenValuesAreRemoved) { } TEST(DBTest, CompactBetweenSnapshots) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - Options options = CurrentOptions(); + Options options = CurrentOptions(options_override); options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); FillLevels("a", "z", 1); @@ -5062,7 +5915,9 @@ TEST(DBTest, CompactBetweenSnapshots) { } TEST(DBTest, DeletionMarkers1) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5097,7 +5952,9 @@ TEST(DBTest, DeletionMarkers1) { } TEST(DBTest, DeletionMarkers2) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); const int last = CurrentOptions().max_mem_compaction_level; @@ -5126,7 +5983,9 @@ TEST(DBTest, DeletionMarkers2) { TEST(DBTest, OverlapInLevel0) { do { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); int tmp = CurrentOptions().max_mem_compaction_level; ASSERT_EQ(tmp, 2) << "Fix test to match config"; @@ -5169,17 +6028,17 @@ TEST(DBTest, OverlapInLevel0) { TEST(DBTest, L0_CompactionBug_Issue44_a) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "b", "v")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_OK(Delete(1, "b")); ASSERT_OK(Delete(1, "a")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_OK(Delete(1, "a")); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "a", "v")); - ReopenWithColumnFamilies({"default", "pikachu"}); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("(a->v)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ASSERT_EQ("(a->v)", Contents(1)); @@ -5188,26 +6047,26 @@ TEST(DBTest, L0_CompactionBug_Issue44_a) { TEST(DBTest, L0_CompactionBug_Issue44_b) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); Put(1, "", ""); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Delete(1, "e"); Put(1, "", ""); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Put(1, "c", "cv"); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Put(1, "", ""); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Put(1, "", ""); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Put(1, "d", "dv"); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Put(1, "", ""); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); Delete(1, "d"); Delete(1, "b"); - ReopenWithColumnFamilies({"default", "pikachu"}); + ReopenWithColumnFamilies({"default", "pikachu"}, CurrentOptions()); ASSERT_EQ("(->)(c->cv)", Contents(1)); env_->SleepForMicroseconds(1000000); // Wait for compaction to finish ASSERT_EQ("(->)(c->cv)", Contents(1)); @@ -5231,17 +6090,17 @@ TEST(DBTest, ComparatorCheck) { Options new_options, options; NewComparator cmp; do { - CreateAndReopenWithCF({"pikachu"}); options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); new_options = CurrentOptions(); new_options.comparator = &cmp; // only the non-default column family has non-matching comparator Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, - {&options, &new_options}); + std::vector({options, new_options})); ASSERT_TRUE(!s.ok()); ASSERT_TRUE(s.ToString().find("comparator") != std::string::npos) << s.ToString(); - } while (ChangeCompactOptions(&new_options)); + } while (ChangeCompactOptions()); } TEST(DBTest, CustomComparator) { @@ -5278,8 +6137,8 @@ TEST(DBTest, CustomComparator) { new_options.comparator = &cmp; new_options.write_buffer_size = 1000; // Compact more often new_options = CurrentOptions(new_options); - DestroyAndReopen(&new_options); - CreateAndReopenWithCF({"pikachu"}, &new_options); + DestroyAndReopen(new_options); + CreateAndReopenWithCF({"pikachu"}, new_options); ASSERT_OK(Put(1, "[10]", "ten")); ASSERT_OK(Put(1, "[0x14]", "twenty")); for (int i = 0; i < 2; i++) { @@ -5300,11 +6159,13 @@ TEST(DBTest, CustomComparator) { } Compact(1, "[0]", "[1000000]"); } - } while (ChangeCompactOptions(&new_options)); + } while (ChangeCompactOptions()); } TEST(DBTest, ManualCompaction) { - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.max_background_flushes = 0; + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; @@ -5341,11 +6202,12 @@ TEST(DBTest, ManualCompaction) { ASSERT_EQ("0,0,1", FilesPerLevel(1)); if (iter == 0) { - Options options = CurrentOptions(); + options = CurrentOptions(); + options.max_background_flushes = 0; options.num_levels = 3; options.create_if_missing = true; - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); } } @@ -5358,9 +6220,9 @@ TEST(DBTest, ManualCompactionOutputPathId) { options.db_paths.emplace_back(dbname_ + "_2", 1000000000); options.compaction_style = kCompactionStyleUniversal; options.level0_file_num_compaction_trigger = 10; - Destroy(&options); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + Destroy(options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); MakeTables(3, "p", "q", 1); dbfull()->TEST_WaitForCompact(); ASSERT_EQ("3", FilesPerLevel(1)); @@ -5373,7 +6235,7 @@ TEST(DBTest, ManualCompactionOutputPathId) { ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, &options); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ASSERT_EQ("1", FilesPerLevel(1)); ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); @@ -5383,7 +6245,7 @@ TEST(DBTest, ManualCompactionOutputPathId) { ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); - ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, &options); + ReopenWithColumnFamilies({kDefaultColumnFamilyName, "pikachu"}, options); ASSERT_EQ("2", FilesPerLevel(1)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); @@ -5399,21 +6261,90 @@ TEST(DBTest, ManualCompactionOutputPathId) { .IsInvalidArgument()); } +TEST(DBTest, ManualLevelCompactionOutputPathId) { + Options options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_background_flushes = 1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) + << "Need to update this test to match kMaxMemCompactLevel"; + + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q", 1); + ASSERT_EQ("3", FilesPerLevel(1)); + ASSERT_EQ(3, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compaction range falls before files + Compact(1, "", "c"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range falls after files + Compact(1, "r", "z"); + ASSERT_EQ("3", FilesPerLevel(1)); + + // Compaction range overlaps files + Compact(1, "p1", "p9", 1); + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Populate a different range + MakeTables(3, "c", "e", 1); + ASSERT_EQ("3,1", FilesPerLevel(1)); + + // Compact just the new range + Compact(1, "b", "f", 1); + ASSERT_EQ("0,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + // Compact all + MakeTables(1, "a", "z", 1); + ASSERT_EQ("1,2", FilesPerLevel(1)); + ASSERT_EQ(2, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[0].path)); + db_->CompactRange(handles_[1], nullptr, nullptr, false, 1, 1); + ASSERT_EQ("0,1", FilesPerLevel(1)); + ASSERT_EQ(1, GetSstFileCount(options.db_paths[1].path)); + ASSERT_EQ(0, GetSstFileCount(options.db_paths[0].path)); + ASSERT_EQ(0, GetSstFileCount(dbname_)); + + if (iter == 0) { + DestroyAndReopen(options); + options = CurrentOptions(); + options.db_paths.emplace_back(dbname_ + "_2", 2 * 10485760); + options.db_paths.emplace_back(dbname_ + "_3", 100 * 10485760); + options.db_paths.emplace_back(dbname_ + "_4", 120 * 10485760); + options.max_background_flushes = 1; + options.num_levels = 3; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + } + } +} + TEST(DBTest, DBOpen_Options) { - std::string dbname = test::TmpDir() + "/db_options_test"; - ASSERT_OK(DestroyDB(dbname, Options())); + Options options = CurrentOptions(); + std::string dbname = test::TmpDir(env_) + "/db_options_test"; + ASSERT_OK(DestroyDB(dbname, options)); // Does not exist, and create_if_missing == false: error DB* db = nullptr; - Options opts; - opts.create_if_missing = false; - Status s = DB::Open(opts, dbname, &db); + options.create_if_missing = false; + Status s = DB::Open(options, dbname, &db); ASSERT_TRUE(strstr(s.ToString().c_str(), "does not exist") != nullptr); ASSERT_TRUE(db == nullptr); // Does not exist, and create_if_missing == true: OK - opts.create_if_missing = true; - s = DB::Open(opts, dbname, &db); + options.create_if_missing = true; + s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); @@ -5421,16 +6352,16 @@ TEST(DBTest, DBOpen_Options) { db = nullptr; // Does exist, and error_if_exists == true: error - opts.create_if_missing = false; - opts.error_if_exists = true; - s = DB::Open(opts, dbname, &db); + options.create_if_missing = false; + options.error_if_exists = true; + s = DB::Open(options, dbname, &db); ASSERT_TRUE(strstr(s.ToString().c_str(), "exists") != nullptr); ASSERT_TRUE(db == nullptr); // Does exist, and error_if_exists == false: OK - opts.create_if_missing = true; - opts.error_if_exists = false; - s = DB::Open(opts, dbname, &db); + options.create_if_missing = true; + options.error_if_exists = false; + s = DB::Open(options, dbname, &db); ASSERT_OK(s); ASSERT_TRUE(db != nullptr); @@ -5439,74 +6370,75 @@ TEST(DBTest, DBOpen_Options) { } TEST(DBTest, DBOpen_Change_NumLevels) { - Options opts; - opts.create_if_missing = true; - DestroyAndReopen(&opts); + Options options = CurrentOptions(); + options.create_if_missing = true; + options.max_background_flushes = 0; + DestroyAndReopen(options); ASSERT_TRUE(db_ != nullptr); - CreateAndReopenWithCF({"pikachu"}, &opts); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "a", "123")); ASSERT_OK(Put(1, "b", "234")); db_->CompactRange(handles_[1], nullptr, nullptr); Close(); - opts.create_if_missing = false; - opts.num_levels = 2; - Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, &opts); + options.create_if_missing = false; + options.num_levels = 2; + Status s = TryReopenWithColumnFamilies({"default", "pikachu"}, options); ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); ASSERT_TRUE(db_ == nullptr); } TEST(DBTest, DestroyDBMetaDatabase) { - std::string dbname = test::TmpDir() + "/db_meta"; + std::string dbname = test::TmpDir(env_) + "/db_meta"; std::string metadbname = MetaDatabaseName(dbname, 0); std::string metametadbname = MetaDatabaseName(metadbname, 0); // Destroy previous versions if they exist. Using the long way. - ASSERT_OK(DestroyDB(metametadbname, Options())); - ASSERT_OK(DestroyDB(metadbname, Options())); - ASSERT_OK(DestroyDB(dbname, Options())); + Options options = CurrentOptions(); + ASSERT_OK(DestroyDB(metametadbname, options)); + ASSERT_OK(DestroyDB(metadbname, options)); + ASSERT_OK(DestroyDB(dbname, options)); // Setup databases - Options opts; - opts.create_if_missing = true; DB* db = nullptr; - ASSERT_OK(DB::Open(opts, dbname, &db)); + ASSERT_OK(DB::Open(options, dbname, &db)); delete db; db = nullptr; - ASSERT_OK(DB::Open(opts, metadbname, &db)); + ASSERT_OK(DB::Open(options, metadbname, &db)); delete db; db = nullptr; - ASSERT_OK(DB::Open(opts, metametadbname, &db)); + ASSERT_OK(DB::Open(options, metametadbname, &db)); delete db; db = nullptr; // Delete databases - ASSERT_OK(DestroyDB(dbname, Options())); + ASSERT_OK(DestroyDB(dbname, options)); // Check if deletion worked. - opts.create_if_missing = false; - ASSERT_TRUE(!(DB::Open(opts, dbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(opts, metadbname, &db)).ok()); - ASSERT_TRUE(!(DB::Open(opts, metametadbname, &db)).ok()); + options.create_if_missing = false; + ASSERT_TRUE(!(DB::Open(options, dbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metadbname, &db)).ok()); + ASSERT_TRUE(!(DB::Open(options, metametadbname, &db)).ok()); } -// Check that number of files does not grow when we are out of space -TEST(DBTest, NoSpace) { +// Check that number of files does not grow when writes are dropped +TEST(DBTest, DropWrites) { do { Options options = CurrentOptions(); options.env = env_; options.paranoid_checks = false; - Reopen(&options); + Reopen(options); ASSERT_OK(Put("foo", "v1")); ASSERT_EQ("v1", Get("foo")); Compact("a", "z"); - const int num_files = CountFiles(); - env_->no_space_.Release_Store(env_); // Force out-of-space errors + const size_t num_files = CountFiles(); + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); env_->sleep_counter_.Reset(); for (int i = 0; i < 5; i++) { - for (int level = 0; level < dbfull()->NumberLevels()-1; level++) { + for (int level = 0; level < dbfull()->NumberLevels() - 1; level++) { dbfull()->TEST_CompactRange(level, nullptr, nullptr); } } @@ -5515,7 +6447,7 @@ TEST(DBTest, NoSpace) { ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("5", property_value); - env_->no_space_.Release_Store(nullptr); + env_->drop_writes_.store(false, std::memory_order_release); ASSERT_LT(CountFiles(), num_files + 3); // Check that compaction attempts slept after errors @@ -5524,39 +6456,53 @@ TEST(DBTest, NoSpace) { } // Check background error counter bumped on flush failures. -TEST(DBTest, NoSpaceFlush) { +TEST(DBTest, DropWritesFlush) { do { Options options = CurrentOptions(); options.env = env_; options.max_background_flushes = 1; - Reopen(&options); + Reopen(options); ASSERT_OK(Put("foo", "v1")); - env_->no_space_.Release_Store(env_); // Force out-of-space errors + // Force out-of-space errors + env_->drop_writes_.store(true, std::memory_order_release); std::string property_value; // Background error count is 0 now. ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); ASSERT_EQ("0", property_value); - dbfull()->TEST_FlushMemTable(false); + dbfull()->TEST_FlushMemTable(true); - // Wait 300 milliseconds or background-errors turned 1 from 0. - int time_to_sleep_limit = 300000; - while (time_to_sleep_limit > 0) { - int to_sleep = (time_to_sleep_limit > 1000) ? 1000 : time_to_sleep_limit; - time_to_sleep_limit -= to_sleep; - env_->SleepForMicroseconds(to_sleep); + ASSERT_TRUE(db_->GetProperty("rocksdb.background-errors", &property_value)); + ASSERT_EQ("1", property_value); - ASSERT_TRUE( - db_->GetProperty("rocksdb.background-errors", &property_value)); - if (property_value == "1") { - break; - } + env_->drop_writes_.store(false, std::memory_order_release); + } while (ChangeCompactOptions()); +} + +// Check that CompactRange() returns failure if there is not enough space left +// on device +TEST(DBTest, NoSpaceCompactRange) { + do { + Options options = CurrentOptions(); + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + // generate 5 tables + for (int i = 0; i < 5; ++i) { + ASSERT_OK(Put(Key(i), Key(i) + "v")); + ASSERT_OK(Flush()); } - ASSERT_EQ("1", property_value); - env_->no_space_.Release_Store(nullptr); + // Force out-of-space errors + env_->no_space_.store(true, std::memory_order_release); + + Status s = db_->CompactRange(nullptr, nullptr); + ASSERT_TRUE(s.IsIOError()); + + env_->no_space_.store(false, std::memory_order_release); } while (ChangeCompactOptions()); } @@ -5565,9 +6511,9 @@ TEST(DBTest, NonWritableFileSystem) { Options options = CurrentOptions(); options.write_buffer_size = 1000; options.env = env_; - Reopen(&options); + Reopen(options); ASSERT_OK(Put("foo", "v1")); - env_->non_writable_.Release_Store(env_); // Force errors for new files + env_->non_writeable_rate_.store(100); std::string big(100000, 'x'); int errors = 0; for (int i = 0; i < 20; i++) { @@ -5577,7 +6523,7 @@ TEST(DBTest, NonWritableFileSystem) { } } ASSERT_GT(errors, 0); - env_->non_writable_.Release_Store(nullptr); + env_->non_writeable_rate_.store(0); } while (ChangeCompactOptions()); } @@ -5591,7 +6537,7 @@ TEST(DBTest, ManifestWriteError) { // We iterate twice. In the second iteration, everything is the // same except the log record never makes it to the MANIFEST file. for (int iter = 0; iter < 2; iter++) { - port::AtomicPointer* error_type = (iter == 0) + std::atomic* error_type = (iter == 0) ? &env_->manifest_sync_error_ : &env_->manifest_write_error_; @@ -5600,7 +6546,8 @@ TEST(DBTest, ManifestWriteError) { options.env = env_; options.create_if_missing = true; options.error_if_exists = false; - DestroyAndReopen(&options); + options.max_background_flushes = 0; + DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); @@ -5611,13 +6558,13 @@ TEST(DBTest, ManifestWriteError) { ASSERT_EQ(NumTableFilesAtLevel(last), 1); // foo=>bar is now in last level // Merging compaction (will fail) - error_type->Release_Store(env_); + error_type->store(true, std::memory_order_release); dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail ASSERT_EQ("bar", Get("foo")); // Recovery: should not lose data - error_type->Release_Store(nullptr); - Reopen(&options); + error_type->store(false, std::memory_order_release); + Reopen(options); ASSERT_EQ("bar", Get("foo")); } } @@ -5633,17 +6580,17 @@ TEST(DBTest, PutFailsParanoid) { options.create_if_missing = true; options.error_if_exists = false; options.paranoid_checks = true; - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); Status s; ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error - env_->log_write_error_.Release_Store(env_); + env_->log_write_error_.store(true, std::memory_order_release); s = Put(1, "foo2", "bar2"); ASSERT_TRUE(!s.ok()); - env_->log_write_error_.Release_Store(nullptr); + env_->log_write_error_.store(false, std::memory_order_release); s = Put(1, "foo3", "bar3"); // the next put should fail, too ASSERT_TRUE(!s.ok()); @@ -5652,16 +6599,16 @@ TEST(DBTest, PutFailsParanoid) { // do the same thing with paranoid checks off options.paranoid_checks = false; - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "bar")); ASSERT_OK(Put(1, "foo1", "bar1")); // simulate error - env_->log_write_error_.Release_Store(env_); + env_->log_write_error_.store(true, std::memory_order_release); s = Put(1, "foo2", "bar2"); ASSERT_TRUE(!s.ok()); - env_->log_write_error_.Release_Store(nullptr); + env_->log_write_error_.store(false, std::memory_order_release); s = Put(1, "foo3", "bar3"); // the next put should NOT fail ASSERT_TRUE(s.ok()); @@ -5669,10 +6616,10 @@ TEST(DBTest, PutFailsParanoid) { TEST(DBTest, FilesDeletedAfterCompaction) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "foo", "v2")); Compact(1, "a", "z"); - const int num_files = CountLiveFiles(); + const size_t num_files = CountLiveFiles(); for (int i = 0; i < 10; i++) { ASSERT_OK(Put(1, "foo", "v2")); Compact(1, "a", "z"); @@ -5693,7 +6640,7 @@ TEST(DBTest, BloomFilter) { table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); // Populate multiple layers const int N = 10000; @@ -5707,7 +6654,7 @@ TEST(DBTest, BloomFilter) { Flush(1); // Prevent auto compactions triggered by seeks - env_->delay_sstable_sync_.Release_Store(env_); + env_->delay_sstable_sync_.store(true, std::memory_order_release); // Lookup present keys. Should rarely read from small sstable. env_->random_read_counter_.Reset(); @@ -5728,16 +6675,176 @@ TEST(DBTest, BloomFilter) { fprintf(stderr, "%d missing => %d reads\n", N, reads); ASSERT_LE(reads, 3*N/100); - env_->delay_sstable_sync_.Release_Store(nullptr); + env_->delay_sstable_sync_.store(false, std::memory_order_release); Close(); } while (ChangeCompactOptions()); } +TEST(DBTest, BloomFilterRate) { + while (ChangeFilterOptions()) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); + } +} + +TEST(DBTest, BloomFilterCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with block based filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with full filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +TEST(DBTest, BloomFilterReverseCompatibility) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + // Create with full filter + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + Flush(1); + + // Check db with block_based filter + table_options.filter_policy.reset(NewBloomFilterPolicy(10, true)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + ReopenWithColumnFamilies({"default", "pikachu"}, options); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); +} + +namespace { +// A wrapped bloom over default FilterPolicy +class WrappedBloom : public FilterPolicy { + public: + explicit WrappedBloom(int bits_per_key) : + filter_(NewBloomFilterPolicy(bits_per_key)), + counter_(0) {} + + ~WrappedBloom() { delete filter_; } + + const char* Name() const override { return "WrappedRocksDbFilterPolicy"; } + + void CreateFilter(const rocksdb::Slice* keys, int n, std::string* dst) + const override { + std::unique_ptr user_keys(new rocksdb::Slice[n]); + for (int i = 0; i < n; ++i) { + user_keys[i] = convertKey(keys[i]); + } + return filter_->CreateFilter(user_keys.get(), n, dst); + } + + bool KeyMayMatch(const rocksdb::Slice& key, const rocksdb::Slice& filter) + const override { + counter_++; + return filter_->KeyMayMatch(convertKey(key), filter); + } + + uint32_t GetCounter() { return counter_; } + + private: + const FilterPolicy* filter_; + mutable uint32_t counter_; + + rocksdb::Slice convertKey(const rocksdb::Slice& key) const { + return key; + } +}; +} // namespace + +TEST(DBTest, BloomFilterWrapper) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + + BlockBasedTableOptions table_options; + WrappedBloom* policy = new WrappedBloom(10); + table_options.filter_policy.reset(policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + CreateAndReopenWithCF({"pikachu"}, options); + + const int maxKey = 10000; + for (int i = 0; i < maxKey; i++) { + ASSERT_OK(Put(1, Key(i), Key(i))); + } + // Add a large key to make the file contain wide range + ASSERT_OK(Put(1, Key(maxKey + 55555), Key(maxKey + 55555))); + ASSERT_EQ(0U, policy->GetCounter()); + Flush(1); + + // Check if they can be found + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ(Key(i), Get(1, Key(i))); + } + ASSERT_EQ(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + ASSERT_EQ(1U * maxKey, policy->GetCounter()); + + // Check if filter is useful + for (int i = 0; i < maxKey; i++) { + ASSERT_EQ("NOT_FOUND", Get(1, Key(i+33333))); + } + ASSERT_GE(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), maxKey*0.98); + ASSERT_EQ(2U * maxKey, policy->GetCounter()); +} + TEST(DBTest, SnapshotFiles) { do { Options options = CurrentOptions(); options.write_buffer_size = 100000000; // Large write buffer - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Random rnd(301); @@ -5767,8 +6874,7 @@ TEST(DBTest, SnapshotFiles) { // copy these files to a new snapshot directory std::string snapdir = dbname_ + ".snapdir/"; - std::string mkdir = "mkdir -p " + snapdir; - ASSERT_EQ(system(mkdir.c_str()), 0); + ASSERT_OK(env_->CreateDirIfMissing(snapdir)); for (unsigned int i = 0; i < files.size(); i++) { // our clients require that GetLiveFiles returns @@ -5796,7 +6902,6 @@ TEST(DBTest, SnapshotFiles) { // release file snapshot dbfull()->DisableFileDeletions(); - // overwrite one key, this key should not appear in the snapshot std::vector extras; for (unsigned int i = 0; i < 1; i++) { @@ -5811,6 +6916,7 @@ TEST(DBTest, SnapshotFiles) { std::vector cf_handles; DB* snapdb; DBOptions opts; + opts.env = env_; opts.create_if_missing = false; Status stat = DB::Open(opts, snapdir, column_families, &cf_handles, &snapdb); @@ -5863,11 +6969,13 @@ TEST(DBTest, SnapshotFiles) { } TEST(DBTest, CompactOnFlush) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { - Options options = CurrentOptions(); + Options options = CurrentOptions(options_override); options.purge_redundant_kvs_while_flush = true; options.disable_auto_compactions = true; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); Put(1, "foo", "v1"); ASSERT_OK(Flush(1)); @@ -5952,22 +7060,18 @@ namespace { std::vector ListSpecificFiles( Env* env, const std::string& path, const FileType expected_file_type) { std::vector files; - std::vector log_files; + std::vector file_numbers; env->GetChildren(path, &files); uint64_t number; FileType type; for (size_t i = 0; i < files.size(); ++i) { if (ParseFileName(files[i], &number, &type)) { if (type == expected_file_type) { - log_files.push_back(number); + file_numbers.push_back(number); } } } - return std::move(log_files); -} - -std::vector ListLogFiles(Env* env, const std::string& path) { - return ListSpecificFiles(env, path, kLogFile); + return std::move(file_numbers); } std::vector ListTableFiles(Env* env, const std::string& path) { @@ -5976,10 +7080,10 @@ std::vector ListTableFiles(Env* env, const std::string& path) { } // namespace TEST(DBTest, FlushOneColumnFamily) { - Options options; + Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu", "ilya", "muromec", "dobrynia", "nikitich", "alyosha", "popovich"}, - &options); + options); ASSERT_OK(Put(0, "Default", "Default")); ASSERT_OK(Put(1, "pikachu", "pikachu")); @@ -5990,119 +7094,215 @@ TEST(DBTest, FlushOneColumnFamily) { ASSERT_OK(Put(6, "alyosha", "alyosha")); ASSERT_OK(Put(7, "popovich", "popovich")); - for (size_t i = 0; i < 8; ++i) { + for (int i = 0; i < 8; ++i) { Flush(i); auto tables = ListTableFiles(env_, dbname_); ASSERT_EQ(tables.size(), i + 1U); } } -TEST(DBTest, WALArchivalTtl) { - do { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.WAL_ttl_seconds = 1000; - DestroyAndReopen(&options); - - // TEST : Create DB with a ttl and no size limit. - // Put some keys. Count the log files present in the DB just after insert. - // Re-open db. Causes deletion/archival to take place. - // Assert that the files moved under "/archive". - // Reopen db with small ttl. - // Assert that archive was removed. - - std::string archiveDir = ArchivalDirectory(dbname_); - - for (int i = 0; i < 10; ++i) { - for (int j = 0; j < 10; ++j) { - ASSERT_OK(Put(Key(10 * i + j), DummyString(1024))); - } - - std::vector log_files = ListLogFiles(env_, dbname_); - - options.create_if_missing = false; - Reopen(&options); - - std::vector logs = ListLogFiles(env_, archiveDir); - std::set archivedFiles(logs.begin(), logs.end()); - - for (auto& log : log_files) { - ASSERT_TRUE(archivedFiles.find(log) != archivedFiles.end()); - } - } - - std::vector log_files = ListLogFiles(env_, archiveDir); - ASSERT_TRUE(log_files.size() > 0); +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it was empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) { + Options options = CurrentOptions(); + options.write_buffer_size = 5000000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Since we will reopen DB with smaller write_buffer_size, + // each key will go to new SST file + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + ASSERT_OK(Put(1, Key(10), DummyString(1000000))); + + ASSERT_OK(Put(3, Key(10), DummyString(1))); + // Make 'dobrynia' to be flushed and new WAL file to be created + ASSERT_OK(Put(2, Key(10), DummyString(7500000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + // Make sure 'dobrynia' was flushed: check sst files amount + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + } + // New WAL file + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + ASSERT_OK(Put(3, Key(10), DummyString(1))); + + options.write_buffer_size = 10; + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + // No inserts => default is empty + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(0)); + // First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(5)); + // 1 SST for big key + 1 SST for small one + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + // 1 SST for all keys + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } +} + +// In https://reviews.facebook.net/D20661 we change +// recovery behavior: previously for each log file each column family +// memtable was flushed, even it wasn't empty. Now it's changed: +// we try to create the smallest number of table files by merging +// updates from multiple logs +TEST(DBTest, RecoverCheckFileAmount) { + Options options = CurrentOptions(); + options.write_buffer_size = 100000; + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); - options.WAL_ttl_seconds = 1; - env_->SleepForMicroseconds(2 * 1000 * 1000); - Reopen(&options); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); - log_files = ListLogFiles(env_, archiveDir); - ASSERT_TRUE(log_files.empty()); - } while (ChangeCompactOptions()); -} + // Make 'nikitich' memtable to be flushed + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // 4 memtable are not flushed, 1 sst file + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + // Memtable for 'nikitich' has flushed, new WAL file has opened + // 4 memtable still not flushed + + // Write to new WAL file + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + + // Fill up 'nikitich' one more time + ASSERT_OK(Put(3, Key(10), DummyString(1002400))); + // make it flush + ASSERT_OK(Put(3, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + // There are still 4 memtable not flushed, and 2 sst tables + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); -namespace { -uint64_t GetLogDirSize(std::string dir_path, SpecialEnv* env) { - uint64_t dir_size = 0; - std::vector files; - env->GetChildren(dir_path, &files); - for (auto& f : files) { - uint64_t number; - FileType type; - if (ParseFileName(f, &number, &type) && type == kLogFile) { - std::string const file_path = dir_path + "/" + f; - uint64_t file_size; - env->GetFileSize(file_path, &file_size); - dir_size += file_size; - } + { + auto tables = ListTableFiles(env_, dbname_); + ASSERT_EQ(tables.size(), static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); } - return dir_size; -} -} // namespace - -TEST(DBTest, WALArchivalSizeLimit) { - do { - Options options = CurrentOptions(); - options.create_if_missing = true; - options.WAL_ttl_seconds = 0; - options.WAL_size_limit_MB = 1000; - - // TEST : Create DB with huge size limit and no ttl. - // Put some keys. Count the archived log files present in the DB - // just after insert. Assert that there are many enough. - // Change size limit. Re-open db. - // Assert that archive is not greater than WAL_size_limit_MB. - // Set ttl and time_to_check_ to small values. Re-open db. - // Assert that there are no archived logs left. - - DestroyAndReopen(&options); - for (int i = 0; i < 128 * 128; ++i) { - ASSERT_OK(Put(Key(i), DummyString(1024))); - } - Reopen(&options); - - std::string archive_dir = ArchivalDirectory(dbname_); - std::vector log_files = ListLogFiles(env_, archive_dir); - ASSERT_TRUE(log_files.size() > 2); - - options.WAL_size_limit_MB = 8; - Reopen(&options); - dbfull()->TEST_PurgeObsoleteteWAL(); - - uint64_t archive_size = GetLogDirSize(archive_dir, env_); - ASSERT_TRUE(archive_size <= options.WAL_size_limit_MB * 1024 * 1024); - options.WAL_ttl_seconds = 1; - dbfull()->TEST_SetDefaultTimeToCheck(1); - env_->SleepForMicroseconds(2 * 1000 * 1000); - Reopen(&options); - dbfull()->TEST_PurgeObsoleteteWAL(); - - log_files = ListLogFiles(env_, archive_dir); - ASSERT_TRUE(log_files.empty()); - } while (ChangeCompactOptions()); + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + std::vector table_files = ListTableFiles(env_, dbname_); + // Check, that records for 'default', 'dobrynia' and 'pikachu' from + // first, second and third WALs went to the same SST. + // So, there is 6 SSTs: three for 'nikitich', one for 'default', one for + // 'dobrynia', one for 'pikachu' + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + } +} + +TEST(DBTest, SharedWriteBuffer) { + Options options = CurrentOptions(); + options.db_write_buffer_size = 100000; // this is the real limit + options.write_buffer_size = 500000; // this is never hit + CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, options); + + // Trigger a flush on every CF + ASSERT_OK(Put(0, Key(1), DummyString(1))); + ASSERT_OK(Put(1, Key(1), DummyString(1))); + ASSERT_OK(Put(3, Key(1), DummyString(90000))); + ASSERT_OK(Put(2, Key(2), DummyString(20000))); + ASSERT_OK(Put(2, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[0]); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(1)); + } + + // Flush 'dobrynia' and 'nikitich' + ASSERT_OK(Put(2, Key(2), DummyString(50000))); + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + ASSERT_OK(Put(2, Key(3), DummyString(20000))); + ASSERT_OK(Put(3, Key(2), DummyString(40000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(2)); + } + + // Make 'dobrynia' and 'nikitich' both take up 40% of space + // When 'pikachu' puts us over 100%, all 3 flush. + ASSERT_OK(Put(2, Key(2), DummyString(40000))); + ASSERT_OK(Put(1, Key(2), DummyString(20000))); + ASSERT_OK(Put(0, Key(1), DummyString(1))); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + dbfull()->TEST_WaitForFlushMemTable(handles_[3]); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(1)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(3)); + } + + // Some remaining writes so 'default' and 'nikitich' flush on closure. + ASSERT_OK(Put(3, Key(1), DummyString(1))); + ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"}, + options); + { + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), + static_cast(2)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), + static_cast(3)); + ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), + static_cast(4)); + } } TEST(DBTest, PurgeInfoLogs) { @@ -6117,7 +7317,7 @@ TEST(DBTest, PurgeInfoLogs) { options.db_log_dir = ""; } for (int i = 0; i < 8; i++) { - Reopen(&options); + Reopen(options); } std::vector files; @@ -6131,8 +7331,8 @@ TEST(DBTest, PurgeInfoLogs) { } ASSERT_EQ(5, info_log_count); - Destroy(&options); - // For mode (1), test DestoryDB() to delete all the logs under DB dir. + Destroy(options); + // For mode (1), test DestroyDB() to delete all the logs under DB dir. // For mode (2), no info log file should have been put under DB dir. std::vector db_files; env_->GetChildren(dbname_, &db_files); @@ -6181,8 +7381,8 @@ void ExpectRecords( TEST(DBTest, TransactionLogIterator) { do { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); Put(0, "key1", DummyString(1024)); Put(1, "key2", DummyString(1024)); Put(1, "key2", DummyString(1024)); @@ -6191,7 +7391,7 @@ TEST(DBTest, TransactionLogIterator) { auto iter = OpenTransactionLogIter(0); ExpectRecords(3, iter); } - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); env_->SleepForMicroseconds(2 * 1000 * 1000); { Put(0, "key4", DummyString(1024)); @@ -6208,11 +7408,13 @@ TEST(DBTest, TransactionLogIterator) { #ifndef NDEBUG // sync point is not included with DNDEBUG build TEST(DBTest, TransactionLogIteratorRace) { static const int LOG_ITERATOR_RACE_TEST_COUNT = 2; - static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = - { { "DBImpl::GetSortedWalFiles:1", "DBImpl::PurgeObsoleteFiles:1", - "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalFiles:2" }, - { "DBImpl::GetSortedWalsOfType:1", "DBImpl::PurgeObsoleteFiles:1", - "DBImpl::PurgeObsoleteFiles:2", "DBImpl::GetSortedWalsOfType:2" }}; + static const char* sync_points[LOG_ITERATOR_RACE_TEST_COUNT][4] = { + {"WalManager::GetSortedWalFiles:1", "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", "WalManager::GetSortedWalFiles:2"}, + {"WalManager::GetSortedWalsOfType:1", + "WalManager::PurgeObsoleteFiles:1", + "WalManager::PurgeObsoleteFiles:2", + "WalManager::GetSortedWalsOfType:2"}}; for (int test = 0; test < LOG_ITERATOR_RACE_TEST_COUNT; ++test) { // Setup sync point dependency to reproduce the race condition of // a log file moved to archived dir, in the middle of GetSortedWalFiles @@ -6225,7 +7427,7 @@ TEST(DBTest, TransactionLogIteratorRace) { rocksdb::SyncPoint::GetInstance()->ClearTrace(); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); + DestroyAndReopen(options); Put("key1", DummyString(1024)); dbfull()->Flush(FlushOptions()); Put("key2", DummyString(1024)); @@ -6260,28 +7462,10 @@ TEST(DBTest, TransactionLogIteratorRace) { } #endif -TEST(DBTest, TransactionLogIteratorMoveOverZeroFiles) { - do { - Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); - // Do a plain Reopen. - Put(1, "key1", DummyString(1024)); - // Two reopens should create a zero record WAL file. - ReopenWithColumnFamilies({"default", "pikachu"}, &options); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); - - Put(1, "key2", DummyString(1024)); - - auto iter = OpenTransactionLogIter(0); - ExpectRecords(2, iter); - } while (ChangeCompactOptions()); -} - TEST(DBTest, TransactionLogIteratorStallAtLastRecord) { do { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); + DestroyAndReopen(options); Put("key1", DummyString(1024)); auto iter = OpenTransactionLogIter(0); ASSERT_OK(iter->status()); @@ -6296,25 +7480,14 @@ TEST(DBTest, TransactionLogIteratorStallAtLastRecord) { } while (ChangeCompactOptions()); } -TEST(DBTest, TransactionLogIteratorJustEmptyFile) { - do { - Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); - unique_ptr iter; - Status status = dbfull()->GetUpdatesSince(0, &iter); - // Check that an empty iterator is returned - ASSERT_TRUE(!iter->Valid()); - } while (ChangeCompactOptions()); -} - TEST(DBTest, TransactionLogIteratorCheckAfterRestart) { do { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); + DestroyAndReopen(options); Put("key1", DummyString(1024)); Put("key2", DummyString(1023)); dbfull()->Flush(FlushOptions()); - Reopen(&options); + Reopen(options); auto iter = OpenTransactionLogIter(0); ExpectRecords(2, iter); } while (ChangeCompactOptions()); @@ -6323,26 +7496,30 @@ TEST(DBTest, TransactionLogIteratorCheckAfterRestart) { TEST(DBTest, TransactionLogIteratorCorruptedLog) { do { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); + DestroyAndReopen(options); for (int i = 0; i < 1024; i++) { - Put("key"+std::to_string(i), DummyString(10)); + Put("key"+ToString(i), DummyString(10)); } dbfull()->Flush(FlushOptions()); // Corrupt this log to create a gap rocksdb::VectorLogPtr wal_files; ASSERT_OK(dbfull()->GetSortedWalFiles(wal_files)); - const auto logfilePath = dbname_ + "/" + wal_files.front()->PathName(); - ASSERT_EQ( - 0, - truncate(logfilePath.c_str(), wal_files.front()->SizeFileBytes() / 2)); + const auto logfile_path = dbname_ + "/" + wal_files.front()->PathName(); + if (mem_env_) { + mem_env_->Truncate(logfile_path, wal_files.front()->SizeFileBytes() / 2); + } else { + ASSERT_EQ(0, truncate(logfile_path.c_str(), + wal_files.front()->SizeFileBytes() / 2)); + } + // Insert a new entry to a new log file Put("key1025", DummyString(10)); // Try to read from the beginning. Should stop before the gap and read less // than 1025 entries auto iter = OpenTransactionLogIter(0); int count; - int last_sequence_read = ReadRecords(iter, count); - ASSERT_LT(last_sequence_read, 1025); + SequenceNumber last_sequence_read = ReadRecords(iter, count); + ASSERT_LT(last_sequence_read, 1025U); // Try to read past the gap, should be able to seek to key1025 auto iter2 = OpenTransactionLogIter(last_sequence_read + 1); ExpectRecords(1, iter2); @@ -6352,8 +7529,8 @@ TEST(DBTest, TransactionLogIteratorCorruptedLog) { TEST(DBTest, TransactionLogIteratorBatchOperations) { do { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); WriteBatch batch; batch.Put(handles_[1], "key1", DummyString(1024)); batch.Put(handles_[0], "key2", DummyString(1024)); @@ -6362,7 +7539,7 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) { dbfull()->Write(WriteOptions(), &batch); Flush(1); Flush(0); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); Put(1, "key4", DummyString(1024)); auto iter = OpenTransactionLogIter(3); ExpectRecords(2, iter); @@ -6371,8 +7548,8 @@ TEST(DBTest, TransactionLogIteratorBatchOperations) { TEST(DBTest, TransactionLogIteratorBlobs) { Options options = OptionsForLogIterTest(); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); { WriteBatch batch; batch.Put(handles_[1], "key1", DummyString(1024)); @@ -6382,27 +7559,27 @@ TEST(DBTest, TransactionLogIteratorBlobs) { batch.PutLogData(Slice("blob2")); batch.Delete(handles_[0], "key2"); dbfull()->Write(WriteOptions(), &batch); - ReopenWithColumnFamilies({"default", "pikachu"}, &options); + ReopenWithColumnFamilies({"default", "pikachu"}, options); } auto res = OpenTransactionLogIter(0)->GetBatch(); struct Handler : public WriteBatch::Handler { std::string seen; virtual Status PutCF(uint32_t cf, const Slice& key, const Slice& value) { - seen += "Put(" + std::to_string(cf) + ", " + key.ToString() + ", " + - std::to_string(value.size()) + ")"; + seen += "Put(" + ToString(cf) + ", " + key.ToString() + ", " + + ToString(value.size()) + ")"; return Status::OK(); } virtual Status MergeCF(uint32_t cf, const Slice& key, const Slice& value) { - seen += "Merge(" + std::to_string(cf) + ", " + key.ToString() + ", " + - std::to_string(value.size()) + ")"; + seen += "Merge(" + ToString(cf) + ", " + key.ToString() + ", " + + ToString(value.size()) + ")"; return Status::OK(); } virtual void LogData(const Slice& blob) { seen += "LogData(" + blob.ToString() + ")"; } virtual Status DeleteCF(uint32_t cf, const Slice& key) { - seen += "Delete(" + std::to_string(cf) + ", " + key.ToString() + ")"; + seen += "Delete(" + ToString(cf) + ", " + key.ToString() + ")"; return Status::OK(); } } handler; @@ -6417,44 +7594,6 @@ TEST(DBTest, TransactionLogIteratorBlobs) { handler.seen); } -TEST(DBTest, ReadFirstRecordCache) { - Options options = CurrentOptions(); - options.env = env_; - options.create_if_missing = true; - DestroyAndReopen(&options); - - std::string path = dbname_ + "/000001.log"; - unique_ptr file; - ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); - - SequenceNumber s; - ASSERT_OK(dbfull()->TEST_ReadFirstLine(path, &s)); - ASSERT_EQ(s, 0U); - - ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); - ASSERT_EQ(s, 0U); - - log::Writer writer(std::move(file)); - WriteBatch batch; - batch.Put("foo", "bar"); - WriteBatchInternal::SetSequence(&batch, 10); - writer.AddRecord(WriteBatchInternal::Contents(&batch)); - - env_->count_sequential_reads_ = true; - // sequential_read_counter_ sanity test - ASSERT_EQ(env_->sequential_read_counter_.Read(), 0); - - ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); - ASSERT_EQ(s, 10U); - // did a read - ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); - - ASSERT_OK(dbfull()->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); - ASSERT_EQ(s, 10U); - // no new reads since the value is cached - ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); -} - // Multi-threaded test: namespace { @@ -6465,9 +7604,9 @@ static const int kNumKeys = 1000; struct MTState { DBTest* test; - port::AtomicPointer stop; - port::AtomicPointer counter[kNumThreads]; - port::AtomicPointer thread_done[kNumThreads]; + std::atomic stop; + std::atomic counter[kNumThreads]; + std::atomic thread_done[kNumThreads]; }; struct MTThread { @@ -6479,12 +7618,12 @@ static void MTThreadBody(void* arg) { MTThread* t = reinterpret_cast(arg); int id = t->id; DB* db = t->state->test->db_; - uintptr_t counter = 0; + int counter = 0; fprintf(stderr, "... starting thread %d\n", id); Random rnd(1000 + id); char valbuf[1500]; - while (t->state->stop.Acquire_Load() == nullptr) { - t->state->counter[id].Release_Store(reinterpret_cast(counter)); + while (t->state->stop.load(std::memory_order_acquire) == false) { + t->state->counter[id].store(counter, std::memory_order_release); int key = rnd.Uniform(kNumKeys); char keybuf[20]; @@ -6544,8 +7683,7 @@ static void MTThreadBody(void* arg) { ASSERT_EQ(k, key); ASSERT_GE(w, 0); ASSERT_LT(w, kNumThreads); - ASSERT_LE((unsigned int)c, reinterpret_cast( - t->state->counter[w].Acquire_Load())); + ASSERT_LE(c, t->state->counter[w].load(std::memory_order_acquire)); ASSERT_EQ(cf, i); if (i == 0) { unique_id = u; @@ -6559,26 +7697,28 @@ static void MTThreadBody(void* arg) { } counter++; } - t->state->thread_done[id].Release_Store(t); + t->state->thread_done[id].store(true, std::memory_order_release); fprintf(stderr, "... stopping thread %d after %d ops\n", id, int(counter)); } } // namespace TEST(DBTest, MultiThreaded) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; do { std::vector cfs; for (int i = 1; i < kColumnFamilies; ++i) { - cfs.push_back(std::to_string(i)); + cfs.push_back(ToString(i)); } - CreateAndReopenWithCF(cfs); + CreateAndReopenWithCF(cfs, CurrentOptions(options_override)); // Initialize state MTState mt; mt.test = this; - mt.stop.Release_Store(0); + mt.stop.store(false, std::memory_order_release); for (int id = 0; id < kNumThreads; id++) { - mt.counter[id].Release_Store(0); - mt.thread_done[id].Release_Store(0); + mt.counter[id].store(0, std::memory_order_release); + mt.thread_done[id].store(false, std::memory_order_release); } // Start threads @@ -6593,9 +7733,9 @@ TEST(DBTest, MultiThreaded) { env_->SleepForMicroseconds(kTestSeconds * 1000000); // Stop the threads and wait for them to finish - mt.stop.Release_Store(&mt); + mt.stop.store(true, std::memory_order_release); for (int id = 0; id < kNumThreads; id++) { - while (mt.thread_done[id].Acquire_Load() == nullptr) { + while (mt.thread_done[id].load(std::memory_order_acquire) == false) { env_->SleepForMicroseconds(100000); } } @@ -6622,7 +7762,7 @@ static void GCThreadBody(void* arg) { WriteOptions wo; for (int i = 0; i < kGCNumKeys; ++i) { - std::string kv(std::to_string(i + id * kGCNumKeys)); + std::string kv(ToString(i + id * kGCNumKeys)); ASSERT_OK(db->Put(wo, kv, kv)); } t->done = true; @@ -6633,8 +7773,10 @@ static void GCThreadBody(void* arg) { TEST(DBTest, GroupCommitTest) { do { Options options = CurrentOptions(); + options.env = env_; + env_->log_write_slowdown_.store(100); options.statistics = rocksdb::CreateDBStatistics(); - Reopen(&options); + Reopen(options); // Start threads GCThread thread[kGCNumThreads]; @@ -6650,11 +7792,13 @@ TEST(DBTest, GroupCommitTest) { env_->SleepForMicroseconds(100000); } } + env_->log_write_slowdown_.store(0); + ASSERT_GT(TestGetTickerCount(options, WRITE_DONE_BY_OTHER), 0); std::vector expected_db; for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { - expected_db.push_back(std::to_string(i)); + expected_db.push_back(ToString(i)); } sort(expected_db.begin(), expected_db.end()); @@ -6681,6 +7825,12 @@ class ModelDB: public DB { class ModelSnapshot : public Snapshot { public: KVMap map_; + + virtual SequenceNumber GetSequenceNumber() const { + // no need to call this + assert(false); + return 0; + } }; explicit ModelDB(const Options& options) : options_(options) {} @@ -6810,8 +7960,17 @@ class ModelDB: public DB { return Status::NotSupported("Not supported operation."); } - using DB::NumberLevels; - virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; } + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1) override { + return Status::NotSupported("Not supported operation."); + } + + using DB::NumberLevels; + virtual int NumberLevels(ColumnFamilyHandle* column_family) { return 1; } using DB::MaxMemCompactionLevel; virtual int MaxMemCompactionLevel(ColumnFamilyHandle* column_family) { @@ -6878,6 +8037,10 @@ class ModelDB: public DB { virtual ColumnFamilyHandle* DefaultColumnFamily() const { return nullptr; } + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) {} + private: class ModelIter: public Iterator { public: @@ -6979,9 +8142,11 @@ static bool CompareIterators(int step, } TEST(DBTest, Randomized) { + anon::OptionsOverride options_override; + options_override.skip_policy = kSkipNoSnapshot; Random rnd(test::RandomSeed()); do { - ModelDB model(CurrentOptions()); + ModelDB model(CurrentOptions(options_override)); const int N = 10000; const Snapshot* model_snap = nullptr; const Snapshot* db_snap = nullptr; @@ -7050,7 +8215,9 @@ TEST(DBTest, Randomized) { if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - Reopen(); + + auto options = CurrentOptions(options_override); + Reopen(options); ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); model_snap = model.GetSnapshot(); @@ -7058,7 +8225,7 @@ TEST(DBTest, Randomized) { } if ((step % 2000) == 0) { - fprintf(stdout, + fprintf(stderr, "DBTest.Randomized, option ID: %d, step: %d out of %d\n", option_config_, step, N); } @@ -7072,7 +8239,7 @@ TEST(DBTest, Randomized) { TEST(DBTest, MultiGetSimple) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ASSERT_OK(Put(1, "k1", "v1")); ASSERT_OK(Put(1, "k2", "v2")); ASSERT_OK(Put(1, "k3", "v3")); @@ -7104,7 +8271,7 @@ TEST(DBTest, MultiGetSimple) { TEST(DBTest, MultiGetEmpty) { do { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); // Empty Key Set std::vector keys; std::vector values; @@ -7113,8 +8280,10 @@ TEST(DBTest, MultiGetEmpty) { ASSERT_EQ(s.size(), 0U); // Empty Database, Empty Key Set - DestroyAndReopen(); - CreateAndReopenWithCF({"pikachu"}); + Options options = CurrentOptions(); + options.create_if_missing = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); s = db_->MultiGet(ReadOptions(), cfs, keys, &values); ASSERT_EQ(s.size(), 0U); @@ -7169,7 +8338,6 @@ void PrefixScanInit(DBTest *dbtest) { // GROUP 2 for (int i = 1; i <= big_range_sstfiles; i++) { - std::string keystr; snprintf(buf, sizeof(buf), "%02d______:start", 0); keystr = std::string(buf); ASSERT_OK(dbtest->Put(keystr, keystr)); @@ -7183,47 +8351,49 @@ void PrefixScanInit(DBTest *dbtest) { } // namespace TEST(DBTest, PrefixScan) { - int count; - Slice prefix; - Slice key; - char buf[100]; - Iterator* iter; - snprintf(buf, sizeof(buf), "03______:"); - prefix = Slice(buf, 8); - key = Slice(buf, 9); - // db configs - env_->count_random_reads_ = true; - Options options = CurrentOptions(); - options.env = env_; - options.prefix_extractor.reset(NewFixedPrefixTransform(8)); - options.disable_auto_compactions = true; - options.max_background_compactions = 2; - options.create_if_missing = true; - options.memtable_factory.reset(NewHashSkipListRepFactory(16)); + while (ChangeFilterOptions()) { + int count; + Slice prefix; + Slice key; + char buf[100]; + Iterator* iter; + snprintf(buf, sizeof(buf), "03______:"); + prefix = Slice(buf, 8); + key = Slice(buf, 9); + // db configs + env_->count_random_reads_ = true; + Options options = CurrentOptions(); + options.env = env_; + options.prefix_extractor.reset(NewFixedPrefixTransform(8)); + options.disable_auto_compactions = true; + options.max_background_compactions = 2; + options.create_if_missing = true; + options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - table_options.filter_policy.reset(NewBloomFilterPolicy(10)); - table_options.whole_key_filtering = false; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + table_options.whole_key_filtering = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - // 11 RAND I/Os - DestroyAndReopen(&options); - PrefixScanInit(this); - count = 0; - env_->random_read_counter_.Reset(); - iter = db_->NewIterator(ReadOptions()); - for (iter->Seek(prefix); iter->Valid(); iter->Next()) { - if (! iter->key().starts_with(prefix)) { - break; + // 11 RAND I/Os + DestroyAndReopen(options); + PrefixScanInit(this); + count = 0; + env_->random_read_counter_.Reset(); + iter = db_->NewIterator(ReadOptions()); + for (iter->Seek(prefix); iter->Valid(); iter->Next()) { + if (! iter->key().starts_with(prefix)) { + break; + } + count++; } - count++; - } - ASSERT_OK(iter->status()); - delete iter; - ASSERT_EQ(count, 2); - ASSERT_EQ(env_->random_read_counter_.Read(), 2); - Close(); + ASSERT_OK(iter->status()); + delete iter; + ASSERT_EQ(count, 2); + ASSERT_EQ(env_->random_read_counter_.Read(), 2); + Close(); + } // end of while } TEST(DBTest, TailingIteratorSingle) { @@ -7245,7 +8415,7 @@ TEST(DBTest, TailingIteratorSingle) { } TEST(DBTest, TailingIteratorKeepAdding) { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ReadOptions read_options; read_options.tailing = true; @@ -7267,7 +8437,7 @@ TEST(DBTest, TailingIteratorKeepAdding) { } TEST(DBTest, TailingIteratorSeekToNext) { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ReadOptions read_options; read_options.tailing = true; @@ -7314,7 +8484,7 @@ TEST(DBTest, TailingIteratorSeekToNext) { } TEST(DBTest, TailingIteratorDeletes) { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ReadOptions read_options; read_options.tailing = true; @@ -7362,8 +8532,8 @@ TEST(DBTest, TailingIteratorPrefixSeek) { options.disable_auto_compactions = true; options.prefix_extractor.reset(NewFixedPrefixTransform(2)); options.memtable_factory.reset(NewHashSkipListRepFactory(16)); - DestroyAndReopen(&options); - CreateAndReopenWithCF({"pikachu"}, &options); + DestroyAndReopen(options); + CreateAndReopenWithCF({"pikachu"}, options); std::unique_ptr iter(db_->NewIterator(read_options, handles_[1])); ASSERT_OK(Put(1, "0101", "test")); @@ -7385,7 +8555,7 @@ TEST(DBTest, TailingIteratorPrefixSeek) { } TEST(DBTest, TailingIteratorIncomplete) { - CreateAndReopenWithCF({"pikachu"}); + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); ReadOptions read_options; read_options.tailing = true; read_options.read_tier = kBlockCacheTier; @@ -7410,7 +8580,7 @@ TEST(DBTest, TailingIteratorSeekToSame) { Options options = CurrentOptions(); options.compaction_style = kCompactionStyleUniversal; options.write_buffer_size = 1000; - CreateAndReopenWithCF({"pikachu"}, &options); + CreateAndReopenWithCF({"pikachu"}, options); ReadOptions read_options; read_options.tailing = true; @@ -7450,7 +8620,7 @@ TEST(DBTest, BlockBasedTablePrefixIndexTest) { options.prefix_extractor.reset(NewFixedPrefixTransform(1)); - Reopen(&options); + Reopen(options); ASSERT_OK(Put("k1", "v1")); Flush(); ASSERT_OK(Put("k2", "v2")); @@ -7461,7 +8631,7 @@ TEST(DBTest, BlockBasedTablePrefixIndexTest) { options.table_factory.reset(NewBlockBasedTableFactory(table_options)); options.prefix_extractor.reset(); - Reopen(&options); + Reopen(options); ASSERT_EQ("v1", Get("k1")); ASSERT_EQ("v2", Get("k2")); } @@ -7472,21 +8642,21 @@ TEST(DBTest, ChecksumTest) { table_options.checksum = kCRC32c; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(&options); + Reopen(options); ASSERT_OK(Put("a", "b")); ASSERT_OK(Put("c", "d")); ASSERT_OK(Flush()); // table with crc checksum table_options.checksum = kxxHash; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(&options); + Reopen(options); ASSERT_OK(Put("e", "f")); ASSERT_OK(Put("g", "h")); ASSERT_OK(Flush()); // table with xxhash checksum table_options.checksum = kCRC32c; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(&options); + Reopen(options); ASSERT_EQ("b", Get("a")); ASSERT_EQ("d", Get("c")); ASSERT_EQ("f", Get("e")); @@ -7494,7 +8664,7 @@ TEST(DBTest, ChecksumTest) { table_options.checksum = kCRC32c; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - Reopen(&options); + Reopen(options); ASSERT_EQ("b", Get("a")); ASSERT_EQ("d", Get("c")); ASSERT_EQ("f", Get("e")); @@ -7514,12 +8684,13 @@ TEST(DBTest, FIFOCompactionTest) { if (iter == 1) { options.disable_auto_compactions = true; } - DestroyAndReopen(&options); + options = CurrentOptions(options); + DestroyAndReopen(options); Random rnd(301); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 100; ++j) { - ASSERT_OK(Put(std::to_string(i * 100 + j), RandomString(&rnd, 1024))); + ASSERT_OK(Put(ToString(i * 100 + j), RandomString(&rnd, 1024))); } // flush should happen here } @@ -7532,24 +8703,31 @@ TEST(DBTest, FIFOCompactionTest) { ASSERT_EQ(NumTableFilesAtLevel(0), 5); for (int i = 0; i < 50; ++i) { // these keys should be deleted in previous compaction - ASSERT_EQ("NOT_FOUND", Get(std::to_string(i))); + ASSERT_EQ("NOT_FOUND", Get(ToString(i))); } } } TEST(DBTest, SimpleWriteTimeoutTest) { + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + Options options; options.env = env_; options.create_if_missing = true; options.write_buffer_size = 100000; options.max_background_flushes = 0; options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 3; options.max_total_wal_size = std::numeric_limits::max(); - WriteOptions write_opt = WriteOptions(); + WriteOptions write_opt; write_opt.timeout_hint_us = 0; - DestroyAndReopen(&options); - // fill the two write buffer + DestroyAndReopen(options); + // fill the two write buffers ASSERT_OK(Put(Key(1), Key(1) + std::string(100000, 'v'), write_opt)); ASSERT_OK(Put(Key(2), Key(2) + std::string(100000, 'v'), write_opt)); // As the only two write buffers are full in this moment, the third @@ -7557,6 +8735,9 @@ TEST(DBTest, SimpleWriteTimeoutTest) { write_opt.timeout_hint_us = 50; ASSERT_TRUE( Put(Key(3), Key(3) + std::string(100000, 'v'), write_opt).IsTimedOut()); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); } // Multi-threaded Timeout Test @@ -7579,7 +8760,7 @@ static void RandomTimeoutWriter(void* arg) { DB* db = state->db; Random rnd(1000 + thread_id); - WriteOptions write_opt = WriteOptions(); + WriteOptions write_opt; write_opt.timeout_hint_us = 500; int timeout_count = 0; int num_keys = kNumKeys * 5; @@ -7611,7 +8792,6 @@ static void RandomTimeoutWriter(void* arg) { if (write_opt.timeout_hint_us == 0 || put_duration + kTimerBias < write_opt.timeout_hint_us) { ASSERT_OK(s); - std::string result; } if (s.IsTimedOut()) { timeout_count++; @@ -7631,7 +8811,7 @@ TEST(DBTest, MTRandomTimeoutTest) { options.level0_slowdown_writes_trigger = 10; options.level0_stop_writes_trigger = 20; options.write_buffer_size = kWriteBufferSize; - DestroyAndReopen(&options); + DestroyAndReopen(options); TimeoutWriterState thread_states[kNumThreads]; for (int tid = 0; tid < kNumThreads; ++tid) { @@ -7657,6 +8837,26 @@ TEST(DBTest, MTRandomTimeoutTest) { } } +TEST(DBTest, Level0StopWritesTest) { + Options options = CurrentOptions(); + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.disable_auto_compactions = true; + options.max_mem_compaction_level = 0; + Reopen(options); + + // create 4 level0 tables + for (int i = 0; i < 4; ++i) { + Put("a", "b"); + Flush(); + } + + WriteOptions woptions; + woptions.timeout_hint_us = 30 * 1000; // 30 ms + Status s = Put("a", "b", woptions); + ASSERT_TRUE(s.IsTimedOut()); +} + } // anonymous namespace /* @@ -7673,7 +8873,7 @@ TEST(DBTest, RateLimitingTest) { options.create_if_missing = true; options.env = env_; options.IncreaseParallelism(4); - DestroyAndReopen(&options); + DestroyAndReopen(options); WriteOptions wo; wo.disableWAL = true; @@ -7694,7 +8894,7 @@ TEST(DBTest, RateLimitingTest) { options.rate_limiter.reset( NewGenericRateLimiter(static_cast(0.7 * raw_rate))); env_->bytes_written_ = 0; - DestroyAndReopen(&options); + DestroyAndReopen(options); start = env_->NowMicros(); // Write ~96M data @@ -7714,7 +8914,7 @@ TEST(DBTest, RateLimitingTest) { options.rate_limiter.reset( NewGenericRateLimiter(static_cast(raw_rate / 2))); env_->bytes_written_ = 0; - DestroyAndReopen(&options); + DestroyAndReopen(options); start = env_->NowMicros(); // Write ~96M data @@ -7731,16 +8931,1588 @@ TEST(DBTest, RateLimitingTest) { ASSERT_TRUE(ratio < 0.6); } +namespace { + bool HaveOverlappingKeyRanges( + const Comparator* c, + const SstFileMetaData& a, const SstFileMetaData& b) { + if (c->Compare(a.smallestkey, b.smallestkey) >= 0) { + if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // b.smallestkey <= a.smallestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // a.smallestkey < b.smallestkey <= a.largestkey + return true; + } + if (c->Compare(a.largestkey, b.largestkey) <= 0) { + if (c->Compare(a.largestkey, b.smallestkey) >= 0) { + // b.smallestkey <= a.largestkey <= b.largestkey + return true; + } + } else if (c->Compare(a.smallestkey, b.largestkey) <= 0) { + // a.smallestkey <= b.largestkey < a.largestkey + return true; + } + return false; + } + + // Identifies all files between level "min_level" and "max_level" + // which has overlapping key range with "input_file_meta". + void GetOverlappingFileNumbersForLevelCompaction( + const ColumnFamilyMetaData& cf_meta, + const Comparator* comparator, + int min_level, int max_level, + const SstFileMetaData* input_file_meta, + std::set* overlapping_file_names) { + std::set overlapping_files; + overlapping_files.insert(input_file_meta); + for (int m = min_level; m <= max_level; ++m) { + for (auto& file : cf_meta.levels[m].files) { + for (auto* included_file : overlapping_files) { + if (HaveOverlappingKeyRanges( + comparator, *included_file, file)) { + overlapping_files.insert(&file); + overlapping_file_names->insert(file.name); + break; + } + } + } + } + } + + void VerifyCompactionResult( + const ColumnFamilyMetaData& cf_meta, + const std::set& overlapping_file_numbers) { +#ifndef NDEBUG + for (auto& level : cf_meta.levels) { + for (auto& file : level.files) { + assert(overlapping_file_numbers.find(file.name) == + overlapping_file_numbers.end()); + } + } +#endif + } + + const SstFileMetaData* PickFileRandomly( + const ColumnFamilyMetaData& cf_meta, + Random* rand, + int* level = nullptr) { + auto file_id = rand->Uniform(static_cast( + cf_meta.file_count)) + 1; + for (auto& level_meta : cf_meta.levels) { + if (file_id <= level_meta.files.size()) { + if (level != nullptr) { + *level = level_meta.level; + } + auto result = rand->Uniform(file_id); + return &(level_meta.files[result]); + } + file_id -= level_meta.files.size(); + } + assert(false); + return nullptr; + } +} // namespace + +TEST(DBTest, CompactFilesOnLevelCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.level0_stop_writes_trigger = 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + Random rnd(301); + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + int output_level = static_cast(cf_meta.levels.size()) - 1; + for (int file_picked = 5; file_picked > 0; --file_picked) { + std::set overlapping_file_names; + std::vector compaction_input_file_names; + for (int f = 0; f < file_picked; ++f) { + int level; + auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); + compaction_input_file_names.push_back(file_meta->name); + GetOverlappingFileNumbersForLevelCompaction( + cf_meta, options.comparator, level, output_level, + file_meta, &overlapping_file_names); + } + + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, + output_level)); + + // Make sure all overlapping files do not exist after compaction + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult(cf_meta, overlapping_file_names); + } + + // make sure all key-values are still there. + for (int key = 64 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_NE(Get(1, ToString(key)), "NOT_FOUND"); + } +} + +TEST(DBTest, CompactFilesOnUniversalCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 10; + + ChangeCompactOptions(); + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.compression = kNoCompression; + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); + Random rnd(301); + for (int key = 1024 * kEntriesPerBuffer; key >= 0; --key) { + ASSERT_OK(Put(1, ToString(key), RandomString(&rnd, kTestValueSize))); + } + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); + dbfull()->TEST_WaitForCompact(); + ColumnFamilyMetaData cf_meta; + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + std::vector compaction_input_file_names; + for (auto file : cf_meta.levels[0].files) { + if (rnd.OneIn(2)) { + compaction_input_file_names.push_back(file.name); + } + } + + if (compaction_input_file_names.size() == 0) { + compaction_input_file_names.push_back( + cf_meta.levels[0].files[0].name); + } + + // expect fail since universal compaction only allow L0 output + ASSERT_TRUE(!dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, 1).ok()); + + // expect ok and verify the compacted files no longer exist. + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + VerifyCompactionResult( + cf_meta, + std::set(compaction_input_file_names.begin(), + compaction_input_file_names.end())); + + compaction_input_file_names.clear(); + + // Pick the first and the last file, expect everything is + // compacted into one single file. + compaction_input_file_names.push_back( + cf_meta.levels[0].files[0].name); + compaction_input_file_names.push_back( + cf_meta.levels[0].files[ + cf_meta.levels[0].files.size() - 1].name); + ASSERT_OK(dbfull()->CompactFiles( + CompactionOptions(), handles_[1], + compaction_input_file_names, 0)); + + dbfull()->GetColumnFamilyMetaData(handles_[1], &cf_meta); + ASSERT_EQ(cf_meta.levels[0].files.size(), 1U); +} + TEST(DBTest, TableOptionsSanitizeTest) { Options options = CurrentOptions(); options.create_if_missing = true; - DestroyAndReopen(&options); + DestroyAndReopen(options); ASSERT_EQ(db_->GetOptions().allow_mmap_reads, false); options.table_factory.reset(new PlainTableFactory()); options.prefix_extractor.reset(NewNoopTransform()); - Destroy(&options); - ASSERT_TRUE(TryReopen(&options).IsNotSupported()); + Destroy(options); + ASSERT_TRUE(TryReopen(options).IsNotSupported()); + + // Test for check of prefix_extractor when hash index is used for + // block-based table + BlockBasedTableOptions to; + to.index_type = BlockBasedTableOptions::kHashSearch; + options = CurrentOptions(); + options.create_if_missing = true; + options.table_factory.reset(NewBlockBasedTableFactory(to)); + ASSERT_TRUE(TryReopen(options).IsInvalidArgument()); + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + ASSERT_OK(TryReopen(options)); +} + +TEST(DBTest, SanitizeNumThreads) { + for (int attempt = 0; attempt < 2; attempt++) { + const size_t kTotalTasks = 8; + SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + + Options options = CurrentOptions(); + if (attempt == 0) { + options.max_background_compactions = 3; + options.max_background_flushes = 2; + } + options.create_if_missing = true; + DestroyAndReopen(options); + + for (size_t i = 0; i < kTotalTasks; i++) { + // Insert 5 tasks to low priority queue and 5 tasks to high priority queue + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_tasks[i], + (i < 4) ? Env::Priority::LOW : Env::Priority::HIGH); + } + + // Wait 100 milliseconds for they are scheduled. + env_->SleepForMicroseconds(100000); + + // pool size 3, total task 4. Queue size should be 1. + ASSERT_EQ(1U, options.env->GetThreadPoolQueueLen(Env::Priority::LOW)); + // pool size 2, total task 4. Queue size should be 2. + ASSERT_EQ(2U, options.env->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + + ASSERT_OK(Put("abc", "def")); + ASSERT_EQ("def", Get("abc")); + Flush(); + ASSERT_EQ("def", Get("abc")); + } +} + +TEST(DBTest, DBIteratorBoundTest) { + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing basic case with no iterate_upper_bound and no prefix_extractor + { + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo1")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("g1")), 0); + } + + // testing iterate_upper_bound and forward iterator + // to make sure it stops at bound + { + ReadOptions ro; + // iterate_upper_bound points beyond the last expected entry + Slice prefix("foo2"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("foo")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("foo1")), 0); + + iter->Next(); + // should stop here... + ASSERT_TRUE(!iter->Valid()); + } + + // prefix is the first letter of the key + options.prefix_extractor.reset(NewFixedPrefixTransform(1)); + + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("foo1", "bar1")); + ASSERT_OK(Put("g1", "0")); + + // testing with iterate_upper_bound and prefix_extractor + // Seek target and iterate_upper_bound are not is same prefix + // This should be an error + { + ReadOptions ro; + Slice prefix("g1"); + ro.iterate_upper_bound = &prefix; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("foo"); + + ASSERT_TRUE(!iter->Valid()); + ASSERT_TRUE(iter->status().IsInvalidArgument()); + } + + // testing that iterate_upper_bound prevents iterating over deleted items + // if the bound has already reached + { + options.prefix_extractor = nullptr; + DestroyAndReopen(options); + ASSERT_OK(Put("a", "0")); + ASSERT_OK(Put("b", "0")); + ASSERT_OK(Put("b1", "0")); + ASSERT_OK(Put("c", "0")); + ASSERT_OK(Put("d", "0")); + ASSERT_OK(Put("e", "0")); + ASSERT_OK(Delete("c")); + ASSERT_OK(Delete("d")); + + // base case with no bound + ReadOptions ro; + ro.iterate_upper_bound = nullptr; + + std::unique_ptr iter(db_->NewIterator(ro)); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + perf_context.Reset(); + iter->Next(); + + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 2); + + // now testing with iterate_bound + Slice prefix("c"); + ro.iterate_upper_bound = &prefix; + + iter.reset(db_->NewIterator(ro)); + + perf_context.Reset(); + + iter->Seek("b"); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Slice("b")), 0); + + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(("b1")), 0); + + iter->Next(); + // the iteration should stop as soon as the the bound key is reached + // even though the key is deleted + // hence internal_delete_skipped_count should be 0 + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(static_cast(perf_context.internal_delete_skipped_count), 0); + } +} + +TEST(DBTest, WriteSingleThreadEntry) { + std::vector threads; + dbfull()->TEST_LockMutex(); + auto w = dbfull()->TEST_BeginWrite(); + threads.emplace_back([&] { Put("a", "b"); }); + env_->SleepForMicroseconds(10000); + threads.emplace_back([&] { Flush(); }); + env_->SleepForMicroseconds(10000); + dbfull()->TEST_UnlockMutex(); + dbfull()->TEST_LockMutex(); + dbfull()->TEST_EndWrite(w); + dbfull()->TEST_UnlockMutex(); + + for (auto& t : threads) { + t.join(); + } +} + +TEST(DBTest, DisableDataSyncTest) { + env_->sync_counter_.store(0); + // iter 0 -- no sync + // iter 1 -- sync + for (int iter = 0; iter < 2; ++iter) { + Options options = CurrentOptions(); + options.disableDataSync = iter == 0; + options.create_if_missing = true; + options.env = env_; + Reopen(options); + CreateAndReopenWithCF({"pikachu"}, options); + + MakeTables(10, "a", "z"); + Compact("a", "z"); + + if (iter == 0) { + ASSERT_EQ(env_->sync_counter_.load(), 0); + } else { + ASSERT_GT(env_->sync_counter_.load(), 0); + } + Destroy(options); + } +} + +TEST(DBTest, DynamicMemtableOptions) { + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k5KB = 5 * 1024; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.max_background_compactions = 1; + options.max_mem_compaction_level = 0; + options.write_buffer_size = k64KB; + options.max_write_buffer_number = 2; + // Don't trigger compact/slowdown/stop + options.level0_file_num_compaction_trigger = 1024; + options.level0_slowdown_writes_trigger = 1024; + options.level0_stop_writes_trigger = 1024; + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int size) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + } + dbfull()->TEST_WaitForFlushMemTable(); + }; + + // Test write_buffer_size + gen_l0_kb(64); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_LT(SizeAtLevel(0), k64KB + k5KB); + ASSERT_GT(SizeAtLevel(0), k64KB - k5KB); + + // Clean up L0 + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Increase buffer size + ASSERT_OK(dbfull()->SetOptions({ + {"write_buffer_size", "131072"}, + })); + + // The existing memtable is still 64KB in size, after it becomes immutable, + // the next memtable will be 128KB in size. Write 256KB total, we should + // have a 64KB L0 file, a 128KB L0 file, and a memtable with 64KB data + gen_l0_kb(256); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + ASSERT_LT(SizeAtLevel(0), k128KB + k64KB + 2 * k5KB); + ASSERT_GT(SizeAtLevel(0), k128KB + k64KB - 2 * k5KB); + + // Test max_write_buffer_number + // Block compaction thread, which will also block the flushes because + // max_background_flushes == 0, so flushes are getting executed by the + // compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + SleepingBackgroundTask sleeping_task_low1; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1, + Env::Priority::LOW); + // Start from scratch and disable compaction/flush. Flush can only happen + // during compaction but trigger is pretty high + options.max_background_flushes = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Put until timeout, bounded by 256 puts. We should see timeout at ~128KB + int count = 0; + Random rnd(301); + WriteOptions wo; + wo.timeout_hint_us = 1000; + + while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 256) { + count++; + } + ASSERT_GT(static_cast(count), 128 * 0.8); + ASSERT_LT(static_cast(count), 128 * 1.2); + + sleeping_task_low1.WakeUp(); + sleeping_task_low1.WaitUntilDone(); + + // Increase + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "8"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(nullptr, nullptr); + + SleepingBackgroundTask sleeping_task_low2; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2, + Env::Priority::LOW); + count = 0; + while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) { + count++; + } + ASSERT_GT(static_cast(count), 512 * 0.8); + ASSERT_LT(static_cast(count), 512 * 1.2); + sleeping_task_low2.WakeUp(); + sleeping_task_low2.WaitUntilDone(); + + // Decrease + ASSERT_OK(dbfull()->SetOptions({ + {"max_write_buffer_number", "4"}, + })); + // Clean up memtable and L0 + dbfull()->CompactRange(nullptr, nullptr); + + SleepingBackgroundTask sleeping_task_low3; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low3, + Env::Priority::LOW); + count = 0; + while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 1024) { + count++; + } + ASSERT_GT(static_cast(count), 256 * 0.8); + ASSERT_LT(static_cast(count), 266 * 1.2); + sleeping_task_low3.WakeUp(); + sleeping_task_low3.WaitUntilDone(); +} + +#if ROCKSDB_USING_THREAD_STATUS +TEST(DBTest, GetThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = true; + TryReopen(options); + + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); + + for (int i = 0; i < 2; ++i) { + // repeat the test with differet number of high / low priority threads + const int kTestCount = 3; + const unsigned int kHighPriCounts[kTestCount] = {3, 2, 5}; + const unsigned int kLowPriCounts[kTestCount] = {10, 15, 3}; + for (int test = 0; test < kTestCount; ++test) { + // Change the number of threads in high / low priority pool. + env_->SetBackgroundThreads(kHighPriCounts[test], Env::HIGH); + env_->SetBackgroundThreads(kLowPriCounts[test], Env::LOW); + // Wait to ensure the all threads has been registered + env_->SleepForMicroseconds(100000); + s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + unsigned int thread_type_counts[ThreadStatus::NUM_THREAD_TYPES]; + memset(thread_type_counts, 0, sizeof(thread_type_counts)); + for (auto thread : thread_list) { + ASSERT_LT(thread.thread_type, ThreadStatus::NUM_THREAD_TYPES); + thread_type_counts[thread.thread_type]++; + } + // Verify the total number of threades + ASSERT_EQ( + thread_type_counts[ThreadStatus::HIGH_PRIORITY] + + thread_type_counts[ThreadStatus::LOW_PRIORITY], + kHighPriCounts[test] + kLowPriCounts[test]); + // Verify the number of high-priority threads + ASSERT_EQ( + thread_type_counts[ThreadStatus::HIGH_PRIORITY], + kHighPriCounts[test]); + // Verify the number of low-priority threads + ASSERT_EQ( + thread_type_counts[ThreadStatus::LOW_PRIORITY], + kLowPriCounts[test]); + } + if (i == 0) { + // repeat the test with multiple column families + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( + handles_, true); + } + } + db_->DropColumnFamily(handles_[2]); + delete handles_[2]; + handles_.erase(handles_.begin() + 2); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( + handles_, true); + Close(); + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( + handles_, true); +} + +TEST(DBTest, DisableThreadStatus) { + Options options; + options.env = env_; + options.enable_thread_tracking = false; + TryReopen(options); + CreateAndReopenWithCF({"pikachu", "about-to-remove"}, options); + // Verify non of the column family info exists + env_->GetThreadStatusUpdater()->TEST_VerifyColumnFamilyInfoMap( + handles_, false); +} + +TEST(DBTest, ThreadStatusSingleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + const int kNumL0Files = 4; + options.level0_file_num_compaction_trigger = kNumL0Files; + for (int tests = 0; tests < 2; ++tests) { + TryReopen(options); + // Each compaction will run at least 2 seconds, which allows + // the test to capture the status of compaction with fewer + // false alarm. + const int kCompactionDelayMicro = 2000000; + ThreadStatusUtil::TEST_SetOperationDelay( + ThreadStatus::OP_COMPACTION, kCompactionDelayMicro); + + Random rnd(301); + for (int key = kEntriesPerBuffer * kNumL0Files; key >= 0; --key) { + ASSERT_OK(Put(ToString(key), RandomString(&rnd, kTestValueSize))); + } + + // wait for compaction to be scheduled + env_->SleepForMicroseconds(500000); + + // check how many threads are doing compaction using GetThreadList + std::vector thread_list; + Status s = env_->GetThreadList(&thread_list); + ASSERT_OK(s); + int compaction_count = 0; + for (auto thread : thread_list) { + if (thread.operation_type == ThreadStatus::OP_COMPACTION) { + compaction_count++; + } + } + + if (options.enable_thread_tracking) { + // expecting one single L0 to L1 compaction + ASSERT_EQ(compaction_count, 1); + } else { + // If thread tracking is not enabled, compaction count should be 0. + ASSERT_EQ(compaction_count, 0); + } + + ThreadStatusUtil::TEST_SetOperationDelay( + ThreadStatus::OP_COMPACTION, 0); + + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + } +} + +TEST(DBTest, ThreadStatusMultipleCompaction) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 10; + const int kNumL0Files = 4; + + const int kHighPriCount = 3; + const int kLowPriCount = 5; + env_->SetBackgroundThreads(kHighPriCount, Env::HIGH); + env_->SetBackgroundThreads(kLowPriCount, Env::LOW); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = + options.target_file_size_base * kNumL0Files; + options.compression = kNoCompression; + options = CurrentOptions(options); + options.env = env_; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + options.max_bytes_for_level_multiplier = 2; + options.max_background_compactions = kLowPriCount; + + for (int tests = 0; tests < 2; ++tests) { + TryReopen(options); + Random rnd(301); + + int max_compaction_count = 0; + std::vector thread_list; + const int kCompactionDelayMicro = 20000; + ThreadStatusUtil::TEST_SetOperationDelay( + ThreadStatus::OP_COMPACTION, kCompactionDelayMicro); + + // Make rocksdb busy + int key = 0; + for (int file = 0; file < 64 * kNumL0Files; ++file) { + for (int k = 0; k < kEntriesPerBuffer; ++k) { + ASSERT_OK(Put(ToString(key++), RandomString(&rnd, kTestValueSize))); + } + + // check how many threads are doing compaction using GetThreadList + int compaction_count = 0; + Status s = env_->GetThreadList(&thread_list); + for (auto thread : thread_list) { + if (thread.operation_type == ThreadStatus::OP_COMPACTION) { + compaction_count++; + } + } + + // Record the max number of compactions at a time. + if (max_compaction_count < compaction_count) { + max_compaction_count = compaction_count; + } + } + + if (options.enable_thread_tracking) { + // Expect rocksdb to at least utilize 60% of the compaction threads. + ASSERT_GE(1.0 * max_compaction_count, + 0.6 * options.max_background_compactions); + } else { + // If thread tracking is not enabled, compaction count should be 0. + ASSERT_EQ(max_compaction_count, 0); + } + + // repeat the test with disabling thread tracking. + options.enable_thread_tracking = false; + } + + ThreadStatusUtil::TEST_SetOperationDelay( + ThreadStatus::OP_COMPACTION, 0); +} + +#endif // ROCKSDB_USING_THREAD_STATUS + +TEST(DBTest, DynamicCompactionOptions) { + // minimum write buffer size is enforced at 64KB + const uint64_t k32KB = 1 << 15; + const uint64_t k64KB = 1 << 16; + const uint64_t k128KB = 1 << 17; + const uint64_t k1MB = 1 << 20; + const uint64_t k4KB = 1 << 12; + Options options; + options.env = env_; + options.create_if_missing = true; + options.compression = kNoCompression; + options.hard_rate_limit = 1.1; + options.write_buffer_size = k64KB; + options.max_write_buffer_number = 2; + // Compaction related options + options.level0_file_num_compaction_trigger = 3; + options.level0_slowdown_writes_trigger = 4; + options.level0_stop_writes_trigger = 8; + options.max_grandparent_overlap_factor = 10; + options.expanded_compaction_factor = 25; + options.source_compaction_factor = 1; + options.target_file_size_base = k64KB; + options.target_file_size_multiplier = 1; + options.max_bytes_for_level_base = k128KB; + options.max_bytes_for_level_multiplier = 4; + + // Block flush thread and disable compaction thread + env_->SetBackgroundThreads(1, Env::LOW); + env_->SetBackgroundThreads(1, Env::HIGH); + DestroyAndReopen(options); + + auto gen_l0_kb = [this](int start, int size, int stride) { + Random rnd(301); + for (int i = 0; i < size; i++) { + ASSERT_OK(Put(Key(start + stride * i), RandomString(&rnd, 1024))); + } + dbfull()->TEST_WaitForFlushMemTable(); + }; + + // Write 3 files that have the same key range. + // Since level0_file_num_compaction_trigger is 3, compaction should be + // triggered. The compaction should result in one L1 file + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + gen_l0_kb(0, 64, 1); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel()); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(1U, metadata.size()); + ASSERT_LE(metadata[0].size, k64KB + k4KB); + ASSERT_GE(metadata[0].size, k64KB - k4KB); + + // Test compaction trigger and target_file_size_base + // Reduce compaction trigger to 2, and reduce L1 file size to 32KB. + // Writing to 64KB L0 files should trigger a compaction. Since these + // 2 L0 files have the same key range, compaction merge them and should + // result in 2 32KB L1 files. + ASSERT_OK(dbfull()->SetOptions({ + {"level0_file_num_compaction_trigger", "2"}, + {"target_file_size_base", ToString(k32KB) } + })); + + gen_l0_kb(0, 64, 1); + ASSERT_EQ("1,1", FilesPerLevel()); + gen_l0_kb(0, 64, 1); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,2", FilesPerLevel()); + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_LE(metadata[0].size, k32KB + k4KB); + ASSERT_GE(metadata[0].size, k32KB - k4KB); + ASSERT_LE(metadata[1].size, k32KB + k4KB); + ASSERT_GE(metadata[1].size, k32KB - k4KB); + + // Test max_bytes_for_level_base + // Increase level base size to 256KB and write enough data that will + // fill L1 and L2. L1 size should be around 256KB while L2 size should be + // around 256KB x 4. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", ToString(k1MB) } + })); + + // writing 96 x 64KB => 6 * 1024KB + // (L1 + L2) = (1 + 4) * 1024KB + for (int i = 0; i < 96; ++i) { + gen_l0_kb(i, 64, 96); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_GT(SizeAtLevel(1), k1MB / 2); + ASSERT_LT(SizeAtLevel(1), k1MB + k1MB / 2); + + // Within (0.5, 1.5) of 4MB. + ASSERT_GT(SizeAtLevel(2), 2 * k1MB); + ASSERT_LT(SizeAtLevel(2), 6 * k1MB); + + // Test max_bytes_for_level_multiplier and + // max_bytes_for_level_base. Now, reduce both mulitplier and level base, + // After filling enough data that can fit in L1 - L3, we should see L1 size + // reduces to 128KB from 256KB which was asserted previously. Same for L2. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_multiplier", "2"}, + {"max_bytes_for_level_base", ToString(k128KB) } + })); + + // writing 20 x 64KB = 10 x 128KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 128KB + for (int i = 0; i < 20; ++i) { + gen_l0_kb(i, 64, 32); + } + dbfull()->TEST_WaitForCompact(); + uint64_t total_size = + SizeAtLevel(1) + SizeAtLevel(2) + SizeAtLevel(3); + ASSERT_TRUE(total_size < k128KB * 7 * 1.5); + + // Test level0_stop_writes_trigger. + // Clean up memtable and L0. Block compaction threads. If continue to write + // and flush memtables. We should see put timeout after 8 memtable flushes + // since level0_stop_writes_trigger = 8 + dbfull()->CompactRange(nullptr, nullptr); + // Block compaction + SleepingBackgroundTask sleeping_task_low1; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low1, + Env::Priority::LOW); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + int count = 0; + Random rnd(301); + WriteOptions wo; + wo.timeout_hint_us = 10000; + while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) { + dbfull()->TEST_FlushMemTable(true); + count++; + } + // Stop trigger = 8 + ASSERT_EQ(count, 8); + // Unblock + sleeping_task_low1.WakeUp(); + sleeping_task_low1.WaitUntilDone(); + + // Now reduce level0_stop_writes_trigger to 6. Clear up memtables and L0. + // Block compaction thread again. Perform the put and memtable flushes + // until we see timeout after 6 memtable flushes. + ASSERT_OK(dbfull()->SetOptions({ + {"level0_stop_writes_trigger", "6"} + })); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + // Block compaction + SleepingBackgroundTask sleeping_task_low2; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low2, + Env::Priority::LOW); + count = 0; + while (Put(Key(count), RandomString(&rnd, 1024), wo).ok() && count < 64) { + dbfull()->TEST_FlushMemTable(true); + count++; + } + ASSERT_EQ(count, 6); + // Unblock + sleeping_task_low2.WakeUp(); + sleeping_task_low2.WaitUntilDone(); + + // Test disable_auto_compactions + // Compaction thread is unblocked but auto compaction is disabled. Write + // 4 L0 files and compaction should be triggered. If auto compaction is + // disabled, then TEST_WaitForCompact will be waiting for nothing. Number of + // L0 files do not change after the call. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"} + })); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't timeout + dbfull()->TEST_FlushMemTable(true); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 4); + + // Enable auto compaction and perform the same test, # of L0 files should be + // reduced after compaction. + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "false"} + })); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + for (int i = 0; i < 4; ++i) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 1024))); + // Wait for compaction so that put won't timeout + dbfull()->TEST_FlushMemTable(true); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_LT(NumTableFilesAtLevel(0), 4); + + // Test for hard_rate_limit. + // First change max_bytes_for_level_base to a big value and populate + // L1 - L3. Then thrink max_bytes_for_level_base and disable auto compaction + // at the same time, we should see some level with score greater than 2. + ASSERT_OK(dbfull()->SetOptions({ + {"max_bytes_for_level_base", ToString(k1MB) } + })); + // writing 40 x 64KB = 10 x 256KB + // (L1 + L2 + L3) = (1 + 2 + 4) * 256KB + for (int i = 0; i < 40; ++i) { + gen_l0_kb(i, 64, 32); + } + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE((SizeAtLevel(1) > k1MB * 0.8 && + SizeAtLevel(1) < k1MB * 1.2) || + (SizeAtLevel(2) > 2 * k1MB * 0.8 && + SizeAtLevel(2) < 2 * k1MB * 1.2) || + (SizeAtLevel(3) > 4 * k1MB * 0.8 && + SizeAtLevel(3) < 4 * k1MB * 1.2)); + // Reduce max_bytes_for_level_base and disable compaction at the same time + // This should cause score to increase + ASSERT_OK(dbfull()->SetOptions({ + {"disable_auto_compactions", "true"}, + {"max_bytes_for_level_base", "65536"}, + })); + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024))); + dbfull()->TEST_FlushMemTable(true); + + // Check score is above 2 + ASSERT_TRUE(SizeAtLevel(1) / k64KB > 2 || + SizeAtLevel(2) / k64KB > 4 || + SizeAtLevel(3) / k64KB > 8); + + // Enfoce hard rate limit. Now set hard_rate_limit to 2, + // we should start to see put delay (1000 us) and timeout as a result + // (L0 score is not regulated by this limit). + ASSERT_OK(dbfull()->SetOptions({ + {"hard_rate_limit", "2"} + })); + ASSERT_OK(Put(Key(count), RandomString(&rnd, 1024))); + dbfull()->TEST_FlushMemTable(true); + + // Hard rate limit slow down for 1000 us, so default 10ms should be ok + ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok()); + wo.timeout_hint_us = 500; + ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).IsTimedOut()); + + // Lift the limit and no timeout + ASSERT_OK(dbfull()->SetOptions({ + {"hard_rate_limit", "100"} + })); + dbfull()->TEST_FlushMemTable(true); + ASSERT_TRUE(Put(Key(count), RandomString(&rnd, 1024), wo).ok()); + + // Test max_mem_compaction_level. + // Destory DB and start from scratch + options.max_background_compactions = 1; + options.max_background_flushes = 0; + options.max_mem_compaction_level = 2; + DestroyAndReopen(options); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 0); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + // Set new value and it becomes effective in this flush + ASSERT_OK(dbfull()->SetOptions({ + {"max_mem_compaction_level", "1"} + })); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); + + ASSERT_TRUE(Put("max_mem_compaction_level_key", + RandomString(&rnd, 8)).ok()); + // Set new value and it becomes effective in this flush + ASSERT_OK(dbfull()->SetOptions({ + {"max_mem_compaction_level", "0"} + })); + dbfull()->TEST_FlushMemTable(true); + ASSERT_EQ(NumTableFilesAtLevel(0), 1); + ASSERT_EQ(NumTableFilesAtLevel(1), 1); + ASSERT_EQ(NumTableFilesAtLevel(2), 1); +} + +TEST(DBTest, FileCreationRandomFailure) { + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = 100000; // Small write buffer + options.target_file_size_base = 200000; + options.max_bytes_for_level_base = 1000000; + options.max_bytes_for_level_multiplier = 2; + + DestroyAndReopen(options); + Random rnd(301); + + const int kTestSize = kCDTKeysPerBuffer * 4096; + const int kTotalIteration = 100; + // the second half of the test involves in random failure + // of file creation. + const int kRandomFailureTest = kTotalIteration / 2; + std::vector values; + for (int i = 0; i < kTestSize; ++i) { + values.push_back("NOT_FOUND"); + } + for (int j = 0; j < kTotalIteration; ++j) { + if (j == kRandomFailureTest) { + env_->non_writeable_rate_.store(90); + } + for (int k = 0; k < kTestSize; ++k) { + // here we expect some of the Put fails. + std::string value = RandomString(&rnd, 100); + Status s = Put(Key(k), Slice(value)); + if (s.ok()) { + // update the latest successful put + values[k] = value; + } + // But everything before we simulate the failure-test should succeed. + if (j < kRandomFailureTest) { + ASSERT_OK(s); + } + } + } + + // If rocksdb does not do the correct job, internal assert will fail here. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + + // verify we have the latest successful update + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } + + // reopen and reverify we have the latest successful update + env_->non_writeable_rate_.store(0); + Reopen(options); + for (int k = 0; k < kTestSize; ++k) { + auto v = Get(Key(k)); + ASSERT_EQ(v, values[k]); + } +} + +TEST(DBTest, PartialCompactionFailure) { + Options options; + const int kKeySize = 16; + const int kKvSize = 1000; + const int kKeysPerBuffer = 100; + const int kNumL1Files = 5; + options.create_if_missing = true; + options.write_buffer_size = kKeysPerBuffer * kKvSize; + options.max_write_buffer_number = 2; + options.target_file_size_base = + options.write_buffer_size * + (options.max_write_buffer_number - 1); + options.level0_file_num_compaction_trigger = kNumL1Files; + options.max_bytes_for_level_base = + options.level0_file_num_compaction_trigger * + options.target_file_size_base; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + // stop the compaction thread until we simulate the file creation failure. + SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + options.env = env_; + + DestroyAndReopen(options); + + const int kNumInsertedKeys = + options.level0_file_num_compaction_trigger * + (options.max_write_buffer_number - 1) * + kKeysPerBuffer; + + Random rnd(301); + std::vector keys; + std::vector values; + for (int k = 0; k < kNumInsertedKeys; ++k) { + keys.emplace_back(RandomString(&rnd, kKeySize)); + values.emplace_back(RandomString(&rnd, kKvSize - kKeySize)); + ASSERT_OK(Put(Slice(keys[k]), Slice(values[k]))); + } + + dbfull()->TEST_FlushMemTable(true); + // Make sure the number of L0 files can trigger compaction. + ASSERT_GE(NumTableFilesAtLevel(0), + options.level0_file_num_compaction_trigger); + + auto previous_num_level0_files = NumTableFilesAtLevel(0); + + // Fail the first file creation. + env_->non_writable_count_ = 1; + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Expect compaction to fail here as one file will fail its + // creation. + ASSERT_TRUE(!dbfull()->TEST_WaitForCompact().ok()); + + // Verify L0 -> L1 compaction does fail. + ASSERT_EQ(NumTableFilesAtLevel(1), 0); + + // Verify all L0 files are still there. + ASSERT_EQ(NumTableFilesAtLevel(0), previous_num_level0_files); + + // All key-values must exist after compaction fails. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } + + env_->non_writable_count_ = 0; + + // Make sure RocksDB will not get into corrupted state. + Reopen(options); + + // Verify again after reopen. + for (int k = 0; k < kNumInsertedKeys; ++k) { + ASSERT_EQ(values[k], Get(keys[k])); + } +} + +TEST(DBTest, DynamicMiscOptions) { + // Test max_sequential_skip_in_iterations + Options options; + options.env = env_; + options.create_if_missing = true; + options.max_sequential_skip_in_iterations = 16; + options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + + auto assert_reseek_count = [this, &options](int key_start, int num_reseek) { + int key0 = key_start; + int key1 = key_start + 1; + int key2 = key_start + 2; + Random rnd(301); + ASSERT_OK(Put(Key(key0), RandomString(&rnd, 8))); + for (int i = 0; i < 10; ++i) { + ASSERT_OK(Put(Key(key1), RandomString(&rnd, 8))); + } + ASSERT_OK(Put(Key(key2), RandomString(&rnd, 8))); + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(key1)); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key1)), 0); + iter->Next(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->key().compare(Key(key2)), 0); + ASSERT_EQ(num_reseek, + TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION)); + }; + // No reseek + assert_reseek_count(100, 0); + + ASSERT_OK(dbfull()->SetOptions({ + {"max_sequential_skip_in_iterations", "4"} + })); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // Trigger reseek + assert_reseek_count(200, 1); + + ASSERT_OK(dbfull()->SetOptions({ + {"max_sequential_skip_in_iterations", "16"} + })); + // Clear memtable and make new option effective + dbfull()->TEST_FlushMemTable(true); + // No reseek + assert_reseek_count(300, 1); +} + +TEST(DBTest, DontDeletePendingOutputs) { + Options options; + options.env = env_; + options.create_if_missing = true; + DestroyAndReopen(options); + + // Every time we write to a table file, call FOF/POF with full DB scan. This + // will make sure our pending_outputs_ protection work correctly + std::function purge_obsolete_files_function = [&]() { + JobContext job_context; + dbfull()->TEST_LockMutex(); + dbfull()->FindObsoleteFiles(&job_context, true /*force*/); + dbfull()->TEST_UnlockMutex(); + dbfull()->PurgeObsoleteFiles(job_context); + }; + + env_->table_write_callback_ = &purge_obsolete_files_function; + + for (int i = 0; i < 2; ++i) { + ASSERT_OK(Put("a", "begin")); + ASSERT_OK(Put("z", "end")); + ASSERT_OK(Flush()); + } + + // If pending output guard does not work correctly, PurgeObsoleteFiles() will + // delete the file that Compaction is trying to create, causing this: error + // db/db_test.cc:975: IO error: + // /tmp/rocksdbtest-1552237650/db_test/000009.sst: No such file or directory + Compact("a", "b"); +} + +TEST(DBTest, DontDeleteMovedFile) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + options.create_if_missing = true; + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + // If the moved file is actually deleted (the move-safeguard in + // ~Version::Version() is not there), we get this failure: + // Corruption: Can't access /000009.sst + Reopen(options); +} + +TEST(DBTest, DeleteMovedFileAfterCompaction) { + // iter 1 -- delete_obsolete_files_period_micros == 0 + for (int iter = 0; iter < 2; ++iter) { + // This test triggers move compaction and verifies that the file is not + // deleted when it's part of move compaction + Options options = CurrentOptions(); + options.env = env_; + if (iter == 1) { + options.delete_obsolete_files_period_micros = 0; + } + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + DestroyAndReopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute L0->L1 + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,1", FilesPerLevel(0)); + + // block compactions + SleepingBackgroundTask sleeping_task; + env_->Schedule(&SleepingBackgroundTask::DoSleepTask, &sleeping_task, + Env::Priority::LOW); + + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + Reopen(options); + std::unique_ptr iterator(db_->NewIterator(ReadOptions())); + ASSERT_EQ("0,1", FilesPerLevel(0)); + // let compactions go + sleeping_task.WakeUp(); + sleeping_task.WaitUntilDone(); + + // this should execute L1->L2 (move) + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto moved_file_name = metadata[0].name; + + // Create two more 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j + 100), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->L2 (merge with previous file) + dbfull()->TEST_WaitForCompact(); + + ASSERT_EQ("0,0,2", FilesPerLevel(0)); + + // iterator is holding the file + ASSERT_TRUE(env_->FileExists(dbname_ + "/" + moved_file_name)); + + iterator.reset(); + + // this file should have been compacted away + ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + moved_file_name)); + } +} + +TEST(DBTest, L0L1L2AndUpHitCounter) { + Options options = CurrentOptions(); + options.write_buffer_size = 32 * 1024; + options.target_file_size_base = 32 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 64 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"mypikachu"}, options); + + int numkeys = 20000; + for (int i = 0; i < numkeys; i++) { + ASSERT_OK(Put(1, Key(i), "val")); + } + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L0)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L1)); + ASSERT_EQ(0, TestGetTickerCount(options, GET_HIT_L2_AND_UP)); + + ASSERT_OK(Flush(1)); + dbfull()->TEST_WaitForCompact(); + + for (int i = 0; i < numkeys; i++) { + ASSERT_EQ(Get(1, Key(i)), "val"); + } + + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L0), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L1), 100); + ASSERT_GT(TestGetTickerCount(options, GET_HIT_L2_AND_UP), 100); + + ASSERT_EQ(numkeys, TestGetTickerCount(options, GET_HIT_L0) + + TestGetTickerCount(options, GET_HIT_L1) + + TestGetTickerCount(options, GET_HIT_L2_AND_UP)); +} + +TEST(DBTest, EncodeDecompressedBlockSizeTest) { + // iter 0 -- zlib + // iter 1 -- bzip2 + // iter 2 -- lz4 + // iter 3 -- lz4HC + CompressionType compressions[] = {kZlibCompression, kBZip2Compression, + kLZ4Compression, kLZ4HCCompression}; + for (int iter = 0; iter < 4; ++iter) { + // first_table_version 1 -- generate with table_version == 1, read with + // table_version == 2 + // first_table_version 2 -- generate with table_version == 2, read with + // table_version == 1 + for (int first_table_version = 1; first_table_version <= 2; + ++first_table_version) { + BlockBasedTableOptions table_options; + table_options.format_version = first_table_version; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + Options options = CurrentOptions(); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + options.create_if_missing = true; + options.compression = compressions[iter]; + DestroyAndReopen(options); + + int kNumKeysWritten = 100000; + + Random rnd(301); + for (int i = 0; i < kNumKeysWritten; ++i) { + // compressible string + ASSERT_OK(Put(Key(i), RandomString(&rnd, 128) + std::string(128, 'a'))); + } + + table_options.format_version = first_table_version == 1 ? 2 : 1; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + Reopen(options); + for (int i = 0; i < kNumKeysWritten; ++i) { + auto r = Get(Key(i)); + ASSERT_EQ(r.substr(128), std::string(128, 'a')); + } + } + } +} + +TEST(DBTest, MutexWaitStats) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + const int64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay( + ThreadStatus::STATE_MUTEX_WAIT, kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_GE(TestGetTickerCount( + options, DB_MUTEX_WAIT_MICROS), kMutexWaitDelay); + ThreadStatusUtil::TEST_SetStateDelay( + ThreadStatus::STATE_MUTEX_WAIT, 0); +} + +// This reproduces a bug where we don't delete a file because when it was +// supposed to be deleted, it was blocked by pending_outputs +// Consider: +// 1. current file_number is 13 +// 2. compaction (1) starts, blocks deletion of all files starting with 13 +// (pending outputs) +// 3. file 13 is created by compaction (2) +// 4. file 13 is consumed by compaction (3) and file 15 was created. Since file +// 13 has no references, it is put into VersionSet::obsolete_files_ +// 5. FindObsoleteFiles() gets file 13 from VersionSet::obsolete_files_. File 13 +// is deleted from obsolete_files_ set. +// 6. PurgeObsoleteFiles() tries to delete file 13, but this file is blocked by +// pending outputs since compaction (1) is still running. It is not deleted and +// it is not present in obsolete_files_ anymore. Therefore, we never delete it. +TEST(DBTest, DeleteObsoleteFilesPendingOutputs) { + Options options = CurrentOptions(); + options.env = env_; + options.write_buffer_size = 2 * 1024 * 1024; // 2 MB + options.max_bytes_for_level_base = 1024 * 1024; // 1 MB + options.level0_file_num_compaction_trigger = + 2; // trigger compaction when we have 2 files + options.max_background_flushes = 2; + options.max_background_compactions = 2; + Reopen(options); + + Random rnd(301); + // Create two 1MB sst files + for (int i = 0; i < 2; ++i) { + // Create 1MB sst file + for (int j = 0; j < 100; ++j) { + ASSERT_OK(Put(Key(i * 50 + j), RandomString(&rnd, 10 * 1024))); + } + ASSERT_OK(Flush()); + } + // this should execute both L0->L1 and L1->(move)->L2 compactions + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ("0,0,1", FilesPerLevel(0)); + + SleepingBackgroundTask blocking_thread; + port::Mutex mutex_; + bool already_blocked(false); + + // block the flush + std::function block_first_time = [&]() { + bool blocking = false; + { + MutexLock l(&mutex_); + if (!already_blocked) { + blocking = true; + already_blocked = true; + } + } + if (blocking) { + blocking_thread.DoSleep(); + } + }; + env_->table_write_callback_ = &block_first_time; + // Create 1MB sst file + for (int j = 0; j < 256; ++j) { + ASSERT_OK(Put(Key(j), RandomString(&rnd, 10 * 1024))); + } + // this should trigger a flush, which is blocked with block_first_time + // pending_file is protecting all the files created after + + ASSERT_OK(dbfull()->TEST_CompactRange(2, nullptr, nullptr)); + + ASSERT_EQ("0,0,0,1", FilesPerLevel(0)); + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 1U); + auto file_on_L2 = metadata[0].name; + + ASSERT_OK(dbfull()->TEST_CompactRange(3, nullptr, nullptr)); + ASSERT_EQ("0,0,0,0,1", FilesPerLevel(0)); + + // finish the flush! + blocking_thread.WakeUp(); + blocking_thread.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ("1,0,0,0,1", FilesPerLevel(0)); + + metadata.clear(); + db_->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(metadata.size(), 2U); + + // This file should have been deleted + ASSERT_TRUE(!env_->FileExists(dbname_ + "/" + file_on_L2)); } } // namespace rocksdb diff --git a/db/dbformat.cc b/db/dbformat.cc index baeb86802..f0bd9d01e 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -127,8 +127,8 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const { } } -LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { - size_t usize = user_key.size(); +LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s) { + size_t usize = _user_key.size(); size_t needed = usize + 13; // A conservative estimate char* dst; if (needed <= sizeof(space_)) { @@ -137,9 +137,10 @@ LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { dst = new char[needed]; } start_ = dst; - dst = EncodeVarint32(dst, usize + 8); + // NOTE: We don't support users keys of more than 2GB :) + dst = EncodeVarint32(dst, static_cast(usize + 8)); kstart_ = dst; - memcpy(dst, user_key.data(), usize); + memcpy(dst, _user_key.data(), usize); dst += usize; EncodeFixed64(dst, PackSequenceAndType(s, kValueTypeForSeek)); dst += 8; diff --git a/db/dbformat.h b/db/dbformat.h index eb5d8ed53..9c7c8dcf1 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -132,8 +132,8 @@ class InternalKey { std::string rep_; public: InternalKey() { } // Leave rep_ as empty to indicate it is invalid - InternalKey(const Slice& user_key, SequenceNumber s, ValueType t) { - AppendInternalKey(&rep_, ParsedInternalKey(user_key, s, t)); + InternalKey(const Slice& _user_key, SequenceNumber s, ValueType t) { + AppendInternalKey(&rep_, ParsedInternalKey(_user_key, s, t)); } bool Valid() const { @@ -201,18 +201,24 @@ class LookupKey { public: // Initialize *this for looking up user_key at a snapshot with // the specified sequence number. - LookupKey(const Slice& user_key, SequenceNumber sequence); + LookupKey(const Slice& _user_key, SequenceNumber sequence); ~LookupKey(); // Return a key suitable for lookup in a MemTable. - Slice memtable_key() const { return Slice(start_, end_ - start_); } + Slice memtable_key() const { + return Slice(start_, static_cast(end_ - start_)); + } // Return an internal key (suitable for passing to an internal iterator) - Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } + Slice internal_key() const { + return Slice(kstart_, static_cast(end_ - kstart_)); + } // Return the user key - Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } + Slice user_key() const { + return Slice(kstart_, static_cast(end_ - kstart_ - 8)); + } private: // We construct a char array of the form: @@ -244,7 +250,7 @@ class IterKey { Slice GetKey() const { return Slice(key_, key_size_); } - const size_t Size() { return key_size_; } + size_t Size() { return key_size_; } void Clear() { key_size_ = 0; } @@ -319,8 +325,8 @@ class IterKey { void EncodeLengthPrefixedKey(const Slice& key) { auto size = key.size(); - EnlargeBufferIfNeeded(size + VarintLength(size)); - char* ptr = EncodeVarint32(key_, size); + EnlargeBufferIfNeeded(size + static_cast(VarintLength(size))); + char* ptr = EncodeVarint32(key_, static_cast(size)); memcpy(ptr, key.data(), size); } diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 14f0324c1..ac8c0e7b0 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -34,6 +34,8 @@ class DeleteFileTest { DeleteFileTest() { db_ = nullptr; env_ = Env::Default(); + options_.enable_thread_tracking = true; + options_.max_background_flushes = 0; options_.write_buffer_size = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000; options_.max_bytes_for_level_base = 1024*1024*1000; @@ -77,7 +79,7 @@ class DeleteFileTest { options.sync = false; ReadOptions roptions; for (int i = startkey; i < (numkeys + startkey) ; i++) { - std::string temp = std::to_string(i); + std::string temp = ToString(i); Slice key(temp); Slice value(temp); ASSERT_OK(db_->Put(options, key, value)); @@ -147,7 +149,6 @@ class DeleteFileTest { TEST(DeleteFileTest, AddKeysAndQueryLevels) { CreateTwoLevels(); std::vector metadata; - std::vector keysinlevel; db_->GetLiveFilesMetaData(&metadata); std::string level1file = ""; @@ -287,6 +288,75 @@ TEST(DeleteFileTest, DeleteLogFiles) { CloseDB(); } +TEST(DeleteFileTest, DeleteNonDefaultColumnFamily) { + CloseDB(); + DBOptions db_options; + db_options.create_if_missing = true; + db_options.create_missing_column_families = true; + std::vector column_families; + column_families.emplace_back(); + column_families.emplace_back("new_cf", ColumnFamilyOptions()); + + std::vector handles; + rocksdb::DB* db; + ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); + + Random rnd(5); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(db->Put(WriteOptions(), handles[1], test::RandomKey(&rnd, 10), + test::RandomKey(&rnd, 10))); + } + ASSERT_OK(db->Flush(FlushOptions(), handles[1])); + + std::vector metadata; + db->GetLiveFilesMetaData(&metadata); + ASSERT_EQ(2U, metadata.size()); + ASSERT_EQ("new_cf", metadata[0].column_family_name); + ASSERT_EQ("new_cf", metadata[1].column_family_name); + auto old_file = metadata[0].smallest_seqno < metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + auto new_file = metadata[0].smallest_seqno > metadata[1].smallest_seqno + ? metadata[0].name + : metadata[1].name; + ASSERT_TRUE(db->DeleteFile(new_file).IsInvalidArgument()); + ASSERT_OK(db->DeleteFile(old_file)); + + { + std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + delete handles[0]; + delete handles[1]; + delete db; + + ASSERT_OK(DB::Open(db_options, dbname_, column_families, &handles, &db)); + { + std::unique_ptr itr(db->NewIterator(ReadOptions(), handles[1])); + int count = 0; + for (itr->SeekToFirst(); itr->Valid(); itr->Next()) { + ASSERT_OK(itr->status()); + ++count; + } + ASSERT_EQ(count, 1000); + } + + delete handles[0]; + delete handles[1]; + delete db; +} + } //namespace rocksdb int main(int argc, char** argv) { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc new file mode 100644 index 000000000..8291f7287 --- /dev/null +++ b/db/fault_injection_test.cc @@ -0,0 +1,742 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +// This test uses a custom Env to keep track of the state of a filesystem as of +// the last "sync". It then checks for data loss errors by purposely dropping +// file data (or entire files) not protected by a "sync". + +#include +#include +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/log_format.h" +#include "db/version_set.h" +#include "rocksdb/cache.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/table.h" +#include "rocksdb/write_batch.h" +#include "util/logging.h" +#include "util/mock_env.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +static const int kValueSize = 1000; +static const int kMaxNumValues = 2000; +static const size_t kNumIterations = 3; + +class TestWritableFile; +class FaultInjectionTestEnv; + +namespace { + +// Assume a filename, and not a directory name like "/foo/bar/" +static std::string GetDirName(const std::string filename) { + size_t found = filename.find_last_of("/\\"); + if (found == std::string::npos) { + return ""; + } else { + return filename.substr(0, found); + } +} + +// Trim the tailing "/" in the end of `str` +static std::string TrimDirname(const std::string& str) { + size_t found = str.find_last_not_of("/"); + if (found == std::string::npos) { + return str; + } + return str.substr(0, found + 1); +} + +// Return pair of a full path. +static std::pair GetDirAndName( + const std::string& name) { + std::string dirname = GetDirName(name); + std::string fname = name.substr(dirname.size() + 1); + return std::make_pair(dirname, fname); +} + +// A basic file truncation function suitable for this test. +Status Truncate(Env* env, const std::string& filename, uint64_t length) { + unique_ptr orig_file; + const EnvOptions options; + Status s = env->NewSequentialFile(filename, &orig_file, options); + if (!s.ok()) { + fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), + s.ToString().c_str()); + return s; + } + + char* scratch = new char[length]; + rocksdb::Slice result; + s = orig_file->Read(length, &result, scratch); + if (s.ok()) { + std::string tmp_name = GetDirName(filename) + "/truncate.tmp"; + unique_ptr tmp_file; + s = env->NewWritableFile(tmp_name, &tmp_file, options); + if (s.ok()) { + s = tmp_file->Append(result); + if (s.ok()) { + s = env->RenameFile(tmp_name, filename); + } else { + fprintf(stderr, "Cannot rename file %s to %s: %s\n", tmp_name.c_str(), + filename.c_str(), s.ToString().c_str()); + env->DeleteFile(tmp_name); + } + } + } + if (!s.ok()) { + fprintf(stderr, "Cannot truncate file %s: %s\n", filename.c_str(), + s.ToString().c_str()); + } + + delete[] scratch; + + return s; +} + +struct FileState { + std::string filename_; + ssize_t pos_; + ssize_t pos_at_last_sync_; + ssize_t pos_at_last_flush_; + + explicit FileState(const std::string& filename) + : filename_(filename), + pos_(-1), + pos_at_last_sync_(-1), + pos_at_last_flush_(-1) { } + + FileState() : pos_(-1), pos_at_last_sync_(-1), pos_at_last_flush_(-1) {} + + bool IsFullySynced() const { return pos_ <= 0 || pos_ == pos_at_last_sync_; } + + Status DropUnsyncedData(Env* env) const; + + Status DropRandomUnsyncedData(Env* env, Random* rand) const; +}; + +} // anonymous namespace + +// A wrapper around WritableFile which informs another Env whenever this file +// is written to or sync'ed. +class TestWritableFile : public WritableFile { + public: + explicit TestWritableFile(const std::string& fname, + unique_ptr&& f, + FaultInjectionTestEnv* env); + virtual ~TestWritableFile(); + virtual Status Append(const Slice& data); + virtual Status Close(); + virtual Status Flush(); + virtual Status Sync(); + + private: + FileState state_; + unique_ptr target_; + bool writable_file_opened_; + FaultInjectionTestEnv* env_; +}; + +class TestDirectory : public Directory { + public: + explicit TestDirectory(FaultInjectionTestEnv* env, std::string dirname, + Directory* dir) + : env_(env), dirname_(dirname), dir_(dir) {} + ~TestDirectory() {} + + virtual Status Fsync() override; + + private: + FaultInjectionTestEnv* env_; + std::string dirname_; + unique_ptr dir_; +}; + +class FaultInjectionTestEnv : public EnvWrapper { + public: + explicit FaultInjectionTestEnv(Env* base) + : EnvWrapper(base), + filesystem_active_(true) {} + virtual ~FaultInjectionTestEnv() { } + + Status NewDirectory(const std::string& name, + unique_ptr* result) override { + unique_ptr r; + Status s = target()->NewDirectory(name, &r); + ASSERT_OK(s); + if (!s.ok()) { + return s; + } + result->reset(new TestDirectory(this, TrimDirname(name), r.release())); + return Status::OK(); + } + + Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + Status s = target()->NewWritableFile(fname, result, soptions); + if (s.ok()) { + result->reset(new TestWritableFile(fname, std::move(*result), this)); + // WritableFile doesn't append to files, so if the same file is opened + // again then it will be truncated - so forget our saved state. + UntrackFile(fname); + MutexLock l(&mutex_); + open_files_.insert(fname); + auto dir_and_name = GetDirAndName(fname); + auto& list = dir_to_new_files_since_last_sync_[dir_and_name.first]; + list.insert(dir_and_name.second); + } + return s; + } + + virtual Status DeleteFile(const std::string& f) { + Status s = EnvWrapper::DeleteFile(f); + if (!s.ok()) { + fprintf(stderr, "Cannot delete file %s: %s\n", f.c_str(), + s.ToString().c_str()); + } + ASSERT_OK(s); + if (s.ok()) { + UntrackFile(f); + } + return s; + } + + virtual Status RenameFile(const std::string& s, const std::string& t) { + Status ret = EnvWrapper::RenameFile(s, t); + + if (ret.ok()) { + MutexLock l(&mutex_); + if (db_file_state_.find(s) != db_file_state_.end()) { + db_file_state_[t] = db_file_state_[s]; + db_file_state_.erase(s); + } + + auto sdn = GetDirAndName(s); + auto tdn = GetDirAndName(t); + if (dir_to_new_files_since_last_sync_[sdn.first].erase(sdn.second) != 0) { + auto& tlist = dir_to_new_files_since_last_sync_[tdn.first]; + assert(tlist.find(tdn.second) == tlist.end()); + tlist.insert(tdn.second); + } + } + + return ret; + } + + void WritableFileClosed(const FileState& state) { + MutexLock l(&mutex_); + if (open_files_.find(state.filename_) != open_files_.end()) { + db_file_state_[state.filename_] = state; + open_files_.erase(state.filename_); + } + } + + // For every file that is not fully synced, make a call to `func` with + // FileState of the file as the parameter. + Status DropFileData(std::function func) { + Status s; + MutexLock l(&mutex_); + for (std::map::const_iterator it = + db_file_state_.begin(); + s.ok() && it != db_file_state_.end(); ++it) { + const FileState& state = it->second; + if (!state.IsFullySynced()) { + s = func(target(), state); + } + } + return s; + } + + Status DropUnsyncedFileData() { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropUnsyncedData(env); + }); + } + + Status DropRandomUnsyncedFileData(Random* rnd) { + return DropFileData([&](Env* env, const FileState& state) { + return state.DropRandomUnsyncedData(env, rnd); + }); + } + + Status DeleteFilesCreatedAfterLastDirSync() { + // Because DeleteFile access this container make a copy to avoid deadlock + std::map> map_copy; + { + MutexLock l(&mutex_); + map_copy.insert(dir_to_new_files_since_last_sync_.begin(), + dir_to_new_files_since_last_sync_.end()); + } + + for (auto& pair : map_copy) { + for (std::string name : pair.second) { + Status s = DeleteFile(pair.first + "/" + name); + if (!s.ok()) { + return s; + } + } + } + return Status::OK(); + } + void ResetState() { + MutexLock l(&mutex_); + db_file_state_.clear(); + dir_to_new_files_since_last_sync_.clear(); + SetFilesystemActiveNoLock(true); + } + + void UntrackFile(const std::string& f) { + MutexLock l(&mutex_); + auto dir_and_name = GetDirAndName(f); + dir_to_new_files_since_last_sync_[dir_and_name.first].erase( + dir_and_name.second); + db_file_state_.erase(f); + open_files_.erase(f); + } + + void SyncDir(const std::string& dirname) { + MutexLock l(&mutex_); + dir_to_new_files_since_last_sync_.erase(dirname); + } + + // Setting the filesystem to inactive is the test equivalent to simulating a + // system reset. Setting to inactive will freeze our saved filesystem state so + // that it will stop being recorded. It can then be reset back to the state at + // the time of the reset. + bool IsFilesystemActive() { + MutexLock l(&mutex_); + return filesystem_active_; + } + void SetFilesystemActiveNoLock(bool active) { filesystem_active_ = active; } + void SetFilesystemActive(bool active) { + MutexLock l(&mutex_); + SetFilesystemActiveNoLock(active); + } + void AssertNoOpenFile() { ASSERT_TRUE(open_files_.empty()); } + + private: + port::Mutex mutex_; + std::map db_file_state_; + std::set open_files_; + std::unordered_map> + dir_to_new_files_since_last_sync_; + bool filesystem_active_; // Record flushes, syncs, writes +}; + +Status FileState::DropUnsyncedData(Env* env) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + return Truncate(env, filename_, sync_pos); +} + +Status FileState::DropRandomUnsyncedData(Env* env, Random* rand) const { + ssize_t sync_pos = pos_at_last_sync_ == -1 ? 0 : pos_at_last_sync_; + assert(pos_ >= sync_pos); + int range = static_cast(pos_ - sync_pos); + uint64_t truncated_size = + static_cast(sync_pos) + rand->Uniform(range); + return Truncate(env, filename_, truncated_size); +} + +Status TestDirectory::Fsync() { + env_->SyncDir(dirname_); + return dir_->Fsync(); +} + +TestWritableFile::TestWritableFile(const std::string& fname, + unique_ptr&& f, + FaultInjectionTestEnv* env) + : state_(fname), + target_(std::move(f)), + writable_file_opened_(true), + env_(env) { + assert(target_ != nullptr); + state_.pos_ = 0; +} + +TestWritableFile::~TestWritableFile() { + if (writable_file_opened_) { + Close(); + } +} + +Status TestWritableFile::Append(const Slice& data) { + Status s = target_->Append(data); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_ += data.size(); + } + return s; +} + +Status TestWritableFile::Close() { + writable_file_opened_ = false; + Status s = target_->Close(); + if (s.ok()) { + env_->WritableFileClosed(state_); + } + return s; +} + +Status TestWritableFile::Flush() { + Status s = target_->Flush(); + if (s.ok() && env_->IsFilesystemActive()) { + state_.pos_at_last_flush_ = state_.pos_; + } + return s; +} + +Status TestWritableFile::Sync() { + if (!env_->IsFilesystemActive()) { + return Status::OK(); + } + // No need to actual sync. + state_.pos_at_last_sync_ = state_.pos_; + return Status::OK(); +} + +class FaultInjectionTest { + protected: + enum OptionConfig { + kDefault, + kDifferentDataDir, + kWalDir, + kSyncWal, + kWalDirSyncWal, + kMultiLevels, + kEnd, + }; + int option_config_; + // When need to make sure data is persistent, sync WAL + bool sync_use_wal_; + // When need to make sure data is persistent, call DB::CompactRange() + bool sync_use_compact_; + + protected: + public: + enum ExpectedVerifResult { kValExpectFound, kValExpectNoError }; + enum ResetMethod { + kResetDropUnsyncedData, + kResetDropRandomUnsyncedData, + kResetDeleteUnsyncedFiles, + kResetDropAndDeleteUnsynced + }; + + std::unique_ptr base_env_; + FaultInjectionTestEnv* env_; + std::string dbname_; + shared_ptr tiny_cache_; + Options options_; + DB* db_; + + FaultInjectionTest() + : option_config_(kDefault), + sync_use_wal_(false), + sync_use_compact_(true), + base_env_(nullptr), + env_(NULL), + db_(NULL) { + NewDB(); + } + + ~FaultInjectionTest() { ASSERT_OK(TearDown()); } + + bool ChangeOptions() { + option_config_++; + if (option_config_ >= kEnd) { + return false; + } else { + if (option_config_ == kMultiLevels) { + base_env_.reset(new MockEnv(Env::Default())); + } + return true; + } + } + + // Return the current option configuration. + Options CurrentOptions() { + sync_use_wal_ = false; + sync_use_compact_ = true; + Options options; + switch (option_config_) { + case kWalDir: + options.wal_dir = test::TmpDir(env_) + "/fault_test_wal"; + break; + case kDifferentDataDir: + options.db_paths.emplace_back(test::TmpDir(env_) + "/fault_test_data", + 1000000U); + break; + case kSyncWal: + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kWalDirSyncWal: + options.wal_dir = test::TmpDir(env_) + "/fault_test_wal"; + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + case kMultiLevels: + options.write_buffer_size = 64 * 1024; + options.target_file_size_base = 64 * 1024; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.max_bytes_for_level_base = 128 * 1024; + options.max_write_buffer_number = 2; + options.max_background_compactions = 8; + options.max_background_flushes = 8; + sync_use_wal_ = true; + sync_use_compact_ = false; + break; + default: + break; + } + return options; + } + + Status NewDB() { + assert(db_ == NULL); + assert(tiny_cache_ == nullptr); + assert(env_ == NULL); + + env_ = + new FaultInjectionTestEnv(base_env_ ? base_env_.get() : Env::Default()); + + options_ = CurrentOptions(); + options_.env = env_; + options_.paranoid_checks = true; + + BlockBasedTableOptions table_options; + tiny_cache_ = NewLRUCache(100); + table_options.block_cache = tiny_cache_; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + dbname_ = test::TmpDir() + "/fault_test"; + + ASSERT_OK(DestroyDB(dbname_, options_)); + + options_.create_if_missing = true; + Status s = OpenDB(); + options_.create_if_missing = false; + return s; + } + + Status SetUp() { + Status s = TearDown(); + if (s.ok()) { + s = NewDB(); + } + return s; + } + + Status TearDown() { + CloseDB(); + + Status s = DestroyDB(dbname_, options_); + + delete env_; + env_ = NULL; + + tiny_cache_.reset(); + + return s; + } + + void Build(const WriteOptions& write_options, int start_idx, int num_vals) { + std::string key_space, value_space; + WriteBatch batch; + for (int i = start_idx; i < start_idx + num_vals; i++) { + Slice key = Key(i, &key_space); + batch.Clear(); + batch.Put(key, Value(i, &value_space)); + ASSERT_OK(db_->Write(write_options, &batch)); + } + } + + Status ReadValue(int i, std::string* val) const { + std::string key_space, value_space; + Slice key = Key(i, &key_space); + Value(i, &value_space); + ReadOptions options; + return db_->Get(options, key, val); + } + + Status Verify(int start_idx, int num_vals, + ExpectedVerifResult expected) const { + std::string val; + std::string value_space; + Status s; + for (int i = start_idx; i < start_idx + num_vals && s.ok(); i++) { + Value(i, &value_space); + s = ReadValue(i, &val); + if (s.ok()) { + ASSERT_EQ(value_space, val); + } + if (expected == kValExpectFound) { + if (!s.ok()) { + fprintf(stderr, "Error when read %dth record (expect found): %s\n", i, + s.ToString().c_str()); + return s; + } + } else if (!s.ok() && !s.IsNotFound()) { + fprintf(stderr, "Error when read %dth record: %s\n", i, + s.ToString().c_str()); + return s; + } + } + return Status::OK(); + } + + // Return the ith key + Slice Key(int i, std::string* storage) const { + char buf[100]; + snprintf(buf, sizeof(buf), "%016d", i); + storage->assign(buf, strlen(buf)); + return Slice(*storage); + } + + // Return the value to associate with the specified key + Slice Value(int k, std::string* storage) const { + Random r(k); + return test::RandomString(&r, kValueSize, storage); + } + + Status OpenDB() { + delete db_; + db_ = NULL; + env_->ResetState(); + return DB::Open(options_, dbname_, &db_); + } + + void CloseDB() { + delete db_; + db_ = NULL; + } + + void DeleteAllData() { + Iterator* iter = db_->NewIterator(ReadOptions()); + WriteOptions options; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + ASSERT_OK(db_->Delete(WriteOptions(), iter->key())); + } + + delete iter; + + FlushOptions flush_options; + flush_options.wait = true; + db_->Flush(flush_options); + } + + // rnd cannot be null for kResetDropRandomUnsyncedData + void ResetDBState(ResetMethod reset_method, Random* rnd = nullptr) { + env_->AssertNoOpenFile(); + switch (reset_method) { + case kResetDropUnsyncedData: + ASSERT_OK(env_->DropUnsyncedFileData()); + break; + case kResetDropRandomUnsyncedData: + ASSERT_OK(env_->DropRandomUnsyncedFileData(rnd)); + break; + case kResetDeleteUnsyncedFiles: + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + case kResetDropAndDeleteUnsynced: + ASSERT_OK(env_->DropUnsyncedFileData()); + ASSERT_OK(env_->DeleteFilesCreatedAfterLastDirSync()); + break; + default: + assert(false); + } + } + + void PartialCompactTestPreFault(int num_pre_sync, int num_post_sync) { + DeleteAllData(); + + WriteOptions write_options; + write_options.sync = sync_use_wal_; + + Build(write_options, 0, num_pre_sync); + if (sync_use_compact_) { + db_->CompactRange(nullptr, nullptr); + } + write_options.sync = false; + Build(write_options, num_pre_sync, num_post_sync); + } + + void PartialCompactTestReopenWithFault(ResetMethod reset_method, + int num_pre_sync, int num_post_sync, + Random* rnd = nullptr) { + env_->SetFilesystemActive(false); + CloseDB(); + ResetDBState(reset_method, rnd); + ASSERT_OK(OpenDB()); + ASSERT_OK(Verify(0, num_pre_sync, FaultInjectionTest::kValExpectFound)); + ASSERT_OK(Verify(num_pre_sync, num_post_sync, + FaultInjectionTest::kValExpectNoError)); + } + + void NoWriteTestPreFault() { + } + + void NoWriteTestReopenWithFault(ResetMethod reset_method) { + CloseDB(); + ResetDBState(reset_method); + ASSERT_OK(OpenDB()); + } +}; + +TEST(FaultInjectionTest, FaultTest) { + do { + Random rnd(301); + ASSERT_OK(SetUp()); + + for (size_t idx = 0; idx < kNumIterations; idx++) { + int num_pre_sync = rnd.Uniform(kMaxNumValues); + int num_post_sync = rnd.Uniform(kMaxNumValues); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropUnsyncedData, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropRandomUnsyncedData, + num_pre_sync, num_post_sync, &rnd); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropUnsyncedData); + + // Setting a separate data path won't pass the test as we don't sync + // it after creating new files, + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + PartialCompactTestReopenWithFault(kResetDropAndDeleteUnsynced, + num_pre_sync, num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDropAndDeleteUnsynced); + + PartialCompactTestPreFault(num_pre_sync, num_post_sync); + // No new files created so we expect all values since no files will be + // dropped. + PartialCompactTestReopenWithFault(kResetDeleteUnsyncedFiles, num_pre_sync, + num_post_sync); + NoWriteTestPreFault(); + NoWriteTestReopenWithFault(kResetDeleteUnsyncedFiles); + } + } while (ChangeOptions()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/db/file_indexer.cc b/db/file_indexer.cc index 56691bde5..c59036bd6 100644 --- a/db/file_indexer.cc +++ b/db/file_indexer.cc @@ -17,17 +17,16 @@ namespace rocksdb { FileIndexer::FileIndexer(const Comparator* ucmp) : num_levels_(0), ucmp_(ucmp), level_rb_(nullptr) {} -uint32_t FileIndexer::NumLevelIndex() { - return next_level_index_.size(); -} +size_t FileIndexer::NumLevelIndex() const { return next_level_index_.size(); } -uint32_t FileIndexer::LevelIndexSize(uint32_t level) { +size_t FileIndexer::LevelIndexSize(size_t level) const { return next_level_index_[level].num_index; } -void FileIndexer::GetNextLevelIndex( - const uint32_t level, const uint32_t file_index, const int cmp_smallest, - const int cmp_largest, int32_t* left_bound, int32_t* right_bound) { +void FileIndexer::GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, + const int cmp_largest, int32_t* left_bound, + int32_t* right_bound) const { assert(level > 0); // Last level, no hint @@ -69,7 +68,7 @@ void FileIndexer::GetNextLevelIndex( assert(*right_bound <= level_rb_[level + 1]); } -void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels, +void FileIndexer::UpdateIndex(Arena* arena, const size_t num_levels, std::vector* const files) { if (files == nullptr) { return; @@ -90,17 +89,17 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels, } // L1 - Ln-1 - for (uint32_t level = 1; level < num_levels_ - 1; ++level) { + for (size_t level = 1; level < num_levels_ - 1; ++level) { const auto& upper_files = files[level]; - const int32_t upper_size = upper_files.size(); + const int32_t upper_size = static_cast(upper_files.size()); const auto& lower_files = files[level + 1]; - level_rb_[level] = upper_files.size() - 1; + level_rb_[level] = static_cast(upper_files.size()) - 1; if (upper_size == 0) { continue; } IndexLevel& index_level = next_level_index_[level]; index_level.num_index = upper_size; - char* mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit)); + mem = arena->AllocateAligned(upper_size * sizeof(IndexUnit)); index_level.index_units = new (mem) IndexUnit[upper_size]; CalculateLB( @@ -129,7 +128,8 @@ void FileIndexer::UpdateIndex(Arena* arena, const uint32_t num_levels, [](IndexUnit* index, int32_t f_idx) { index->largest_rb = f_idx; }); } - level_rb_[num_levels_ - 1] = files[num_levels_ - 1].size() - 1; + level_rb_[num_levels_ - 1] = + static_cast(files[num_levels_ - 1].size()) - 1; } void FileIndexer::CalculateLB( @@ -137,8 +137,8 @@ void FileIndexer::CalculateLB( const std::vector& lower_files, IndexLevel* index_level, std::function cmp_op, std::function set_index) { - const int32_t upper_size = upper_files.size(); - const int32_t lower_size = lower_files.size(); + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); int32_t upper_idx = 0; int32_t lower_idx = 0; @@ -175,8 +175,8 @@ void FileIndexer::CalculateRB( const std::vector& lower_files, IndexLevel* index_level, std::function cmp_op, std::function set_index) { - const int32_t upper_size = upper_files.size(); - const int32_t lower_size = lower_files.size(); + const int32_t upper_size = static_cast(upper_files.size()); + const int32_t lower_size = static_cast(lower_files.size()); int32_t upper_idx = upper_size - 1; int32_t lower_idx = lower_size - 1; diff --git a/db/file_indexer.h b/db/file_indexer.h index 127b3ee46..e673499ac 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -42,19 +42,19 @@ class FileIndexer { public: explicit FileIndexer(const Comparator* ucmp); - uint32_t NumLevelIndex(); + size_t NumLevelIndex() const; - uint32_t LevelIndexSize(uint32_t level); + size_t LevelIndexSize(size_t level) const; // Return a file index range in the next level to search for a key based on // smallest and largest key comparision for the current file specified by // level and file_index. When *left_index < *right_index, both index should // be valid and fit in the vector size. - void GetNextLevelIndex( - const uint32_t level, const uint32_t file_index, const int cmp_smallest, - const int cmp_largest, int32_t* left_bound, int32_t* right_bound); + void GetNextLevelIndex(const size_t level, const size_t file_index, + const int cmp_smallest, const int cmp_largest, + int32_t* left_bound, int32_t* right_bound) const; - void UpdateIndex(Arena* arena, const uint32_t num_levels, + void UpdateIndex(Arena* arena, const size_t num_levels, std::vector* const files); enum { @@ -62,7 +62,7 @@ class FileIndexer { }; private: - uint32_t num_levels_; + size_t num_levels_; const Comparator* ucmp_; struct IndexUnit { diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 673d85a5c..69aaa386f 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -22,8 +22,15 @@ class IntComparator : public Comparator { int Compare(const Slice& a, const Slice& b) const { assert(a.size() == 8); assert(b.size() == 8); - return *reinterpret_cast(a.data()) - - *reinterpret_cast(b.data()); + int64_t diff = *reinterpret_cast(a.data()) - + *reinterpret_cast(b.data()); + if (diff < 0) { + return -1; + } else if (diff == 0) { + return 0; + } else { + return 1; + } } const char* Name() const { @@ -94,7 +101,6 @@ TEST(FileIndexerTest, Empty) { // Case 1: no overlap, files are on the left of next level files TEST(FileIndexerTest, no_overlap_left) { Arena arena; - uint32_t kNumLevels = 4; indexer = new FileIndexer(&ucmp); // level 1 AddFile(1, 100, 200); @@ -135,7 +141,6 @@ TEST(FileIndexerTest, no_overlap_left) { // Case 2: no overlap, files are on the right of next level files TEST(FileIndexerTest, no_overlap_right) { Arena arena; - uint32_t kNumLevels = 4; indexer = new FileIndexer(&ucmp); // level 1 AddFile(1, 2100, 2200); @@ -178,7 +183,6 @@ TEST(FileIndexerTest, no_overlap_right) { // Case 3: empty L2 TEST(FileIndexerTest, empty_L2) { Arena arena; - uint32_t kNumLevels = 4; indexer = new FileIndexer(&ucmp); for (uint32_t i = 1; i < kNumLevels; ++i) { ASSERT_EQ(0U, indexer->LevelIndexSize(i)); diff --git a/db/filename.cc b/db/filename.cc index 42c7efb78..160005dda 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -6,7 +6,10 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include "db/filename.h" #include @@ -16,6 +19,7 @@ #include "db/dbformat.h" #include "rocksdb/env.h" #include "util/logging.h" +#include "util/stop_watch.h" namespace rocksdb { @@ -76,6 +80,17 @@ std::string MakeTableFileName(const std::string& path, uint64_t number) { return MakeFileName(path, number, "sst"); } +uint64_t TableFileNameToNumber(const std::string& name) { + uint64_t number = 0; + uint64_t base = 1; + int pos = static_cast(name.find_last_of('.')); + while (--pos >= 0 && name[pos] >= '0' && name[pos] <= '9') { + number += (name[pos] - '0') * base; + base *= 10; + } + return number; +} + std::string TableFileName(const std::vector& db_paths, uint64_t number, uint32_t path_id) { assert(number > 0); @@ -315,4 +330,16 @@ Status SetIdentityFile(Env* env, const std::string& dbname) { return s; } +Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) { + if (db_options->disableDataSync) { + return Status::OK(); + } else if (db_options->use_fsync) { + StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); + return file->Fsync(); + } else { + StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); + return file->Sync(); + } +} + } // namespace rocksdb diff --git a/db/filename.h b/db/filename.h index a80703074..33f5ace20 100644 --- a/db/filename.h +++ b/db/filename.h @@ -14,15 +14,18 @@ #include #include #include + +#include "port/port.h" +#include "rocksdb/options.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" #include "rocksdb/transaction_log.h" -#include "port/port.h" namespace rocksdb { class Env; class Directory; +class WritableFile; enum FileType { kLogFile, @@ -36,9 +39,6 @@ enum FileType { kIdentityFile }; -// map from file number to path ID. -typedef std::unordered_map FileNumToPathIdMap; - // Return the name of the log file with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". @@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname, extern std::string MakeTableFileName(const std::string& name, uint64_t number); +// the reverse function of MakeTableFileName +// TODO(yhchiang): could merge this function with ParseFileName() +extern uint64_t TableFileNameToNumber(const std::string& name); + // Return the name of the sstable with the specified number // in the db named by "dbname". The result will be prefixed with // "dbname". @@ -134,4 +138,8 @@ extern Status SetCurrentFile(Env* env, const std::string& dbname, // Make the IDENTITY file for the db extern Status SetIdentityFile(Env* env, const std::string& dbname); +// Sync manifest file `file`. +extern Status SyncManifest(Env* env, const DBOptions* db_options, + WritableFile* file); + } // namespace rocksdb diff --git a/db/flush_job.cc b/db/flush_job.cc new file mode 100644 index 000000000..ca1d113db --- /dev/null +++ b/db/flush_job.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/flush_job.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include + +#include "db/builder.h" +#include "db/db_iter.h" +#include "db/dbformat.h" +#include "db/filename.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/memtable.h" +#include "db/memtable_list.h" +#include "db/merge_context.h" +#include "db/version_set.h" +#include "port/port.h" +#include "port/likely.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/merger.h" +#include "table/table_builder.h" +#include "table/two_level_iterator.h" +#include "util/coding.h" +#include "util/file_util.h" +#include "util/logging.h" +#include "util/log_buffer.h" +#include "util/mutexlock.h" +#include "util/perf_context_imp.h" +#include "util/iostats_context_imp.h" +#include "util/stop_watch.h" +#include "util/sync_point.h" + +namespace rocksdb { + +FlushJob::FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const DBOptions& db_options, + const MutableCFOptions& mutable_cf_options, + const EnvOptions& env_options, VersionSet* versions, + InstrumentedMutex* db_mutex, + std::atomic* shutting_down, + SequenceNumber newest_snapshot, JobContext* job_context, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_file_directory, + CompressionType output_compression, Statistics* stats) + : dbname_(dbname), + cfd_(cfd), + db_options_(db_options), + mutable_cf_options_(mutable_cf_options), + env_options_(env_options), + versions_(versions), + db_mutex_(db_mutex), + shutting_down_(shutting_down), + newest_snapshot_(newest_snapshot), + job_context_(job_context), + log_buffer_(log_buffer), + db_directory_(db_directory), + output_file_directory_(output_file_directory), + output_compression_(output_compression), + stats_(stats) {} + +Status FlushJob::Run(uint64_t* file_number) { + // Save the contents of the earliest memtable as a new Table + uint64_t fn; + autovector mems; + cfd_->imm()->PickMemtablesToFlush(&mems); + if (mems.empty()) { + LogToBuffer(log_buffer_, "[%s] Nothing in memtable to flush", + cfd_->GetName().c_str()); + return Status::OK(); + } + + // entries mems are (implicitly) sorted in ascending order by their created + // time. We will use the first memtable's `edit` to keep the meta info for + // this flush. + MemTable* m = mems[0]; + VersionEdit* edit = m->GetEdits(); + edit->SetPrevLogNumber(0); + // SetLogNumber(log_num) indicates logs with number smaller than log_num + // will no longer be picked up for recovery. + edit->SetLogNumber(mems.back()->GetNextLogNumber()); + edit->SetColumnFamily(cfd_->GetID()); + + // This will release and re-acquire the mutex. + Status s = WriteLevel0Table(mems, edit, &fn); + + if (s.ok() && + (shutting_down_->load(std::memory_order_acquire) || cfd_->IsDropped())) { + s = Status::ShutdownInProgress( + "Database shutdown or Column family drop during flush"); + } + + if (!s.ok()) { + cfd_->imm()->RollbackMemtableFlush(mems, fn); + } else { + // Replace immutable memtable with the generated Table + s = cfd_->imm()->InstallMemtableFlushResults( + cfd_, mutable_cf_options_, mems, versions_, db_mutex_, fn, + &job_context_->memtables_to_free, db_directory_, log_buffer_); + } + + if (s.ok() && file_number != nullptr) { + *file_number = fn; + } + return s; +} + +Status FlushJob::WriteLevel0Table(const autovector& mems, + VersionEdit* edit, uint64_t* filenumber) { + db_mutex_->AssertHeld(); + const uint64_t start_micros = db_options_.env->NowMicros(); + FileMetaData meta; + // path 0 for level 0 file. + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + *filenumber = meta.fd.GetNumber(); + + const SequenceNumber earliest_seqno_in_memtable = + mems[0]->GetFirstSequenceNumber(); + Version* base = cfd_->current(); + base->Ref(); // it is likely that we do not need this reference + Status s; + { + db_mutex_->Unlock(); + if (log_buffer_) { + log_buffer_->FlushBufferToLog(); + } + std::vector memtables; + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + for (MemTable* m : mems) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Flushing memtable with next log file: %" PRIu64 "\n", + cfd_->GetName().c_str(), m->GetNextLogNumber()); + memtables.push_back(m->NewIterator(ro, &arena)); + } + { + ScopedArenaIterator iter( + NewMergingIterator(&cfd_->internal_comparator(), &memtables[0], + static_cast(memtables.size()), &arena)); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Level-0 flush table #%" PRIu64 ": started", + cfd_->GetName().c_str(), meta.fd.GetNumber()); + + s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_, + cfd_->table_cache(), iter.get(), &meta, + cfd_->internal_comparator(), newest_snapshot_, + earliest_seqno_in_memtable, output_compression_, + cfd_->ioptions()->compression_opts, Env::IO_HIGH); + LogFlush(db_options_.info_log); + } + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "[%s] Level-0 flush table #%" PRIu64 ": %" PRIu64 " bytes %s", + cfd_->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(), + s.ToString().c_str()); + if (!db_options_.disableDataSync && output_file_directory_ != nullptr) { + output_file_directory_->Fsync(); + } + db_mutex_->Lock(); + } + base->Unref(); + + // re-acquire the most current version + base = cfd_->current(); + + // Note that if file_size is zero, the file has been deleted and + // should not be added to the manifest. + int level = 0; + if (s.ok() && meta.fd.GetFileSize() > 0) { + const Slice min_user_key = meta.smallest.user_key(); + const Slice max_user_key = meta.largest.user_key(); + // if we have more than 1 background thread, then we cannot + // insert files directly into higher levels because some other + // threads could be concurrently producing compacted files for + // that key range. + if (base != nullptr && db_options_.max_background_compactions <= 1 && + db_options_.max_background_flushes == 0 && + cfd_->ioptions()->compaction_style == kCompactionStyleLevel) { + level = base->storage_info()->PickLevelForMemTableOutput( + mutable_cf_options_, min_user_key, max_user_key); + // If level does not match path id, reset level back to 0 + uint32_t fdpath = LevelCompactionPicker::GetPathId( + *cfd_->ioptions(), mutable_cf_options_, level); + if (fdpath != 0) { + level = 0; + } + } + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), + meta.fd.GetFileSize(), meta.smallest, meta.largest, + meta.smallest_seqno, meta.largest_seqno); + } + + InternalStats::CompactionStats stats(1); + stats.micros = db_options_.env->NowMicros() - start_micros; + stats.bytes_written = meta.fd.GetFileSize(); + cfd_->internal_stats()->AddCompactionStats(level, stats); + cfd_->internal_stats()->AddCFStats(InternalStats::BYTES_FLUSHED, + meta.fd.GetFileSize()); + RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize()); + return s; +} + +} // namespace rocksdb diff --git a/db/flush_job.h b/db/flush_job.h new file mode 100644 index 000000000..40cdc5045 --- /dev/null +++ b/db/flush_job.h @@ -0,0 +1,87 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/log_writer.h" +#include "db/snapshot.h" +#include "db/column_family.h" +#include "db/version_edit.h" +#include "db/memtable_list.h" +#include "port/port.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/transaction_log.h" +#include "util/autovector.h" +#include "util/instrumented_mutex.h" +#include "util/stop_watch.h" +#include "util/thread_local.h" +#include "util/scoped_arena_iterator.h" +#include "db/internal_stats.h" +#include "db/write_controller.h" +#include "db/flush_scheduler.h" +#include "db/write_thread.h" +#include "db/job_context.h" + +namespace rocksdb { + +class MemTable; +class TableCache; +class Version; +class VersionEdit; +class VersionSet; +class Arena; + +class FlushJob { + public: + // TODO(icanadi) make effort to reduce number of parameters here + // IMPORTANT: mutable_cf_options needs to be alive while FlushJob is alive + FlushJob(const std::string& dbname, ColumnFamilyData* cfd, + const DBOptions& db_options, + const MutableCFOptions& mutable_cf_options, + const EnvOptions& env_options, VersionSet* versions, + InstrumentedMutex* db_mutex, std::atomic* shutting_down, + SequenceNumber newest_snapshot, JobContext* job_context, + LogBuffer* log_buffer, Directory* db_directory, + Directory* output_file_directory, CompressionType output_compression, + Statistics* stats); + ~FlushJob() {} + + Status Run(uint64_t* file_number = nullptr); + + private: + Status WriteLevel0Table(const autovector& mems, VersionEdit* edit, + uint64_t* filenumber); + const std::string& dbname_; + ColumnFamilyData* cfd_; + const DBOptions& db_options_; + const MutableCFOptions& mutable_cf_options_; + const EnvOptions& env_options_; + VersionSet* versions_; + InstrumentedMutex* db_mutex_; + std::atomic* shutting_down_; + SequenceNumber newest_snapshot_; + JobContext* job_context_; + LogBuffer* log_buffer_; + Directory* db_directory_; + Directory* output_file_directory_; + CompressionType output_compression_; + Statistics* stats_; +}; + +} // namespace rocksdb diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc new file mode 100644 index 000000000..d3e824087 --- /dev/null +++ b/db/flush_job_test.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "db/flush_job.h" +#include "db/column_family.h" +#include "db/version_set.h" +#include "db/writebuffer.h" +#include "rocksdb/cache.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "table/mock_table.h" + +namespace rocksdb { + +// TODO(icanadi) Mock out everything else: +// 1. VersionSet +// 2. Memtable +class FlushJobTest { + public: + FlushJobTest() + : env_(Env::Default()), + dbname_(test::TmpDir() + "/flush_job_test"), + table_cache_(NewLRUCache(50000, 16, 8)), + write_buffer_(db_options_.db_write_buffer_size), + versions_(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_, + &write_controller_)), + shutting_down_(false), + mock_table_factory_(new mock::MockTableFactory()) { + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + // TODO(icanadi) Remove this once we mock out VersionSet + NewDB(); + std::vector column_families; + cf_options_.table_factory = mock_table_factory_; + column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); + + ASSERT_OK(versions_->Recover(column_families, false)); + } + + void NewDB() { + VersionEdit new_db; + new_db.SetLogNumber(0); + new_db.SetNextFile(2); + new_db.SetLastSequence(0); + + const std::string manifest = DescriptorFileName(dbname_, 1); + unique_ptr file; + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(env_options_)); + ASSERT_OK(s); + { + log::Writer log(std::move(file)); + std::string record; + new_db.EncodeTo(&record); + s = log.AddRecord(record); + } + ASSERT_OK(s); + // Make "CURRENT" file that points to the new manifest file. + s = SetCurrentFile(env_, dbname_, 1, nullptr); + } + + Env* env_; + std::string dbname_; + EnvOptions env_options_; + std::shared_ptr table_cache_; + WriteController write_controller_; + DBOptions db_options_; + WriteBuffer write_buffer_; + ColumnFamilyOptions cf_options_; + std::unique_ptr versions_; + InstrumentedMutex mutex_; + std::atomic shutting_down_; + std::shared_ptr mock_table_factory_; +}; + +TEST(FlushJobTest, Empty) { + JobContext job_context; + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + env_options_, versions_.get(), &mutex_, &shutting_down_, + SequenceNumber(), &job_context, nullptr, nullptr, nullptr, + kNoCompression, nullptr); + ASSERT_OK(flush_job.Run()); + job_context.Clean(); +} + +TEST(FlushJobTest, NonEmpty) { + JobContext job_context; + auto cfd = versions_->GetColumnFamilySet()->GetDefault(); + auto new_mem = cfd->ConstructNewMemtable(*cfd->GetLatestMutableCFOptions()); + new_mem->Ref(); + std::map inserted_keys; + for (int i = 1; i < 10000; ++i) { + std::string key(ToString(i)); + std::string value("value" + ToString(i)); + new_mem->Add(SequenceNumber(i), kTypeValue, key, value); + InternalKey internal_key(key, SequenceNumber(i), kTypeValue); + inserted_keys.insert({internal_key.Encode().ToString(), value}); + } + cfd->imm()->Add(new_mem); + + FlushJob flush_job(dbname_, versions_->GetColumnFamilySet()->GetDefault(), + db_options_, *cfd->GetLatestMutableCFOptions(), + env_options_, versions_.get(), &mutex_, &shutting_down_, + SequenceNumber(), &job_context, nullptr, nullptr, nullptr, + kNoCompression, nullptr); + mutex_.Lock(); + ASSERT_OK(flush_job.Run()); + mutex_.Unlock(); + mock_table_factory_->AssertSingleFile(inserted_keys); + job_context.Clean(); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc new file mode 100644 index 000000000..56816159e --- /dev/null +++ b/db/flush_scheduler.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/flush_scheduler.h" + +#include + +#include "db/column_family.h" + +namespace rocksdb { + +void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) { +#ifndef NDEBUG + assert(column_families_set_.find(cfd) == column_families_set_.end()); + column_families_set_.insert(cfd); +#endif // NDEBUG + cfd->Ref(); + column_families_.push_back(cfd); +} + +ColumnFamilyData* FlushScheduler::GetNextColumnFamily() { + ColumnFamilyData* cfd = nullptr; + while (column_families_.size() > 0) { + cfd = column_families_.front(); + column_families_.pop_front(); + if (cfd->IsDropped()) { + if (cfd->Unref()) { + delete cfd; + cfd = nullptr; + } + } else { + break; + } + } +#ifndef NDEBUG + if (cfd != nullptr) { + auto itr = column_families_set_.find(cfd); + assert(itr != column_families_set_.end()); + column_families_set_.erase(itr); + } +#endif // NDEBUG + return cfd; +} + +bool FlushScheduler::Empty() { return column_families_.empty(); } + +void FlushScheduler::Clear() { + for (auto cfd : column_families_) { +#ifndef NDEBUG + auto itr = column_families_set_.find(cfd); + assert(itr != column_families_set_.end()); + column_families_set_.erase(itr); +#endif // NDEBUG + if (cfd->Unref()) { + delete cfd; + } + } + column_families_.clear(); +} + +} // namespace rocksdb diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h new file mode 100644 index 000000000..0c96709b9 --- /dev/null +++ b/db/flush_scheduler.h @@ -0,0 +1,40 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include + +namespace rocksdb { + +class ColumnFamilyData; + +// This class is thread-compatible. It's should only be accessed from single +// write thread (between BeginWrite() and EndWrite()) +class FlushScheduler { + public: + FlushScheduler() = default; + ~FlushScheduler() = default; + + void ScheduleFlush(ColumnFamilyData* cfd); + // Returns Ref()-ed column family. Client needs to Unref() + // REQUIRES: db mutex is held (exception is single-threaded recovery) + ColumnFamilyData* GetNextColumnFamily(); + + bool Empty(); + + void Clear(); + + private: + std::deque column_families_; +#ifndef NDEBUG + std::set column_families_set_; +#endif // NDEBUG +}; + +} // namespace rocksdb diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 74e6dd249..93af3c2d4 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -10,6 +10,7 @@ #include #include +#include "db/job_context.h" #include "db/db_impl.h" #include "db/db_iter.h" #include "db/column_family.h" @@ -114,27 +115,36 @@ class LevelIterator : public Iterator { }; ForwardIterator::ForwardIterator(DBImpl* db, const ReadOptions& read_options, - ColumnFamilyData* cfd) + ColumnFamilyData* cfd, SuperVersion* current_sv) : db_(db), read_options_(read_options), cfd_(cfd), - prefix_extractor_(cfd->options()->prefix_extractor.get()), + prefix_extractor_(cfd->ioptions()->prefix_extractor), user_comparator_(cfd->user_comparator()), immutable_min_heap_(MinIterComparator(&cfd_->internal_comparator())), - sv_(nullptr), + sv_(current_sv), mutable_iter_(nullptr), current_(nullptr), + status_(Status::OK()), + immutable_status_(Status::OK()), valid_(false), - is_prev_set_(false) {} + is_prev_set_(false), + is_prev_inclusive_(false) { + if (sv_) { + RebuildIterators(false); + } +} ForwardIterator::~ForwardIterator() { - Cleanup(); + Cleanup(true); } -void ForwardIterator::Cleanup() { - delete mutable_iter_; +void ForwardIterator::Cleanup(bool release_sv) { + if (mutable_iter_ != nullptr) { + mutable_iter_->~Iterator(); + } for (auto* m : imm_iters_) { - delete m; + m->~Iterator(); } imm_iters_.clear(); for (auto* f : l0_iters_) { @@ -146,15 +156,17 @@ void ForwardIterator::Cleanup() { } level_iters_.clear(); - if (sv_ != nullptr && sv_->Unref()) { - DBImpl::DeletionState deletion_state; - db_->mutex_.Lock(); - sv_->Cleanup(); - db_->FindObsoleteFiles(deletion_state, false, true); - db_->mutex_.Unlock(); - delete sv_; - if (deletion_state.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(deletion_state); + if (release_sv) { + if (sv_ != nullptr && sv_->Unref()) { + JobContext job_context; + db_->mutex_.Lock(); + sv_->Cleanup(); + db_->FindObsoleteFiles(&job_context, false, true); + db_->mutex_.Unlock(); + delete sv_; + if (job_context.HaveSomethingToDelete()) { + db_->PurgeObsoleteFiles(job_context); + } } } } @@ -166,8 +178,8 @@ bool ForwardIterator::Valid() const { void ForwardIterator::SeekToFirst() { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { - RebuildIterators(); - } else if (status_.IsIncomplete()) { + RebuildIterators(true); + } else if (immutable_status_.IsIncomplete()) { ResetIncompleteIterators(); } SeekInternal(Slice(), true); @@ -176,8 +188,8 @@ void ForwardIterator::SeekToFirst() { void ForwardIterator::Seek(const Slice& internal_key) { if (sv_ == nullptr || sv_ ->version_number != cfd_->GetSuperVersionNumber()) { - RebuildIterators(); - } else if (status_.IsIncomplete()) { + RebuildIterators(true); + } else if (immutable_status_.IsIncomplete()) { ResetIncompleteIterators(); } SeekInternal(internal_key, false); @@ -185,6 +197,7 @@ void ForwardIterator::Seek(const Slice& internal_key) { void ForwardIterator::SeekInternal(const Slice& internal_key, bool seek_to_first) { + assert(mutable_iter_); // mutable seek_to_first ? mutable_iter_->SeekToFirst() : mutable_iter_->Seek(internal_key); @@ -194,13 +207,16 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, // if it turns to need to seek immutable often. We probably want to have // an option to turn it off. if (seek_to_first || NeedToSeekImmutable(internal_key)) { + immutable_status_ = Status::OK(); { auto tmp = MinIterHeap(MinIterComparator(&cfd_->internal_comparator())); immutable_min_heap_.swap(tmp); } for (auto* m : imm_iters_) { seek_to_first ? m->SeekToFirst() : m->Seek(internal_key); - if (m->Valid()) { + if (!m->status().ok()) { + immutable_status_ = m->status(); + } else if (m->Valid()) { immutable_min_heap_.push(m); } } @@ -209,27 +225,23 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, if (!seek_to_first) { user_key = ExtractUserKey(internal_key); } - auto* files = sv_->current->files_; - for (uint32_t i = 0; i < files[0].size(); ++i) { + const VersionStorageInfo* vstorage = sv_->current->storage_info(); + const std::vector& l0 = vstorage->LevelFiles(0); + for (uint32_t i = 0; i < l0.size(); ++i) { if (seek_to_first) { l0_iters_[i]->SeekToFirst(); } else { // If the target key passes over the larget key, we are sure Next() // won't go over this file. if (user_comparator_->Compare(user_key, - files[0][i]->largest.user_key()) > 0) { + l0[i]->largest.user_key()) > 0) { continue; } l0_iters_[i]->Seek(internal_key); } - if (l0_iters_[i]->status().IsIncomplete()) { - // if any of the immutable iterators is incomplete (no-io option was - // used), we are unable to reliably find the smallest key - assert(read_options_.read_tier == kBlockCacheTier); - status_ = l0_iters_[i]->status(); - valid_ = false; - return; + if (!l0_iters_[i]->status().ok()) { + immutable_status_ = l0_iters_[i]->status(); } else if (l0_iters_[i]->Valid()) { immutable_min_heap_.push(l0_iters_[i]); } @@ -237,86 +249,83 @@ void ForwardIterator::SeekInternal(const Slice& internal_key, int32_t search_left_bound = 0; int32_t search_right_bound = FileIndexer::kLevelMaxIndex; - for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) { - if (files[level].empty()) { + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const std::vector& level_files = + vstorage->LevelFiles(level); + if (level_files.empty()) { search_left_bound = 0; search_right_bound = FileIndexer::kLevelMaxIndex; continue; } assert(level_iters_[level - 1] != nullptr); uint32_t f_idx = 0; + const auto& indexer = vstorage->file_indexer(); if (!seek_to_first) { - // TODO(ljin): remove before committing - // f_idx = FindFileInRange( - // files[level], internal_key, 0, files[level].size()); - if (search_left_bound == search_right_bound) { f_idx = search_left_bound; } else if (search_left_bound < search_right_bound) { - f_idx = FindFileInRange( - files[level], internal_key, search_left_bound, - search_right_bound == FileIndexer::kLevelMaxIndex ? - files[level].size() : search_right_bound); + f_idx = + FindFileInRange(level_files, internal_key, search_left_bound, + search_right_bound == FileIndexer::kLevelMaxIndex + ? static_cast(level_files.size()) + : search_right_bound); } else { // search_left_bound > search_right_bound // There are only 2 cases this can happen: // (1) target key is smaller than left most file // (2) target key is larger than right most file - assert(search_left_bound == (int32_t)files[level].size() || + assert(search_left_bound == (int32_t)level_files.size() || search_right_bound == -1); if (search_right_bound == -1) { assert(search_left_bound == 0); f_idx = 0; } else { - sv_->current->file_indexer_.GetNextLevelIndex( - level, files[level].size() - 1, + indexer.GetNextLevelIndex( + level, level_files.size() - 1, 1, 1, &search_left_bound, &search_right_bound); continue; } } // Prepare hints for the next level - if (f_idx < files[level].size()) { + if (f_idx < level_files.size()) { int cmp_smallest = user_comparator_->Compare( - user_key, files[level][f_idx]->smallest.user_key()); + user_key, level_files[f_idx]->smallest.user_key()); int cmp_largest = -1; if (cmp_smallest >= 0) { cmp_smallest = user_comparator_->Compare( - user_key, files[level][f_idx]->smallest.user_key()); + user_key, level_files[f_idx]->smallest.user_key()); } - sv_->current->file_indexer_.GetNextLevelIndex(level, f_idx, + indexer.GetNextLevelIndex(level, f_idx, cmp_smallest, cmp_largest, &search_left_bound, &search_right_bound); } else { - sv_->current->file_indexer_.GetNextLevelIndex( - level, files[level].size() - 1, + indexer.GetNextLevelIndex( + level, level_files.size() - 1, 1, 1, &search_left_bound, &search_right_bound); } } // Seek - if (f_idx < files[level].size()) { + if (f_idx < level_files.size()) { level_iters_[level - 1]->SetFileIndex(f_idx); seek_to_first ? level_iters_[level - 1]->SeekToFirst() : level_iters_[level - 1]->Seek(internal_key); - if (level_iters_[level - 1]->status().IsIncomplete()) { - // see above - assert(read_options_.read_tier == kBlockCacheTier); - status_ = level_iters_[level - 1]->status(); - valid_ = false; - return; + if (!level_iters_[level - 1]->status().ok()) { + immutable_status_ = level_iters_[level - 1]->status(); } else if (level_iters_[level - 1]->Valid()) { immutable_min_heap_.push(level_iters_[level - 1]); } } } - if (seek_to_first || immutable_min_heap_.empty()) { + if (seek_to_first) { is_prev_set_ = false; } else { prev_key_.SetKey(internal_key); is_prev_set_ = true; + is_prev_inclusive_ = true; } } else if (current_ && current_ != mutable_iter_) { // current_ is one of immutable iterators, push it back to the heap @@ -334,24 +343,33 @@ void ForwardIterator::Next() { std::string current_key = key().ToString(); Slice old_key(current_key.data(), current_key.size()); - RebuildIterators(); + RebuildIterators(true); SeekInternal(old_key, false); if (!valid_ || key().compare(old_key) != 0) { return; } } else if (current_ != mutable_iter_) { // It is going to advance immutable iterator - prev_key_.SetKey(current_->key()); - is_prev_set_ = true; + + bool update_prev_key = true; + if (is_prev_set_ && prefix_extractor_) { + // advance prev_key_ to current_ only if they share the same prefix + update_prev_key = + prefix_extractor_->Transform(prev_key_.GetKey()).compare( + prefix_extractor_->Transform(current_->key())) == 0; + } + + if (update_prev_key) { + prev_key_.SetKey(current_->key()); + is_prev_set_ = true; + is_prev_inclusive_ = false; + } } current_->Next(); if (current_ != mutable_iter_) { - if (current_->status().IsIncomplete()) { - assert(read_options_.read_tier == kBlockCacheTier); - status_ = current_->status(); - valid_ = false; - return; + if (!current_->status().ok()) { + immutable_status_ = current_->status(); } else if (current_->Valid()) { immutable_min_heap_.push(current_); } @@ -377,45 +395,35 @@ Status ForwardIterator::status() const { return mutable_iter_->status(); } - for (auto *it : imm_iters_) { - if (it && !it->status().ok()) { - return it->status(); - } - } - for (auto *it : l0_iters_) { - if (it && !it->status().ok()) { - return it->status(); - } - } - for (auto *it : level_iters_) { - if (it && !it->status().ok()) { - return it->status(); - } - } - - return Status::OK(); + return immutable_status_; } -void ForwardIterator::RebuildIterators() { +void ForwardIterator::RebuildIterators(bool refresh_sv) { // Clean up - Cleanup(); - // New - sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); - mutable_iter_ = sv_->mem->NewIterator(read_options_); - sv_->imm->AddIterators(read_options_, &imm_iters_); - const auto& l0_files = sv_->current->files_[0]; + Cleanup(refresh_sv); + if (refresh_sv) { + // New + sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); + } + mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_); + sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_); + + const auto* vstorage = sv_->current->storage_info(); + const auto& l0_files = vstorage->LevelFiles(0); l0_iters_.reserve(l0_files.size()); for (const auto* l0 : l0_files) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd)); } - level_iters_.reserve(sv_->current->NumberLevels() - 1); - for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) { - if (sv_->current->files_[level].empty()) { + level_iters_.reserve(vstorage->num_levels() - 1); + for (int32_t level = 1; level < vstorage->num_levels(); ++level) { + const auto& level_files = vstorage->LevelFiles(level); + + if (level_files.empty()) { level_iters_.push_back(nullptr); } else { - level_iters_.push_back(new LevelIterator(cfd_, read_options_, - sv_->current->files_[level])); + level_iters_.push_back( + new LevelIterator(cfd_, read_options_, level_files)); } } @@ -424,7 +432,7 @@ void ForwardIterator::RebuildIterators() { } void ForwardIterator::ResetIncompleteIterators() { - const auto& l0_files = sv_->current->files_[0]; + const auto& l0_files = sv_->current->storage_info()->LevelFiles(0); for (uint32_t i = 0; i < l0_iters_.size(); ++i) { assert(i < l0_files.size()); if (!l0_iters_[i]->status().IsIncomplete()) { @@ -474,7 +482,14 @@ void ForwardIterator::UpdateCurrent() { } bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { - if (!valid_ || !is_prev_set_) { + // We maintain the interval (prev_key_, immutable_min_heap_.top()->key()) + // such that there are no records with keys within that range in + // immutable_min_heap_. Since immutable structures (SST files and immutable + // memtables) can't change in this version, we don't need to do a seek if + // 'target' belongs to that interval (immutable_min_heap_.top() is already + // at the correct position). + + if (!valid_ || !current_ || !is_prev_set_ || !immutable_status_.ok()) { return true; } Slice prev_key = prev_key_.GetKey(); @@ -483,13 +498,17 @@ bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { return true; } if (cfd_->internal_comparator().InternalKeyComparator::Compare( - prev_key, target) >= 0) { + prev_key, target) >= (is_prev_inclusive_ ? 1 : 0)) { return true; } - if (immutable_min_heap_.empty() || - cfd_->internal_comparator().InternalKeyComparator::Compare( - target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() - : current_->key()) > 0) { + + if (immutable_min_heap_.empty() && current_ == mutable_iter_) { + // Nothing to seek on. + return false; + } + if (cfd_->internal_comparator().InternalKeyComparator::Compare( + target, current_ == mutable_iter_ ? immutable_min_heap_.top()->key() + : current_->key()) > 0) { return true; } return false; diff --git a/db/forward_iterator.h b/db/forward_iterator.h index bbf423a50..ccc23ebaa 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -14,6 +14,7 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "util/arena.h" namespace rocksdb { @@ -50,7 +51,7 @@ typedef std::priority_queue #include #include "db/column_family.h" + #include "db/db_impl.h" +#include "util/string_util.h" namespace rocksdb { +#ifndef ROCKSDB_LITE namespace { const double kMB = 1048576.0; const double kGB = kMB * 1024; @@ -24,64 +31,55 @@ void PrintLevelStatsHeader(char* buf, size_t len, const std::string& cf_name) { buf, len, "\n** Compaction Stats [%s] **\n" "Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) " - "Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) " - "Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) " - "Stall(sec) Stall(cnt) Avg(ms)\n" + "Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) " + "Comp(sec) Comp(cnt) Avg(sec) " + "Stall(sec) Stall(cnt) Avg(ms) RecordIn RecordDrop\n" "--------------------------------------------------------------------" "--------------------------------------------------------------------" - "--------------------------------------------------------------------\n", + "----------------------------------------------------------\n", cf_name.c_str()); } void PrintLevelStats(char* buf, size_t len, const std::string& name, int num_files, int being_compacted, double total_file_size, double score, - double rw_amp, double w_amp, double stall_us, uint64_t stalls, + double w_amp, double stall_us, uint64_t stalls, const InternalStats::CompactionStats& stats) { uint64_t bytes_read = stats.bytes_readn + stats.bytes_readnp1; uint64_t bytes_new = stats.bytes_written - stats.bytes_readnp1; double elapsed = (stats.micros + 1) / 1000000.0; snprintf(buf, len, - "%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */ - "%8.1f " /* Read(GB) */ - "%7.1f " /* Rn(GB) */ - "%8.1f " /* Rnp1(GB) */ - "%9.1f " /* Write(GB) */ - "%8.1f " /* Wnew(GB) */ - "%6.1f " /* RW-Amp */ - "%5.1f " /* W-Amp */ - "%8.1f " /* Rd(MB/s) */ - "%8.1f " /* Wr(MB/s) */ - "%8d " /* Rn(cnt) */ - "%9d " /* Rnp1(cnt) */ - "%9d " /* Wnp1(cnt) */ - "%9d " /* Wnew(cnt) */ - "%10.0f " /* Comp(sec) */ - "%9d " /* Comp(cnt) */ - "%8.3f " /* Avg(sec) */ - "%10.2f " /* Stall(sec) */ - "%10" PRIu64 " " /* Stall(cnt) */ - "%7.2f\n" /* Avg(ms) */, - name.c_str(), num_files, being_compacted, total_file_size / kMB, score, - bytes_read / kGB, - stats.bytes_readn / kGB, - stats.bytes_readnp1 / kGB, - stats.bytes_written / kGB, - bytes_new / kGB, - rw_amp, - w_amp, - bytes_read / kMB / elapsed, - stats.bytes_written / kMB / elapsed, - stats.files_in_leveln, - stats.files_in_levelnp1, - stats.files_out_levelnp1, - stats.files_out_levelnp1 - stats.files_in_levelnp1, - stats.micros / 1000000.0, - stats.count, - stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count, - stall_us / 1000000.0, - stalls, - stalls == 0 ? 0 : stall_us / 1000.0 / stalls); + "%4s %5d/%-3d %8.0f %5.1f " /* Level, Files, Size(MB), Score */ + "%8.1f " /* Read(GB) */ + "%7.1f " /* Rn(GB) */ + "%8.1f " /* Rnp1(GB) */ + "%9.1f " /* Write(GB) */ + "%8.1f " /* Wnew(GB) */ + "%9.1f " /* Moved(GB) */ + "%5.1f " /* W-Amp */ + "%8.1f " /* Rd(MB/s) */ + "%8.1f " /* Wr(MB/s) */ + "%9.0f " /* Comp(sec) */ + "%9d " /* Comp(cnt) */ + "%8.3f " /* Avg(sec) */ + "%10.2f " /* Stall(sec) */ + "%10" PRIu64 + " " /* Stall(cnt) */ + "%7.2f " /* Avg(ms) */ + "%12" PRIu64 + " " /* input entries */ + "%12" PRIu64 "\n" /* number of records reduced */, + name.c_str(), num_files, being_compacted, total_file_size / kMB, + score, bytes_read / kGB, stats.bytes_readn / kGB, + stats.bytes_readnp1 / kGB, stats.bytes_written / kGB, + bytes_new / kGB, stats.bytes_moved / kGB, + w_amp, bytes_read / kMB / elapsed, + stats.bytes_written / kMB / elapsed, + stats.micros / 1000000.0, stats.count, + stats.count == 0 ? 0 : stats.micros / 1000000.0 / stats.count, + stall_us / 1000000.0, stalls, + stalls == 0 ? 0 : stall_us / 1000.0 / stalls, + stats.num_input_records, stats.num_dropped_records); } @@ -125,6 +123,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, return kBackgroundErrors; } else if (in == "cur-size-active-mem-table") { return kCurSizeActiveMemTable; + } else if (in == "cur-size-all-mem-tables") { + return kCurSizeAllMemTables; } else if (in == "num-entries-active-mem-table") { return kNumEntriesInMutableMemtable; } else if (in == "num-entries-imm-mem-tables") { @@ -136,6 +136,10 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, return kEstimatedUsageByTableReaders; } else if (in == "is-file-deletions-enabled") { return kIsFileDeletionEnabled; + } else if (in == "num-snapshots") { + return kNumSnapshots; + } else if (in == "oldest-snapshot-time") { + return kOldestSnapshotTime; } return kUnknown; } @@ -159,7 +163,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type, const Slice& property, std::string* value) { assert(value != nullptr); - Version* current = cfd_->current(); + auto* current = cfd_->current(); + const auto* vstorage = current->storage_info(); Slice in = property; switch (property_type) { @@ -172,7 +177,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type, } else { char buf[100]; snprintf(buf, sizeof(buf), "%d", - current->NumLevelFiles(static_cast(level))); + vstorage->NumLevelFiles(static_cast(level))); *value = buf; return true; } @@ -186,8 +191,8 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type, for (int level = 0; level < number_levels_; level++) { snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, - current->NumLevelFiles(level), - current->NumLevelBytes(level) / kMB); + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); value->append(buf); } return true; @@ -219,7 +224,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type, bool InternalStats::GetIntProperty(DBPropertyType property_type, uint64_t* value, DBImpl* db) const { - Version* current = cfd_->current(); + const auto* vstorage = cfd_->current()->storage_info(); switch (property_type) { case kNumImmutableMemTable: @@ -232,7 +237,7 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type, case kCompactionPending: // 1 if the system already determines at least one compacdtion is needed. // 0 otherwise, - *value = (current->NeedsCompaction() ? 1 : 0); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); return true; case kBackgroundErrors: // Accumulated number of errors in background flushes or compactions. @@ -242,12 +247,17 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type, // Current size of the active memtable *value = cfd_->mem()->ApproximateMemoryUsage(); return true; + case kCurSizeAllMemTables: + // Current size of the active memtable + immutable memtables + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; case kNumEntriesInMutableMemtable: - // Current size of the active memtable + // Current number of entires in the active memtable *value = cfd_->mem()->GetNumEntries(); return true; case kNumEntriesInImmutableMemtable: - // Current size of the active memtable + // Current number of entries in the immutable memtables *value = cfd_->imm()->current()->GetTotalNumEntries(); return true; case kEstimatedNumKeys: @@ -255,11 +265,19 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type, // Use estimated entries in tables + total entries in memtables. *value = cfd_->mem()->GetNumEntries() + cfd_->imm()->current()->GetTotalNumEntries() + - current->GetEstimatedActiveKeys(); + vstorage->GetEstimatedActiveKeys(); + return true; + case kNumSnapshots: + *value = db->snapshots().count(); return true; + case kOldestSnapshotTime: + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +#ifndef ROCKSDB_LITE case kIsFileDeletionEnabled: *value = db->IsFileDeletionsEnabled(); return true; +#endif default: return false; } @@ -276,18 +294,29 @@ void InternalStats::DumpDBStats(std::string* value) { value->append(buf); // Cumulative uint64_t user_bytes_written = db_stats_[InternalStats::BYTES_WRITTEN]; + uint64_t num_keys_written = db_stats_[InternalStats::NUMBER_KEYS_WRITTEN]; uint64_t write_other = db_stats_[InternalStats::WRITE_DONE_BY_OTHER]; uint64_t write_self = db_stats_[InternalStats::WRITE_DONE_BY_SELF]; uint64_t wal_bytes = db_stats_[InternalStats::WAL_FILE_BYTES]; uint64_t wal_synced = db_stats_[InternalStats::WAL_FILE_SYNCED]; uint64_t write_with_wal = db_stats_[InternalStats::WRITE_WITH_WAL]; + uint64_t write_stall_micros = db_stats_[InternalStats::WRITE_STALL_MICROS]; // Data + // writes: total number of write requests. + // keys: total number of key updates issued by all the write requests + // batches: number of group commits issued to the DB. Each group can contain + // one or more writes. + // so writes/keys is the average number of put in multi-put or put + // writes/batches is the average group commit size. + // + // The format is the same for interval stats. snprintf(buf, sizeof(buf), - "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " batches, " - "%.1f writes per batch, %.2f GB user ingest\n", - write_other + write_self, write_self, + "Cumulative writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64 + " batches, %.1f writes per batch, %.2f GB user ingest, " + "stall micros: %" PRIu64 "\n", + write_other + write_self, num_keys_written, write_self, (write_other + write_self) / static_cast(write_self + 1), - user_bytes_written / kGB); + user_bytes_written / kGB, write_stall_micros); value->append(buf); // WAL snprintf(buf, sizeof(buf), @@ -301,14 +330,18 @@ void InternalStats::DumpDBStats(std::string* value) { // Interval uint64_t interval_write_other = write_other - db_stats_snapshot_.write_other; uint64_t interval_write_self = write_self - db_stats_snapshot_.write_self; + uint64_t interval_num_keys_written = + num_keys_written - db_stats_snapshot_.num_keys_written; snprintf(buf, sizeof(buf), - "Interval writes: %" PRIu64 " writes, %" PRIu64 " batches, " - "%.1f writes per batch, %.1f MB user ingest\n", + "Interval writes: %" PRIu64 " writes, %" PRIu64 " keys, %" PRIu64 + " batches, %.1f writes per batch, %.1f MB user ingest, " + "stall micros: %" PRIu64 "\n", interval_write_other + interval_write_self, - interval_write_self, + interval_num_keys_written, interval_write_self, static_cast(interval_write_other + interval_write_self) / (interval_write_self + 1), - (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB); + (user_bytes_written - db_stats_snapshot_.ingest_bytes) / kMB, + write_stall_micros - db_stats_snapshot_.write_stall_micros); value->append(buf); uint64_t interval_write_with_wal = @@ -330,30 +363,33 @@ void InternalStats::DumpDBStats(std::string* value) { db_stats_snapshot_.ingest_bytes = user_bytes_written; db_stats_snapshot_.write_other = write_other; db_stats_snapshot_.write_self = write_self; + db_stats_snapshot_.num_keys_written = num_keys_written; db_stats_snapshot_.wal_bytes = wal_bytes; db_stats_snapshot_.wal_synced = wal_synced; db_stats_snapshot_.write_with_wal = write_with_wal; + db_stats_snapshot_.write_stall_micros = write_stall_micros; } void InternalStats::DumpCFStats(std::string* value) { - Version* current = cfd_->current(); + const VersionStorageInfo* vstorage = cfd_->current()->storage_info(); int num_levels_to_check = - (cfd_->options()->compaction_style != kCompactionStyleUniversal && - cfd_->options()->compaction_style != kCompactionStyleFIFO) - ? current->NumberLevels() - 1 + (cfd_->ioptions()->compaction_style != kCompactionStyleUniversal && + cfd_->ioptions()->compaction_style != kCompactionStyleFIFO) + ? vstorage->num_levels() - 1 : 1; + // Compaction scores are sorted base on its value. Restore them to the // level order std::vector compaction_score(number_levels_, 0); for (int i = 0; i < num_levels_to_check; ++i) { - compaction_score[current->compaction_level_[i]] = - current->compaction_score_[i]; + compaction_score[vstorage->CompactionScoreLevel(i)] = + vstorage->CompactionScore(i); } // Count # of files being compacted for each level std::vector files_being_compacted(number_levels_, 0); for (int level = 0; level < num_levels_to_check; ++level) { - for (auto* f : current->files_[level]) { + for (auto* f : vstorage->LevelFiles(level)) { if (f->being_compacted) { ++files_being_compacted[level]; } @@ -376,7 +412,7 @@ void InternalStats::DumpCFStats(std::string* value) { uint64_t total_stall_count = 0; double total_stall_us = 0; for (int level = 0; level < number_levels_; level++) { - int files = current->NumLevelFiles(level); + int files = vstorage->NumLevelFiles(level); total_files += files; total_files_being_compacted += files_being_compacted[level]; if (comp_stats_[level].micros > 0 || files > 0) { @@ -395,36 +431,29 @@ void InternalStats::DumpCFStats(std::string* value) { stall_leveln_slowdown_hard_[level]); stats_sum.Add(comp_stats_[level]); - total_file_size += current->NumLevelBytes(level); + total_file_size += vstorage->NumLevelBytes(level); total_stall_us += stall_us; total_stall_count += stalls; total_slowdown_soft += stall_leveln_slowdown_soft_[level]; total_slowdown_count_soft += stall_leveln_slowdown_count_soft_[level]; total_slowdown_hard += stall_leveln_slowdown_hard_[level]; total_slowdown_count_hard += stall_leveln_slowdown_count_hard_[level]; - int64_t bytes_read = comp_stats_[level].bytes_readn + - comp_stats_[level].bytes_readnp1; - double rw_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0 - : (comp_stats_[level].bytes_written + bytes_read) / - static_cast(comp_stats_[level].bytes_readn); double w_amp = (comp_stats_[level].bytes_readn == 0) ? 0.0 : comp_stats_[level].bytes_written / static_cast(comp_stats_[level].bytes_readn); - PrintLevelStats(buf, sizeof(buf), "L" + std::to_string(level), - files, files_being_compacted[level], current->NumLevelBytes(level), - compaction_score[level], rw_amp, w_amp, stall_us, stalls, - comp_stats_[level]); + PrintLevelStats(buf, sizeof(buf), "L" + ToString(level), files, + files_being_compacted[level], + vstorage->NumLevelBytes(level), compaction_score[level], + w_amp, stall_us, stalls, comp_stats_[level]); value->append(buf); } } uint64_t curr_ingest = cf_stats_value_[BYTES_FLUSHED]; // Cumulative summary - double rw_amp = (stats_sum.bytes_written + stats_sum.bytes_readn + - stats_sum.bytes_readnp1) / static_cast(curr_ingest + 1); double w_amp = stats_sum.bytes_written / static_cast(curr_ingest + 1); // Stats summary across levels PrintLevelStats(buf, sizeof(buf), "Sum", total_files, - total_files_being_compacted, total_file_size, 0, rw_amp, w_amp, + total_files_being_compacted, total_file_size, 0, w_amp, total_stall_us, total_stall_count, stats_sum); value->append(buf); // Interval summary @@ -432,12 +461,9 @@ void InternalStats::DumpCFStats(std::string* value) { curr_ingest - cf_stats_snapshot_.ingest_bytes + 1; CompactionStats interval_stats(stats_sum); interval_stats.Subtract(cf_stats_snapshot_.comp_stats); - rw_amp = (interval_stats.bytes_written + - interval_stats.bytes_readn + interval_stats.bytes_readnp1) / - static_cast(interval_ingest); w_amp = interval_stats.bytes_written / static_cast(interval_ingest); PrintLevelStats(buf, sizeof(buf), "Int", 0, 0, 0, 0, - rw_amp, w_amp, total_stall_us - cf_stats_snapshot_.stall_us, + w_amp, total_stall_us - cf_stats_snapshot_.stall_us, total_stall_count - cf_stats_snapshot_.stall_count, interval_stats); value->append(buf); @@ -473,4 +499,14 @@ void InternalStats::DumpCFStats(std::string* value) { cf_stats_snapshot_.stall_count = total_stall_count; } + +#else + +DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, + bool* need_out_of_mutex) { + return kUnknown; +} + +#endif // !ROCKSDB_LITE + } // namespace rocksdb diff --git a/db/internal_stats.h b/db/internal_stats.h index 2e04f24e7..c1d77b6b6 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -21,6 +21,8 @@ namespace rocksdb { class MemTableList; class DBImpl; +// IMPORTANT: If you add a new property here, also add it to the list in +// include/rocksdb/db.h enum DBPropertyType : uint32_t { kUnknown, kNumFilesAtLevel, // Number of files at a specific level @@ -36,6 +38,8 @@ enum DBPropertyType : uint32_t { kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. kBackgroundErrors, // Return accumulated background errors encountered. kCurSizeActiveMemTable, // Return current size of the active memtable + kCurSizeAllMemTables, // Return current size of all (active + immutable) + // memtables kNumEntriesInMutableMemtable, // Return number of entries in the mutable // memtable. kNumEntriesInImmutableMemtable, // Return sum of number of entries in all @@ -44,12 +48,16 @@ enum DBPropertyType : uint32_t { kEstimatedUsageByTableReaders, // Estimated memory by table readers. kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_, // 0 means file deletions enabled + kNumSnapshots, // Number of snapshots in the system + kOldestSnapshotTime, // Unix timestamp of the first snapshot }; extern DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, bool* need_out_of_mutex); + +#ifndef ROCKSDB_LITE class InternalStats { public: enum InternalCFStatsType { @@ -65,9 +73,11 @@ class InternalStats { WAL_FILE_BYTES, WAL_FILE_SYNCED, BYTES_WRITTEN, + NUMBER_KEYS_WRITTEN, WRITE_DONE_BY_OTHER, WRITE_DONE_BY_SELF, WRITE_WITH_WAL, + WRITE_STALL_MICROS, INTERNAL_DB_STATS_ENUM_MAX, }; @@ -114,6 +124,9 @@ class InternalStats { // Total bytes written during compaction between levels N and N+1 uint64_t bytes_written; + // Total bytes moved to this level + uint64_t bytes_moved; + // Files read from level N during compaction between levels N and N+1 int files_in_leveln; @@ -123,27 +136,40 @@ class InternalStats { // Files written during compaction between levels N and N+1 int files_out_levelnp1; + // Total incoming entries during compaction between levels N and N+1 + uint64_t num_input_records; + + // Accumulated diff number of entries + // (num input entries - num output entires) for compaction levels N and N+1 + uint64_t num_dropped_records; + // Number of compactions done int count; - explicit CompactionStats(int count = 0) + explicit CompactionStats(int _count = 0) : micros(0), bytes_readn(0), bytes_readnp1(0), bytes_written(0), + bytes_moved(0), files_in_leveln(0), files_in_levelnp1(0), files_out_levelnp1(0), - count(count) {} + num_input_records(0), + num_dropped_records(0), + count(_count) {} explicit CompactionStats(const CompactionStats& c) : micros(c.micros), bytes_readn(c.bytes_readn), bytes_readnp1(c.bytes_readnp1), bytes_written(c.bytes_written), + bytes_moved(c.bytes_moved), files_in_leveln(c.files_in_leveln), files_in_levelnp1(c.files_in_levelnp1), files_out_levelnp1(c.files_out_levelnp1), + num_input_records(c.num_input_records), + num_dropped_records(c.num_dropped_records), count(c.count) {} void Add(const CompactionStats& c) { @@ -151,9 +177,12 @@ class InternalStats { this->bytes_readn += c.bytes_readn; this->bytes_readnp1 += c.bytes_readnp1; this->bytes_written += c.bytes_written; + this->bytes_moved += c.bytes_moved; this->files_in_leveln += c.files_in_leveln; this->files_in_levelnp1 += c.files_in_levelnp1; this->files_out_levelnp1 += c.files_out_levelnp1; + this->num_input_records += c.num_input_records; + this->num_dropped_records += c.num_dropped_records; this->count += c.count; } @@ -162,9 +191,12 @@ class InternalStats { this->bytes_readn -= c.bytes_readn; this->bytes_readnp1 -= c.bytes_readnp1; this->bytes_written -= c.bytes_written; + this->bytes_moved -= c.bytes_moved; this->files_in_leveln -= c.files_in_leveln; this->files_in_levelnp1 -= c.files_in_levelnp1; this->files_out_levelnp1 -= c.files_out_levelnp1; + this->num_input_records -= c.num_input_records; + this->num_dropped_records -= c.num_dropped_records; this->count -= c.count; } }; @@ -173,6 +205,10 @@ class InternalStats { comp_stats_[level].Add(stats); } + void IncBytesMoved(int level, uint64_t amount) { + comp_stats_[level].bytes_moved += amount; + } + void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) { if (soft) { stall_leveln_slowdown_soft_[level] += micros; @@ -247,6 +283,13 @@ class InternalStats { // another thread. uint64_t write_other; uint64_t write_self; + // Total number of keys written. write_self and write_other measure number + // of write requests written, Each of the write request can contain updates + // to multiple keys. num_keys_written is total number of keys updated by all + // those writes. + uint64_t num_keys_written; + // Total time writes delayed by stalls. + uint64_t write_stall_micros; double seconds_up; DBStatsSnapshot() @@ -256,6 +299,8 @@ class InternalStats { write_with_wal(0), write_other(0), write_self(0), + num_keys_written(0), + write_stall_micros(0), seconds_up(0) {} } db_stats_snapshot_; @@ -272,4 +317,78 @@ class InternalStats { const uint64_t started_at_; }; +#else + +class InternalStats { + public: + enum InternalCFStatsType { + LEVEL0_SLOWDOWN, + MEMTABLE_COMPACTION, + LEVEL0_NUM_FILES, + WRITE_STALLS_ENUM_MAX, + BYTES_FLUSHED, + INTERNAL_CF_STATS_ENUM_MAX, + }; + + enum InternalDBStatsType { + WAL_FILE_BYTES, + WAL_FILE_SYNCED, + BYTES_WRITTEN, + NUMBER_KEYS_WRITTEN, + WRITE_DONE_BY_OTHER, + WRITE_DONE_BY_SELF, + WRITE_WITH_WAL, + WRITE_STALL_MICROS, + INTERNAL_DB_STATS_ENUM_MAX, + }; + + InternalStats(int num_levels, Env* env, ColumnFamilyData* cfd) {} + + struct CompactionStats { + uint64_t micros; + uint64_t bytes_readn; + uint64_t bytes_readnp1; + uint64_t bytes_written; + uint64_t bytes_moved; + int files_in_leveln; + int files_in_levelnp1; + int files_out_levelnp1; + uint64_t num_input_records; + uint64_t num_dropped_records; + int count; + + explicit CompactionStats(int _count = 0) {} + + explicit CompactionStats(const CompactionStats& c) {} + + void Add(const CompactionStats& c) {} + + void Subtract(const CompactionStats& c) {} + }; + + void AddCompactionStats(int level, const CompactionStats& stats) {} + + void IncBytesMoved(int level, uint64_t amount) {} + + void RecordLevelNSlowdown(int level, uint64_t micros, bool soft) {} + + void AddCFStats(InternalCFStatsType type, uint64_t value) {} + + void AddDBStats(InternalDBStatsType type, uint64_t value) {} + + uint64_t GetBackgroundErrorCount() const { return 0; } + + uint64_t BumpAndGetBackgroundErrorCount() { return 0; } + + bool GetStringProperty(DBPropertyType property_type, const Slice& property, + std::string* value) { return false; } + + bool GetIntProperty(DBPropertyType property_type, uint64_t* value, + DBImpl* db) const { return false; } + + bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, + uint64_t* value) const { return false; } +}; +#endif // !ROCKSDB_LITE + } // namespace rocksdb diff --git a/db/job_context.h b/db/job_context.h new file mode 100644 index 000000000..d3aa9b215 --- /dev/null +++ b/db/job_context.h @@ -0,0 +1,103 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include +#include + +#include "db/column_family.h" + +namespace rocksdb { + +class MemTable; + +struct JobContext { + inline bool HaveSomethingToDelete() const { + return full_scan_candidate_files.size() || sst_delete_files.size() || + log_delete_files.size() || new_superversion != nullptr || + superversions_to_free.size() > 0 || memtables_to_free.size() > 0; + } + + // Structure to store information for candidate files to delete. + struct CandidateFileInfo { + std::string file_name; + uint32_t path_id; + CandidateFileInfo(std::string name, uint32_t path) + : file_name(std::move(name)), path_id(path) {} + bool operator==(const CandidateFileInfo& other) const { + return file_name == other.file_name && path_id == other.path_id; + } + }; + + // a list of all files that we'll consider deleting + // (every once in a while this is filled up with all files + // in the DB directory) + // (filled only if we're doing full scan) + std::vector full_scan_candidate_files; + + // the list of all live sst files that cannot be deleted + std::vector sst_live; + + // a list of sst files that we need to delete + std::vector sst_delete_files; + + // a list of log files that we need to delete + std::vector log_delete_files; + + // a list of memtables to be free + autovector memtables_to_free; + + autovector superversions_to_free; + + SuperVersion* new_superversion; // if nullptr no new superversion + + // the current manifest_file_number, log_number and prev_log_number + // that corresponds to the set of files in 'live'. + uint64_t manifest_file_number; + uint64_t pending_manifest_file_number; + uint64_t log_number; + uint64_t prev_log_number; + + uint64_t min_pending_output = 0; + + explicit JobContext(bool create_superversion = false) { + manifest_file_number = 0; + pending_manifest_file_number = 0; + log_number = 0; + prev_log_number = 0; + new_superversion = create_superversion ? new SuperVersion() : nullptr; + } + + void Clean() { + // free pending memtables + for (auto m : memtables_to_free) { + delete m; + } + // free superversions + for (auto s : superversions_to_free) { + delete s; + } + // if new_superversion was not used, it will be non-nullptr and needs + // to be freed here + delete new_superversion; + + memtables_to_free.clear(); + superversions_to_free.clear(); + new_superversion = nullptr; + } + + ~JobContext() { + assert(memtables_to_free.size() == 0); + assert(superversions_to_free.size() == 0); + assert(new_superversion == nullptr); + } +}; + +} // namespace rocksdb diff --git a/db/listener_test.cc b/db/listener_test.cc new file mode 100644 index 000000000..80d4d4cd1 --- /dev/null +++ b/db/listener_test.cc @@ -0,0 +1,401 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#include "db/dbformat.h" +#include "db/db_impl.h" +#include "db/filename.h" +#include "db/version_set.h" +#include "db/write_batch_internal.h" +#include "rocksdb/cache.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/options.h" +#include "rocksdb/table_properties.h" +#include "table/block_based_table_factory.h" +#include "table/plain_table_factory.h" +#include "util/hash.h" +#include "util/hash_linklist_rep.h" +#include "utilities/merge_operators.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/rate_limiter.h" +#include "util/statistics.h" +#include "util/testharness.h" +#include "util/sync_point.h" +#include "util/testutil.h" + +#ifndef ROCKSDB_LITE + +namespace rocksdb { + +class EventListenerTest { + public: + EventListenerTest() { + dbname_ = test::TmpDir() + "/listener_test"; + ASSERT_OK(DestroyDB(dbname_, Options())); + db_ = nullptr; + Reopen(); + } + + ~EventListenerTest() { + Close(); + Options options; + options.db_paths.emplace_back(dbname_, 0); + options.db_paths.emplace_back(dbname_ + "_2", 0); + options.db_paths.emplace_back(dbname_ + "_3", 0); + options.db_paths.emplace_back(dbname_ + "_4", 0); + ASSERT_OK(DestroyDB(dbname_, options)); + } + + void CreateColumnFamilies(const std::vector& cfs, + const ColumnFamilyOptions* options = nullptr) { + ColumnFamilyOptions cf_opts; + cf_opts = ColumnFamilyOptions(Options()); + size_t cfi = handles_.size(); + handles_.resize(cfi + cfs.size()); + for (auto cf : cfs) { + ASSERT_OK(db_->CreateColumnFamily(cf_opts, cf, &handles_[cfi++])); + } + } + + void Close() { + for (auto h : handles_) { + delete h; + } + handles_.clear(); + delete db_; + db_ = nullptr; + } + + void ReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + ASSERT_OK(TryReopenWithColumnFamilies(cfs, options)); + } + + Status TryReopenWithColumnFamilies(const std::vector& cfs, + const Options* options = nullptr) { + Close(); + Options opts = (options == nullptr) ? Options() : *options; + std::vector v_opts(cfs.size(), &opts); + return TryReopenWithColumnFamilies(cfs, v_opts); + } + + Status TryReopenWithColumnFamilies( + const std::vector& cfs, + const std::vector& options) { + Close(); + ASSERT_EQ(cfs.size(), options.size()); + std::vector column_families; + for (size_t i = 0; i < cfs.size(); ++i) { + column_families.push_back(ColumnFamilyDescriptor(cfs[i], *options[i])); + } + DBOptions db_opts = DBOptions(*options[0]); + return DB::Open(db_opts, dbname_, column_families, &handles_, &db_); + } + + Status TryReopen(Options* options = nullptr) { + Close(); + Options opts; + if (options != nullptr) { + opts = *options; + } else { + opts.create_if_missing = true; + } + + return DB::Open(opts, dbname_, &db_); + } + + void Reopen(Options* options = nullptr) { + ASSERT_OK(TryReopen(options)); + } + + void CreateAndReopenWithCF(const std::vector& cfs, + const Options* options = nullptr) { + CreateColumnFamilies(cfs, options); + std::vector cfs_plus_default = cfs; + cfs_plus_default.insert(cfs_plus_default.begin(), kDefaultColumnFamilyName); + ReopenWithColumnFamilies(cfs_plus_default, options); + } + + DBImpl* dbfull() { + return reinterpret_cast(db_); + } + + Status Put(int cf, const Slice& k, const Slice& v, + WriteOptions wo = WriteOptions()) { + return db_->Put(wo, handles_[cf], k, v); + } + + Status Flush(int cf = 0) { + FlushOptions opt = FlushOptions(); + opt.wait = true; + if (cf == 0) { + return db_->Flush(opt); + } else { + return db_->Flush(opt, handles_[cf]); + } + } + + DB* db_; + std::string dbname_; + std::vector handles_; +}; + +class TestCompactionListener : public EventListener { + public: + void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override { + compacted_dbs_.push_back(db); + } + + std::vector compacted_dbs_; +}; + +TEST(EventListenerTest, OnSingleDBCompactionTest) { + const int kTestKeySize = 16; + const int kTestValueSize = 984; + const int kEntrySize = kTestKeySize + kTestValueSize; + const int kEntriesPerBuffer = 100; + const int kNumL0Files = 4; + + Options options; + options.create_if_missing = true; + options.write_buffer_size = kEntrySize * kEntriesPerBuffer; + options.compaction_style = kCompactionStyleLevel; + options.target_file_size_base = options.write_buffer_size; + options.max_bytes_for_level_base = options.target_file_size_base * 2; + options.max_bytes_for_level_multiplier = 2; + options.compression = kNoCompression; + options.enable_thread_tracking = true; + options.level0_file_num_compaction_trigger = kNumL0Files; + + TestCompactionListener* listener = new TestCompactionListener(); + options.listeners.emplace_back(listener); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + CreateAndReopenWithCF(cf_names, &options); + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (size_t i = 1; i < 8; ++i) { + ASSERT_OK(Flush(static_cast(i))); + const Slice kStart = "a"; + const Slice kEnd = "z"; + ASSERT_OK(dbfull()->CompactRange(handles_[i], &kStart, &kEnd)); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + } + + ASSERT_EQ(listener->compacted_dbs_.size(), cf_names.size()); + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->compacted_dbs_[i], db_); + } +} + +class TestFlushListener : public EventListener { + public: + void OnFlushCompleted( + DB* db, const std::string& name, + const std::string& file_path, + bool triggered_writes_slowdown, + bool triggered_writes_stop) override { + flushed_dbs_.push_back(db); + flushed_column_family_names_.push_back(name); + if (triggered_writes_slowdown) { + slowdown_count++; + } + if (triggered_writes_stop) { + stop_count++; + } + } + + std::vector flushed_column_family_names_; + std::vector flushed_dbs_; + int slowdown_count; + int stop_count; +}; + +TEST(EventListenerTest, OnSingleDBFlushTest) { + Options options; + options.write_buffer_size = 100000; + TestFlushListener* listener = new TestFlushListener(); + options.listeners.emplace_back(listener); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + CreateAndReopenWithCF(cf_names, &options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (size_t i = 1; i < 8; ++i) { + ASSERT_OK(Flush(static_cast(i))); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } + + // make sure call-back functions are called in the right order + for (size_t i = 0; i < cf_names.size(); ++i) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } +} + +TEST(EventListenerTest, MultiCF) { + Options options; + options.write_buffer_size = 100000; + TestFlushListener* listener = new TestFlushListener(); + options.listeners.emplace_back(listener); + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + CreateAndReopenWithCF(cf_names, &options); + + ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); + ASSERT_OK(Put(2, "ilya", std::string(90000, 'i'))); + ASSERT_OK(Put(3, "muromec", std::string(90000, 'm'))); + ASSERT_OK(Put(4, "dobrynia", std::string(90000, 'd'))); + ASSERT_OK(Put(5, "nikitich", std::string(90000, 'n'))); + ASSERT_OK(Put(6, "alyosha", std::string(90000, 'a'))); + ASSERT_OK(Put(7, "popovich", std::string(90000, 'p'))); + for (size_t i = 1; i < 8; ++i) { + ASSERT_OK(Flush(static_cast(i))); + ASSERT_EQ(listener->flushed_dbs_.size(), i); + ASSERT_EQ(listener->flushed_column_family_names_.size(), i); + } + + // make sure call-back functions are called in the right order + for (size_t i = 0; i < cf_names.size(); i++) { + ASSERT_EQ(listener->flushed_dbs_[i], db_); + ASSERT_EQ(listener->flushed_column_family_names_[i], cf_names[i]); + } +} + +TEST(EventListenerTest, MultiDBMultiListeners) { + std::vector listeners; + const int kNumDBs = 5; + const int kNumListeners = 10; + for (int i = 0; i < kNumListeners; ++i) { + listeners.emplace_back(new TestFlushListener()); + } + + std::vector cf_names = { + "pikachu", "ilya", "muromec", "dobrynia", + "nikitich", "alyosha", "popovich"}; + + Options options; + options.create_if_missing = true; + for (int i = 0; i < kNumListeners; ++i) { + options.listeners.emplace_back(listeners[i]); + } + DBOptions db_opts(options); + ColumnFamilyOptions cf_opts(options); + + std::vector dbs; + std::vector> vec_handles; + + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(DestroyDB(dbname_ + ToString(d), options)); + DB* db; + std::vector handles; + ASSERT_OK(DB::Open(options, dbname_ + ToString(d), &db)); + for (size_t c = 0; c < cf_names.size(); ++c) { + ColumnFamilyHandle* handle; + db->CreateColumnFamily(cf_opts, cf_names[c], &handle); + handles.push_back(handle); + } + + vec_handles.push_back(std::move(handles)); + dbs.push_back(db); + } + + for (int d = 0; d < kNumDBs; ++d) { + for (size_t c = 0; c < cf_names.size(); ++c) { + ASSERT_OK(dbs[d]->Put(WriteOptions(), vec_handles[d][c], + cf_names[c], cf_names[c])); + } + } + + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_OK(dbs[d]->Flush(FlushOptions(), vec_handles[d][c])); + reinterpret_cast(dbs[d])->TEST_WaitForFlushMemTable(); + } + } + + for (auto* listener : listeners) { + int pos = 0; + for (size_t c = 0; c < cf_names.size(); ++c) { + for (int d = 0; d < kNumDBs; ++d) { + ASSERT_EQ(listener->flushed_dbs_[pos], dbs[d]); + ASSERT_EQ(listener->flushed_column_family_names_[pos], cf_names[c]); + pos++; + } + } + } + + for (auto handles : vec_handles) { + for (auto h : handles) { + delete h; + } + handles.clear(); + } + vec_handles.clear(); + + for (auto db : dbs) { + delete db; + } +} + +TEST(EventListenerTest, DisableBGCompaction) { + Options options; + TestFlushListener* listener = new TestFlushListener(); + const int kSlowdownTrigger = 5; + const int kStopTrigger = 10; + options.level0_slowdown_writes_trigger = kSlowdownTrigger; + options.level0_stop_writes_trigger = kStopTrigger; + options.listeners.emplace_back(listener); + // BG compaction is disabled. Number of L0 files will simply keeps + // increasing in this test. + options.compaction_style = kCompactionStyleNone; + options.compression = kNoCompression; + options.write_buffer_size = 100000; // Small write buffer + + CreateAndReopenWithCF({"pikachu"}, &options); + WriteOptions wopts; + wopts.timeout_hint_us = 100000; + ColumnFamilyMetaData cf_meta; + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + // keep writing until writes are forced to stop. + for (int i = 0; static_cast(cf_meta.file_count) < kStopTrigger; ++i) { + Put(1, ToString(i), std::string(100000, 'x'), wopts); + db_->GetColumnFamilyMetaData(handles_[1], &cf_meta); + } + ASSERT_GE(listener->slowdown_count, kStopTrigger - kSlowdownTrigger); + ASSERT_GE(listener->stop_count, 1); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} + diff --git a/db/log_and_apply_bench.cc b/db/log_and_apply_bench.cc index a5aa95017..e5e271a1f 100644 --- a/db/log_and_apply_bench.cc +++ b/db/log_and_apply_bench.cc @@ -6,25 +6,33 @@ #include +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include "util/testharness.h" #include "util/benchharness.h" #include "db/version_set.h" +#include "db/write_controller.h" +#include "db/writebuffer.h" #include "util/mutexlock.h" namespace rocksdb { -std::string MakeKey(unsigned int num) { +std::string MakeKey(uint64_t num) { char buf[30]; - snprintf(buf, sizeof(buf), "%016u", num); + snprintf(buf, sizeof(buf), "%016" PRIu64, num); return std::string(buf); } void BM_LogAndApply(int iters, int num_base_files) { VersionSet* vset; + WriteController wc; ColumnFamilyData* default_cfd; uint64_t fnum = 1; - port::Mutex mu; - MutexLock l(&mu); + InstrumentedMutex mu; + InstrumentedMutexLock l(&mu); BENCHMARK_SUSPEND { std::string dbname = test::TmpDir() + "/rocksdb_test_benchmark"; @@ -45,9 +53,10 @@ void BM_LogAndApply(int iters, int num_base_files) { // Notice we are using the default options not through SanitizeOptions(). // We might want to initialize some options manually if needed. options.db_paths.emplace_back(dbname, 0); + WriteBuffer wb(options.db_write_buffer_size); // The parameter of table cache is passed in as null, so any file I/O // operation is likely to fail. - vset = new VersionSet(dbname, &options, sopt, nullptr); + vset = new VersionSet(dbname, &options, sopt, nullptr, &wb, &wc); std::vector dummy; dummy.push_back(ColumnFamilyDescriptor()); ASSERT_OK(vset->Recover(dummy)); @@ -58,7 +67,8 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); vbase.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); } - ASSERT_OK(vset->LogAndApply(default_cfd, &vbase, &mu)); + ASSERT_OK(vset->LogAndApply(default_cfd, + *default_cfd->GetLatestMutableCFOptions(), &vbase, &mu)); } for (int i = 0; i < iters; i++) { @@ -67,8 +77,10 @@ void BM_LogAndApply(int iters, int num_base_files) { InternalKey start(MakeKey(2 * fnum), 1, kTypeValue); InternalKey limit(MakeKey(2 * fnum + 1), 1, kTypeDeletion); vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); - vset->LogAndApply(default_cfd, &vedit, &mu); + vset->LogAndApply(default_cfd, *default_cfd->GetLatestMutableCFOptions(), + &vedit, &mu); } + delete vset; } BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1) diff --git a/db/log_reader.cc b/db/log_reader.cc index be1fb8ceb..9ab97ca3e 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -20,9 +20,9 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(unique_ptr&& file, Reporter* reporter, +Reader::Reader(unique_ptr&& _file, Reporter* reporter, bool checksum, uint64_t initial_offset) - : file_(std::move(file)), + : file_(std::move(_file)), reporter_(reporter), checksum_(checksum), backing_store_(new char[kBlockSize]), @@ -32,8 +32,7 @@ Reader::Reader(unique_ptr&& file, Reporter* reporter, eof_offset_(0), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset) { -} + initial_offset_(initial_offset) {} Reader::~Reader() { delete[] backing_store_; @@ -55,7 +54,7 @@ bool Reader::SkipToInitialBlock() { if (block_start_location > 0) { Status skip_status = file_->Skip(block_start_location); if (!skip_status.ok()) { - ReportDrop(block_start_location, skip_status); + ReportDrop(static_cast(block_start_location), skip_status); return false; } } diff --git a/db/log_test.cc b/db/log_test.cc index 6577a6a9c..8086e2775 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -558,9 +558,9 @@ TEST(LogTest, ErrorJoinsRecords) { ASSERT_EQ("correct", Read()); ASSERT_EQ("EOF", Read()); - const unsigned int dropped = DroppedBytes(); - ASSERT_LE(dropped, 2*kBlockSize + 100); - ASSERT_GE(dropped, 2*kBlockSize); + size_t dropped = DroppedBytes(); + ASSERT_LE(dropped, 2 * kBlockSize + 100); + ASSERT_GE(dropped, 2 * kBlockSize); } TEST(LogTest, ReadStart) { diff --git a/db/memtable.cc b/db/memtable.cc index 523998c30..6dcacc421 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -15,6 +15,7 @@ #include "db/dbformat.h" #include "db/merge_context.h" +#include "db/writebuffer.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -31,41 +32,63 @@ namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) +MemTableOptions::MemTableOptions( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options) + : write_buffer_size(mutable_cf_options.write_buffer_size), + arena_block_size(mutable_cf_options.arena_block_size), + memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes( + mutable_cf_options.memtable_prefix_bloom_probes), + memtable_prefix_bloom_huge_page_tlb_size( + mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size), + inplace_update_support(ioptions.inplace_update_support), + inplace_update_num_locks(mutable_cf_options.inplace_update_num_locks), + inplace_callback(ioptions.inplace_callback), + max_successive_merges(mutable_cf_options.max_successive_merges), + filter_deletes(mutable_cf_options.filter_deletes), + statistics(ioptions.statistics), + merge_operator(ioptions.merge_operator), + info_log(ioptions.info_log) {} + +MemTable::MemTable(const InternalKeyComparator& cmp, + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBuffer* write_buffer) : comparator_(cmp), + moptions_(ioptions, mutable_cf_options), refs_(0), - kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), - kWriteBufferSize(options.write_buffer_size), - arena_(options.arena_block_size), - table_(options.memtable_factory->CreateMemTableRep( - comparator_, &arena_, options.prefix_extractor.get(), - options.info_log.get())), + kArenaBlockSize(OptimizeBlockSize(moptions_.arena_block_size)), + arena_(moptions_.arena_block_size), + allocator_(&arena_, write_buffer), + table_(ioptions.memtable_factory->CreateMemTableRep( + comparator_, &allocator_, ioptions.prefix_extractor, + ioptions.info_log)), num_entries_(0), flush_in_progress_(false), flush_completed_(false), file_number_(0), first_seqno_(0), mem_next_logfile_number_(0), - locks_(options.inplace_update_support ? options.inplace_update_num_locks - : 0), - prefix_extractor_(options.prefix_extractor.get()), - should_flush_(ShouldFlushNow()) { + locks_(moptions_.inplace_update_support ? + moptions_.inplace_update_num_locks : 0), + prefix_extractor_(ioptions.prefix_extractor), + should_flush_(ShouldFlushNow()), + flush_scheduled_(false) { // if should_flush_ == true without an entry inserted, something must have // gone wrong already. assert(!should_flush_); - if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { + if (prefix_extractor_ && moptions_.memtable_prefix_bloom_bits > 0) { prefix_bloom_.reset(new DynamicBloom( - &arena_, - options.memtable_prefix_bloom_bits, options.bloom_locality, - options.memtable_prefix_bloom_probes, nullptr, - options.memtable_prefix_bloom_huge_page_tlb_size, - options.info_log.get())); + &allocator_, + moptions_.memtable_prefix_bloom_bits, ioptions.bloom_locality, + moptions_.memtable_prefix_bloom_probes, nullptr, + moptions_.memtable_prefix_bloom_huge_page_tlb_size, + ioptions.info_log)); } } -MemTable::~MemTable() { - assert(refs_ == 0); -} +MemTable::~MemTable() { assert(refs_ == 0); } size_t MemTable::ApproximateMemoryUsage() { size_t arena_usage = arena_.ApproximateMemoryUsage(); @@ -97,14 +120,16 @@ bool MemTable::ShouldFlushNow() const { // if we can still allocate one more block without exceeding the // over-allocation ratio, then we should not flush. if (allocated_memory + kArenaBlockSize < - kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + moptions_.write_buffer_size + + kArenaBlockSize * kAllowOverAllocationRatio) { return false; } - // if user keeps adding entries that exceeds kWriteBufferSize, we need to - // flush earlier even though we still have much available memory left. - if (allocated_memory > - kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { + // if user keeps adding entries that exceeds moptions.write_buffer_size, + // we need to flush earlier even though we still have much available + // memory left. + if (allocated_memory > moptions_.write_buffer_size + + kArenaBlockSize * kAllowOverAllocationRatio) { return true; } @@ -158,7 +183,7 @@ Slice MemTableRep::UserKey(const char* key) const { } KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { - *buf = arena_->Allocate(len); + *buf = allocator_->Allocate(len); return static_cast(*buf); } @@ -167,7 +192,7 @@ KeyHandle MemTableRep::Allocate(const size_t len, char** buf) { // into this scratch space. const char* EncodeKey(std::string* scratch, const Slice& target) { scratch->clear(); - PutVarint32(scratch, target.size()); + PutVarint32(scratch, static_cast(target.size())); scratch->append(target.data(), target.size()); return scratch->data(); } @@ -175,12 +200,12 @@ const char* EncodeKey(std::string* scratch, const Slice& target) { class MemTableIterator: public Iterator { public: MemTableIterator( - const MemTable& mem, const ReadOptions& options, Arena* arena) + const MemTable& mem, const ReadOptions& read_options, Arena* arena) : bloom_(nullptr), prefix_extractor_(mem.prefix_extractor_), valid_(false), arena_mode_(arena != nullptr) { - if (prefix_extractor_ != nullptr && !options.total_order_seek) { + if (prefix_extractor_ != nullptr && !read_options.total_order_seek) { bloom_ = mem.prefix_bloom_.get(); iter_ = mem.table_->GetDynamicPrefixIterator(arena); } else { @@ -248,14 +273,10 @@ class MemTableIterator: public Iterator { void operator=(const MemTableIterator&); }; -Iterator* MemTable::NewIterator(const ReadOptions& options, Arena* arena) { - if (arena == nullptr) { - return new MemTableIterator(*this, options, nullptr); - } else { - auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); - return new (mem) - MemTableIterator(*this, options, arena); - } +Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) { + assert(arena != nullptr); + auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); + return new (mem) MemTableIterator(*this, read_options, arena); } port::RWMutex* MemTable::GetLock(const Slice& key) { @@ -271,12 +292,12 @@ void MemTable::Add(SequenceNumber s, ValueType type, // key bytes : char[internal_key.size()] // value_size : varint32 of value.size() // value bytes : char[value.size()] - size_t key_size = key.size(); - size_t val_size = value.size(); - size_t internal_key_size = key_size + 8; - const size_t encoded_len = - VarintLength(internal_key_size) + internal_key_size + - VarintLength(val_size) + val_size; + uint32_t key_size = static_cast(key.size()); + uint32_t val_size = static_cast(value.size()); + uint32_t internal_key_size = key_size + 8; + const uint32_t encoded_len = VarintLength(internal_key_size) + + internal_key_size + VarintLength(val_size) + + val_size; char* buf = nullptr; KeyHandle handle = table_->Allocate(encoded_len, &buf); assert(buf != nullptr); @@ -399,7 +420,6 @@ static bool SaveValue(void* arg, const char* entry) { *(s->found_final_value) = true; return false; } - std::string merge_result; // temporary area for merge results later Slice v = GetLengthPrefixedSlice(key_ptr + key_length); *(s->merge_in_progress) = true; merge_context->PushOperand(v); @@ -416,13 +436,13 @@ static bool SaveValue(void* arg, const char* entry) { } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options) { + MergeContext* merge_context) { // The sequence number is updated synchronously in version_set.h - if (first_seqno_ == 0) { + if (IsEmpty()) { // Avoiding recording stats for speed. return false; } - PERF_TIMER_AUTO(get_from_memtable_time); + PERF_TIMER_GUARD(get_from_memtable_time); Slice user_key = key.user_key(); bool found_final_value = false; @@ -440,11 +460,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, saver.value = value; saver.status = s; saver.mem = this; - saver.merge_context = &merge_context; - saver.merge_operator = options.merge_operator.get(); - saver.logger = options.info_log.get(); - saver.inplace_update_support = options.inplace_update_support; - saver.statistics = options.statistics.get(); + saver.merge_context = merge_context; + saver.merge_operator = moptions_.merge_operator; + saver.logger = moptions_.info_log; + saver.inplace_update_support = moptions_.inplace_update_support; + saver.statistics = moptions_.statistics; table_->Get(key, &saver, SaveValue); } @@ -452,7 +472,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, if (!found_final_value && merge_in_progress) { *s = Status::MergeInProgress(""); } - PERF_TIMER_STOP(get_from_memtable_time); PERF_COUNTER_ADD(get_from_memtable_count, 1); return found_final_value; } @@ -487,8 +506,8 @@ void MemTable::Update(SequenceNumber seq, switch (static_cast(tag & 0xff)) { case kTypeValue: { Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); - uint32_t prev_size = prev_value.size(); - uint32_t new_size = value.size(); + uint32_t prev_size = static_cast(prev_value.size()); + uint32_t new_size = static_cast(value.size()); // Update value, if new value size <= previous value size if (new_size <= prev_size ) { @@ -517,8 +536,7 @@ void MemTable::Update(SequenceNumber seq, bool MemTable::UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta, - const Options& options) { + const Slice& delta) { LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); @@ -546,15 +564,15 @@ bool MemTable::UpdateCallback(SequenceNumber seq, switch (static_cast(tag & 0xff)) { case kTypeValue: { Slice prev_value = GetLengthPrefixedSlice(key_ptr + key_length); - uint32_t prev_size = prev_value.size(); + uint32_t prev_size = static_cast(prev_value.size()); char* prev_buffer = const_cast(prev_value.data()); - uint32_t new_prev_size = prev_size; + uint32_t new_prev_size = prev_size; std::string str_value; WriteLock wl(GetLock(lkey.user_key())); - auto status = options.inplace_callback(prev_buffer, &new_prev_size, - delta, &str_value); + auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size, + delta, &str_value); if (status == UpdateStatus::UPDATED_INPLACE) { // Value already updated by callback. assert(new_prev_size <= prev_size); @@ -567,12 +585,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq, memcpy(p, prev_buffer, new_prev_size); } } - RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); + RecordTick(moptions_.statistics, NUMBER_KEYS_UPDATED); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATED) { Add(seq, kTypeValue, key, Slice(str_value)); - RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(moptions_.statistics, NUMBER_KEYS_WRITTEN); should_flush_ = ShouldFlushNow(); return true; } else if (status == UpdateStatus::UPDATE_FAILED) { diff --git a/db/memtable.h b/db/memtable.h index 8bc281c6c..f3befce7d 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -10,21 +10,48 @@ #pragma once #include #include +#include #include +#include #include "db/dbformat.h" #include "db/skiplist.h" #include "db/version_edit.h" #include "rocksdb/db.h" #include "rocksdb/memtablerep.h" +#include "rocksdb/immutable_options.h" +#include "db/memtable_allocator.h" #include "util/arena.h" #include "util/dynamic_bloom.h" +#include "util/mutable_cf_options.h" namespace rocksdb { -class Arena; class Mutex; class MemTableIterator; class MergeContext; +class WriteBuffer; + +struct MemTableOptions { + explicit MemTableOptions( + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options); + size_t write_buffer_size; + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + uint32_t memtable_prefix_bloom_probes; + size_t memtable_prefix_bloom_huge_page_tlb_size; + bool inplace_update_support; + size_t inplace_update_num_locks; + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + size_t max_successive_merges; + bool filter_deletes; + Statistics* statistics; + MergeOperator* merge_operator; + Logger* info_log; +}; class MemTable { public: @@ -40,7 +67,9 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator, - const Options& options); + const ImmutableCFOptions& ioptions, + const MutableCFOptions& mutable_cf_options, + WriteBuffer* write_buffer); ~MemTable(); @@ -67,7 +96,11 @@ class MemTable { // This method heuristically determines if the memtable should continue to // host more data. - bool ShouldFlush() const { return should_flush_; } + bool ShouldScheduleFlush() const { + return flush_scheduled_ == false && should_flush_; + } + + void MarkFlushScheduled() { flush_scheduled_ = true; } // Return an iterator that yields the contents of the memtable. // @@ -81,8 +114,7 @@ class MemTable { // arena: If not null, the arena needs to be used to allocate the Iterator. // Calling ~Iterator of the iterator will destroy all the states but // those allocated in arena. - Iterator* NewIterator(const ReadOptions& options, - Arena* arena = nullptr); + Iterator* NewIterator(const ReadOptions& read_options, Arena* arena); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. @@ -100,7 +132,7 @@ class MemTable { // store MergeInProgress in s, and return false. // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options); + MergeContext* merge_context); // Attempts to update the new_value inplace, else does normal Add // Pseudocode @@ -124,8 +156,7 @@ class MemTable { // else return false bool UpdateCallback(SequenceNumber seq, const Slice& key, - const Slice& delta, - const Options& options); + const Slice& delta); // Returns the number of successive merge entries starting from the newest // entry for the key up to the last non-merge entry or last entry for the @@ -138,6 +169,9 @@ class MemTable { // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } + // Returns if there is no entry inserted to the mem table. + bool IsEmpty() const { return first_seqno_ == 0; } + // Returns the sequence number of the first element that was inserted // into the memtable SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } @@ -151,7 +185,10 @@ class MemTable { void SetNextLogNumber(uint64_t num) { mem_next_logfile_number_ = num; } // Notify the underlying storage that no more items will be added - void MarkImmutable() { table_->MarkReadOnly(); } + void MarkImmutable() { + table_->MarkReadOnly(); + allocator_.DoneAllocating(); + } // return true if the current MemTableRep supports merge operator. bool IsMergeOperatorSupported() const { @@ -159,7 +196,10 @@ class MemTable { } // return true if the current MemTableRep supports snapshots. - bool IsSnapshotSupported() const { return table_->IsSnapshotSupported(); } + // inplace update prevents snapshots, + bool IsSnapshotSupported() const { + return table_->IsSnapshotSupported() && !moptions_.inplace_update_support; + } // Get the lock associated for the key port::RWMutex* GetLock(const Slice& key); @@ -168,10 +208,10 @@ class MemTable { return comparator_.comparator; } - const Arena& TEST_GetArena() const { return arena_; } + const MemTableOptions* GetMemTableOptions() const { return &moptions_; } private: - // Dynamically check if we can add more incoming entries. + // Dynamically check if we can add more incoming entries bool ShouldFlushNow() const; friend class MemTableIterator; @@ -179,10 +219,11 @@ class MemTable { friend class MemTableList; KeyComparator comparator_; + const MemTableOptions moptions_; int refs_; const size_t kArenaBlockSize; - const size_t kWriteBufferSize; Arena arena_; + MemTableAllocator allocator_; unique_ptr table_; uint64_t num_entries_; @@ -214,6 +255,9 @@ class MemTable { // a flag indicating if a memtable has met the criteria to flush bool should_flush_; + + // a flag indicating if flush has been scheduled + bool flush_scheduled_; }; extern const char* EncodeKey(std::string* scratch, const Slice& target); diff --git a/db/memtable_allocator.cc b/db/memtable_allocator.cc new file mode 100644 index 000000000..d3ecea2fd --- /dev/null +++ b/db/memtable_allocator.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include "db/memtable_allocator.h" +#include "db/writebuffer.h" +#include "util/arena.h" + +namespace rocksdb { + +MemTableAllocator::MemTableAllocator(Arena* arena, WriteBuffer* write_buffer) + : arena_(arena), write_buffer_(write_buffer), bytes_allocated_(0) { +} + +MemTableAllocator::~MemTableAllocator() { + DoneAllocating(); +} + +char* MemTableAllocator::Allocate(size_t bytes) { + assert(write_buffer_ != nullptr); + bytes_allocated_ += bytes; + write_buffer_->ReserveMem(bytes); + return arena_->Allocate(bytes); +} + +char* MemTableAllocator::AllocateAligned(size_t bytes, size_t huge_page_size, + Logger* logger) { + assert(write_buffer_ != nullptr); + bytes_allocated_ += bytes; + write_buffer_->ReserveMem(bytes); + return arena_->AllocateAligned(bytes, huge_page_size, logger); +} + +void MemTableAllocator::DoneAllocating() { + if (write_buffer_ != nullptr) { + write_buffer_->FreeMem(bytes_allocated_); + write_buffer_ = nullptr; + } +} + +size_t MemTableAllocator::BlockSize() const { + return arena_->BlockSize(); +} + +} // namespace rocksdb diff --git a/db/memtable_allocator.h b/db/memtable_allocator.h new file mode 100644 index 000000000..fa8ee1287 --- /dev/null +++ b/db/memtable_allocator.h @@ -0,0 +1,47 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// This is used by the MemTable to allocate write buffer memory. It connects +// to WriteBuffer so we can track and enforce overall write buffer limits. + +#pragma once +#include "util/allocator.h" + +namespace rocksdb { + +class Arena; +class Logger; +class WriteBuffer; + +class MemTableAllocator : public Allocator { + public: + explicit MemTableAllocator(Arena* arena, WriteBuffer* write_buffer); + ~MemTableAllocator(); + + // Allocator interface + char* Allocate(size_t bytes) override; + char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) override; + size_t BlockSize() const override; + + // Call when we're finished allocating memory so we can free it from + // the write buffer's limit. + void DoneAllocating(); + + private: + Arena* arena_; + WriteBuffer* write_buffer_; + size_t bytes_allocated_; + + // No copying allowed + MemTableAllocator(const MemTableAllocator&); + void operator=(const MemTableAllocator&); +}; + +} // namespace rocksdb diff --git a/db/memtable_list.cc b/db/memtable_list.cc index d3fc1356b..44c069dd5 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -5,6 +5,11 @@ // #include "db/memtable_list.h" +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include #include "rocksdb/db.h" #include "db/memtable.h" @@ -62,10 +67,9 @@ int MemTableList::size() const { // Return the most recent value found, if any. // Operands stores the list of merge operations to apply, so far. bool MemTableListVersion::Get(const LookupKey& key, std::string* value, - Status* s, MergeContext& merge_context, - const Options& options) { + Status* s, MergeContext* merge_context) { for (auto& memtable : memlist_) { - if (memtable->Get(key, value, s, merge_context, options)) { + if (memtable->Get(key, value, s, merge_context)) { return true; } } @@ -73,9 +77,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value, } void MemTableListVersion::AddIterators(const ReadOptions& options, - std::vector* iterator_list) { + std::vector* iterator_list, + Arena* arena) { for (auto& m : memlist_) { - iterator_list->push_back(m->NewIterator(options)); + iterator_list->push_back(m->NewIterator(options, arena)); } } @@ -114,7 +119,7 @@ void MemTableListVersion::Remove(MemTable* m) { bool MemTableList::IsFlushPending() const { if ((flush_requested_ && num_flush_not_started_ >= 1) || (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) { - assert(imm_flush_needed.NoBarrier_Load() != nullptr); + assert(imm_flush_needed.load(std::memory_order_relaxed)); return true; } return false; @@ -129,7 +134,7 @@ void MemTableList::PickMemtablesToFlush(autovector* ret) { assert(!m->flush_completed_); num_flush_not_started_--; if (num_flush_not_started_ == 0) { - imm_flush_needed.Release_Store(nullptr); + imm_flush_needed.store(false, std::memory_order_release); } m->flush_in_progress_ = true; // flushing will start very soon ret->push_back(m); @@ -139,8 +144,7 @@ void MemTableList::PickMemtablesToFlush(autovector* ret) { } void MemTableList::RollbackMemtableFlush(const autovector& mems, - uint64_t file_number, - FileNumToPathIdMap* pending_outputs) { + uint64_t file_number) { assert(!mems.empty()); // If the flush was not successful, then just reset state. @@ -154,15 +158,14 @@ void MemTableList::RollbackMemtableFlush(const autovector& mems, m->edit_.Clear(); num_flush_not_started_++; } - pending_outputs->erase(file_number); - imm_flush_needed.Release_Store(reinterpret_cast(1)); + imm_flush_needed.store(true, std::memory_order_release); } // Record a successful flush in the manifest file Status MemTableList::InstallMemtableFlushResults( - ColumnFamilyData* cfd, const autovector& mems, VersionSet* vset, - port::Mutex* mu, Logger* info_log, uint64_t file_number, - FileNumToPathIdMap* pending_outputs, autovector* to_delete, + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& mems, VersionSet* vset, InstrumentedMutex* mu, + uint64_t file_number, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer) { mu->AssertHeld(); @@ -193,11 +196,11 @@ Status MemTableList::InstallMemtableFlushResults( break; } - LogToBuffer(log_buffer, "[%s] Level-0 commit table #%lu started", - cfd->GetName().c_str(), (unsigned long)m->file_number_); + LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 " started", + cfd->GetName().c_str(), m->file_number_); // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, &m->edit_, mu, db_directory); + s = vset->LogAndApply(cfd, mutable_cf_options, &m->edit_, mu, db_directory); // we will be changing the version in the next code path, // so we better create a new one, since versions are immutable @@ -208,34 +211,26 @@ Status MemTableList::InstallMemtableFlushResults( uint64_t mem_id = 1; // how many memtables has been flushed. do { if (s.ok()) { // commit new state - LogToBuffer(log_buffer, - "[%s] Level-0 commit table #%lu: memtable #%lu done", - cfd->GetName().c_str(), (unsigned long)m->file_number_, - (unsigned long)mem_id); + LogToBuffer(log_buffer, "[%s] Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " done", + cfd->GetName().c_str(), m->file_number_, mem_id); current_->Remove(m); assert(m->file_number_ > 0); - // pending_outputs can be cleared only after the newly created file - // has been written to a committed version so that other concurrently - // executing compaction threads do not mistakenly assume that this - // file is not live. - pending_outputs->erase(m->file_number_); if (m->Unref() != nullptr) { to_delete->push_back(m); } } else { //commit failed. setup state so that we can flush again. - Log(info_log, - "Level-0 commit table #%lu: memtable #%lu failed", - (unsigned long)m->file_number_, - (unsigned long)mem_id); + LogToBuffer(log_buffer, "Level-0 commit table #%" PRIu64 + ": memtable #%" PRIu64 " failed", + m->file_number_, mem_id); m->flush_completed_ = false; m->flush_in_progress_ = false; m->edit_.Clear(); num_flush_not_started_++; - pending_outputs->erase(m->file_number_); m->file_number_ = 0; - imm_flush_needed.Release_Store((void *)1); + imm_flush_needed.store(true, std::memory_order_release); } ++mem_id; } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) && @@ -258,17 +253,17 @@ void MemTableList::Add(MemTable* m) { m->MarkImmutable(); num_flush_not_started_++; if (num_flush_not_started_ == 1) { - imm_flush_needed.Release_Store((void *)1); + imm_flush_needed.store(true, std::memory_order_release); } } // Returns an estimate of the number of bytes of data in use. size_t MemTableList::ApproximateMemoryUsage() { - size_t size = 0; + size_t total_size = 0; for (auto& memtable : current_->memlist_) { - size += memtable->ApproximateMemoryUsage(); + total_size += memtable->ApproximateMemoryUsage(); } - return size; + return total_size; } void MemTableList::InstallNewVersion() { diff --git a/db/memtable_list.h b/db/memtable_list.h index f4923e831..30382eac6 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -22,13 +22,14 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "util/autovector.h" +#include "util/instrumented_mutex.h" #include "util/log_buffer.h" namespace rocksdb { class ColumnFamilyData; class InternalKeyComparator; -class Mutex; +class InstrumentedMutex; class MergeIteratorBuilder; // keeps a list of immutable memtables in a vector. the list is immutable @@ -46,10 +47,10 @@ class MemTableListVersion { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - MergeContext& merge_context, const Options& options); + MergeContext* merge_context); void AddIterators(const ReadOptions& options, - std::vector* iterator_list); + std::vector* iterator_list, Arena* arena); void AddIterators(const ReadOptions& options, MergeIteratorBuilder* merge_iter_builder); @@ -78,12 +79,12 @@ class MemTableList { public: // A list of memtables. explicit MemTableList(int min_write_buffer_number_to_merge) - : min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), + : imm_flush_needed(false), + min_write_buffer_number_to_merge_(min_write_buffer_number_to_merge), current_(new MemTableListVersion()), num_flush_not_started_(0), commit_in_progress_(false), flush_requested_(false) { - imm_flush_needed.Release_Store(nullptr); current_->Ref(); } ~MemTableList() {} @@ -92,7 +93,7 @@ class MemTableList { // so that background threads can detect non-nullptr pointer to // determine whether there is anything more to start flushing. - port::AtomicPointer imm_flush_needed; + std::atomic imm_flush_needed; // Returns the total number of memtables in the list int size() const; @@ -108,14 +109,13 @@ class MemTableList { // Reset status of the given memtable list back to pending state so that // they can get picked up again on the next round of flush. void RollbackMemtableFlush(const autovector& mems, - uint64_t file_number, - FileNumToPathIdMap* pending_outputs); + uint64_t file_number); // Commit a successful flush in the manifest file Status InstallMemtableFlushResults( - ColumnFamilyData* cfd, const autovector& m, VersionSet* vset, - port::Mutex* mu, Logger* info_log, uint64_t file_number, - FileNumToPathIdMap* pending_outputs, autovector* to_delete, + ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options, + const autovector& m, VersionSet* vset, InstrumentedMutex* mu, + uint64_t file_number, autovector* to_delete, Directory* db_directory, LogBuffer* log_buffer); // New memtables are inserted at the front of the list. diff --git a/db/memtablerep_bench.cc b/db/memtablerep_bench.cc new file mode 100644 index 000000000..5bdfa836d --- /dev/null +++ b/db/memtablerep_bench.cc @@ -0,0 +1,696 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#define __STDC_FORMAT_MACROS + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/writebuffer.h" +#include "port/port.h" +#include "port/stack_trace.h" +#include "rocksdb/comparator.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/options.h" +#include "rocksdb/slice_transform.h" +#include "util/arena.h" +#include "util/mutexlock.h" +#include "util/stop_watch.h" +#include "util/testutil.h" + +using GFLAGS::ParseCommandLineFlags; +using GFLAGS::RegisterFlagValidator; +using GFLAGS::SetUsageMessage; + +DEFINE_string(benchmarks, "fillrandom", + "Comma-separated list of benchmarks to run. Options:\n" + "\tfillrandom -- write N random values\n" + "\tfillseq -- write N values in sequential order\n" + "\treadrandom -- read N values in random order\n" + "\treadseq -- scan the DB\n" + "\treadwrite -- 1 thread writes while N - 1 threads " + "do random\n" + "\t reads\n" + "\tseqreadwrite -- 1 thread writes while N - 1 threads " + "do scans\n"); + +DEFINE_string(memtablerep, "skiplist", + "Which implementation of memtablerep to use. See " + "include/memtablerep.h for\n" + " more details. Options:\n" + "\tskiplist -- backed by a skiplist\n" + "\tvector -- backed by an std::vector\n" + "\thashskiplist -- backed by a hash skip list\n" + "\thashlinklist -- backed by a hash linked list\n" + "\tcuckoo -- backed by a cuckoo hash table"); + +DEFINE_int64(bucket_count, 1000000, + "bucket_count parameter to pass into NewHashSkiplistRepFactory or " + "NewHashLinkListRepFactory"); + +DEFINE_int32( + hashskiplist_height, 4, + "skiplist_height parameter to pass into NewHashSkiplistRepFactory"); + +DEFINE_int32( + hashskiplist_branching_factor, 4, + "branching_factor parameter to pass into NewHashSkiplistRepFactory"); + +DEFINE_int32( + huge_page_tlb_size, 0, + "huge_page_tlb_size parameter to pass into NewHashLinkListRepFactory"); + +DEFINE_int32(bucket_entries_logging_threshold, 4096, + "bucket_entries_logging_threshold parameter to pass into " + "NewHashLinkListRepFactory"); + +DEFINE_bool(if_log_bucket_dist_when_flash, true, + "if_log_bucket_dist_when_flash parameter to pass into " + "NewHashLinkListRepFactory"); + +DEFINE_int32( + threshold_use_skiplist, 256, + "threshold_use_skiplist parameter to pass into NewHashLinkListRepFactory"); + +DEFINE_int64( + write_buffer_size, 256, + "write_buffer_size parameter to pass into NewHashCuckooRepFactory"); + +DEFINE_int64( + average_data_size, 64, + "average_data_size parameter to pass into NewHashCuckooRepFactory"); + +DEFINE_int64( + hash_function_count, 4, + "hash_function_count parameter to pass into NewHashCuckooRepFactory"); + +DEFINE_int32( + num_threads, 1, + "Number of concurrent threads to run. If the benchmark includes writes,\n" + "then at most one thread will be a writer"); + +DEFINE_int32(num_operations, 1000000, + "Number of operations to do for write and random read benchmarks"); + +DEFINE_int32(num_scans, 10, + "Number of times for each thread to scan the memtablerep for " + "sequential read " + "benchmarks"); + +DEFINE_int32(item_size, 100, "Number of bytes each item should be"); + +DEFINE_int32(prefix_length, 8, + "Prefix length to pass into NewFixedPrefixTransform"); + +/* VectorRep settings */ +DEFINE_int64(vectorrep_count, 0, + "Number of entries to reserve on VectorRep initialization"); + +DEFINE_int64(seed, 0, + "Seed base for random number generators. " + "When 0 it is deterministic."); + +static rocksdb::Env* FLAGS_env = rocksdb::Env::Default(); + +namespace rocksdb { + +namespace { +struct CallbackVerifyArgs { + bool found; + LookupKey* key; + MemTableRep* table; + InternalKeyComparator* comparator; +}; +} // namespace + +// Helper for quickly generating random data. +class RandomGenerator { + private: + std::string data_; + unsigned int pos_; + + public: + RandomGenerator() { + Random rnd(301); + auto size = (unsigned)std::max(1048576, FLAGS_item_size); + test::RandomString(&rnd, size, &data_); + pos_ = 0; + } + + Slice Generate(unsigned int len) { + assert(len <= data_.size()); + if (pos_ + len > data_.size()) { + pos_ = 0; + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } +}; + +enum WriteMode { SEQUENTIAL, RANDOM, UNIQUE_RANDOM }; + +class KeyGenerator { + public: + KeyGenerator(Random64* rand, WriteMode mode, uint64_t num) + : rand_(rand), mode_(mode), num_(num), next_(0) { + if (mode_ == UNIQUE_RANDOM) { + // NOTE: if memory consumption of this approach becomes a concern, + // we can either break it into pieces and only random shuffle a section + // each time. Alternatively, use a bit map implementation + // (https://reviews.facebook.net/differential/diff/54627/) + values_.resize(num_); + for (uint64_t i = 0; i < num_; ++i) { + values_[i] = i; + } + std::shuffle( + values_.begin(), values_.end(), + std::default_random_engine(static_cast(FLAGS_seed))); + } + } + + uint64_t Next() { + switch (mode_) { + case SEQUENTIAL: + return next_++; + case RANDOM: + return rand_->Next() % num_; + case UNIQUE_RANDOM: + return values_[next_++]; + } + assert(false); + return std::numeric_limits::max(); + } + + private: + Random64* rand_; + WriteMode mode_; + const uint64_t num_; + uint64_t next_; + std::vector values_; +}; + +class BenchmarkThread { + public: + explicit BenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, + uint64_t* read_hits) + : table_(table), + key_gen_(key_gen), + bytes_written_(bytes_written), + bytes_read_(bytes_read), + sequence_(sequence), + num_ops_(num_ops), + read_hits_(read_hits) {} + + virtual void operator()() = 0; + virtual ~BenchmarkThread() {} + + protected: + MemTableRep* table_; + KeyGenerator* key_gen_; + uint64_t* bytes_written_; + uint64_t* bytes_read_; + uint64_t* sequence_; + uint64_t num_ops_; + uint64_t* read_hits_; + RandomGenerator generator_; +}; + +class FillBenchmarkThread : public BenchmarkThread { + public: + FillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits) + : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence, + num_ops, read_hits) {} + + void FillOne() { + char* buf = nullptr; + auto internal_key_size = 16; + auto encoded_len = + FLAGS_item_size + VarintLength(internal_key_size) + internal_key_size; + KeyHandle handle = table_->Allocate(encoded_len, &buf); + assert(buf != nullptr); + char* p = EncodeVarint32(buf, internal_key_size); + auto key = key_gen_->Next(); + EncodeFixed64(p, key); + p += 8; + EncodeFixed64(p, ++(*sequence_)); + p += 8; + Slice bytes = generator_.Generate(FLAGS_item_size); + memcpy(p, bytes.data(), FLAGS_item_size); + p += FLAGS_item_size; + assert(p == buf + encoded_len); + table_->Insert(handle); + *bytes_written_ += encoded_len; + } + + void operator()() override { + for (unsigned int i = 0; i < num_ops_; ++i) { + FillOne(); + } + } +}; + +class ConcurrentFillBenchmarkThread : public FillBenchmarkThread { + public: + ConcurrentFillBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, + uint64_t* read_hits, + std::atomic_int* threads_done) + : FillBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence, + num_ops, read_hits) { + threads_done_ = threads_done; + } + + void operator()() override { + // # of read threads will be total threads - write threads (always 1). Loop + // while all reads complete. + while ((*threads_done_).load() < (FLAGS_num_threads - 1)) { + FillOne(); + } + } + + private: + std::atomic_int* threads_done_; +}; + +class ReadBenchmarkThread : public BenchmarkThread { + public: + ReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, uint64_t* read_hits) + : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence, + num_ops, read_hits) {} + + static bool callback(void* arg, const char* entry) { + CallbackVerifyArgs* callback_args = static_cast(arg); + assert(callback_args != nullptr); + uint32_t key_length; + const char* key_ptr = GetVarint32Ptr(entry, entry + 5, &key_length); + if ((callback_args->comparator)->user_comparator()->Compare( + Slice(key_ptr, key_length - 8), callback_args->key->user_key()) == + 0) { + callback_args->found = true; + } + return false; + } + + void ReadOne() { + std::string user_key; + auto key = key_gen_->Next(); + PutFixed64(&user_key, key); + LookupKey lookup_key(user_key, *sequence_); + InternalKeyComparator internal_key_comp(BytewiseComparator()); + CallbackVerifyArgs verify_args; + verify_args.found = false; + verify_args.key = &lookup_key; + verify_args.table = table_; + verify_args.comparator = &internal_key_comp; + table_->Get(lookup_key, &verify_args, callback); + if (verify_args.found) { + *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size; + ++*read_hits_; + } + } + void operator()() override { + for (unsigned int i = 0; i < num_ops_; ++i) { + ReadOne(); + } + } +}; + +class SeqReadBenchmarkThread : public BenchmarkThread { + public: + SeqReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, + uint64_t* read_hits) + : BenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence, + num_ops, read_hits) {} + + void ReadOneSeq() { + std::unique_ptr iter(table_->GetIterator()); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + // pretend to read the value + *bytes_read_ += VarintLength(16) + 16 + FLAGS_item_size; + } + ++*read_hits_; + } + + void operator()() override { + for (unsigned int i = 0; i < num_ops_; ++i) { + { ReadOneSeq(); } + } + } +}; + +class ConcurrentReadBenchmarkThread : public ReadBenchmarkThread { + public: + ConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, uint64_t* bytes_read, + uint64_t* sequence, uint64_t num_ops, + uint64_t* read_hits, + std::atomic_int* threads_done) + : ReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, sequence, + num_ops, read_hits) { + threads_done_ = threads_done; + } + + void operator()() override { + for (unsigned int i = 0; i < num_ops_; ++i) { + ReadOne(); + } + ++*threads_done_; + } + + private: + std::atomic_int* threads_done_; +}; + +class SeqConcurrentReadBenchmarkThread : public SeqReadBenchmarkThread { + public: + SeqConcurrentReadBenchmarkThread(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* bytes_written, + uint64_t* bytes_read, uint64_t* sequence, + uint64_t num_ops, uint64_t* read_hits, + std::atomic_int* threads_done) + : SeqReadBenchmarkThread(table, key_gen, bytes_written, bytes_read, + sequence, num_ops, read_hits) { + threads_done_ = threads_done; + } + + void operator()() override { + for (unsigned int i = 0; i < num_ops_; ++i) { + ReadOneSeq(); + } + ++*threads_done_; + } + + private: + std::atomic_int* threads_done_; +}; + +class Benchmark { + public: + explicit Benchmark(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* sequence, uint32_t num_threads) + : table_(table), + key_gen_(key_gen), + sequence_(sequence), + num_threads_(num_threads) {} + + virtual ~Benchmark() {} + virtual void Run() { + std::cout << "Number of threads: " << num_threads_ << std::endl; + std::vector threads; + uint64_t bytes_written = 0; + uint64_t bytes_read = 0; + uint64_t read_hits = 0; + StopWatchNano timer(Env::Default(), true); + RunThreads(&threads, &bytes_written, &bytes_read, true, &read_hits); + auto elapsed_time = static_cast(timer.ElapsedNanos() / 1000); + std::cout << "Elapsed time: " << static_cast(elapsed_time) << " us" + << std::endl; + + if (bytes_written > 0) { + auto MiB_written = static_cast(bytes_written) / (1 << 20); + auto write_throughput = MiB_written / (elapsed_time / 1000000); + std::cout << "Total bytes written: " << MiB_written << " MiB" + << std::endl; + std::cout << "Write throughput: " << write_throughput << " MiB/s" + << std::endl; + auto us_per_op = elapsed_time / num_write_ops_per_thread_; + std::cout << "write us/op: " << us_per_op << std::endl; + } + if (bytes_read > 0) { + auto MiB_read = static_cast(bytes_read) / (1 << 20); + auto read_throughput = MiB_read / (elapsed_time / 1000000); + std::cout << "Total bytes read: " << MiB_read << " MiB" << std::endl; + std::cout << "Read throughput: " << read_throughput << " MiB/s" + << std::endl; + auto us_per_op = elapsed_time / num_read_ops_per_thread_; + std::cout << "read us/op: " << us_per_op << std::endl; + } + } + + virtual void RunThreads(std::vector* threads, + uint64_t* bytes_written, uint64_t* bytes_read, + bool write, uint64_t* read_hits) = 0; + + protected: + MemTableRep* table_; + KeyGenerator* key_gen_; + uint64_t* sequence_; + uint64_t num_write_ops_per_thread_; + uint64_t num_read_ops_per_thread_; + const uint32_t num_threads_; +}; + +class FillBenchmark : public Benchmark { + public: + explicit FillBenchmark(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* sequence) + : Benchmark(table, key_gen, sequence, 1) { + num_write_ops_per_thread_ = FLAGS_num_operations; + } + + void RunThreads(std::vector* threads, uint64_t* bytes_written, + uint64_t* bytes_read, bool write, + uint64_t* read_hits) override { + FillBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, sequence_, + num_write_ops_per_thread_, read_hits)(); + } +}; + +class ReadBenchmark : public Benchmark { + public: + explicit ReadBenchmark(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* sequence) + : Benchmark(table, key_gen, sequence, FLAGS_num_threads) { + num_read_ops_per_thread_ = FLAGS_num_operations / FLAGS_num_threads; + } + + void RunThreads(std::vector* threads, uint64_t* bytes_written, + uint64_t* bytes_read, bool write, + uint64_t* read_hits) override { + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads->emplace_back( + ReadBenchmarkThread(table_, key_gen_, bytes_written, bytes_read, + sequence_, num_read_ops_per_thread_, read_hits)); + } + for (auto& thread : *threads) { + thread.join(); + } + std::cout << "read hit%: " + << (static_cast(*read_hits) / FLAGS_num_operations) * 100 + << std::endl; + } +}; + +class SeqReadBenchmark : public Benchmark { + public: + explicit SeqReadBenchmark(MemTableRep* table, uint64_t* sequence) + : Benchmark(table, nullptr, sequence, FLAGS_num_threads) { + num_read_ops_per_thread_ = FLAGS_num_scans; + } + + void RunThreads(std::vector* threads, uint64_t* bytes_written, + uint64_t* bytes_read, bool write, + uint64_t* read_hits) override { + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads->emplace_back(SeqReadBenchmarkThread( + table_, key_gen_, bytes_written, bytes_read, sequence_, + num_read_ops_per_thread_, read_hits)); + } + for (auto& thread : *threads) { + thread.join(); + } + } +}; + +template +class ReadWriteBenchmark : public Benchmark { + public: + explicit ReadWriteBenchmark(MemTableRep* table, KeyGenerator* key_gen, + uint64_t* sequence) + : Benchmark(table, key_gen, sequence, FLAGS_num_threads) { + num_read_ops_per_thread_ = + FLAGS_num_threads <= 1 + ? 0 + : (FLAGS_num_operations / (FLAGS_num_threads - 1)); + num_write_ops_per_thread_ = FLAGS_num_operations; + } + + void RunThreads(std::vector* threads, uint64_t* bytes_written, + uint64_t* bytes_read, bool write, + uint64_t* read_hits) override { + std::atomic_int threads_done; + threads_done.store(0); + threads->emplace_back(ConcurrentFillBenchmarkThread( + table_, key_gen_, bytes_written, bytes_read, sequence_, + num_write_ops_per_thread_, read_hits, &threads_done)); + for (int i = 1; i < FLAGS_num_threads; ++i) { + threads->emplace_back( + ReadThreadType(table_, key_gen_, bytes_written, bytes_read, sequence_, + num_read_ops_per_thread_, read_hits, &threads_done)); + } + for (auto& thread : *threads) { + thread.join(); + } + } +}; + +} // namespace rocksdb + +void PrintWarnings() { +#if defined(__GNUC__) && !defined(__OPTIMIZE__) + fprintf(stdout, + "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n"); +#endif +#ifndef NDEBUG + fprintf(stdout, + "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); +#endif +} + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + + " [OPTIONS]..."); + ParseCommandLineFlags(&argc, &argv, true); + + PrintWarnings(); + + rocksdb::Options options; + + std::unique_ptr factory; + if (FLAGS_memtablerep == "skiplist") { + factory.reset(new rocksdb::SkipListFactory); + } else if (FLAGS_memtablerep == "vector") { + factory.reset(new rocksdb::VectorRepFactory); + } else if (FLAGS_memtablerep == "hashskiplist") { + factory.reset(rocksdb::NewHashSkipListRepFactory( + FLAGS_bucket_count, FLAGS_hashskiplist_height, + FLAGS_hashskiplist_branching_factor)); + options.prefix_extractor.reset( + rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length)); + } else if (FLAGS_memtablerep == "hashlinklist") { + factory.reset(rocksdb::NewHashLinkListRepFactory( + FLAGS_bucket_count, FLAGS_huge_page_tlb_size, + FLAGS_bucket_entries_logging_threshold, + FLAGS_if_log_bucket_dist_when_flash, FLAGS_threshold_use_skiplist)); + options.prefix_extractor.reset( + rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length)); + } else if (FLAGS_memtablerep == "cuckoo") { + factory.reset(rocksdb::NewHashCuckooRepFactory( + FLAGS_write_buffer_size, FLAGS_average_data_size, + static_cast(FLAGS_hash_function_count))); + options.prefix_extractor.reset( + rocksdb::NewFixedPrefixTransform(FLAGS_prefix_length)); + } else { + fprintf(stdout, "Unknown memtablerep: %s\n", FLAGS_memtablerep.c_str()); + exit(1); + } + + rocksdb::InternalKeyComparator internal_key_comp( + rocksdb::BytewiseComparator()); + rocksdb::MemTable::KeyComparator key_comp(internal_key_comp); + rocksdb::Arena arena; + rocksdb::WriteBuffer wb(FLAGS_write_buffer_size); + rocksdb::MemTableAllocator memtable_allocator(&arena, &wb); + uint64_t sequence; + auto createMemtableRep = [&] { + sequence = 0; + return factory->CreateMemTableRep(key_comp, &memtable_allocator, + options.prefix_extractor.get(), + options.info_log.get()); + }; + std::unique_ptr memtablerep; + rocksdb::Random64 rng(FLAGS_seed); + const char* benchmarks = FLAGS_benchmarks.c_str(); + while (benchmarks != nullptr) { + std::unique_ptr key_gen; + const char* sep = strchr(benchmarks, ','); + rocksdb::Slice name; + if (sep == nullptr) { + name = benchmarks; + benchmarks = nullptr; + } else { + name = rocksdb::Slice(benchmarks, sep - benchmarks); + benchmarks = sep + 1; + } + std::unique_ptr benchmark; + if (name == rocksdb::Slice("fillseq")) { + memtablerep.reset(createMemtableRep()); + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL, + FLAGS_num_operations)); + benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(), + key_gen.get(), &sequence)); + } else if (name == rocksdb::Slice("fillrandom")) { + memtablerep.reset(createMemtableRep()); + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::UNIQUE_RANDOM, + FLAGS_num_operations)); + benchmark.reset(new rocksdb::FillBenchmark(memtablerep.get(), + key_gen.get(), &sequence)); + } else if (name == rocksdb::Slice("readrandom")) { + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM, + FLAGS_num_operations)); + benchmark.reset(new rocksdb::ReadBenchmark(memtablerep.get(), + key_gen.get(), &sequence)); + } else if (name == rocksdb::Slice("readseq")) { + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::SEQUENTIAL, + FLAGS_num_operations)); + benchmark.reset( + new rocksdb::SeqReadBenchmark(memtablerep.get(), &sequence)); + } else if (name == rocksdb::Slice("readwrite")) { + memtablerep.reset(createMemtableRep()); + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM, + FLAGS_num_operations)); + benchmark.reset(new rocksdb::ReadWriteBenchmark< + rocksdb::ConcurrentReadBenchmarkThread>(memtablerep.get(), + key_gen.get(), &sequence)); + } else if (name == rocksdb::Slice("seqreadwrite")) { + memtablerep.reset(createMemtableRep()); + key_gen.reset(new rocksdb::KeyGenerator(&rng, rocksdb::RANDOM, + FLAGS_num_operations)); + benchmark.reset(new rocksdb::ReadWriteBenchmark< + rocksdb::SeqConcurrentReadBenchmarkThread>(memtablerep.get(), + key_gen.get(), &sequence)); + } else { + std::cout << "WARNING: skipping unknown benchmark '" << name.ToString() + << std::endl; + continue; + } + std::cout << "Running " << name.ToString() << std::endl; + benchmark->Run(); + } + + return 0; +} + +#endif // GFLAGS diff --git a/db/merge_helper.cc b/db/merge_helper.cc index 7bde824ab..11b5d8f47 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -85,9 +85,10 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) if (success_) { - std::string& key = keys_.back(); // The original key encountered + std::string& original_key = + keys_.back(); // The original key encountered orig_ikey.type = kTypeValue; - UpdateInternalKey(&key[0], key.size(), + UpdateInternalKey(&original_key[0], original_key.size(), orig_ikey.sequence, orig_ikey.type); swap(operands_.back(), merge_result); } else { @@ -108,17 +109,17 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, // => store result in operands_.back() (and update keys_.back()) // => change the entry type to kTypeValue for keys_.back() // We are done! Success! - const Slice value = iter->value(); - success_ = user_merge_operator_->FullMerge(ikey.user_key, &value, - operands_, &merge_result, - logger_); + const Slice val = iter->value(); + success_ = user_merge_operator_->FullMerge(ikey.user_key, &val, operands_, + &merge_result, logger_); // We store the result in keys_.back() and operands_.back() // if nothing went wrong (i.e.: no operand corruption on disk) if (success_) { - std::string& key = keys_.back(); // The original key encountered + std::string& original_key = + keys_.back(); // The original key encountered orig_ikey.type = kTypeValue; - UpdateInternalKey(&key[0], key.size(), + UpdateInternalKey(&original_key[0], original_key.size(), orig_ikey.sequence, orig_ikey.type); swap(operands_.back(), merge_result); } else { @@ -177,9 +178,9 @@ void MergeHelper::MergeUntil(Iterator* iter, SequenceNumber stop_before, logger_); if (success_) { - std::string& key = keys_.back(); // The original key encountered + std::string& original_key = keys_.back(); // The original key encountered orig_ikey.type = kTypeValue; - UpdateInternalKey(&key[0], key.size(), + UpdateInternalKey(&original_key[0], original_key.size(), orig_ikey.sequence, orig_ikey.type); // The final value() is always stored in operands_.back() diff --git a/db/merge_test.cc b/db/merge_test.cc index 7e71ccf86..1d7800883 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -23,15 +23,11 @@ using namespace std; using namespace rocksdb; namespace { - int numMergeOperatorCalls; - void resetNumMergeOperatorCalls() { - numMergeOperatorCalls = 0; - } +size_t num_merge_operator_calls; +void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; } - int num_partial_merge_calls; - void resetNumPartialMergeCalls() { - num_partial_merge_calls = 0; - } +size_t num_partial_merge_calls; +void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; } } class CountMergeOperator : public AssociativeMergeOperator { @@ -45,7 +41,7 @@ class CountMergeOperator : public AssociativeMergeOperator { const Slice& value, std::string* new_value, Logger* logger) const override { - ++numMergeOperatorCalls; + ++num_merge_operator_calls; if (existing_value == nullptr) { new_value->assign(value.data(), value.size()); return true; @@ -307,31 +303,31 @@ void testCounters(Counters& counters, DB* db, bool test_compaction) { } } -void testSuccessiveMerge( - Counters& counters, int max_num_merges, int num_merges) { +void testSuccessiveMerge(Counters& counters, size_t max_num_merges, + size_t num_merges) { counters.assert_remove("z"); uint64_t sum = 0; - for (int i = 1; i <= num_merges; ++i) { + for (size_t i = 1; i <= num_merges; ++i) { resetNumMergeOperatorCalls(); counters.assert_add("z", i); sum += i; if (i % (max_num_merges + 1) == 0) { - assert(numMergeOperatorCalls == max_num_merges + 1); + assert(num_merge_operator_calls == max_num_merges + 1); } else { - assert(numMergeOperatorCalls == 0); + assert(num_merge_operator_calls == 0); } resetNumMergeOperatorCalls(); assert(counters.assert_get("z") == sum); - assert(numMergeOperatorCalls == i % (max_num_merges + 1)); + assert(num_merge_operator_calls == i % (max_num_merges + 1)); } } -void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, - int count) { +void testPartialMerge(Counters* counters, DB* db, size_t max_merge, + size_t min_merge, size_t count) { FlushOptions o; o.wait = true; @@ -339,7 +335,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, // operands exceeds the threshold. uint64_t tmp_sum = 0; resetNumPartialMergeCalls(); - for (int i = 1; i <= count; i++) { + for (size_t i = 1; i <= count; i++) { counters->assert_add("b", i); tmp_sum += i; } @@ -348,7 +344,7 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, ASSERT_EQ(tmp_sum, counters->assert_get("b")); if (count > max_merge) { // in this case, FullMerge should be called instead. - ASSERT_EQ(num_partial_merge_calls, 0); + ASSERT_EQ(num_partial_merge_calls, 0U); } else { // if count >= min_merge, then partial merge should be called once. ASSERT_EQ((count >= min_merge), (num_partial_merge_calls == 1)); @@ -358,20 +354,18 @@ void testPartialMerge(Counters* counters, DB* db, int max_merge, int min_merge, resetNumPartialMergeCalls(); tmp_sum = 0; db->Put(rocksdb::WriteOptions(), "c", "10"); - for (int i = 1; i <= count; i++) { + for (size_t i = 1; i <= count; i++) { counters->assert_add("c", i); tmp_sum += i; } db->Flush(o); db->CompactRange(nullptr, nullptr); ASSERT_EQ(tmp_sum, counters->assert_get("c")); - ASSERT_EQ(num_partial_merge_calls, 0); + ASSERT_EQ(num_partial_merge_calls, 0U); } -void testSingleBatchSuccessiveMerge( - DB* db, - int max_num_merges, - int num_merges) { +void testSingleBatchSuccessiveMerge(DB* db, size_t max_num_merges, + size_t num_merges) { assert(num_merges > max_num_merges); Slice key("BatchSuccessiveMerge"); @@ -380,7 +374,7 @@ void testSingleBatchSuccessiveMerge( // Create the batch WriteBatch batch; - for (int i = 0; i < num_merges; ++i) { + for (size_t i = 0; i < num_merges; ++i) { batch.Merge(key, merge_value_slice); } @@ -390,8 +384,9 @@ void testSingleBatchSuccessiveMerge( Status s = db->Write(WriteOptions(), &batch); assert(s.ok()); } - assert(numMergeOperatorCalls == - num_merges - (num_merges % (max_num_merges + 1))); + ASSERT_EQ( + num_merge_operator_calls, + static_cast(num_merges - (num_merges % (max_num_merges + 1)))); // Get the value resetNumMergeOperatorCalls(); @@ -403,18 +398,11 @@ void testSingleBatchSuccessiveMerge( assert(get_value_str.size() == sizeof(uint64_t)); uint64_t get_value = DecodeFixed64(&get_value_str[0]); ASSERT_EQ(get_value, num_merges * merge_value); - ASSERT_EQ(numMergeOperatorCalls, (num_merges % (max_num_merges + 1))); + ASSERT_EQ(num_merge_operator_calls, + static_cast((num_merges % (max_num_merges + 1)))); } void runTest(int argc, const string& dbname, const bool use_ttl = false) { - auto db = OpenDb(dbname, use_ttl); - - { - cout << "Test read-modify-write counters... \n"; - Counters counters(db, 0); - testCounters(counters, db.get(), true); - } - bool compact = false; if (argc > 1) { compact = true; @@ -422,13 +410,22 @@ void runTest(int argc, const string& dbname, const bool use_ttl = false) { } { - cout << "Test merge-based counters... \n"; - MergeBasedCounters counters(db, 0); - testCounters(counters, db.get(), compact); + auto db = OpenDb(dbname, use_ttl); + + { + cout << "Test read-modify-write counters... \n"; + Counters counters(db, 0); + testCounters(counters, db.get(), true); + } + + { + cout << "Test merge-based counters... \n"; + MergeBasedCounters counters(db, 0); + testCounters(counters, db.get(), compact); + } } DestroyDB(dbname, Options()); - db.reset(); { cout << "Test merge in memtable... \n"; diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index a182fb521..be35fd6d9 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -6,7 +6,6 @@ #include #include #include -#include "/usr/include/valgrind/callgrind.h" #include "rocksdb/db.h" #include "rocksdb/perf_context.h" @@ -15,6 +14,8 @@ #include "util/histogram.h" #include "util/stop_watch.h" #include "util/testharness.h" +#include "util/thread_status_util.h" +#include "util/string_util.h" bool FLAGS_random_key = false; @@ -29,7 +30,7 @@ const std::string kDbName = rocksdb::test::TmpDir() + "/perf_context_test"; namespace rocksdb { -std::shared_ptr OpenDb() { +std::shared_ptr OpenDb(bool read_only = false) { DB* db; Options options; options.create_if_missing = true; @@ -39,12 +40,21 @@ std::shared_ptr OpenDb() { FLAGS_min_write_buffer_number_to_merge; if (FLAGS_use_set_based_memetable) { - auto prefix_extractor = rocksdb::NewFixedPrefixTransform(0); - options.memtable_factory.reset( - NewHashSkipListRepFactory(prefix_extractor)); +#ifndef ROCKSDB_LITE + options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(0)); + options.memtable_factory.reset(NewHashSkipListRepFactory()); +#else + fprintf(stderr, "Prefix hash is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } - Status s = DB::Open(options, kDbName, &db); + Status s; + if (!read_only) { + s = DB::Open(options, kDbName, &db); + } else { + s = DB::OpenForReadOnly(options, kDbName, &db); + } ASSERT_OK(s); return std::shared_ptr(db); } @@ -58,25 +68,26 @@ TEST(PerfContextTest, SeekIntoDeletion) { ReadOptions read_options; for (int i = 0; i < FLAGS_total_keys; ++i) { - std::string key = "k" + std::to_string(i); - std::string value = "v" + std::to_string(i); + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); db->Put(write_options, key, value); } for (int i = 0; i < FLAGS_total_keys -1 ; ++i) { - std::string key = "k" + std::to_string(i); + std::string key = "k" + ToString(i); db->Delete(write_options, key); } HistogramImpl hist_get; HistogramImpl hist_get_time; for (int i = 0; i < FLAGS_total_keys - 1; ++i) { - std::string key = "k" + std::to_string(i); + std::string key = "k" + ToString(i); std::string value; perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); + StopWatchNano timer(Env::Default()); + timer.Start(); auto status = db->Get(read_options, key, &value); auto elapsed_nanos = timer.ElapsedNanos(); ASSERT_TRUE(status.IsNotFound()); @@ -84,27 +95,32 @@ TEST(PerfContextTest, SeekIntoDeletion) { hist_get_time.Add(elapsed_nanos); } - std::cout << "Get uesr key comparison: \n" << hist_get.ToString() + std::cout << "Get user key comparison: \n" << hist_get.ToString() << "Get time: \n" << hist_get_time.ToString(); - HistogramImpl hist_seek_to_first; - std::unique_ptr iter(db->NewIterator(read_options)); + { + HistogramImpl hist_seek_to_first; + std::unique_ptr iter(db->NewIterator(read_options)); - perf_context.Reset(); - StopWatchNano timer(Env::Default(), true); - iter->SeekToFirst(); - hist_seek_to_first.Add(perf_context.user_key_comparison_count); - auto elapsed_nanos = timer.ElapsedNanos(); + perf_context.Reset(); + StopWatchNano timer(Env::Default(), true); + iter->SeekToFirst(); + hist_seek_to_first.Add(perf_context.user_key_comparison_count); + auto elapsed_nanos = timer.ElapsedNanos(); - std::cout << "SeekToFirst uesr key comparison: \n" << hist_seek_to_first.ToString() - << "ikey skipped: " << perf_context.internal_key_skipped_count << "\n" - << "idelete skipped: " << perf_context.internal_delete_skipped_count << "\n" - << "elapsed: " << elapsed_nanos << "\n"; + std::cout << "SeekToFirst uesr key comparison: \n" + << hist_seek_to_first.ToString() + << "ikey skipped: " << perf_context.internal_key_skipped_count + << "\n" + << "idelete skipped: " + << perf_context.internal_delete_skipped_count << "\n" + << "elapsed: " << elapsed_nanos << "\n"; + } HistogramImpl hist_seek; for (int i = 0; i < FLAGS_total_keys; ++i) { std::unique_ptr iter(db->NewIterator(read_options)); - std::string key = "k" + std::to_string(i); + std::string key = "k" + ToString(i); perf_context.Reset(); StopWatchNano timer(Env::Default(), true); @@ -149,11 +165,12 @@ TEST(PerfContextTest, StopWatchNanoOverhead) { TEST(PerfContextTest, StopWatchOverhead) { // profile the timer cost by itself! const int kTotalIterations = 1000000; + uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(Env::Default()); + StopWatch timer(Env::Default(), nullptr, 0, &elapsed); for (auto& timing : timings) { - timing = timer.ElapsedMicros(); + timing = elapsed; } HistogramImpl histogram; @@ -166,7 +183,7 @@ TEST(PerfContextTest, StopWatchOverhead) { std::cout << histogram.ToString(); } -void ProfileKeyComparison() { +void ProfileQueries(bool enabled_time = false) { DestroyDB(kDbName, Options()); // Start this test with a fresh DB auto db = OpenDb(); @@ -175,74 +192,248 @@ void ProfileKeyComparison() { ReadOptions read_options; HistogramImpl hist_put; + HistogramImpl hist_get; HistogramImpl hist_get_snapshot; HistogramImpl hist_get_memtable; + HistogramImpl hist_get_files; HistogramImpl hist_get_post_process; HistogramImpl hist_num_memtable_checked; + + HistogramImpl hist_mget; + HistogramImpl hist_mget_snapshot; + HistogramImpl hist_mget_memtable; + HistogramImpl hist_mget_files; + HistogramImpl hist_mget_post_process; + HistogramImpl hist_mget_num_memtable_checked; + HistogramImpl hist_write_pre_post; HistogramImpl hist_write_wal_time; HistogramImpl hist_write_memtable_time; + uint64_t total_db_mutex_nanos = 0; + std::cout << "Inserting " << FLAGS_total_keys << " key/value pairs\n...\n"; std::vector keys; + const int kFlushFlag = -1; for (int i = 0; i < FLAGS_total_keys; ++i) { keys.push_back(i); + if (i == FLAGS_total_keys / 2) { + // Issuing a flush in the middle. + keys.push_back(kFlushFlag); + } } if (FLAGS_random_key) { std::random_shuffle(keys.begin(), keys.end()); } - +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 1U); +#endif + int num_mutex_waited = 0; for (const int i : keys) { - std::string key = "k" + std::to_string(i); - std::string value = "v" + std::to_string(i); + if (i == kFlushFlag) { + FlushOptions fo; + db->Flush(fo); + continue; + } + + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + std::vector values; perf_context.Reset(); db->Put(write_options, key, value); + if (++num_mutex_waited > 3) { +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + } hist_write_pre_post.Add(perf_context.write_pre_and_post_process_time); hist_write_wal_time.Add(perf_context.write_wal_time); hist_write_memtable_time.Add(perf_context.write_memtable_time); hist_put.Add(perf_context.user_key_comparison_count); + total_db_mutex_nanos += perf_context.db_mutex_lock_nanos; + } +#ifndef NDEBUG + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0U); +#endif + + for (const int i : keys) { + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + std::vector multiget_keys = {Slice(key)}; + std::vector values; perf_context.Reset(); db->Get(read_options, key, &value); hist_get_snapshot.Add(perf_context.get_snapshot_time); hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_get_files.Add(perf_context.get_from_output_files_time); hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); hist_get_post_process.Add(perf_context.get_post_process_time); hist_get.Add(perf_context.user_key_comparison_count); + + perf_context.Reset(); + db->MultiGet(read_options, multiget_keys, &values); + hist_mget_snapshot.Add(perf_context.get_snapshot_time); + hist_mget_memtable.Add(perf_context.get_from_memtable_time); + hist_mget_files.Add(perf_context.get_from_output_files_time); + hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_mget_post_process.Add(perf_context.get_post_process_time); + hist_mget.Add(perf_context.user_key_comparison_count); } std::cout << "Put uesr key comparison: \n" << hist_put.ToString() - << "Get uesr key comparison: \n" << hist_get.ToString(); + << "Get uesr key comparison: \n" << hist_get.ToString() + << "MultiGet uesr key comparison: \n" << hist_get.ToString(); std::cout << "Put(): Pre and Post Process Time: \n" << hist_write_pre_post.ToString() << " Writing WAL time: \n" << hist_write_wal_time.ToString() << "\n" << " Writing Mem Table time: \n" - << hist_write_memtable_time.ToString() << "\n"; + << hist_write_memtable_time.ToString() << "\n" + << " Total DB mutex nanos: \n" << total_db_mutex_nanos << "\n"; - std::cout << "Get(): Time to get snapshot: \n" + std::cout << "Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" << hist_get_post_process.ToString() + << "\n"; + + std::cout << "MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" << hist_mget_post_process.ToString() + << "\n"; + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_snapshot.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_get_post_process.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); +#ifndef NDEBUG + ASSERT_GT(total_db_mutex_nanos, 2000U); +#endif + } + + db.reset(); + db = OpenDb(true); + + hist_get.Clear(); + hist_get_snapshot.Clear(); + hist_get_memtable.Clear(); + hist_get_files.Clear(); + hist_get_post_process.Clear(); + hist_num_memtable_checked.Clear(); + + hist_mget.Clear(); + hist_mget_snapshot.Clear(); + hist_mget_memtable.Clear(); + hist_mget_files.Clear(); + hist_mget_post_process.Clear(); + hist_mget_num_memtable_checked.Clear(); + + for (const int i : keys) { + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); + + std::vector multiget_keys = {Slice(key)}; + std::vector values; + + perf_context.Reset(); + db->Get(read_options, key, &value); + hist_get_snapshot.Add(perf_context.get_snapshot_time); + hist_get_memtable.Add(perf_context.get_from_memtable_time); + hist_get_files.Add(perf_context.get_from_output_files_time); + hist_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_get_post_process.Add(perf_context.get_post_process_time); + hist_get.Add(perf_context.user_key_comparison_count); + + perf_context.Reset(); + db->MultiGet(read_options, multiget_keys, &values); + hist_mget_snapshot.Add(perf_context.get_snapshot_time); + hist_mget_memtable.Add(perf_context.get_from_memtable_time); + hist_mget_files.Add(perf_context.get_from_output_files_time); + hist_mget_num_memtable_checked.Add(perf_context.get_from_memtable_count); + hist_mget_post_process.Add(perf_context.get_post_process_time); + hist_mget.Add(perf_context.user_key_comparison_count); + } + + std::cout << "ReadOnly Get uesr key comparison: \n" << hist_get.ToString() + << "ReadOnly MultiGet uesr key comparison: \n" + << hist_mget.ToString(); + + std::cout << "ReadOnly Get(): Time to get snapshot: \n" << hist_get_snapshot.ToString() << " Time to get value from memtables: \n" << hist_get_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_get_files.ToString() << "\n" << " Number of memtables checked: \n" << hist_num_memtable_checked.ToString() << "\n" - << " Time to post process: \n" - << hist_get_post_process.ToString() << "\n"; + << " Time to post process: \n" << hist_get_post_process.ToString() + << "\n"; + + std::cout << "ReadOnly MultiGet(): Time to get snapshot: \n" + << hist_mget_snapshot.ToString() + << " Time to get value from memtables: \n" + << hist_mget_memtable.ToString() << "\n" + << " Time to get value from output files: \n" + << hist_mget_files.ToString() << "\n" + << " Number of memtables checked: \n" + << hist_mget_num_memtable_checked.ToString() << "\n" + << " Time to post process: \n" << hist_mget_post_process.ToString() + << "\n"; + + if (enabled_time) { + ASSERT_GT(hist_get.Average(), 0); + ASSERT_GT(hist_get_memtable.Average(), 0); + ASSERT_GT(hist_get_files.Average(), 0); + ASSERT_GT(hist_num_memtable_checked.Average(), 0); + // In read-only mode Get(), no super version operation is needed + ASSERT_EQ(hist_get_post_process.Average(), 0); + ASSERT_EQ(hist_get_snapshot.Average(), 0); + + ASSERT_GT(hist_mget.Average(), 0); + ASSERT_GT(hist_mget_snapshot.Average(), 0); + ASSERT_GT(hist_mget_memtable.Average(), 0); + ASSERT_GT(hist_mget_files.Average(), 0); + ASSERT_GT(hist_mget_post_process.Average(), 0); + ASSERT_GT(hist_mget_num_memtable_checked.Average(), 0); + } } TEST(PerfContextTest, KeyComparisonCount) { SetPerfLevel(kEnableCount); - ProfileKeyComparison(); + ProfileQueries(); SetPerfLevel(kDisable); - ProfileKeyComparison(); + ProfileQueries(); SetPerfLevel(kEnableTime); - ProfileKeyComparison(); + ProfileQueries(true); } // make perf_context_test @@ -281,8 +472,8 @@ TEST(PerfContextTest, SeekKeyComparison) { SetPerfLevel(kEnableTime); StopWatchNano timer(Env::Default()); for (const int i : keys) { - std::string key = "k" + std::to_string(i); - std::string value = "v" + std::to_string(i); + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); perf_context.Reset(); timer.Start(); @@ -301,8 +492,8 @@ TEST(PerfContextTest, SeekKeyComparison) { HistogramImpl hist_next; for (int i = 0; i < FLAGS_total_keys; ++i) { - std::string key = "k" + std::to_string(i); - std::string value = "v" + std::to_string(i); + std::string key = "k" + ToString(i); + std::string value = "v" + ToString(i); std::unique_ptr iter(db->NewIterator(read_options)); perf_context.Reset(); diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index bb0f96f15..906ff8c8f 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -158,7 +158,7 @@ class PlainTableDBTest { // Return spread of files per level std::string FilesPerLevel() { std::string result; - int last_non_zero_offset = 0; + size_t last_non_zero_offset = 0; for (int level = 0; level < db_->NumberLevels(); level++) { int f = NumTableFilesAtLevel(level); char buf[100]; @@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber; class TestPlainTableReader : public PlainTableReader { public: - TestPlainTableReader(const EnvOptions& storage_options, + TestPlainTableReader(const EnvOptions& env_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, uint64_t file_size, int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, unique_ptr&& file, - const Options& options, bool* expect_bloom_not_match, + const ImmutableCFOptions& ioptions, + bool* expect_bloom_not_match, bool store_index_in_file) - : PlainTableReader(options, std::move(file), storage_options, icomparator, + : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { Status s = MmapDataFile(); @@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader { PlainTablePropertyNames::kBloomVersion); ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); ASSERT_EQ(bloom_version_ptr->second, std::string("1")); - if (options.bloom_locality > 0) { + if (ioptions.bloom_locality > 0) { auto num_blocks_ptr = props->user_collected_properties.find( PlainTablePropertyNames::kNumBloomBlocks); ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); @@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory { store_index_in_file_(options.store_index_in_file), expect_bloom_not_match_(expect_bloom_not_match) {} - Status NewTableReader(const Options& options, const EnvOptions& soptions, + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); ASSERT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, - options.env, BloomBlockBuilder::kBloomBlock, + ioptions.env, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); ASSERT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlock( - file.get(), file_size, kPlainTableMagicNumber, options.env, + file.get(), file_size, kPlainTableMagicNumber, ioptions.env, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); ASSERT_TRUE(s.ok()); } @@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory { DecodeFixed32(encoding_type_prop->second.c_str())); std::unique_ptr new_reader(new TestPlainTableReader( - soptions, internal_comparator, encoding_type, file_size, + env_options, internal_comparator, encoding_type, file_size, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, - std::move(file), options, expect_bloom_not_match_, + std::move(file), ioptions, expect_bloom_not_match_, store_index_in_file_)); *table = std::move(new_reader); @@ -626,7 +628,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) { }; for (size_t i = 0; i < 7; i++) { - ASSERT_OK(Put(key_list[i], std::to_string(i))); + ASSERT_OK(Put(key_list[i], ToString(i))); } dbfull()->TEST_FlushMemTable(); @@ -637,7 +639,7 @@ TEST(PlainTableDBTest, IteratorLargeKeys) { for (size_t i = 0; i < 7; i++) { ASSERT_TRUE(iter->Valid()); ASSERT_EQ(key_list[i], iter->key().ToString()); - ASSERT_EQ(std::to_string(i), iter->value().ToString()); + ASSERT_EQ(ToString(i), iter->value().ToString()); iter->Next(); } @@ -674,7 +676,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) { MakeLongKeyWithPrefix(26, '6')}; for (size_t i = 0; i < 7; i++) { - ASSERT_OK(Put(key_list[i], std::to_string(i))); + ASSERT_OK(Put(key_list[i], ToString(i))); } dbfull()->TEST_FlushMemTable(); @@ -685,7 +687,7 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) { for (size_t i = 0; i < 7; i++) { ASSERT_TRUE(iter->Valid()); ASSERT_EQ(key_list[i], iter->key().ToString()); - ASSERT_EQ(std::to_string(i), iter->value().ToString()); + ASSERT_EQ(ToString(i), iter->value().ToString()); iter->Next(); } @@ -694,40 +696,12 @@ TEST(PlainTableDBTest, IteratorLargeKeysWithPrefix) { delete iter; } -// A test comparator which compare two strings in this way: -// (1) first compare prefix of 8 bytes in alphabet order, -// (2) if two strings share the same prefix, sort the other part of the string -// in the reverse alphabet order. -class SimpleSuffixReverseComparator : public Comparator { - public: - SimpleSuffixReverseComparator() {} - - virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } - - virtual int Compare(const Slice& a, const Slice& b) const { - Slice prefix_a = Slice(a.data(), 8); - Slice prefix_b = Slice(b.data(), 8); - int prefix_comp = prefix_a.compare(prefix_b); - if (prefix_comp != 0) { - return prefix_comp; - } else { - Slice suffix_a = Slice(a.data() + 8, a.size() - 8); - Slice suffix_b = Slice(b.data() + 8, b.size() - 8); - return -(suffix_a.compare(suffix_b)); - } - } - virtual void FindShortestSeparator(std::string* start, - const Slice& limit) const {} - - virtual void FindShortSuccessor(std::string* key) const {} -}; - TEST(PlainTableDBTest, IteratorReverseSuffixComparator) { Options options = CurrentOptions(); options.create_if_missing = true; // Set only one bucket to force bucket conflict. // Test index interval for the same prefix to be 1, 2 and 4 - SimpleSuffixReverseComparator comp; + test::SimpleSuffixReverseComparator comp; options.comparator = ∁ DestroyAndReopen(&options); @@ -890,7 +864,7 @@ TEST(PlainTableDBTest, HashBucketConflictReverseSuffixComparator) { for (unsigned char i = 1; i <= 3; i++) { Options options = CurrentOptions(); options.create_if_missing = true; - SimpleSuffixReverseComparator comp; + test::SimpleSuffixReverseComparator comp; options.comparator = ∁ // Set only one bucket to force bucket conflict. // Test index interval for the same prefix to be 1, 2 and 4 diff --git a/db/prefix_test.cc b/db/prefix_test.cc index a69dda2b4..fa2c128c4 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -29,14 +29,14 @@ using GFLAGS::ParseCommandLineFlags; DEFINE_bool(trigger_deadlock, false, "issue delete in range scan to trigger PrefixHashMap deadlock"); -DEFINE_uint64(bucket_count, 100000, "number of buckets"); +DEFINE_int32(bucket_count, 100000, "number of buckets"); DEFINE_uint64(num_locks, 10001, "number of locks"); DEFINE_bool(random_prefix, false, "randomize prefix"); DEFINE_uint64(total_prefixes, 100000, "total number of prefixes"); DEFINE_uint64(items_per_prefix, 1, "total number of values per prefix"); DEFINE_int64(write_buffer_size, 33554432, ""); -DEFINE_int64(max_write_buffer_number, 2, ""); -DEFINE_int64(min_write_buffer_number_to_merge, 1, ""); +DEFINE_int32(max_write_buffer_number, 2, ""); +DEFINE_int32(min_write_buffer_number_to_merge, 1, ""); DEFINE_int32(skiplist_height, 4, ""); DEFINE_int32(memtable_prefix_bloom_bits, 10000000, ""); DEFINE_int32(memtable_prefix_bloom_probes, 10, ""); @@ -52,7 +52,8 @@ struct TestKey { uint64_t prefix; uint64_t sorted; - TestKey(uint64_t prefix, uint64_t sorted) : prefix(prefix), sorted(sorted) {} + TestKey(uint64_t _prefix, uint64_t _sorted) + : prefix(_prefix), sorted(_sorted) {} }; // return a slice backed by test_key @@ -441,7 +442,7 @@ TEST(PrefixTest, DynamicPrefixIterator) { for (auto prefix : prefixes) { TestKey test_key(prefix, FLAGS_items_per_prefix / 2); Slice key = TestKeyToSlice(test_key); - std::string value = "v" + std::to_string(0); + std::string value = "v" + ToString(0); perf_context.Reset(); StopWatchNano timer(Env::Default(), true); diff --git a/db/repair.cc b/db/repair.cc index 820cc1924..3b5952dd0 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -31,7 +31,10 @@ #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "db/builder.h" #include "db/db_impl.h" @@ -42,10 +45,14 @@ #include "db/memtable.h" #include "db/table_cache.h" #include "db/version_edit.h" +#include "db/writebuffer.h" #include "db/write_batch_internal.h" #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -58,6 +65,7 @@ class Repairer { env_(options.env), icmp_(options.comparator), options_(SanitizeOptions(dbname, &icmp_, options)), + ioptions_(options_), raw_table_cache_( // TableCache can be small since we expect each table to be opened // once. @@ -65,7 +73,7 @@ class Repairer { options_.table_cache_remove_scan_count_limit)), next_file_number_(1) { table_cache_ = - new TableCache(&options_, storage_options_, raw_table_cache_.get()); + new TableCache(ioptions_, env_options_, raw_table_cache_.get()); edit_ = new VersionEdit(); } @@ -87,7 +95,7 @@ class Repairer { for (size_t i = 0; i < tables_.size(); i++) { bytes += tables_[i].meta.fd.GetFileSize(); } - Log(options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "**** Repaired rocksdb %s; " "recovered %zu files; %" PRIu64 "bytes. " @@ -107,8 +115,9 @@ class Repairer { std::string const dbname_; Env* const env_; - InternalKeyComparator const icmp_; - Options const options_; + const InternalKeyComparator icmp_; + const Options options_; + const ImmutableCFOptions ioptions_; std::shared_ptr raw_table_cache_; TableCache* table_cache_; VersionEdit* edit_; @@ -118,7 +127,7 @@ class Repairer { std::vector logs_; std::vector tables_; uint64_t next_file_number_; - const EnvOptions storage_options_; + const EnvOptions env_options_; Status FindFiles() { std::vector filenames; @@ -167,7 +176,7 @@ class Repairer { std::string logname = LogFileName(dbname_, logs_[i]); Status status = ConvertLogToTable(logs_[i]); if (!status.ok()) { - Log(options_.info_log, + Log(InfoLogLevel::WARN_LEVEL, options_.info_log, "Log #%" PRIu64 ": ignoring conversion error: %s", logs_[i], status.ToString().c_str()); } @@ -182,7 +191,8 @@ class Repairer { uint64_t lognum; virtual void Corruption(size_t bytes, const Status& s) { // We print error messages for corruption, but continue repairing. - Log(info_log, "Log #%" PRIu64 ": dropping %d bytes; %s", lognum, + Log(InfoLogLevel::ERROR_LEVEL, info_log, + "Log #%" PRIu64 ": dropping %d bytes; %s", lognum, static_cast(bytes), s.ToString().c_str()); } }; @@ -190,7 +200,7 @@ class Repairer { // Open the log file std::string logname = LogFileName(dbname_, log); unique_ptr lfile; - Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); + Status status = env_->NewSequentialFile(logname, &lfile, env_options_); if (!status.ok()) { return status; } @@ -211,8 +221,10 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - MemTable* mem = new MemTable(icmp_, options_); - auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); + WriteBuffer wb(options_.db_write_buffer_size); + MemTable* mem = new MemTable(icmp_, ioptions_, + MutableCFOptions(options_, ioptions_), &wb); + auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem); mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { @@ -226,7 +238,8 @@ class Repairer { if (status.ok()) { counter += WriteBatchInternal::Count(&batch); } else { - Log(options_.info_log, "Log #%" PRIu64 ": ignoring %s", log, + Log(InfoLogLevel::WARN_LEVEL, + options_.info_log, "Log #%" PRIu64 ": ignoring %s", log, status.ToString().c_str()); status = Status::OK(); // Keep going with rest of file } @@ -236,12 +249,15 @@ class Repairer { // since ExtractMetaData() will also generate edits. FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); - ReadOptions ro; - ro.total_order_seek = true; - Iterator* iter = mem->NewIterator(ro); - status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, - iter, &meta, icmp_, 0, 0, kNoCompression); - delete iter; + { + ReadOptions ro; + ro.total_order_seek = true; + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); + status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, + iter.get(), &meta, icmp_, 0, 0, kNoCompression, + CompressionOptions()); + } delete mem->Unref(); delete cf_mems_default; mem = nullptr; @@ -250,9 +266,9 @@ class Repairer { table_fds_.push_back(meta.fd); } } - Log(options_.info_log, - "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", log, counter, - meta.fd.GetNumber(), status.ToString().c_str()); + Log(InfoLogLevel::INFO_LEVEL, options_.info_log, + "Log #%" PRIu64 ": %d ops saved to Table #%" PRIu64 " %s", + log, counter, meta.fd.GetNumber(), status.ToString().c_str()); return status; } @@ -267,7 +283,8 @@ class Repairer { char file_num_buf[kFormatFileNumberBufSize]; FormatFileNumber(t.meta.fd.GetNumber(), t.meta.fd.GetPathId(), file_num_buf, sizeof(file_num_buf)); - Log(options_.info_log, "Table #%s: ignoring %s", file_num_buf, + Log(InfoLogLevel::WARN_LEVEL, options_.info_log, + "Table #%s: ignoring %s", file_num_buf, status.ToString().c_str()); ArchiveFile(fname); } else { @@ -286,7 +303,7 @@ class Repairer { file_size); if (status.ok()) { Iterator* iter = table_cache_->NewIterator( - ReadOptions(), storage_options_, icmp_, t->meta.fd); + ReadOptions(), env_options_, icmp_, t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -294,7 +311,8 @@ class Repairer { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { Slice key = iter->key(); if (!ParseInternalKey(key, &parsed)) { - Log(options_.info_log, "Table #%" PRIu64 ": unparsable key %s", + Log(InfoLogLevel::ERROR_LEVEL, + options_.info_log, "Table #%" PRIu64 ": unparsable key %s", t->meta.fd.GetNumber(), EscapeString(key).c_str()); continue; } @@ -317,7 +335,8 @@ class Repairer { } delete iter; } - Log(options_.info_log, "Table #%" PRIu64 ": %d entries %s", + Log(InfoLogLevel::INFO_LEVEL, + options_.info_log, "Table #%" PRIu64 ": %d entries %s", t->meta.fd.GetNumber(), counter, status.ToString().c_str()); return status; } @@ -326,7 +345,7 @@ class Repairer { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); + tmp, &file, env_->OptimizeForManifestWrite(env_options_)); if (!status.ok()) { return status; } @@ -394,7 +413,8 @@ class Repairer { new_file.append("/"); new_file.append((slash == nullptr) ? fname.c_str() : slash + 1); Status s = env_->RenameFile(fname, new_file); - Log(options_.info_log, "Archiving %s: %s\n", + Log(InfoLogLevel::INFO_LEVEL, + options_.info_log, "Archiving %s: %s\n", fname.c_str(), s.ToString().c_str()); } }; diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc deleted file mode 100644 index 3a5809774..000000000 --- a/db/simple_table_db_test.cc +++ /dev/null @@ -1,810 +0,0 @@ -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#include -#include - -#include "rocksdb/db.h" -#include "rocksdb/filter_policy.h" -#include "db/db_impl.h" -#include "db/filename.h" -#include "db/version_set.h" -#include "db/write_batch_internal.h" -#include "rocksdb/statistics.h" -#include "rocksdb/cache.h" -#include "rocksdb/compaction_filter.h" -#include "rocksdb/env.h" -#include "rocksdb/table.h" -#include "rocksdb/table_properties.h" -#include "table/table_builder.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/mutexlock.h" -#include "util/testharness.h" -#include "util/testutil.h" -#include "utilities/merge_operators.h" - -using std::unique_ptr; - -// IS THIS FILE STILL NEEDED? -namespace rocksdb { - -// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built -// as production quality. -// SimpleTable requires the input key size to be fixed 16 bytes, value cannot -// be longer than 150000 bytes and stored data on disk in this format: -// +--------------------------------------------+ <= key1 offset -// | key1 | value_size (4 bytes) | | -// +----------------------------------------+ | -// | value1 | -// | | -// +----------------------------------------+---+ <= key2 offset -// | key2 | value_size (4 bytes) | | -// +----------------------------------------+ | -// | value2 | -// | | -// | ...... | -// +-----------------+--------------------------+ <= index_block_offset -// | key1 | key1 offset (8 bytes) | -// +-----------------+--------------------------+ -// | key2 | key2 offset (8 bytes) | -// +-----------------+--------------------------+ -// | key3 | key3 offset (8 bytes) | -// +-----------------+--------------------------+ -// | ...... | -// +-----------------+------------+-------------+ -// | index_block_offset (8 bytes) | -// +------------------------------+ - -// SimpleTable is a simple table format for UNIT TEST ONLY. It is not built -// as production quality. -class SimpleTableReader: public TableReader { -public: - // Attempt to open the table that is stored in bytes [0..file_size) - // of "file", and read the metadata entries necessary to allow - // retrieving data from the table. - // - // If successful, returns ok and sets "*table" to the newly opened - // table. The client should delete "*table" when no longer needed. - // If there was an error while initializing the table, sets "*table" - // to nullptr and returns a non-ok status. Does not take ownership of - // "*source", but the client must ensure that "source" remains live - // for the duration of the returned table's lifetime. - // - // *file must remain live while this Table is in use. - static Status Open(const Options& options, const EnvOptions& soptions, - unique_ptr && file, uint64_t file_size, - unique_ptr* table_reader); - - Iterator* NewIterator(const ReadOptions&, Arena* arena) override; - - Status Get(const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist)(void*) = nullptr) override; - - uint64_t ApproximateOffsetOf(const Slice& key) override; - - virtual size_t ApproximateMemoryUsage() const override { return 0; } - - void SetupForCompaction() override; - - std::shared_ptr GetTableProperties() const override; - - ~SimpleTableReader(); - -private: - struct Rep; - Rep* rep_; - - explicit SimpleTableReader(Rep* rep) { - rep_ = rep; - } - friend class TableCache; - friend class SimpleTableIterator; - - Status GetOffset(const Slice& target, uint64_t* offset); - - // No copying allowed - explicit SimpleTableReader(const TableReader&) = delete; - void operator=(const TableReader&) = delete; -}; - -// Iterator to iterate SimpleTable -class SimpleTableIterator: public Iterator { -public: - explicit SimpleTableIterator(SimpleTableReader* table); - ~SimpleTableIterator(); - - bool Valid() const; - - void SeekToFirst(); - - void SeekToLast(); - - void Seek(const Slice& target); - - void Next(); - - void Prev(); - - Slice key() const; - - Slice value() const; - - Status status() const; - -private: - SimpleTableReader* table_; - uint64_t offset_; - uint64_t next_offset_; - Slice key_; - Slice value_; - char tmp_str_[4]; - char* key_str_; - char* value_str_; - int value_str_len_; - Status status_; - // No copying allowed - SimpleTableIterator(const SimpleTableIterator&) = delete; - void operator=(const Iterator&) = delete; -}; - -struct SimpleTableReader::Rep { - ~Rep() { - } - Rep(const EnvOptions& storage_options, uint64_t index_start_offset, - int num_entries) : - soptions(storage_options), index_start_offset(index_start_offset), - num_entries(num_entries) { - } - - Options options; - const EnvOptions& soptions; - Status status; - unique_ptr file; - uint64_t index_start_offset; - int num_entries; - std::shared_ptr table_properties; - - const static int user_key_size = 16; - const static int offset_length = 8; - const static int key_footer_len = 8; - - static int GetInternalKeyLength() { - return user_key_size + key_footer_len; - } -}; - -SimpleTableReader::~SimpleTableReader() { - delete rep_; -} - -Status SimpleTableReader::Open(const Options& options, - const EnvOptions& soptions, - unique_ptr && file, - uint64_t size, - unique_ptr* table_reader) { - char footer_space[Rep::offset_length]; - Slice footer_input; - Status s = file->Read(size - Rep::offset_length, Rep::offset_length, - &footer_input, footer_space); - if (s.ok()) { - uint64_t index_start_offset = DecodeFixed64(footer_space); - - int num_entries = (size - Rep::offset_length - index_start_offset) - / (Rep::GetInternalKeyLength() + Rep::offset_length); - SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, - index_start_offset, - num_entries); - - rep->file = std::move(file); - rep->options = options; - table_reader->reset(new SimpleTableReader(rep)); - } - return s; -} - -void SimpleTableReader::SetupForCompaction() { -} - -std::shared_ptr SimpleTableReader::GetTableProperties() - const { - return rep_->table_properties; -} - -Iterator* SimpleTableReader::NewIterator(const ReadOptions& options, - Arena* arena) { - if (arena == nullptr) { - return new SimpleTableIterator(this); - } else { - auto mem = arena->AllocateAligned(sizeof(SimpleTableIterator)); - return new (mem) SimpleTableIterator(this); - } -} - -Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) { - uint32_t left = 0; - uint32_t right = rep_->num_entries - 1; - char key_chars[Rep::GetInternalKeyLength()]; - Slice tmp_slice; - - uint32_t target_offset = 0; - while (left <= right) { - uint32_t mid = (left + right + 1) / 2; - - uint64_t offset_to_read = rep_->index_start_offset - + (Rep::GetInternalKeyLength() + Rep::offset_length) * mid; - Status s = rep_->file->Read(offset_to_read, Rep::GetInternalKeyLength(), - &tmp_slice, key_chars); - if (!s.ok()) { - return s; - } - - InternalKeyComparator ikc(rep_->options.comparator); - int compare_result = ikc.Compare(tmp_slice, target); - - if (compare_result < 0) { - if (left == right) { - target_offset = right + 1; - break; - } - left = mid; - } else { - if (left == right) { - target_offset = left; - break; - } - right = mid - 1; - } - } - - if (target_offset >= (uint32_t) rep_->num_entries) { - *offset = rep_->index_start_offset; - return Status::OK(); - } - - char value_offset_chars[Rep::offset_length]; - - int64_t offset_for_value_offset = rep_->index_start_offset - + (Rep::GetInternalKeyLength() + Rep::offset_length) * target_offset - + Rep::GetInternalKeyLength(); - Status s = rep_->file->Read(offset_for_value_offset, Rep::offset_length, - &tmp_slice, value_offset_chars); - if (s.ok()) { - *offset = DecodeFixed64(value_offset_chars); - } - return s; -} - -Status SimpleTableReader::Get(const ReadOptions& options, const Slice& k, - void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { - Status s; - SimpleTableIterator* iter = new SimpleTableIterator(this); - for (iter->Seek(k); iter->Valid(); iter->Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(iter->key(), &parsed_key)) { - return Status::Corruption(Slice()); - } - - if (!(*saver)(arg, parsed_key, iter->value())) { - break; - } - } - s = iter->status(); - delete iter; - return s; -} - -uint64_t SimpleTableReader::ApproximateOffsetOf(const Slice& key) { - return 0; -} - -SimpleTableIterator::SimpleTableIterator(SimpleTableReader* table) : - table_(table) { - key_str_ = new char[SimpleTableReader::Rep::GetInternalKeyLength()]; - value_str_len_ = -1; - SeekToFirst(); -} - -SimpleTableIterator::~SimpleTableIterator() { - delete[] key_str_; - if (value_str_len_ >= 0) { - delete[] value_str_; - } -} - -bool SimpleTableIterator::Valid() const { - return offset_ < table_->rep_->index_start_offset; -} - -void SimpleTableIterator::SeekToFirst() { - next_offset_ = 0; - Next(); -} - -void SimpleTableIterator::SeekToLast() { - assert(false); -} - -void SimpleTableIterator::Seek(const Slice& target) { - Status s = table_->GetOffset(target, &next_offset_); - if (!s.ok()) { - status_ = s; - } - Next(); -} - -void SimpleTableIterator::Next() { - offset_ = next_offset_; - if (offset_ >= table_->rep_->index_start_offset) { - return; - } - Slice result; - int internal_key_size = SimpleTableReader::Rep::GetInternalKeyLength(); - - Status s = table_->rep_->file->Read(next_offset_, internal_key_size, &result, - key_str_); - next_offset_ += internal_key_size; - key_ = result; - - Slice value_size_slice; - s = table_->rep_->file->Read(next_offset_, 4, &value_size_slice, tmp_str_); - next_offset_ += 4; - uint32_t value_size = DecodeFixed32(tmp_str_); - - Slice value_slice; - if ((int) value_size > value_str_len_) { - if (value_str_len_ >= 0) { - delete[] value_str_; - } - value_str_ = new char[value_size]; - value_str_len_ = value_size; - } - s = table_->rep_->file->Read(next_offset_, value_size, &value_slice, - value_str_); - next_offset_ += value_size; - value_ = value_slice; -} - -void SimpleTableIterator::Prev() { - assert(false); -} - -Slice SimpleTableIterator::key() const { - Log(table_->rep_->options.info_log, "key!!!!"); - return key_; -} - -Slice SimpleTableIterator::value() const { - return value_; -} - -Status SimpleTableIterator::status() const { - return status_; -} - -class SimpleTableBuilder: public TableBuilder { -public: - // Create a builder that will store the contents of the table it is - // building in *file. Does not close the file. It is up to the - // caller to close the file after calling Finish(). The output file - // will be part of level specified by 'level'. A value of -1 means - // that the caller does not know which level the output file will reside. - SimpleTableBuilder(const Options& options, WritableFile* file, - CompressionType compression_type); - - // REQUIRES: Either Finish() or Abandon() has been called. - ~SimpleTableBuilder(); - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override; - - // Return non-ok iff some error has been detected. - Status status() const override; - - // Finish building the table. Stops using the file passed to the - // constructor after this function returns. - // REQUIRES: Finish(), Abandon() have not been called - Status Finish() override; - - // Indicate that the contents of this builder should be abandoned. Stops - // using the file passed to the constructor after this function returns. - // If the caller is not going to call Finish(), it must call Abandon() - // before destroying this builder. - // REQUIRES: Finish(), Abandon() have not been called - void Abandon() override; - - // Number of calls to Add() so far. - uint64_t NumEntries() const override; - - // Size of the file generated so far. If invoked after a successful - // Finish() call, returns the size of the final generated file. - uint64_t FileSize() const override; - -private: - struct Rep; - Rep* rep_; - - // No copying allowed - SimpleTableBuilder(const SimpleTableBuilder&) = delete; - void operator=(const SimpleTableBuilder&) = delete; -}; - -struct SimpleTableBuilder::Rep { - Options options; - WritableFile* file; - uint64_t offset = 0; - Status status; - - uint64_t num_entries = 0; - - bool closed = false; // Either Finish() or Abandon() has been called. - - const static int user_key_size = 16; - const static int offset_length = 8; - const static int key_footer_len = 8; - - static int GetInternalKeyLength() { - return user_key_size + key_footer_len; - } - - std::string index; - - Rep(const Options& opt, WritableFile* f) : - options(opt), file(f) { - } - ~Rep() { - } -}; - -SimpleTableBuilder::SimpleTableBuilder(const Options& options, - WritableFile* file, - CompressionType compression_type) : - rep_(new SimpleTableBuilder::Rep(options, file)) { -} - -SimpleTableBuilder::~SimpleTableBuilder() { - delete (rep_); -} - -void SimpleTableBuilder::Add(const Slice& key, const Slice& value) { - assert((int ) key.size() == Rep::GetInternalKeyLength()); - - // Update index - rep_->index.append(key.data(), key.size()); - PutFixed64(&(rep_->index), rep_->offset); - - // Write key-value pair - rep_->file->Append(key); - rep_->offset += Rep::GetInternalKeyLength(); - - std::string size; - int value_size = value.size(); - PutFixed32(&size, value_size); - Slice sizeSlice(size); - rep_->file->Append(sizeSlice); - rep_->file->Append(value); - rep_->offset += value_size + 4; - - rep_->num_entries++; -} - -Status SimpleTableBuilder::status() const { - return Status::OK(); -} - -Status SimpleTableBuilder::Finish() { - Rep* r = rep_; - assert(!r->closed); - r->closed = true; - - uint64_t index_offset = rep_->offset; - Slice index_slice(rep_->index); - rep_->file->Append(index_slice); - rep_->offset += index_slice.size(); - - std::string index_offset_str; - PutFixed64(&index_offset_str, index_offset); - Slice foot_slice(index_offset_str); - rep_->file->Append(foot_slice); - rep_->offset += foot_slice.size(); - - return Status::OK(); -} - -void SimpleTableBuilder::Abandon() { - rep_->closed = true; -} - -uint64_t SimpleTableBuilder::NumEntries() const { - return rep_->num_entries; -} - -uint64_t SimpleTableBuilder::FileSize() const { - return rep_->offset; -} - -class SimpleTableFactory: public TableFactory { -public: - ~SimpleTableFactory() { - } - SimpleTableFactory() { - } - const char* Name() const override { - return "SimpleTable"; - } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const; - - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& internal_key, - WritableFile* file, - CompressionType compression_type) const; - - virtual Status SanitizeDBOptions(DBOptions* db_opts) const override { - return Status::OK(); - } - - virtual std::string GetPrintableTableOptions() const override { - return std::string(); - } -}; - -Status SimpleTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const { - - return SimpleTableReader::Open(options, soptions, std::move(file), file_size, - table_reader); -} - -TableBuilder* SimpleTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_key, - WritableFile* file, CompressionType compression_type) const { - return new SimpleTableBuilder(options, file, compression_type); -} - -class SimpleTableDBTest { -protected: -public: - std::string dbname_; - Env* env_; - DB* db_; - - Options last_options_; - - SimpleTableDBTest() : - env_(Env::Default()) { - dbname_ = test::TmpDir() + "/simple_table_db_test"; - ASSERT_OK(DestroyDB(dbname_, Options())); - db_ = nullptr; - Reopen(); - } - - ~SimpleTableDBTest() { - delete db_; - ASSERT_OK(DestroyDB(dbname_, Options())); - } - - // Return the current option configuration. - Options CurrentOptions() { - Options options; - options.table_factory.reset(new SimpleTableFactory()); - return options; - } - - DBImpl* dbfull() { - return reinterpret_cast(db_); - } - - void Reopen(Options* options = nullptr) { - ASSERT_OK(TryReopen(options)); - } - - void Close() { - delete db_; - db_ = nullptr; - } - - void DestroyAndReopen(Options* options = nullptr) { - //Destroy using last options - Destroy(&last_options_); - ASSERT_OK(TryReopen(options)); - } - - void Destroy(Options* options) { - delete db_; - db_ = nullptr; - ASSERT_OK(DestroyDB(dbname_, *options)); - } - - Status PureReopen(Options* options, DB** db) { - return DB::Open(*options, dbname_, db); - } - - Status TryReopen(Options* options = nullptr) { - delete db_; - db_ = nullptr; - Options opts; - if (options != nullptr) { - opts = *options; - } else { - opts = CurrentOptions(); - opts.create_if_missing = true; - } - last_options_ = opts; - - return DB::Open(opts, dbname_, &db_); - } - - Status Put(const Slice& k, const Slice& v) { - return db_->Put(WriteOptions(), k, v); - } - - Status Delete(const std::string& k) { - return db_->Delete(WriteOptions(), k); - } - - std::string Get(const std::string& k, const Snapshot* snapshot = nullptr) { - ReadOptions options; - options.snapshot = snapshot; - std::string result; - Status s = db_->Get(options, k, &result); - if (s.IsNotFound()) { - result = "NOT_FOUND"; - } else if (!s.ok()) { - result = s.ToString(); - } - return result; - } - - - int NumTableFilesAtLevel(int level) { - std::string property; - ASSERT_TRUE( - db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(level), - &property)); - return atoi(property.c_str()); - } - - // Return spread of files per level - std::string FilesPerLevel() { - std::string result; - int last_non_zero_offset = 0; - for (int level = 0; level < db_->NumberLevels(); level++) { - int f = NumTableFilesAtLevel(level); - char buf[100]; - snprintf(buf, sizeof(buf), "%s%d", (level ? "," : ""), f); - result += buf; - if (f > 0) { - last_non_zero_offset = result.size(); - } - } - result.resize(last_non_zero_offset); - return result; - } - - std::string IterStatus(Iterator* iter) { - std::string result; - if (iter->Valid()) { - result = iter->key().ToString() + "->" + iter->value().ToString(); - } else { - result = "(invalid)"; - } - return result; - } -}; - -TEST(SimpleTableDBTest, Empty) { - ASSERT_TRUE(db_ != nullptr); - ASSERT_EQ("NOT_FOUND", Get("0000000000000foo")); -} - -TEST(SimpleTableDBTest, ReadWrite) { - ASSERT_OK(Put("0000000000000foo", "v1")); - ASSERT_EQ("v1", Get("0000000000000foo")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("0000000000000foo", "v3")); - ASSERT_EQ("v3", Get("0000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} - -TEST(SimpleTableDBTest, Flush) { - ASSERT_OK(Put("0000000000000foo", "v1")); - ASSERT_OK(Put("0000000000000bar", "v2")); - ASSERT_OK(Put("0000000000000foo", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000foo")); - ASSERT_EQ("v2", Get("0000000000000bar")); -} - -TEST(SimpleTableDBTest, Flush2) { - ASSERT_OK(Put("0000000000000bar", "b")); - ASSERT_OK(Put("0000000000000foo", "v1")); - dbfull()->TEST_FlushMemTable(); - - ASSERT_OK(Put("0000000000000foo", "v2")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v2", Get("0000000000000foo")); - - ASSERT_OK(Put("0000000000000eee", "v3")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v3", Get("0000000000000eee")); - - ASSERT_OK(Delete("0000000000000bar")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("NOT_FOUND", Get("0000000000000bar")); - - ASSERT_OK(Put("0000000000000eee", "v5")); - dbfull()->TEST_FlushMemTable(); - ASSERT_EQ("v5", Get("0000000000000eee")); -} - -static std::string Key(int i) { - char buf[100]; - snprintf(buf, sizeof(buf), "key_______%06d", i); - return std::string(buf); -} - -static std::string RandomString(Random* rnd, int len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} - -TEST(SimpleTableDBTest, CompactionTrigger) { - Options options = CurrentOptions(); - options.write_buffer_size = 100 << 10; //100KB - options.num_levels = 3; - options.max_mem_compaction_level = 0; - options.level0_file_num_compaction_trigger = 3; - Reopen(&options); - - Random rnd(301); - - for (int num = 0; num < options.level0_file_num_compaction_trigger - 1; - num++) { - std::vector values; - // Write 120KB (12 values, each 10K) - for (int i = 0; i < 12; i++) { - values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); - } - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), num + 1); - } - - //generate one more file in level-0, and should trigger level-0 compaction - std::vector values; - for (int i = 0; i < 12; i++) { - values.push_back(RandomString(&rnd, 10000)); - ASSERT_OK(Put(Key(i), values[i])); - } - dbfull()->TEST_WaitForCompact(); - - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - ASSERT_EQ(NumTableFilesAtLevel(1), 1); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} diff --git a/db/skiplist.h b/db/skiplist.h index 751f7c3ec..c1e375007 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -32,10 +32,10 @@ #pragma once #include +#include #include -#include "util/arena.h" #include "port/port.h" -#include "util/arena.h" +#include "util/allocator.h" #include "util/random.h" namespace rocksdb { @@ -47,9 +47,9 @@ class SkipList { public: // Create a new SkipList object that will use "cmp" for comparing keys, - // and will allocate memory using "*arena". Objects allocated in the arena - // must remain allocated for the lifetime of the skiplist object. - explicit SkipList(Comparator cmp, Arena* arena, + // and will allocate memory using "*allocator". Objects allocated in the + // allocator must remain allocated for the lifetime of the skiplist object. + explicit SkipList(Comparator cmp, Allocator* allocator, int32_t max_height = 12, int32_t branching_factor = 4); // Insert key into the list. @@ -109,21 +109,20 @@ class SkipList { // Immutable after construction Comparator const compare_; - Arena* const arena_; // Arena used for allocations of nodes + Allocator* const allocator_; // Allocator used for allocations of nodes Node* const head_; // Modified only by Insert(). Read racily by readers, but stale // values are ok. - port::AtomicPointer max_height_; // Height of the entire list + std::atomic max_height_; // Height of the entire list // Used for optimizing sequential insert patterns Node** prev_; int32_t prev_height_; inline int GetMaxHeight() const { - return static_cast( - reinterpret_cast(max_height_.NoBarrier_Load())); + return max_height_.load(std::memory_order_relaxed); } // Read/written only by Insert(). @@ -169,35 +168,35 @@ struct SkipList::Node { assert(n >= 0); // Use an 'acquire load' so that we observe a fully initialized // version of the returned Node. - return reinterpret_cast(next_[n].Acquire_Load()); + return (next_[n].load(std::memory_order_acquire)); } void SetNext(int n, Node* x) { assert(n >= 0); // Use a 'release store' so that anybody who reads through this // pointer observes a fully initialized version of the inserted node. - next_[n].Release_Store(x); + next_[n].store(x, std::memory_order_release); } // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next(int n) { assert(n >= 0); - return reinterpret_cast(next_[n].NoBarrier_Load()); + return next_[n].load(std::memory_order_relaxed); } void NoBarrier_SetNext(int n, Node* x) { assert(n >= 0); - next_[n].NoBarrier_Store(x); + next_[n].store(x, std::memory_order_relaxed); } private: // Array of length equal to the node height. next_[0] is lowest level link. - port::AtomicPointer next_[1]; + std::atomic next_[1]; }; template typename SkipList::Node* SkipList::NewNode(const Key& key, int height) { - char* mem = arena_->AllocateAligned( - sizeof(Node) + sizeof(port::AtomicPointer) * (height - 1)); + char* mem = allocator_->AllocateAligned( + sizeof(Node) + sizeof(std::atomic) * (height - 1)); return new (mem) Node(key); } @@ -356,23 +355,24 @@ typename SkipList::Node* SkipList::FindLast() } template -SkipList::SkipList(const Comparator cmp, Arena* arena, +SkipList::SkipList(const Comparator cmp, Allocator* allocator, int32_t max_height, int32_t branching_factor) : kMaxHeight_(max_height), kBranching_(branching_factor), compare_(cmp), - arena_(arena), + allocator_(allocator), head_(NewNode(0 /* any key will do */, max_height)), - max_height_(reinterpret_cast(1)), + max_height_(1), prev_height_(1), rnd_(0xdeadbeef) { assert(kMaxHeight_ > 0); assert(kBranching_ > 0); - // Allocate the prev_ Node* array, directly from the passed-in arena. + // Allocate the prev_ Node* array, directly from the passed-in allocator. // prev_ does not need to be freed, as its life cycle is tied up with - // the arena as a whole. - prev_ = (Node**) arena_->AllocateAligned(sizeof(Node*) * kMaxHeight_); + // the allocator as a whole. + prev_ = reinterpret_cast( + allocator_->AllocateAligned(sizeof(Node*) * kMaxHeight_)); for (int i = 0; i < kMaxHeight_; i++) { head_->SetNext(i, nullptr); prev_[i] = head_; @@ -402,7 +402,7 @@ void SkipList::Insert(const Key& key) { // the loop below. In the former case the reader will // immediately drop to the next level since nullptr sorts after all // keys. In the latter case the reader will use the new node. - max_height_.NoBarrier_Store(reinterpret_cast(height)); + max_height_.store(height, std::memory_order_relaxed); } x = NewNode(key, height); diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index b87ddcbb0..d8e113c66 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -191,13 +191,11 @@ class ConcurrentTest { // Per-key generation struct State { - port::AtomicPointer generation[K]; - void Set(int k, intptr_t v) { - generation[k].Release_Store(reinterpret_cast(v)); - } - intptr_t Get(int k) { - return reinterpret_cast(generation[k].Acquire_Load()); + std::atomic generation[K]; + void Set(int k, int v) { + generation[k].store(v, std::memory_order_release); } + int Get(int k) { return generation[k].load(std::memory_order_acquire); } State() { for (unsigned int k = 0; k < K; k++) { @@ -221,9 +219,9 @@ class ConcurrentTest { // REQUIRES: External synchronization void WriteStep(Random* rnd) { const uint32_t k = rnd->Next() % K; - const intptr_t g = current_.Get(k) + 1; - const Key key = MakeKey(k, g); - list_.Insert(key); + const int g = current_.Get(k) + 1; + const Key new_key = MakeKey(k, g); + list_.Insert(new_key); current_.Set(k, g); } @@ -255,11 +253,10 @@ class ConcurrentTest { // Note that generation 0 is never inserted, so it is ok if // <*,0,*> is missing. ASSERT_TRUE((gen(pos) == 0U) || - (gen(pos) > (uint64_t)initial_state.Get(key(pos))) - ) << "key: " << key(pos) - << "; gen: " << gen(pos) - << "; initgen: " - << initial_state.Get(key(pos)); + (gen(pos) > static_cast(initial_state.Get( + static_cast(key(pos)))))) + << "key: " << key(pos) << "; gen: " << gen(pos) + << "; initgen: " << initial_state.Get(static_cast(key(pos))); // Advance to next key in the valid key space if (key(pos) < key(current)) { @@ -303,7 +300,7 @@ class TestState { public: ConcurrentTest t_; int seed_; - port::AtomicPointer quit_flag_; + std::atomic quit_flag_; enum ReaderState { STARTING, @@ -312,10 +309,7 @@ class TestState { }; explicit TestState(int s) - : seed_(s), - quit_flag_(nullptr), - state_(STARTING), - state_cv_(&mu_) {} + : seed_(s), quit_flag_(false), state_(STARTING), state_cv_(&mu_) {} void Wait(ReaderState s) { mu_.Lock(); @@ -343,7 +337,7 @@ static void ConcurrentReader(void* arg) { Random rnd(state->seed_); int64_t reads = 0; state->Change(TestState::RUNNING); - while (!state->quit_flag_.Acquire_Load()) { + while (!state->quit_flag_.load(std::memory_order_acquire)) { state->t_.ReadStep(&rnd); ++reads; } @@ -362,10 +356,10 @@ static void RunConcurrent(int run) { TestState state(seed + 1); Env::Default()->Schedule(ConcurrentReader, &state); state.Wait(TestState::RUNNING); - for (int i = 0; i < kSize; i++) { + for (int k = 0; k < kSize; k++) { state.t_.WriteStep(&rnd); } - state.quit_flag_.Release_Store(&state); // Any non-nullptr arg will do + state.quit_flag_.store(true, std::memory_order_release); state.Wait(TestState::DONE); } } diff --git a/db/snapshot.h b/db/snapshot.h index 2c2e3eac8..de9897f24 100644 --- a/db/snapshot.h +++ b/db/snapshot.h @@ -20,6 +20,8 @@ class SnapshotImpl : public Snapshot { public: SequenceNumber number_; // const after creation + virtual SequenceNumber GetSequenceNumber() const { return number_; } + private: friend class SnapshotList; @@ -28,6 +30,8 @@ class SnapshotImpl : public Snapshot { SnapshotImpl* next_; SnapshotList* list_; // just for sanity checks + + int64_t unix_time_; }; class SnapshotList { @@ -36,20 +40,23 @@ class SnapshotList { list_.prev_ = &list_; list_.next_ = &list_; list_.number_ = 0xFFFFFFFFL; // placeholder marker, for debugging + count_ = 0; } bool empty() const { return list_.next_ == &list_; } SnapshotImpl* oldest() const { assert(!empty()); return list_.next_; } SnapshotImpl* newest() const { assert(!empty()); return list_.prev_; } - const SnapshotImpl* New(SequenceNumber seq) { + const SnapshotImpl* New(SequenceNumber seq, uint64_t unix_time) { SnapshotImpl* s = new SnapshotImpl; s->number_ = seq; + s->unix_time_ = unix_time; s->list_ = this; s->next_ = &list_; s->prev_ = list_.prev_; s->prev_->next_ = s; s->next_->prev_ = s; + count_++; return s; } @@ -57,6 +64,7 @@ class SnapshotList { assert(s->list_ == this); s->prev_->next_ = s->next_; s->next_->prev_ = s->prev_; + count_--; delete s; } @@ -71,16 +79,27 @@ class SnapshotList { } // get the sequence number of the most recent snapshot - const SequenceNumber GetNewest() { + SequenceNumber GetNewest() { if (empty()) { return 0; } return newest()->number_; } + int64_t GetOldestSnapshotTime() const { + if (empty()) { + return 0; + } else { + return oldest()->unix_time_; + } + } + + uint64_t count() const { return count_; } + private: // Dummy head of doubly-linked list of snapshots SnapshotImpl list_; + uint64_t count_; }; } // namespace rocksdb diff --git a/db/table_cache.cc b/db/table_cache.cc index c362499a6..e1b0ca8b9 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -15,6 +15,7 @@ #include "rocksdb/statistics.h" #include "table/iterator_wrapper.h" #include "table/table_reader.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/stop_watch.h" @@ -36,12 +37,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) { sizeof(*file_number)); } -TableCache::TableCache(const Options* options, - const EnvOptions& storage_options, Cache* const cache) - : env_(options->env), - db_paths_(options->db_paths), - options_(options), - storage_options_(storage_options), +TableCache::TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, Cache* const cache) + : ioptions_(ioptions), + env_options_(env_options), cache_(cache) {} TableCache::~TableCache() { @@ -55,7 +54,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { cache_->Release(handle); } -Status TableCache::FindTable(const EnvOptions& toptions, +Status TableCache::FindTable(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io) { @@ -68,28 +67,27 @@ Status TableCache::FindTable(const EnvOptions& toptions, return Status::Incomplete("Table not found in table_cache, no_io is set"); } std::string fname = - TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId()); + TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); unique_ptr file; unique_ptr table_reader; - s = env_->NewRandomAccessFile(fname, &file, toptions); - RecordTick(options_->statistics.get(), NO_FILE_OPENS); + s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options); + RecordTick(ioptions_.statistics, NO_FILE_OPENS); if (s.ok()) { - if (options_->advise_random_on_open) { + if (ioptions_.advise_random_on_open) { file->Hint(RandomAccessFile::RANDOM); } - StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); - s = options_->table_factory->NewTableReader( - *options_, toptions, internal_comparator, std::move(file), + StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + s = ioptions_.table_factory->NewTableReader( + ioptions_, env_options, internal_comparator, std::move(file), fd.GetFileSize(), &table_reader); } if (!s.ok()) { assert(table_reader == nullptr); - RecordTick(options_->statistics.get(), NO_FILE_ERRORS); + RecordTick(ioptions_.statistics, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { - assert(file.get() == nullptr); *handle = cache_->Insert(key, table_reader.release(), 1, &DeleteEntry); } } @@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, } Iterator* TableCache::NewIterator(const ReadOptions& options, - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& icomparator, const FileDescriptor& fd, TableReader** table_reader_ptr, @@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(toptions, icomparator, fd, &handle, + s = FindTable(env_options, icomparator, fd, &handle, options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); @@ -134,34 +132,33 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& fd, const Slice& k, void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { + const FileDescriptor& fd, const Slice& k, + GetContext* get_context) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(storage_options_, internal_comparator, fd, &handle, + s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); } } if (s.ok()) { - s = t->Get(options, k, arg, saver, mark_key_may_exist); + s = t->Get(options, k, get_context); if (handle != nullptr) { ReleaseHandle(handle); } } else if (options.read_tier && s.IsIncomplete()) { // Couldnt find Table in cache but treat as kFound if no_io set - (*mark_key_may_exist)(arg); + get_context->MarkKeyMayExist(); return Status::OK(); } return s; } + Status TableCache::GetTableProperties( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, bool no_io) { Status s; @@ -174,7 +171,7 @@ Status TableCache::GetTableProperties( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); + s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io); if (!s.ok()) { return s; } @@ -186,7 +183,7 @@ Status TableCache::GetTableProperties( } size_t TableCache::GetMemoryUsageByTableReader( - const EnvOptions& toptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd) { Status s; @@ -197,7 +194,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, fd, &table_handle, true); + s = FindTable(env_options, internal_comparator, fd, &table_handle, true); if (!s.ok()) { return 0; } diff --git a/db/table_cache.h b/db/table_cache.h index 79090e064..76bb1c0a2 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -19,6 +19,7 @@ #include "rocksdb/cache.h" #include "rocksdb/env.h" #include "rocksdb/table.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -26,11 +27,12 @@ namespace rocksdb { class Env; class Arena; struct FileDescriptor; +class GetContext; class TableCache { public: - TableCache(const Options* options, const EnvOptions& storage_options, - Cache* cache); + TableCache(const ImmutableCFOptions& ioptions, + const EnvOptions& storage_options, Cache* cache); ~TableCache(); // Return an iterator for the specified file number (the corresponding @@ -51,10 +53,8 @@ class TableCache { // it returns false. Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileDescriptor& file_fd, const Slice& k, void* arg, - bool (*handle_result)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*) = nullptr); + const FileDescriptor& file_fd, const Slice& k, + GetContext* get_context); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); @@ -91,10 +91,8 @@ class TableCache { void ReleaseHandle(Cache::Handle* handle); private: - Env* const env_; - const std::vector db_paths_; - const Options* options_; - const EnvOptions& storage_options_; + const ImmutableCFOptions& ioptions_; + const EnvOptions& env_options_; Cache* const cache_; }; diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index 25bd70036..36ed0f97f 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -7,6 +7,7 @@ #include "db/dbformat.h" #include "util/coding.h" +#include "util/string_util.h" namespace rocksdb { @@ -40,7 +41,7 @@ Status InternalKeyPropertiesCollector::Finish( UserCollectedProperties InternalKeyPropertiesCollector::GetReadableProperties() const { return { - { "kDeletedKeys", std::to_string(deleted_keys_) } + { "kDeletedKeys", ToString(deleted_keys_) } }; } diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 638b259f2..364b23b44 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "db/table_properties_collector.h" #include "rocksdb/table.h" +#include "rocksdb/immutable_options.h" #include "table/block_based_table_factory.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" @@ -78,6 +79,7 @@ class FakeRandomeAccessFile : public RandomAccessFile { class DumbLogger : public Logger { public: + using Logger::Logv; virtual void Logv(const char* format, va_list ap) { } virtual size_t GetLogFileSize() const { return 0; } }; @@ -85,12 +87,14 @@ class DumbLogger : public Logger { // Utilities test functions namespace { void MakeBuilder(const Options& options, + const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, std::unique_ptr* writable, std::unique_ptr* builder) { writable->reset(new FakeWritableFile); - builder->reset(options.table_factory->NewTableBuilder( - options, internal_comparator, writable->get(), options.compression)); + builder->reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, writable->get(), + options.compression, options.compression_opts)); } } // namespace @@ -153,7 +157,8 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; std::unique_ptr writable; - MakeBuilder(options, internal_comparator, &writable, &builder); + const ImmutableCFOptions ioptions(options); + MakeBuilder(options, ioptions, internal_comparator, &writable, &builder); for (const auto& kv : kvs) { if (encode_as_internal) { @@ -264,9 +269,10 @@ void TestInternalKeyPropertiesCollector( options.table_properties_collector_factories = { std::make_shared()}; } + const ImmutableCFOptions ioptions(options); for (int iter = 0; iter < 2; ++iter) { - MakeBuilder(options, pikc, &writable, &builder); + MakeBuilder(options, ioptions, pikc, &writable, &builder); for (const auto& k : keys) { builder->Add(k.Encode(), "val"); } diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index bfcf7b328..b0bf6e4e9 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -4,6 +4,11 @@ // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include #include "db/transaction_log_impl.h" #include "db/write_batch_internal.h" @@ -13,7 +18,7 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( const std::string& dir, const DBOptions* options, const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seq, - std::unique_ptr files, DBImpl const* const dbimpl) + std::unique_ptr files, VersionSet const* const versions) : dir_(dir), options_(options), read_options_(read_options), @@ -25,9 +30,9 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( currentFileIndex_(0), currentBatchSeq_(0), currentLastSeq_(0), - dbimpl_(dbimpl) { + versions_(versions) { assert(files_ != nullptr); - assert(dbimpl_ != nullptr); + assert(versions_ != nullptr); reporter_.env = options_->env; reporter_.info_log = options_->info_log.get(); @@ -43,14 +48,14 @@ Status TransactionLogIteratorImpl::OpenLogFile( return env->NewSequentialFile(fname, file, soptions_); } else { std::string fname = LogFileName(dir_, logFile->LogNumber()); - Status status = env->NewSequentialFile(fname, file, soptions_); - if (!status.ok()) { + Status s = env->NewSequentialFile(fname, file, soptions_); + if (!s.ok()) { // If cannot open file in DB directory. // Try the archive dir, as it could have moved in the meanwhile. fname = ArchivedLogFileName(dir_, logFile->LogNumber()); - status = env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, file, soptions_); } - return status; + return s; } } @@ -74,7 +79,7 @@ bool TransactionLogIteratorImpl::RestrictedRead( Slice* record, std::string* scratch) { // Don't read if no more complete entries to read from logs - if (currentLastSeq_ >= dbimpl_->GetLatestSequenceNumber()) { + if (currentLastSeq_ >= versions_->LastSequence()) { return false; } return currentLogReader_->ReadRecord(record, scratch); @@ -177,15 +182,15 @@ void TransactionLogIteratorImpl::NextImpl(bool internal) { // Open the next file if (currentFileIndex_ < files_->size() - 1) { ++currentFileIndex_; - Status status =OpenLogReader(files_->at(currentFileIndex_).get()); - if (!status.ok()) { + Status s = OpenLogReader(files_->at(currentFileIndex_).get()); + if (!s.ok()) { isValid_ = false; - currentStatus_ = status; + currentStatus_ = s; return; } } else { isValid_ = false; - if (currentLastSeq_ == dbimpl_->GetLatestSequenceNumber()) { + if (currentLastSeq_ == versions_->LastSequence()) { currentStatus_ = Status::OK(); } else { currentStatus_ = Status::Corruption("NO MORE DATA LEFT"); @@ -203,12 +208,10 @@ bool TransactionLogIteratorImpl::IsBatchExpected( if (batchSeq != expectedSeq) { char buf[200]; snprintf(buf, sizeof(buf), - "Discontinuity in log records. Got seq=%lu, Expected seq=%lu, " - "Last flushed seq=%lu.Log iterator will reseek the correct " - "batch.", - (unsigned long)batchSeq, - (unsigned long)expectedSeq, - (unsigned long)dbimpl_->GetLatestSequenceNumber()); + "Discontinuity in log records. Got seq=%" PRIu64 + ", Expected seq=%" PRIu64 ", Last flushed seq=%" PRIu64 + ".Log iterator will reseek the correct batch.", + batchSeq, expectedSeq, versions_->LastSequence()); reporter_.Info(buf); return false; } @@ -240,7 +243,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { currentLastSeq_ = currentBatchSeq_ + WriteBatchInternal::Count(batch.get()) - 1; // currentBatchSeq_ can only change here - assert(currentLastSeq_ <= dbimpl_->GetLatestSequenceNumber()); + assert(currentLastSeq_ <= versions_->LastSequence()); currentBatch_ = move(batch); isValid_ = true; @@ -249,9 +252,9 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { unique_ptr file; - Status status = OpenLogFile(logFile, &file); - if (!status.ok()) { - return status; + Status s = OpenLogFile(logFile, &file); + if (!s.ok()) { + return s; } assert(file); currentLogReader_.reset(new log::Reader(std::move(file), &reporter_, diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index 319b01cb1..f0e572a5b 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -11,23 +11,12 @@ #include "rocksdb/options.h" #include "rocksdb/types.h" #include "rocksdb/transaction_log.h" -#include "db/db_impl.h" +#include "db/version_set.h" #include "db/log_reader.h" #include "db/filename.h" namespace rocksdb { -struct LogReporter : public log::Reader::Reporter { - Env* env; - Logger* info_log; - virtual void Corruption(size_t bytes, const Status& s) { - Log(info_log, "dropping %zu bytes; %s", bytes, s.ToString().c_str()); - } - virtual void Info(const char* s) { - Log(info_log, "%s", s); - } -}; - class LogFileImpl : public LogFile { public: LogFileImpl(uint64_t logNum, WalFileType logType, SequenceNumber startSeq, @@ -71,7 +60,7 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { const std::string& dir, const DBOptions* options, const TransactionLogIterator::ReadOptions& read_options, const EnvOptions& soptions, const SequenceNumber seqNum, - std::unique_ptr files, DBImpl const* const dbimpl); + std::unique_ptr files, VersionSet const* const versions); virtual bool Valid(); @@ -95,10 +84,24 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { std::unique_ptr currentBatch_; unique_ptr currentLogReader_; Status OpenLogFile(const LogFile* logFile, unique_ptr* file); - LogReporter reporter_; + + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + virtual void Corruption(size_t bytes, const Status& s) { + Log(InfoLogLevel::ERROR_LEVEL, info_log, "dropping %zu bytes; %s", bytes, + s.ToString().c_str()); + } + virtual void Info(const char* s) { + Log(InfoLogLevel::INFO_LEVEL, info_log, "%s", s); + } + } reporter_; + SequenceNumber currentBatchSeq_; // sequence number at start of current batch SequenceNumber currentLastSeq_; // last sequence in the current batch - DBImpl const * const dbimpl_; // The db on whose log files this iterates + // Used only to get latest seq. num + // TODO(icanadi) can this be just a callback? + VersionSet const* const versions_; // Reads from transaction log only if the writebatch record has been written bool RestrictedRead(Slice* record, std::string* scratch); diff --git a/db/version_builder.cc b/db/version_builder.cc new file mode 100644 index 000000000..c010ee429 --- /dev/null +++ b/db/version_builder.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/version_builder.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include +#include +#include + +#include "db/dbformat.h" +#include "db/table_cache.h" +#include "db/version_set.h" +#include "table/table_reader.h" + +namespace rocksdb { + +bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { + if (a->smallest_seqno != b->smallest_seqno) { + return a->smallest_seqno > b->smallest_seqno; + } + if (a->largest_seqno != b->largest_seqno) { + return a->largest_seqno > b->largest_seqno; + } + // Break ties by file number + return a->fd.GetNumber() > b->fd.GetNumber(); +} + +namespace { +bool BySmallestKey(FileMetaData* a, FileMetaData* b, + const InternalKeyComparator* cmp) { + int r = cmp->Compare(a->smallest, b->smallest); + if (r != 0) { + return (r < 0); + } + // Break ties by file number + return (a->fd.GetNumber() < b->fd.GetNumber()); +} +} // namespace + +class VersionBuilder::Rep { + private: + // Helper to sort files_ in v + // kLevel0 -- NewestFirstBySeqNo + // kLevelNon0 -- BySmallestKey + struct FileComparator { + enum SortMethod { kLevel0 = 0, kLevelNon0 = 1, } sort_method; + const InternalKeyComparator* internal_comparator; + + bool operator()(FileMetaData* f1, FileMetaData* f2) const { + switch (sort_method) { + case kLevel0: + return NewestFirstBySeqNo(f1, f2); + case kLevelNon0: + return BySmallestKey(f1, f2, internal_comparator); + } + assert(false); + return false; + } + }; + + struct LevelState { + std::unordered_set deleted_files; + // Map from file number to file meta data. + std::unordered_map added_files; + }; + + const EnvOptions& env_options_; + TableCache* table_cache_; + VersionStorageInfo* base_vstorage_; + LevelState* levels_; + FileComparator level_zero_cmp_; + FileComparator level_nonzero_cmp_; + + public: + Rep(const EnvOptions& env_options, TableCache* table_cache, + VersionStorageInfo* base_vstorage) + : env_options_(env_options), + table_cache_(table_cache), + base_vstorage_(base_vstorage) { + levels_ = new LevelState[base_vstorage_->num_levels()]; + level_zero_cmp_.sort_method = FileComparator::kLevel0; + level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; + level_nonzero_cmp_.internal_comparator = + base_vstorage_->InternalComparator(); + } + + ~Rep() { + for (int level = 0; level < base_vstorage_->num_levels(); level++) { + const auto& added = levels_[level].added_files; + for (auto& pair : added) { + UnrefFile(pair.second); + } + } + + delete[] levels_; + } + + void UnrefFile(FileMetaData* f) { + f->refs--; + if (f->refs <= 0) { + if (f->table_reader_handle) { + assert(table_cache_ != nullptr); + table_cache_->ReleaseHandle(f->table_reader_handle); + f->table_reader_handle = nullptr; + } + delete f; + } + } + + void CheckConsistency(VersionStorageInfo* vstorage) { +#ifndef NDEBUG + // make sure the files are sorted correctly + for (int level = 0; level < vstorage->num_levels(); level++) { + auto& level_files = vstorage->LevelFiles(level); + for (size_t i = 1; i < level_files.size(); i++) { + auto f1 = level_files[i - 1]; + auto f2 = level_files[i]; + if (level == 0) { + assert(level_zero_cmp_(f1, f2)); + assert(f1->largest_seqno > f2->largest_seqno); + } else { + assert(level_nonzero_cmp_(f1, f2)); + + // Make sure there is no overlap in levels > 0 + if (vstorage->InternalComparator()->Compare(f1->largest, + f2->smallest) >= 0) { + fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", + (f1->largest).DebugString().c_str(), + (f2->smallest).DebugString().c_str()); + abort(); + } + } + } + } +#endif + } + + void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, + int level) { +#ifndef NDEBUG + // a file to be deleted better exist in the previous version + bool found = false; + for (int l = 0; !found && l < base_vstorage_->num_levels(); l++) { + const std::vector& base_files = + base_vstorage_->LevelFiles(l); + for (unsigned int i = 0; i < base_files.size(); i++) { + FileMetaData* f = base_files[i]; + if (f->fd.GetNumber() == number) { + found = true; + break; + } + } + } + // if the file did not exist in the previous version, then it + // is possibly moved from lower level to higher level in current + // version + for (int l = level + 1; !found && l < base_vstorage_->num_levels(); l++) { + auto& level_added = levels_[l].added_files; + auto got = level_added.find(number); + if (got != level_added.end()) { + found = true; + break; + } + } + + // maybe this file was added in a previous edit that was Applied + if (!found) { + auto& level_added = levels_[level].added_files; + auto got = level_added.find(number); + if (got != level_added.end()) { + found = true; + } + } + if (!found) { + fprintf(stderr, "not found %" PRIu64 "\n", number); + } + assert(found); +#endif + } + + // Apply all of the edits in *edit to the current state. + void Apply(VersionEdit* edit) { + CheckConsistency(base_vstorage_); + + // Delete files + const VersionEdit::DeletedFileSet& del = edit->GetDeletedFiles(); + for (const auto& del_file : del) { + const auto level = del_file.first; + const auto number = del_file.second; + levels_[level].deleted_files.insert(number); + CheckConsistencyForDeletes(edit, number, level); + + auto exising = levels_[level].added_files.find(number); + if (exising != levels_[level].added_files.end()) { + UnrefFile(exising->second); + levels_[level].added_files.erase(number); + } + } + + // Add new files + for (const auto& new_file : edit->GetNewFiles()) { + const int level = new_file.first; + FileMetaData* f = new FileMetaData(new_file.second); + f->refs = 1; + + assert(levels_[level].added_files.find(f->fd.GetNumber()) == + levels_[level].added_files.end()); + levels_[level].deleted_files.erase(f->fd.GetNumber()); + levels_[level].added_files[f->fd.GetNumber()] = f; + } + } + + // Save the current state in *v. + void SaveTo(VersionStorageInfo* vstorage) { + CheckConsistency(base_vstorage_); + CheckConsistency(vstorage); + + for (int level = 0; level < base_vstorage_->num_levels(); level++) { + const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; + // Merge the set of added files with the set of pre-existing files. + // Drop any deleted files. Store the result in *v. + const auto& base_files = base_vstorage_->LevelFiles(level); + auto base_iter = base_files.begin(); + auto base_end = base_files.end(); + const auto& unordered_added_files = levels_[level].added_files; + vstorage->Reserve(level, + base_files.size() + unordered_added_files.size()); + + // Sort added files for the level. + std::vector added_files; + added_files.reserve(unordered_added_files.size()); + for (const auto& pair : unordered_added_files) { + added_files.push_back(pair.second); + } + std::sort(added_files.begin(), added_files.end(), cmp); + +#ifndef NDEBUG + FileMetaData* prev_file = nullptr; +#endif + + for (const auto& added : added_files) { +#ifndef NDEBUG + if (level > 0 && prev_file != nullptr) { + assert(base_vstorage_->InternalComparator()->Compare( + prev_file->smallest, added->smallest) <= 0); + } + prev_file = added; +#endif + + // Add all smaller files listed in base_ + for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); + base_iter != bpos; ++base_iter) { + MaybeAddFile(vstorage, level, *base_iter); + } + + MaybeAddFile(vstorage, level, added); + } + + // Add remaining base files + for (; base_iter != base_end; ++base_iter) { + MaybeAddFile(vstorage, level, *base_iter); + } + } + + CheckConsistency(vstorage); + } + + void LoadTableHandlers() { + assert(table_cache_ != nullptr); + for (int level = 0; level < base_vstorage_->num_levels(); level++) { + for (auto& file_meta_pair : levels_[level].added_files) { + auto* file_meta = file_meta_pair.second; + assert(!file_meta->table_reader_handle); + table_cache_->FindTable( + env_options_, *(base_vstorage_->InternalComparator()), + file_meta->fd, &file_meta->table_reader_handle, false); + if (file_meta->table_reader_handle != nullptr) { + // Load table_reader + file_meta->fd.table_reader = table_cache_->GetTableReaderFromHandle( + file_meta->table_reader_handle); + } + } + } + } + + void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f) { + if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { + // File is deleted: do nothing + } else { + vstorage->AddFile(level, f); + } + } +}; + +VersionBuilder::VersionBuilder(const EnvOptions& env_options, + TableCache* table_cache, + VersionStorageInfo* base_vstorage) + : rep_(new Rep(env_options, table_cache, base_vstorage)) {} +VersionBuilder::~VersionBuilder() { delete rep_; } +void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { + rep_->CheckConsistency(vstorage); +} +void VersionBuilder::CheckConsistencyForDeletes(VersionEdit* edit, + uint64_t number, int level) { + rep_->CheckConsistencyForDeletes(edit, number, level); +} +void VersionBuilder::Apply(VersionEdit* edit) { rep_->Apply(edit); } +void VersionBuilder::SaveTo(VersionStorageInfo* vstorage) { + rep_->SaveTo(vstorage); +} +void VersionBuilder::LoadTableHandlers() { rep_->LoadTableHandlers(); } +void VersionBuilder::MaybeAddFile(VersionStorageInfo* vstorage, int level, + FileMetaData* f) { + rep_->MaybeAddFile(vstorage, level, f); +} + +} // namespace rocksdb diff --git a/db/version_builder.h b/db/version_builder.h new file mode 100644 index 000000000..452604f17 --- /dev/null +++ b/db/version_builder.h @@ -0,0 +1,42 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once +#include "rocksdb/env.h" + +namespace rocksdb { + +class TableCache; +class VersionStorageInfo; +class VersionEdit; +struct FileMetaData; + +// A helper class so we can efficiently apply a whole sequence +// of edits to a particular state without creating intermediate +// Versions that contain full copies of the intermediate state. +class VersionBuilder { + public: + VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, + VersionStorageInfo* base_vstorage); + ~VersionBuilder(); + void CheckConsistency(VersionStorageInfo* vstorage); + void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, + int level); + void Apply(VersionEdit* edit); + void SaveTo(VersionStorageInfo* vstorage); + void LoadTableHandlers(); + void MaybeAddFile(VersionStorageInfo* vstorage, int level, FileMetaData* f); + + private: + class Rep; + Rep* rep_; +}; + +extern bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b); +} // namespace rocksdb diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc new file mode 100644 index 000000000..a48b4e3a2 --- /dev/null +++ b/db/version_builder_test.cc @@ -0,0 +1,229 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "db/version_edit.h" +#include "db/version_set.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class VersionBuilderTest { + public: + const Comparator* ucmp_; + InternalKeyComparator icmp_; + Options options_; + ImmutableCFOptions ioptions_; + MutableCFOptions mutable_cf_options_; + VersionStorageInfo vstorage_; + uint32_t file_num_; + CompactionOptionsFIFO fifo_options_; + std::vector size_being_compacted_; + + VersionBuilderTest() + : ucmp_(BytewiseComparator()), + icmp_(ucmp_), + ioptions_(options_), + mutable_cf_options_(options_, ioptions_), + vstorage_(&icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr), + file_num_(1) { + mutable_cf_options_.RefreshDerivedOptions(ioptions_); + size_being_compacted_.resize(options_.num_levels); + } + + ~VersionBuilderTest() { + for (int i = 0; i < vstorage_.num_levels(); i++) { + for (auto* f : vstorage_.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } + } + + InternalKey GetInternalKey(const char* ukey, + SequenceNumber smallest_seq = 100) { + return InternalKey(ukey, smallest_seq, kTypeValue); + } + + void Add(int level, uint32_t file_number, const char* smallest, + const char* largest, uint64_t file_size = 0, uint32_t path_id = 0, + SequenceNumber smallest_seq = 100, + SequenceNumber largest_seq = 100, + uint64_t num_entries = 0, uint64_t num_deletions = 0, + bool sampled = false) { + assert(level < vstorage_.num_levels()); + FileMetaData* f = new FileMetaData; + f->fd = FileDescriptor(file_number, path_id, file_size); + f->smallest = GetInternalKey(smallest, smallest_seq); + f->largest = GetInternalKey(largest, largest_seq); + f->compensated_file_size = file_size; + f->refs = 0; + f->num_entries = num_entries; + f->num_deletions = num_deletions; + vstorage_.AddFile(level, f); + if (sampled) { + f->init_stats_from_file = true; + vstorage_.UpdateAccumulatedStats(f); + } + } + + void UpdateVersionStorageInfo() { + vstorage_.UpdateFilesBySize(); + vstorage_.UpdateNumNonEmptyLevels(); + vstorage_.GenerateFileIndexer(); + vstorage_.GenerateLevelFilesBrief(); + vstorage_.SetFinalized(); + } +}; + +TEST(VersionBuilderTest, ApplyAndSaveTo) { + Add(0, 1U, "150", "200", 100U); + // Level 1 score 1.2 + Add(1, 66U, "150", "200", 100U); + Add(1, 88U, "201", "300", 100U); + // Level 2 score 1.8. File 7 is the largest. Should be picked + Add(2, 6U, "150", "179", 100U); + Add(2, 7U, "180", "220", 100U); + Add(2, 8U, "221", "300", 100U); + // Level 3 score slightly larger than 1 + Add(3, 26U, "150", "170", 100U); + Add(3, 27U, "171", "179", 100U); + Add(3, 28U, "191", "220", 100U); + Add(3, 29U, "221", "300", 100U); + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200); + version_edit.DeleteFile(3, 27U); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(400U, new_vstorage.NumLevelBytes(2)); + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(3)); + + for (int i = 0; i < new_vstorage.num_levels(); i++) { + for (auto* f : new_vstorage.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } +} + +TEST(VersionBuilderTest, ApplyMultipleAndSaveTo) { + UpdateVersionStorageInfo(); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200); + + EnvOptions env_options; + + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr); + version_builder.Apply(&version_edit); + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(500U, new_vstorage.NumLevelBytes(2)); + + for (int i = 0; i < new_vstorage.num_levels(); i++) { + for (auto* f : new_vstorage.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } +} + +TEST(VersionBuilderTest, ApplyDeleteAndSaveTo) { + UpdateVersionStorageInfo(); + + EnvOptions env_options; + VersionBuilder version_builder(env_options, nullptr, &vstorage_); + VersionStorageInfo new_vstorage(&icmp_, ucmp_, options_.num_levels, + kCompactionStyleLevel, nullptr); + + VersionEdit version_edit; + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200); + version_builder.Apply(&version_edit); + + VersionEdit version_edit2; + version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), + GetInternalKey("950"), 200, 200); + version_edit2.DeleteFile(2, 616); + version_edit2.DeleteFile(2, 636); + version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), + GetInternalKey("850"), 200, 200); + version_builder.Apply(&version_edit2); + + version_builder.SaveTo(&new_vstorage); + + ASSERT_EQ(300U, new_vstorage.NumLevelBytes(2)); + + for (int i = 0; i < new_vstorage.num_levels(); i++) { + for (auto* f : new_vstorage.LevelFiles(i)) { + if (--f->refs == 0) { + delete f; + } + } + } +} + +TEST(VersionBuilderTest, EstimatedActiveKeys) { + const uint32_t kTotalSamples = 20; + const uint32_t kNumLevels = 5; + const uint32_t kFilesPerLevel = 8; + const uint32_t kNumFiles = kNumLevels * kFilesPerLevel; + const uint32_t kEntriesPerFile = 1000; + const uint32_t kDeletionsPerFile = 100; + for (uint32_t i = 0; i < kNumFiles; ++i) { + Add(static_cast(i / kFilesPerLevel), i + 1, + ToString((i + 100) * 1000).c_str(), + ToString((i + 100) * 1000 + 999).c_str(), + 100U, 0, 100, 100, + kEntriesPerFile, kDeletionsPerFile, + (i < kTotalSamples)); + } + // minus 2X for the number of deletion entries because: + // 1x for deletion entry does not count as a data entry. + // 1x for each deletion entry will actually remove one data entry. + ASSERT_EQ(vstorage_.GetEstimatedActiveKeys(), + (kEntriesPerFile - 2 * kDeletionsPerFile) * kNumFiles); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/version_edit.cc b/db/version_edit.cc index 4e2cf8f5b..f7b288870 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -64,7 +64,7 @@ void VersionEdit::Clear() { column_family_name_.clear(); } -void VersionEdit::EncodeTo(std::string* dst) const { +bool VersionEdit::EncodeTo(std::string* dst) const { if (has_comparator_) { PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); @@ -98,6 +98,9 @@ void VersionEdit::EncodeTo(std::string* dst) const { for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; + if (!f.smallest.Valid() || !f.largest.Valid()) { + return false; + } if (f.fd.GetPathId() == 0) { // Use older format to make sure user can roll back the build if they // don't config multiple DB paths. @@ -131,6 +134,7 @@ void VersionEdit::EncodeTo(std::string* dst) const { if (is_column_family_drop_) { PutVarint32(dst, kColumnFamilyDrop); } + return true; } static bool GetInternalKey(Slice* input, InternalKey* dst) { @@ -164,7 +168,6 @@ Status VersionEdit::DecodeFrom(const Slice& src) { // Temporary storage for parsing int level; - uint64_t number; FileMetaData f; Slice str; InternalKey key; @@ -233,9 +236,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; - case kDeletedFile: - if (GetLevel(&input, &level, &msg) && - GetVarint64(&input, &number)) { + case kDeletedFile: { + uint64_t number; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number)) { deleted_files_.insert(std::make_pair(level, number)); } else { if (!msg) { @@ -243,6 +246,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } } break; + } case kNewFile: { uint64_t number; @@ -293,7 +297,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { - msg = "new-file2 entry"; + msg = "new-file3 entry"; } } break; diff --git a/db/version_edit.h b/db/version_edit.h index 58edfed45..004855ff9 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -8,6 +8,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include #include #include @@ -38,10 +39,10 @@ struct FileDescriptor { FileDescriptor() : FileDescriptor(0, 0, 0) {} - FileDescriptor(uint64_t number, uint32_t path_id, uint64_t file_size) + FileDescriptor(uint64_t number, uint32_t path_id, uint64_t _file_size) : table_reader(nullptr), packed_number_and_path_id(PackFileNumberAndPathId(number, path_id)), - file_size(file_size) {} + file_size(_file_size) {} FileDescriptor& operator=(const FileDescriptor& fd) { table_reader = fd.table_reader; @@ -74,9 +75,11 @@ struct FileMetaData { // Stats for compensating deletion entries during compaction // File size compensated by deletion entry. - // This is updated in Version::UpdateTemporaryStats() first time when the - // file is created or loaded. After it is updated, it is immutable. + // This is updated in Version::UpdateAccumulatedStats() first time when the + // file is created or loaded. After it is updated (!= 0), it is immutable. uint64_t compensated_file_size; + // These values can mutate, but they can only be read or written from + // single-threaded LogAndApply thread uint64_t num_entries; // the number of entries. uint64_t num_deletions; // the number of deletion entries. uint64_t raw_key_size; // total uncompressed key size. @@ -109,20 +112,16 @@ struct FdWithKeyRange { largest_key() { } - FdWithKeyRange(FileDescriptor fd, - Slice smallest_key, Slice largest_key) - : fd(fd), - smallest_key(smallest_key), - largest_key(largest_key) { - } + FdWithKeyRange(FileDescriptor _fd, Slice _smallest_key, Slice _largest_key) + : fd(_fd), smallest_key(_smallest_key), largest_key(_largest_key) {} }; // Data structure to store an array of FdWithKeyRange in one level // Actual data is guaranteed to be stored closely -struct FileLevel { +struct LevelFilesBrief { size_t num_files; FdWithKeyRange* files; - FileLevel() { + LevelFilesBrief() { num_files = 0; files = nullptr; } @@ -163,13 +162,13 @@ class VersionEdit { // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: "smallest" and "largest" are smallest and largest keys in file - void AddFile(int level, uint64_t file, uint64_t file_size, - uint64_t file_path_id, const InternalKey& smallest, + void AddFile(int level, uint64_t file, uint32_t file_path_id, + uint64_t file_size, const InternalKey& smallest, const InternalKey& largest, const SequenceNumber& smallest_seqno, const SequenceNumber& largest_seqno) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.fd = FileDescriptor(file, file_size, file_path_id); + f.fd = FileDescriptor(file, file_path_id, file_size); f.smallest = smallest; f.largest = largest; f.smallest_seqno = smallest_seqno; @@ -183,9 +182,7 @@ class VersionEdit { } // Number of edits - int NumEntries() { - return new_files_.size() + deleted_files_.size(); - } + size_t NumEntries() { return new_files_.size() + deleted_files_.size(); } bool IsColumnFamilyManipulation() { return is_column_family_add_ || is_column_family_drop_; @@ -212,17 +209,23 @@ class VersionEdit { is_column_family_drop_ = true; } - void EncodeTo(std::string* dst) const; + // return true on success. + bool EncodeTo(std::string* dst) const; Status DecodeFrom(const Slice& src); + typedef std::set> DeletedFileSet; + + const DeletedFileSet& GetDeletedFiles() { return deleted_files_; } + const std::vector>& GetNewFiles() { + return new_files_; + } + std::string DebugString(bool hex_key = false) const; private: friend class VersionSet; friend class Version; - typedef std::set< std::pair> DeletedFileSet; - bool GetLevel(Slice* input, int* level, const char** msg); int max_level_; diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 850f242c1..ec123d2c1 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -26,11 +26,12 @@ class VersionEditTest { }; TEST(VersionEditTest, EncodeDecode) { static const uint64_t kBig = 1ull << 50; + static const uint32_t kBig32Bit = 1ull << 30; VersionEdit edit; for (int i = 0; i < 4; i++) { TestEncodeDecode(edit); - edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, 0, + edit.AddFile(3, kBig + 300 + i, kBig32Bit + 400 + i, 0, InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), kBig + 500 + i, kBig + 600 + i); @@ -44,6 +45,16 @@ TEST(VersionEditTest, EncodeDecode) { TestEncodeDecode(edit); } +TEST(VersionEditTest, EncodeEmptyFile) { + VersionEdit edit; + edit.AddFile(0, 0, 0, 0, + InternalKey(), + InternalKey(), + 0, 0); + std::string buffer; + ASSERT_TRUE(!edit.EncodeTo(&buffer)); +} + TEST(VersionEditTest, ColumnFamilyTest) { VersionEdit edit; edit.SetColumnFamily(2); diff --git a/db/version_set.cc b/db/version_set.cc index 3a1545853..09f45b7dc 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -9,15 +9,19 @@ #include "db/version_set.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include +#include #include #include #include #include #include #include -#include +#include #include "db/filename.h" #include "db/log_reader.h" @@ -26,6 +30,8 @@ #include "db/merge_context.h" #include "db/table_cache.h" #include "db/compaction.h" +#include "db/version_builder.h" +#include "db/writebuffer.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "table/table_reader.h" @@ -34,6 +40,7 @@ #include "table/format.h" #include "table/plain_table_factory.h" #include "table/meta_blocks.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -42,10 +49,10 @@ namespace rocksdb { namespace { -// Find File in FileLevel data structure +// Find File in LevelFilesBrief data structure // Within an index range defined by left and right int FindFileInRange(const InternalKeyComparator& icmp, - const FileLevel& file_level, + const LevelFilesBrief& file_level, const Slice& key, uint32_t left, uint32_t right) { @@ -65,27 +72,6 @@ int FindFileInRange(const InternalKeyComparator& icmp, return right; } -bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { - if (a->smallest_seqno != b->smallest_seqno) { - return a->smallest_seqno > b->smallest_seqno; - } - if (a->largest_seqno != b->largest_seqno) { - return a->largest_seqno > b->largest_seqno; - } - // Break ties by file number - return a->fd.GetNumber() > b->fd.GetNumber(); -} - -bool BySmallestKey(FileMetaData* a, FileMetaData* b, - const InternalKeyComparator* cmp) { - int r = cmp->Compare(a->smallest, b->smallest); - if (r != 0) { - return (r < 0); - } - // Break ties by file number - return (a->fd.GetNumber() < b->fd.GetNumber()); -} - // Class to help choose the next file to search for the particular key. // Searches and returns files level by level. // We can search level-by-level since entries never hop across @@ -98,19 +84,20 @@ class FilePicker { std::vector* files, const Slice& user_key, const Slice& ikey, - autovector* file_levels, + autovector* file_levels, unsigned int num_levels, FileIndexer* file_indexer, const Comparator* user_comparator, const InternalKeyComparator* internal_comparator) : num_levels_(num_levels), curr_level_(-1), + hit_file_level_(-1), search_left_bound_(0), search_right_bound_(FileIndexer::kLevelMaxIndex), #ifndef NDEBUG files_(files), #endif - file_levels_(file_levels), + level_files_brief_(file_levels), user_key_(user_key), ikey_(ikey), file_indexer_(file_indexer), @@ -120,8 +107,8 @@ class FilePicker { search_ended_ = !PrepareNextLevel(); if (!search_ended_) { // Prefetch Level 0 table data to avoid cache miss if possible. - for (unsigned int i = 0; i < (*file_levels_)[0].num_files; ++i) { - auto* r = (*file_levels_)[0].files[i].fd.table_reader; + for (unsigned int i = 0; i < (*level_files_brief_)[0].num_files; ++i) { + auto* r = (*level_files_brief_)[0].files[i].fd.table_reader; if (r) { r->Prepare(ikey); } @@ -134,6 +121,7 @@ class FilePicker { while (curr_index_in_curr_level_ < curr_file_level_->num_files) { // Loops over all files in current level. FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; + hit_file_level_ = curr_level_; int cmp_largest = -1; // Do key range filtering of files or/and fractional cascading if: @@ -213,17 +201,22 @@ class FilePicker { return nullptr; } + // getter for current file level + // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts + unsigned int GetHitFileLevel() { return hit_file_level_; } + private: unsigned int num_levels_; unsigned int curr_level_; - int search_left_bound_; - int search_right_bound_; + unsigned int hit_file_level_; + int32_t search_left_bound_; + int32_t search_right_bound_; #ifndef NDEBUG std::vector* files_; #endif - autovector* file_levels_; + autovector* level_files_brief_; bool search_ended_; - FileLevel* curr_file_level_; + LevelFilesBrief* curr_file_level_; unsigned int curr_index_in_curr_level_; unsigned int start_index_in_curr_level_; Slice user_key_; @@ -240,7 +233,7 @@ class FilePicker { bool PrepareNextLevel() { curr_level_++; while (curr_level_ < num_levels_) { - curr_file_level_ = &(*file_levels_)[curr_level_]; + curr_file_level_ = &(*level_files_brief_)[curr_level_]; if (curr_file_level_->num_files == 0) { // When current level is empty, the search bound generated from upper // level must be [0, -1] or [0, FileIndexer::kLevelMaxIndex] if it is @@ -273,11 +266,13 @@ class FilePicker { start_index = search_left_bound_; } else if (search_left_bound_ < search_right_bound_) { if (search_right_bound_ == FileIndexer::kLevelMaxIndex) { - search_right_bound_ = curr_file_level_->num_files - 1; + search_right_bound_ = + static_cast(curr_file_level_->num_files) - 1; } - start_index = FindFileInRange(*internal_comparator_, - *curr_file_level_, ikey_, - search_left_bound_, search_right_bound_); + start_index = + FindFileInRange(*internal_comparator_, *curr_file_level_, ikey_, + static_cast(search_left_bound_), + static_cast(search_right_bound_)); } else { // search_left_bound > search_right_bound, key does not exist in // this level. Since no comparision is done in this level, it will @@ -301,6 +296,8 @@ class FilePicker { }; } // anonymous namespace +VersionStorageInfo::~VersionStorageInfo() { delete[] files_; } + Version::~Version() { assert(refs_ == 0); @@ -309,9 +306,9 @@ Version::~Version() { next_->prev_ = prev_; // Drop references to files - for (int level = 0; level < num_levels_; level++) { - for (size_t i = 0; i < files_[level].size(); i++) { - FileMetaData* f = files_[level][i]; + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (size_t i = 0; i < storage_info_.files_[level].size(); i++) { + FileMetaData* f = storage_info_.files_[level][i]; assert(f->refs > 0); f->refs--; if (f->refs <= 0) { @@ -323,16 +320,16 @@ Version::~Version() { } } } - delete[] files_; } int FindFile(const InternalKeyComparator& icmp, - const FileLevel& file_level, + const LevelFilesBrief& file_level, const Slice& key) { - return FindFileInRange(icmp, file_level, key, 0, file_level.num_files); + return FindFileInRange(icmp, file_level, key, 0, + static_cast(file_level.num_files)); } -void DoGenerateFileLevel(FileLevel* file_level, +void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena) { assert(file_level); @@ -379,7 +376,7 @@ static bool BeforeFile(const Comparator* ucmp, bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, bool disjoint_sorted_files, - const FileLevel& file_level, + const LevelFilesBrief& file_level, const Slice* smallest_user_key, const Slice* largest_user_key) { const Comparator* ucmp = icmp.user_comparator(); @@ -413,18 +410,20 @@ bool SomeFileOverlapsRange( return !BeforeFile(ucmp, largest_user_key, &file_level.files[index]); } +namespace { + // An internal iterator. For a given version/level pair, yields // information about the files in the level. For a given entry, key() // is the largest key that occurs in the file, and value() is an // 16-byte value containing the file number and file size, both // encoded using EncodeFixed64. -class Version::LevelFileNumIterator : public Iterator { +class LevelFileNumIterator : public Iterator { public: LevelFileNumIterator(const InternalKeyComparator& icmp, - const FileLevel* flevel) + const LevelFilesBrief* flevel) : icmp_(icmp), flevel_(flevel), - index_(flevel->num_files), + index_(static_cast(flevel->num_files)), current_value_(0, 0, 0) { // Marks as invalid } virtual bool Valid() const { @@ -435,7 +434,9 @@ class Version::LevelFileNumIterator : public Iterator { } virtual void SeekToFirst() { index_ = 0; } virtual void SeekToLast() { - index_ = (flevel_->num_files == 0) ? 0 : flevel_->num_files - 1; + index_ = (flevel_->num_files == 0) + ? 0 + : static_cast(flevel_->num_files) - 1; } virtual void Next() { assert(Valid()); @@ -444,7 +445,7 @@ class Version::LevelFileNumIterator : public Iterator { virtual void Prev() { assert(Valid()); if (index_ == 0) { - index_ = flevel_->num_files; // Marks as invalid + index_ = static_cast(flevel_->num_files); // Marks as invalid } else { index_--; } @@ -464,12 +465,12 @@ class Version::LevelFileNumIterator : public Iterator { virtual Status status() const { return Status::OK(); } private: const InternalKeyComparator icmp_; - const FileLevel* flevel_; + const LevelFilesBrief* flevel_; uint32_t index_; mutable FileDescriptor current_value_; }; -class Version::LevelFileIteratorState : public TwoLevelIteratorState { +class LevelFileIteratorState : public TwoLevelIteratorState { public: LevelFileIteratorState(TableCache* table_cache, const ReadOptions& read_options, const EnvOptions& env_options, @@ -505,13 +506,37 @@ class Version::LevelFileIteratorState : public TwoLevelIteratorState { bool for_compaction_; }; +// A wrapper of version builder which references the current version in +// constructor and unref it in the destructor. +// Both of the constructor and destructor need to be called inside DB Mutex. +class BaseReferencedVersionBuilder { + public: + explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd) + : version_builder_(new VersionBuilder( + cfd->current()->version_set()->env_options(), cfd->table_cache(), + cfd->current()->storage_info())), + version_(cfd->current()) { + version_->Ref(); + } + ~BaseReferencedVersionBuilder() { + delete version_builder_; + version_->Unref(); + } + VersionBuilder* version_builder() { return version_builder_; } + + private: + VersionBuilder* version_builder_; + Version* version_; +}; +} // anonymous namespace + Status Version::GetTableProperties(std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) { auto table_cache = cfd_->table_cache(); - auto options = cfd_->options(); + auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( - vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, + vset_->env_options_, cfd_->internal_comparator(), file_meta->fd, tp, true /* no io */); if (s.ok()) { return s; @@ -527,13 +552,13 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // directly from the properties block in the file. std::unique_ptr file; if (fname != nullptr) { - s = options->env->NewRandomAccessFile( - *fname, &file, vset_->storage_options_); + s = ioptions->env->NewRandomAccessFile( + *fname, &file, vset_->env_options_); } else { - s = options->env->NewRandomAccessFile( - TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), + s = ioptions->env->NewRandomAccessFile( + TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()), - &file, vset_->storage_options_); + &file, vset_->env_options_); } if (!s.ok()) { return s; @@ -545,21 +570,21 @@ Status Version::GetTableProperties(std::shared_ptr* tp, s = ReadTableProperties( file.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, - vset_->env_, options->info_log.get(), &raw_table_properties); + vset_->env_, ioptions->info_log, &raw_table_properties); if (!s.ok()) { return s; } - RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); + RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); *tp = std::shared_ptr(raw_table_properties); return s; } Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { - for (int level = 0; level < num_levels_; level++) { - for (const auto& file_meta : files_[level]) { + for (int level = 0; level < storage_info_.num_levels_; level++) { + for (const auto& file_meta : storage_info_.files_[level]) { auto fname = - TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), + TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(), file_meta->fd.GetPathId()); // 1. If the table is already present in table cache, load table // properties from there. @@ -578,55 +603,97 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { size_t Version::GetMemoryUsageByTableReaders() { size_t total_usage = 0; - for (auto& file_level : file_levels_) { + for (auto& file_level : storage_info_.level_files_brief_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - vset_->storage_options_, cfd_->internal_comparator(), + vset_->env_options_, cfd_->internal_comparator(), file_level.files[i].fd); } } return total_usage; } -uint64_t Version::GetEstimatedActiveKeys() { +void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { + assert(cf_meta); + assert(cfd_); + + cf_meta->name = cfd_->GetName(); + cf_meta->size = 0; + cf_meta->file_count = 0; + cf_meta->levels.clear(); + + auto* ioptions = cfd_->ioptions(); + auto* vstorage = storage_info(); + + for (int level = 0; level < cfd_->NumberLevels(); level++) { + uint64_t level_size = 0; + cf_meta->file_count += vstorage->LevelFiles(level).size(); + std::vector files; + for (const auto& file : vstorage->LevelFiles(level)) { + uint32_t path_id = file->fd.GetPathId(); + std::string file_path; + if (path_id < ioptions->db_paths.size()) { + file_path = ioptions->db_paths[path_id].path; + } else { + assert(!ioptions->db_paths.empty()); + file_path = ioptions->db_paths.back().path; + } + files.emplace_back( + MakeTableFileName("", file->fd.GetNumber()), + file_path, + file->fd.GetFileSize(), + file->smallest_seqno, + file->largest_seqno, + file->smallest.user_key().ToString(), + file->largest.user_key().ToString(), + file->being_compacted); + level_size += file->fd.GetFileSize(); + } + cf_meta->levels.emplace_back( + level, level_size, std::move(files)); + cf_meta->size += level_size; + } +} + + +uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { // Estimation will be not accurate when: // (1) there is merge keys // (2) keys are directly overwritten // (3) deletion on non-existing keys - return num_non_deletions_ - num_deletions_; -} + // (4) low number of samples + if (num_samples_ == 0) { + return 0; + } -void Version::AddIterators(const ReadOptions& read_options, - const EnvOptions& soptions, - std::vector* iters) { - // Merge all level zero files together since they may overlap - for (size_t i = 0; i < file_levels_[0].num_files; i++) { - const auto& file = file_levels_[0].files[i]; - iters->push_back(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), file.fd)); + uint64_t file_count = 0; + for (int level = 0; level < num_levels_; ++level) { + file_count += files_[level].size(); } - // For levels > 0, we can use a concatenating iterator that sequentially - // walks through the non-overlapping files in the level, opening them - // lazily. - for (int level = 1; level < num_levels_; level++) { - if (file_levels_[level].num_files != 0) { - iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState( - cfd_->table_cache(), read_options, soptions, - cfd_->internal_comparator(), false /* for_compaction */, - cfd_->options()->prefix_extractor != nullptr), - new LevelFileNumIterator(cfd_->internal_comparator(), - &file_levels_[level]))); - } + if (num_samples_ < file_count) { + // casting to avoid overflowing + return static_cast(static_cast( + accumulated_num_non_deletions_ - accumulated_num_deletions_) * + static_cast(file_count) / num_samples_); + } else { + return accumulated_num_non_deletions_ - accumulated_num_deletions_; } } void Version::AddIterators(const ReadOptions& read_options, const EnvOptions& soptions, MergeIteratorBuilder* merge_iter_builder) { + assert(storage_info_.finalized_); + + if (storage_info_.num_non_empty_levels() == 0) { + // No file in the Version. + return; + } + // Merge all level zero files together since they may overlap - for (size_t i = 0; i < file_levels_[0].num_files; i++) { - const auto& file = file_levels_[0].files[i]; + for (size_t i = 0; i < storage_info_.LevelFilesBrief(0).num_files; i++) { + const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr, false, merge_iter_builder->GetArena())); @@ -635,163 +702,78 @@ void Version::AddIterators(const ReadOptions& read_options, // For levels > 0, we can use a concatenating iterator that sequentially // walks through the non-overlapping files in the level, opening them // lazily. - for (int level = 1; level < num_levels_; level++) { - if (file_levels_[level].num_files != 0) { + for (int level = 1; level < storage_info_.num_non_empty_levels(); level++) { + if (storage_info_.LevelFilesBrief(level).num_files != 0) { merge_iter_builder->AddIterator(NewTwoLevelIterator( new LevelFileIteratorState( cfd_->table_cache(), read_options, soptions, cfd_->internal_comparator(), false /* for_compaction */, - cfd_->options()->prefix_extractor != nullptr), + cfd_->ioptions()->prefix_extractor != nullptr), new LevelFileNumIterator(cfd_->internal_comparator(), - &file_levels_[level]), merge_iter_builder->GetArena())); + &storage_info_.LevelFilesBrief(level)), + merge_iter_builder->GetArena())); } } } -// Callback from TableCache::Get() -enum SaverState { - kNotFound, - kFound, - kDeleted, - kCorrupt, - kMerge // saver contains the current merge result (the operands) -}; - -namespace version_set { -struct Saver { - SaverState state; - const Comparator* ucmp; - Slice user_key; - bool* value_found; // Is value set correctly? Used by KeyMayExist - std::string* value; - const MergeOperator* merge_operator; - // the merge operations encountered; - MergeContext* merge_context; - Logger* logger; - Statistics* statistics; -}; -} // namespace version_set - -// Called from TableCache::Get and Table::Get when file/block in which -// key may exist are not there in TableCache/BlockCache respectively. In this -// case we can't guarantee that key does not exist and are not permitted to do -// IO to be certain.Set the status=kFound and value_found=false to let the -// caller know that key may exist but is not there in memory -static void MarkKeyMayExist(void* arg) { - version_set::Saver* s = reinterpret_cast(arg); - s->state = kFound; - if (s->value_found != nullptr) { - *(s->value_found) = false; - } -} - -static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, - const Slice& v) { - version_set::Saver* s = reinterpret_cast(arg); - MergeContext* merge_contex = s->merge_context; - std::string merge_result; // temporary area for merge results later - - assert(s != nullptr && merge_contex != nullptr); - - // TODO: Merge? - if (s->ucmp->Compare(parsed_key.user_key, s->user_key) == 0) { - // Key matches. Process it - switch (parsed_key.type) { - case kTypeValue: - if (kNotFound == s->state) { - s->state = kFound; - s->value->assign(v.data(), v.size()); - } else if (kMerge == s->state) { - assert(s->merge_operator != nullptr); - s->state = kFound; - if (!s->merge_operator->FullMerge(s->user_key, &v, - merge_contex->GetOperands(), - s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = kCorrupt; - } - } else { - assert(false); - } - return false; - - case kTypeDeletion: - if (kNotFound == s->state) { - s->state = kDeleted; - } else if (kMerge == s->state) { - s->state = kFound; - if (!s->merge_operator->FullMerge(s->user_key, nullptr, - merge_contex->GetOperands(), - s->value, s->logger)) { - RecordTick(s->statistics, NUMBER_MERGE_FAILURES); - s->state = kCorrupt; - } - } else { - assert(false); - } - return false; - - case kTypeMerge: - assert(s->state == kNotFound || s->state == kMerge); - s->state = kMerge; - merge_contex->PushOperand(v); - return true; - - default: - assert(false); - break; - } +VersionStorageInfo::VersionStorageInfo( + const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int levels, + CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage) + : internal_comparator_(internal_comparator), + user_comparator_(user_comparator), + // cfd is nullptr if Version is dummy + num_levels_(levels), + num_non_empty_levels_(0), + file_indexer_(user_comparator), + compaction_style_(compaction_style), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), + compaction_score_(num_levels_), + compaction_level_(num_levels_), + accumulated_file_size_(0), + accumulated_raw_key_size_(0), + accumulated_raw_value_size_(0), + accumulated_num_non_deletions_(0), + accumulated_num_deletions_(0), + num_samples_(0), + finalized_(false) { + if (ref_vstorage != nullptr) { + accumulated_file_size_ = ref_vstorage->accumulated_file_size_; + accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; + accumulated_raw_value_size_ = ref_vstorage->accumulated_raw_value_size_; + accumulated_num_non_deletions_ = + ref_vstorage->accumulated_num_non_deletions_; + accumulated_num_deletions_ = ref_vstorage->accumulated_num_deletions_; + num_samples_ = ref_vstorage->num_samples_; } - - // s->state could be Corrupt, merge or notfound - - return false; } -Version::Version(ColumnFamilyData* cfd, VersionSet* vset, +Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, uint64_t version_number) - : cfd_(cfd), - internal_comparator_((cfd == nullptr) ? nullptr - : &cfd->internal_comparator()), - user_comparator_( - (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()), - table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()), - merge_operator_((cfd == nullptr) ? nullptr - : cfd->options()->merge_operator.get()), - info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()), - db_statistics_((cfd == nullptr) ? nullptr - : cfd->options()->statistics.get()), - // cfd is nullptr if Version is dummy - num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), - num_non_empty_levels_(num_levels_), - file_indexer_(cfd == nullptr + : cfd_(column_family_data), + info_log_((cfd_ == nullptr) ? nullptr : cfd_->ioptions()->info_log), + db_statistics_((cfd_ == nullptr) ? nullptr + : cfd_->ioptions()->statistics), + table_cache_((cfd_ == nullptr) ? nullptr : cfd_->table_cache()), + merge_operator_((cfd_ == nullptr) ? nullptr + : cfd_->ioptions()->merge_operator), + storage_info_((cfd_ == nullptr) ? nullptr : &cfd_->internal_comparator(), + (cfd_ == nullptr) ? nullptr : cfd_->user_comparator(), + cfd_ == nullptr ? 0 : cfd_->NumberLevels(), + cfd_ == nullptr ? kCompactionStyleLevel + : cfd_->ioptions()->compaction_style, + (cfd_ == nullptr || cfd_->current() == nullptr) ? nullptr - : cfd->internal_comparator().user_comparator()), + : cfd_->current()->storage_info()), vset_(vset), next_(this), prev_(this), refs_(0), - files_(new std::vector[num_levels_]), - files_by_size_(num_levels_), - next_file_to_compact_by_size_(num_levels_), - compaction_score_(num_levels_), - compaction_level_(num_levels_), - version_number_(version_number), - total_file_size_(0), - total_raw_key_size_(0), - total_raw_value_size_(0), - num_non_deletions_(0), - num_deletions_(0) { - if (cfd != nullptr && cfd->current() != nullptr) { - total_file_size_ = cfd->current()->total_file_size_; - total_raw_key_size_ = cfd->current()->total_raw_key_size_; - total_raw_value_size_ = cfd->current()->total_raw_value_size_; - num_non_deletions_ = cfd->current()->num_non_deletions_; - num_deletions_ = cfd->current()->num_deletions_; - } -} + version_number_(version_number) {} -void Version::Get(const ReadOptions& options, +void Version::Get(const ReadOptions& read_options, const LookupKey& k, std::string* value, Status* status, @@ -801,46 +783,52 @@ void Version::Get(const ReadOptions& options, Slice user_key = k.user_key(); assert(status->ok() || status->IsMergeInProgress()); - version_set::Saver saver; - saver.state = status->ok()? kNotFound : kMerge; - saver.ucmp = user_comparator_; - saver.user_key = user_key; - saver.value_found = value_found; - saver.value = value; - saver.merge_operator = merge_operator_; - saver.merge_context = merge_context; - saver.logger = info_log_; - saver.statistics = db_statistics_; - - FilePicker fp(files_, user_key, ikey, &file_levels_, num_non_empty_levels_, - &file_indexer_, user_comparator_, internal_comparator_); + + GetContext get_context( + user_comparator(), merge_operator_, info_log_, db_statistics_, + status->ok() ? GetContext::kNotFound : GetContext::kMerge, user_key, + value, value_found, merge_context); + + FilePicker fp( + storage_info_.files_, user_key, ikey, &storage_info_.level_files_brief_, + storage_info_.num_non_empty_levels_, &storage_info_.file_indexer_, + user_comparator(), internal_comparator()); FdWithKeyRange* f = fp.GetNextFile(); while (f != nullptr) { - *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey, - &saver, SaveValue, MarkKeyMayExist); + *status = table_cache_->Get(read_options, *internal_comparator(), f->fd, + ikey, &get_context); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; } - switch (saver.state) { - case kNotFound: - break; // Keep searching in other files - case kFound: + switch (get_context.State()) { + case GetContext::kNotFound: + // Keep searching in other files + break; + case GetContext::kFound: + if (fp.GetHitFileLevel() == 0) { + RecordTick(db_statistics_, GET_HIT_L0); + } else if (fp.GetHitFileLevel() == 1) { + RecordTick(db_statistics_, GET_HIT_L1); + } else if (fp.GetHitFileLevel() >= 2) { + RecordTick(db_statistics_, GET_HIT_L2_AND_UP); + } return; - case kDeleted: - *status = Status::NotFound(); // Use empty error message for speed + case GetContext::kDeleted: + // Use empty error message for speed + *status = Status::NotFound(); return; - case kCorrupt: + case GetContext::kCorrupt: *status = Status::Corruption("corrupted key for ", user_key); return; - case kMerge: + case GetContext::kMerge: break; } f = fp.GetNextFile(); } - if (kMerge == saver.state) { + if (GetContext::kMerge == get_context.State()) { if (!merge_operator_) { *status = Status::InvalidArgument( "merge_operator is not properly initialized."); @@ -849,7 +837,7 @@ void Version::Get(const ReadOptions& options, // merge_operands are in saver and we hit the beginning of the key history // do a final merge of nullptr and operands; if (merge_operator_->FullMerge(user_key, nullptr, - saver.merge_context->GetOperands(), value, + merge_context->GetOperands(), value, info_log_)) { *status = Status::OK(); } else { @@ -862,31 +850,32 @@ void Version::Get(const ReadOptions& options, } } -void Version::GenerateFileLevels() { - file_levels_.resize(num_non_empty_levels_); +void VersionStorageInfo::GenerateLevelFilesBrief() { + level_files_brief_.resize(num_non_empty_levels_); for (int level = 0; level < num_non_empty_levels_; level++) { - DoGenerateFileLevel(&file_levels_[level], files_[level], &arena_); + DoGenerateLevelFilesBrief( + &level_files_brief_[level], files_[level], &arena_); } } -void Version::PrepareApply(std::vector& size_being_compacted) { - UpdateTemporaryStats(); - ComputeCompactionScore(size_being_compacted); - UpdateFilesBySize(); - UpdateNumNonEmptyLevels(); - file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); - GenerateFileLevels(); +void Version::PrepareApply() { + UpdateAccumulatedStats(); + storage_info_.UpdateFilesBySize(); + storage_info_.UpdateNumNonEmptyLevels(); + storage_info_.GenerateFileIndexer(); + storage_info_.GenerateLevelFilesBrief(); } bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { - if (file_meta->init_stats_from_file) { + if (file_meta->init_stats_from_file || + file_meta->compensated_file_size > 0) { return false; } std::shared_ptr tp; Status s = GetTableProperties(&tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { - Log(vset_->options_->info_log, + Log(InfoLogLevel::ERROR_LEVEL, vset_->db_options_->info_log, "Unable to load table properties for file %" PRIu64 " --- %s\n", file_meta->fd.GetNumber(), s.ToString().c_str()); return false; @@ -900,36 +889,71 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { return true; } -void Version::UpdateTemporaryStats() { - static const int kDeletionWeightOnCompaction = 2; +void VersionStorageInfo::UpdateAccumulatedStats(FileMetaData* file_meta) { + assert(file_meta->init_stats_from_file); + accumulated_file_size_ += file_meta->fd.GetFileSize(); + accumulated_raw_key_size_ += file_meta->raw_key_size; + accumulated_raw_value_size_ += file_meta->raw_value_size; + accumulated_num_non_deletions_ += + file_meta->num_entries - file_meta->num_deletions; + accumulated_num_deletions_ += file_meta->num_deletions; + num_samples_++; +} - // incrementally update the average value size by - // including newly added files into the global stats +void Version::UpdateAccumulatedStats() { + // maximum number of table properties loaded from files. + const int kMaxInitCount = 20; int init_count = 0; - int total_count = 0; - for (int level = 0; level < num_levels_; level++) { - for (auto* file_meta : files_[level]) { + // here only the first kMaxInitCount files which haven't been + // initialized from file will be updated with num_deletions. + // The motivation here is to cap the maximum I/O per Version creation. + // The reason for choosing files from lower-level instead of higher-level + // is that such design is able to propagate the initialization from + // lower-level to higher-level: When the num_deletions of lower-level + // files are updated, it will make the lower-level files have accurate + // compensated_file_size, making lower-level to higher-level compaction + // will be triggered, which creates higher-level files whose num_deletions + // will be updated here. + for (int level = 0; + level < storage_info_.num_levels_ && init_count < kMaxInitCount; + ++level) { + for (auto* file_meta : storage_info_.files_[level]) { if (MaybeInitializeFileMetaData(file_meta)) { // each FileMeta will be initialized only once. - total_file_size_ += file_meta->fd.GetFileSize(); - total_raw_key_size_ += file_meta->raw_key_size; - total_raw_value_size_ += file_meta->raw_value_size; - num_non_deletions_ += - file_meta->num_entries - file_meta->num_deletions; - num_deletions_ += file_meta->num_deletions; - init_count++; + storage_info_.UpdateAccumulatedStats(file_meta); + if (++init_count >= kMaxInitCount) { + break; + } } - total_count++; } } + // In case all sampled-files contain only deletion entries, then we + // load the table-property of a file in higher-level to initialize + // that value. + for (int level = storage_info_.num_levels_ - 1; + storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { + for (int i = static_cast(storage_info_.files_[level].size()) - 1; + storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { + if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); + } + } + } + + storage_info_.ComputeCompensatedSizes(); +} +void VersionStorageInfo::ComputeCompensatedSizes() { + static const int kDeletionWeightOnCompaction = 2; uint64_t average_value_size = GetAverageValueSize(); // compute the compensated size for (int level = 0; level < num_levels_; level++) { for (auto* file_meta : files_[level]) { // Here we only compute compensated_file_size for those file_meta - // which compensated_file_size is uninitialized (== 0). + // which compensated_file_size is uninitialized (== 0). This is true only + // for files that have been created right now and no other thread has + // access to them. That's why we can safely mutate compensated_file_size. if (file_meta->compensated_file_size == 0) { file_meta->compensated_file_size = file_meta->fd.GetFileSize() + file_meta->num_deletions * average_value_size * @@ -939,15 +963,20 @@ void Version::UpdateTemporaryStats() { } } -void Version::ComputeCompactionScore( - std::vector& size_being_compacted) { +int VersionStorageInfo::MaxInputLevel() const { + if (compaction_style_ == kCompactionStyleLevel) { + return num_levels() - 2; + } + return 0; +} + +void VersionStorageInfo::ComputeCompactionScore( + const MutableCFOptions& mutable_cf_options, + const CompactionOptionsFIFO& compaction_options_fifo) { double max_score = 0; int max_score_level = 0; - int max_input_level = - cfd_->compaction_picker()->MaxInputLevel(NumberLevels()); - - for (int level = 0; level <= max_input_level; level++) { + for (int level = 0; level <= MaxInputLevel(); level++) { double score; if (level == 0) { // We treat level-0 specially by bounding the number of files @@ -969,24 +998,29 @@ void Version::ComputeCompactionScore( numfiles++; } } - if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { + if (compaction_style_ == kCompactionStyleFIFO) { score = static_cast(total_size) / - cfd_->options()->compaction_options_fifo.max_table_files_size; - } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { + compaction_options_fifo.max_table_files_size; + } else if (numfiles >= mutable_cf_options.level0_stop_writes_trigger) { // If we are slowing down writes, then we better compact that first score = 1000000; - } else if (numfiles >= cfd_->options()->level0_slowdown_writes_trigger) { + } else if (numfiles >= + mutable_cf_options.level0_slowdown_writes_trigger) { score = 10000; } else { score = static_cast(numfiles) / - cfd_->options()->level0_file_num_compaction_trigger; + mutable_cf_options.level0_file_num_compaction_trigger; } } else { // Compute the ratio of current size to size limit. - const uint64_t level_bytes = - TotalCompensatedFileSize(files_[level]) - size_being_compacted[level]; - score = static_cast(level_bytes) / - cfd_->compaction_picker()->MaxBytesForLevel(level); + uint64_t level_bytes_no_compacting = 0; + for (auto f : files_[level]) { + if (f && f->being_compacted == false) { + level_bytes_no_compacting += f->compensated_file_size; + } + } + score = static_cast(level_bytes_no_compacting) / + mutable_cf_options.MaxBytesForLevel(level); if (max_score < score) { max_score = score; max_score_level = level; @@ -1002,8 +1036,8 @@ void Version::ComputeCompactionScore( // sort all the levels based on their score. Higher scores get listed // first. Use bubble sort because the number of entries are small. - for (int i = 0; i < NumberLevels() - 2; i++) { - for (int j = i + 1; j < NumberLevels() - 1; j++) { + for (int i = 0; i < num_levels() - 2; i++) { + for (int j = i + 1; j < num_levels() - 1; j++) { if (compaction_score_[i] < compaction_score_[j]) { double score = compaction_score_[i]; int level = compaction_level_[i]; @@ -1017,16 +1051,35 @@ void Version::ComputeCompactionScore( } namespace { + +// used to sort files by size +struct Fsize { + int index; + FileMetaData* file; +}; + // Compator that is used to sort files based on their size // In normal mode: descending size -bool CompareCompensatedSizeDescending(const Version::Fsize& first, - const Version::Fsize& second) { +bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { return (first.file->compensated_file_size > second.file->compensated_file_size); } + } // anonymous namespace -void Version::UpdateNumNonEmptyLevels() { +void VersionStorageInfo::AddFile(int level, FileMetaData* f) { + assert(level < num_levels()); + auto* level_files = &files_[level]; + // Must not overlap + assert(level <= 0 || level_files->empty() || + internal_comparator_->Compare( + (*level_files)[level_files->size() - 1]->largest, f->smallest) < + 0); + f->refs++; + level_files->push_back(f); +} + +void VersionStorageInfo::UpdateNumNonEmptyLevels() { num_non_empty_levels_ = num_levels_; for (int i = num_levels_ - 1; i >= 0; i--) { if (files_[i].size() != 0) { @@ -1037,14 +1090,14 @@ void Version::UpdateNumNonEmptyLevels() { } } -void Version::UpdateFilesBySize() { - if (cfd_->options()->compaction_style == kCompactionStyleFIFO || - cfd_->options()->compaction_style == kCompactionStyleUniversal) { +void VersionStorageInfo::UpdateFilesBySize() { + if (compaction_style_ == kCompactionStyleFIFO || + compaction_style_ == kCompactionStyleUniversal) { // don't need this return; } // No need to sort the highest level because it is never compacted. - for (int level = 0; level < NumberLevels() - 1; level++) { + for (int level = 0; level < num_levels() - 1; level++) { const std::vector& files = files_[level]; auto& files_by_size = files_by_size_[level]; assert(files_by_size.size() == 0); @@ -1057,7 +1110,7 @@ void Version::UpdateFilesBySize() { } // sort the top number_of_files_to_sort_ based on file size - size_t num = Version::number_of_files_to_sort_; + size_t num = VersionStorageInfo::kNumberFilesToSort; if (num > temp.size()) { num = temp.size(); } @@ -1088,35 +1141,16 @@ bool Version::Unref() { return false; } -bool Version::NeedsCompaction() const { - // In universal compaction case, this check doesn't really - // check the compaction condition, but checks num of files threshold - // only. We are not going to miss any compaction opportunity - // but it's likely that more compactions are scheduled but - // ending up with nothing to do. We can improve it later. - // TODO(sdong): improve this function to be accurate for universal - // compactions. - int max_input_level = - cfd_->compaction_picker()->MaxInputLevel(NumberLevels()); - - for (int i = 0; i <= max_input_level; i++) { - if (compaction_score_[i] >= 1) { - return true; - } - } - return false; -} - -bool Version::OverlapInLevel(int level, - const Slice* smallest_user_key, - const Slice* largest_user_key) { - return SomeFileOverlapsRange(cfd_->internal_comparator(), (level > 0), - file_levels_[level], smallest_user_key, +bool VersionStorageInfo::OverlapInLevel(int level, + const Slice* smallest_user_key, + const Slice* largest_user_key) { + return SomeFileOverlapsRange(*internal_comparator_, (level > 0), + level_files_brief_[level], smallest_user_key, largest_user_key); } -int Version::PickLevelForMemTableOutput( - const Slice& smallest_user_key, +int VersionStorageInfo::PickLevelForMemTableOutput( + const MutableCFOptions& mutable_cf_options, const Slice& smallest_user_key, const Slice& largest_user_key) { int level = 0; if (!OverlapInLevel(0, &smallest_user_key, &largest_user_key)) { @@ -1125,8 +1159,8 @@ int Version::PickLevelForMemTableOutput( InternalKey start(smallest_user_key, kMaxSequenceNumber, kValueTypeForSeek); InternalKey limit(largest_user_key, 0, static_cast(0)); std::vector overlaps; - int max_mem_compact_level = cfd_->options()->max_mem_compaction_level; - while (max_mem_compact_level > 0 && level < max_mem_compact_level) { + while (mutable_cf_options.max_mem_compaction_level > 0 && + level < mutable_cf_options.max_mem_compaction_level) { if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } @@ -1136,7 +1170,7 @@ int Version::PickLevelForMemTableOutput( } GetOverlappingInputs(level + 2, &start, &limit, &overlaps); const uint64_t sum = TotalFileSize(overlaps); - if (sum > cfd_->compaction_picker()->MaxGrandParentOverlapBytes(level)) { + if (sum > mutable_cf_options.MaxGrandParentOverlapBytes(level)) { break; } level++; @@ -1150,12 +1184,9 @@ int Version::PickLevelForMemTableOutput( // If hint_index is specified, then it points to a file in the // overlapping range. // The file_index returns a pointer to any file in an overlapping range. -void Version::GetOverlappingInputs(int level, - const InternalKey* begin, - const InternalKey* end, - std::vector* inputs, - int hint_index, - int* file_index) { +void VersionStorageInfo::GetOverlappingInputs( + int level, const InternalKey* begin, const InternalKey* end, + std::vector* inputs, int hint_index, int* file_index) { inputs->clear(); Slice user_begin, user_end; if (begin != nullptr) { @@ -1167,14 +1198,14 @@ void Version::GetOverlappingInputs(int level, if (file_index) { *file_index = -1; } - const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); + const Comparator* user_cmp = user_comparator_; if (begin != nullptr && end != nullptr && level > 0) { GetOverlappingInputsBinarySearch(level, user_begin, user_end, inputs, hint_index, file_index); return; } - for (size_t i = 0; i < file_levels_[level].num_files; ) { - FdWithKeyRange* f = &(file_levels_[level].files[i++]); + for (size_t i = 0; i < level_files_brief_[level].num_files; ) { + FdWithKeyRange* f = &(level_files_brief_[level].files[i++]); const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_limit = ExtractUserKey(f->largest_key); if (begin != nullptr && user_cmp->Compare(file_limit, user_begin) < 0) { @@ -1197,7 +1228,7 @@ void Version::GetOverlappingInputs(int level, i = 0; } } else if (file_index) { - *file_index = i-1; + *file_index = static_cast(i) - 1; } } } @@ -1207,19 +1238,15 @@ void Version::GetOverlappingInputs(int level, // Employ binary search to find at least one file that overlaps the // specified range. From that file, iterate backwards and // forwards to find all overlapping files. -void Version::GetOverlappingInputsBinarySearch( - int level, - const Slice& user_begin, - const Slice& user_end, - std::vector* inputs, - int hint_index, - int* file_index) { +void VersionStorageInfo::GetOverlappingInputsBinarySearch( + int level, const Slice& user_begin, const Slice& user_end, + std::vector* inputs, int hint_index, int* file_index) { assert(level > 0); int min = 0; int mid = 0; - int max = files_[level].size() -1; + int max = static_cast(files_[level].size()) - 1; bool foundOverlap = false; - const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); + const Comparator* user_cmp = user_comparator_; // if the caller already knows the index of a file that has overlap, // then we can skip the binary search. @@ -1230,7 +1257,7 @@ void Version::GetOverlappingInputsBinarySearch( while (!foundOverlap && min <= max) { mid = (min + max)/2; - FdWithKeyRange* f = &(file_levels_[level].files[mid]); + FdWithKeyRange* f = &(level_files_brief_[level].files[mid]); const Slice file_start = ExtractUserKey(f->smallest_key); const Slice file_limit = ExtractUserKey(f->largest_key); if (user_cmp->Compare(file_limit, user_begin) < 0) { @@ -1259,19 +1286,16 @@ void Version::GetOverlappingInputsBinarySearch( // overlaps the specified range. From that file, iterate backward // and forward to find all overlapping files. // Use FileLevel in searching, make it faster -void Version::ExtendOverlappingInputs( - int level, - const Slice& user_begin, - const Slice& user_end, - std::vector* inputs, - unsigned int midIndex) { - - const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); - const FdWithKeyRange* files = file_levels_[level].files; +void VersionStorageInfo::ExtendOverlappingInputs( + int level, const Slice& user_begin, const Slice& user_end, + std::vector* inputs, unsigned int midIndex) { + + const Comparator* user_cmp = user_comparator_; + const FdWithKeyRange* files = level_files_brief_[level].files; #ifndef NDEBUG { // assert that the file at midIndex overlaps with the range - assert(midIndex < file_levels_[level].num_files); + assert(midIndex < level_files_brief_[level].num_files); const FdWithKeyRange* f = &files[midIndex]; const Slice fstart = ExtractUserKey(f->smallest_key); const Slice flimit = ExtractUserKey(f->largest_key); @@ -1298,7 +1322,8 @@ void Version::ExtendOverlappingInputs( } } // check forward from 'mid+1' to higher indices - for (unsigned int i = midIndex+1; i < file_levels_[level].num_files; i++) { + for (unsigned int i = midIndex+1; + i < level_files_brief_[level].num_files; i++) { const FdWithKeyRange* f = &files[i]; const Slice file_start = ExtractUserKey(f->smallest_key); if (user_cmp->Compare(file_start, user_end) <= 0) { @@ -1321,9 +1346,8 @@ void Version::ExtendOverlappingInputs( // an overlapping user key to the file "just outside" of it (i.e. // just after the last file, or just before the first file) // REQUIRES: "*inputs" is a sorted list of non-overlapping files -bool Version::HasOverlappingUserKey( - const std::vector* inputs, - int level) { +bool VersionStorageInfo::HasOverlappingUserKey( + const std::vector* inputs, int level) { // If inputs empty, there is no overlap. // If level == 0, it is assumed that all needed files were already included. @@ -1331,15 +1355,15 @@ bool Version::HasOverlappingUserKey( return false; } - const Comparator* user_cmp = cfd_->internal_comparator().user_comparator(); - const FileLevel& file_level = file_levels_[level]; - const FdWithKeyRange* files = file_levels_[level].files; + const Comparator* user_cmp = user_comparator_; + const rocksdb::LevelFilesBrief& file_level = level_files_brief_[level]; + const FdWithKeyRange* files = level_files_brief_[level].files; const size_t kNumFiles = file_level.num_files; // Check the last file in inputs against the file after it - size_t last_file = FindFile(cfd_->internal_comparator(), file_level, + size_t last_file = FindFile(*internal_comparator_, file_level, inputs->back()->largest.Encode()); - assert(0 <= last_file && last_file < kNumFiles); // File should exist! + assert(last_file < kNumFiles); // File should exist! if (last_file < kNumFiles-1) { // If not the last file const Slice last_key_in_input = ExtractUserKey( files[last_file].largest_key); @@ -1352,9 +1376,9 @@ bool Version::HasOverlappingUserKey( } // Check the first file in inputs against the file just before it - size_t first_file = FindFile(cfd_->internal_comparator(), file_level, + size_t first_file = FindFile(*internal_comparator_, file_level, inputs->front()->smallest.Encode()); - assert(0 <= first_file && first_file <= last_file); // File should exist! + assert(first_file <= last_file); // File should exist! if (first_file > 0) { // If not first file const Slice& first_key_in_input = ExtractUserKey( files[first_file].smallest_key); @@ -1369,15 +1393,16 @@ bool Version::HasOverlappingUserKey( return false; } -int64_t Version::NumLevelBytes(int level) const { +uint64_t VersionStorageInfo::NumLevelBytes(int level) const { assert(level >= 0); - assert(level < NumberLevels()); + assert(level < num_levels()); return TotalFileSize(files_[level]); } -const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { +const char* VersionStorageInfo::LevelSummary( + LevelSummaryStorage* scratch) const { int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); - for (int i = 0; i < NumberLevels(); i++) { + for (int i = 0; i < num_levels(); i++) { int sz = sizeof(scratch->buffer) - len; int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); if (ret < 0 || ret >= sz) break; @@ -1391,8 +1416,8 @@ const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { return scratch->buffer; } -const char* Version::LevelFileSummary(FileSummaryStorage* scratch, - int level) const { +const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); for (const auto& f : files_[level]) { int sz = sizeof(scratch->buffer) - len; @@ -1414,10 +1439,10 @@ const char* Version::LevelFileSummary(FileSummaryStorage* scratch, return scratch->buffer; } -int64_t Version::MaxNextLevelOverlappingBytes() { +int64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { uint64_t result = 0; std::vector overlaps; - for (int level = 1; level < NumberLevels() - 1; level++) { + for (int level = 1; level < num_levels() - 1; level++) { for (const auto& f : files_[level]) { GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); const uint64_t sum = TotalFileSize(overlaps); @@ -1430,8 +1455,8 @@ int64_t Version::MaxNextLevelOverlappingBytes() { } void Version::AddLiveFiles(std::vector* live) { - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = files_[level]; + for (int level = 0; level < storage_info_.num_levels(); level++) { + const std::vector& files = storage_info_.files_[level]; for (const auto& file : files) { live->push_back(file->fd); } @@ -1440,7 +1465,7 @@ void Version::AddLiveFiles(std::vector* live) { std::string Version::DebugString(bool hex) const { std::string r; - for (int level = 0; level < num_levels_; level++) { + for (int level = 0; level < storage_info_.num_levels_; level++) { // E.g., // --- level 1 --- // 17:123['a' .. 'd'] @@ -1450,7 +1475,7 @@ std::string Version::DebugString(bool hex) const { r.append(" --- version# "); AppendNumberTo(&r, version_number_); r.append(" ---\n"); - const std::vector& files = files_[level]; + const std::vector& files = storage_info_.files_[level]; for (size_t i = 0; i < files.size(); i++) { r.push_back(' '); AppendNumberTo(&r, files[i]->fd.GetNumber()); @@ -1470,270 +1495,25 @@ std::string Version::DebugString(bool hex) const { struct VersionSet::ManifestWriter { Status status; bool done; - port::CondVar cv; + InstrumentedCondVar cv; ColumnFamilyData* cfd; VersionEdit* edit; - explicit ManifestWriter(port::Mutex* mu, ColumnFamilyData* cfd, + explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, VersionEdit* e) - : done(false), cv(mu), cfd(cfd), edit(e) {} + : done(false), cv(mu), cfd(_cfd), edit(e) {} }; -// A helper class so we can efficiently apply a whole sequence -// of edits to a particular state without creating intermediate -// Versions that contain full copies of the intermediate state. -class VersionSet::Builder { - private: - // Helper to sort v->files_ - // kLevel0 -- NewestFirstBySeqNo - // kLevelNon0 -- BySmallestKey - struct FileComparator { - enum SortMethod { - kLevel0 = 0, - kLevelNon0 = 1, - } sort_method; - const InternalKeyComparator* internal_comparator; - - bool operator()(FileMetaData* f1, FileMetaData* f2) const { - switch (sort_method) { - case kLevel0: - return NewestFirstBySeqNo(f1, f2); - case kLevelNon0: - return BySmallestKey(f1, f2, internal_comparator); - } - assert(false); - return false; - } - }; - - typedef std::set FileSet; - struct LevelState { - std::set deleted_files; - FileSet* added_files; - }; - - ColumnFamilyData* cfd_; - Version* base_; - LevelState* levels_; - FileComparator level_zero_cmp_; - FileComparator level_nonzero_cmp_; - - public: - Builder(ColumnFamilyData* cfd) : cfd_(cfd), base_(cfd->current()) { - base_->Ref(); - levels_ = new LevelState[base_->NumberLevels()]; - level_zero_cmp_.sort_method = FileComparator::kLevel0; - level_nonzero_cmp_.sort_method = FileComparator::kLevelNon0; - level_nonzero_cmp_.internal_comparator = &cfd->internal_comparator(); - - levels_[0].added_files = new FileSet(level_zero_cmp_); - for (int level = 1; level < base_->NumberLevels(); level++) { - levels_[level].added_files = new FileSet(level_nonzero_cmp_); - } - } - - ~Builder() { - for (int level = 0; level < base_->NumberLevels(); level++) { - const FileSet* added = levels_[level].added_files; - std::vector to_unref; - to_unref.reserve(added->size()); - for (FileSet::const_iterator it = added->begin(); - it != added->end(); ++it) { - to_unref.push_back(*it); - } - delete added; - for (uint32_t i = 0; i < to_unref.size(); i++) { - FileMetaData* f = to_unref[i]; - f->refs--; - if (f->refs <= 0) { - if (f->table_reader_handle) { - cfd_->table_cache()->ReleaseHandle(f->table_reader_handle); - f->table_reader_handle = nullptr; - } - delete f; - } - } - } - - delete[] levels_; - base_->Unref(); - } - - void CheckConsistency(Version* v) { -#ifndef NDEBUG - // make sure the files are sorted correctly - for (int level = 0; level < v->NumberLevels(); level++) { - for (size_t i = 1; i < v->files_[level].size(); i++) { - auto f1 = v->files_[level][i - 1]; - auto f2 = v->files_[level][i]; - if (level == 0) { - assert(level_zero_cmp_(f1, f2)); - assert(f1->largest_seqno > f2->largest_seqno); - } else { - assert(level_nonzero_cmp_(f1, f2)); - - // Make sure there is no overlap in levels > 0 - if (cfd_->internal_comparator().Compare(f1->largest, f2->smallest) >= - 0) { - fprintf(stderr, "overlapping ranges in same level %s vs. %s\n", - (f1->largest).DebugString().c_str(), - (f2->smallest).DebugString().c_str()); - abort(); - } - } - } - } -#endif - } - - void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, - int level) { -#ifndef NDEBUG - // a file to be deleted better exist in the previous version - bool found = false; - for (int l = 0; !found && l < base_->NumberLevels(); l++) { - const std::vector& base_files = base_->files_[l]; - for (unsigned int i = 0; i < base_files.size(); i++) { - FileMetaData* f = base_files[i]; - if (f->fd.GetNumber() == number) { - found = true; - break; - } - } - } - // if the file did not exist in the previous version, then it - // is possibly moved from lower level to higher level in current - // version - for (int l = level+1; !found && l < base_->NumberLevels(); l++) { - const FileSet* added = levels_[l].added_files; - for (FileSet::const_iterator added_iter = added->begin(); - added_iter != added->end(); ++added_iter) { - FileMetaData* f = *added_iter; - if (f->fd.GetNumber() == number) { - found = true; - break; - } - } - } - - // maybe this file was added in a previous edit that was Applied - if (!found) { - const FileSet* added = levels_[level].added_files; - for (FileSet::const_iterator added_iter = added->begin(); - added_iter != added->end(); ++added_iter) { - FileMetaData* f = *added_iter; - if (f->fd.GetNumber() == number) { - found = true; - break; - } - } - } - if (!found) { - fprintf(stderr, "not found %" PRIu64 "\n", number); - } - assert(found); -#endif - } - - // Apply all of the edits in *edit to the current state. - void Apply(VersionEdit* edit) { - CheckConsistency(base_); - - // Delete files - const VersionEdit::DeletedFileSet& del = edit->deleted_files_; - for (const auto& del_file : del) { - const auto level = del_file.first; - const auto number = del_file.second; - levels_[level].deleted_files.insert(number); - CheckConsistencyForDeletes(edit, number, level); - } - - // Add new files - for (const auto& new_file : edit->new_files_) { - const int level = new_file.first; - FileMetaData* f = new FileMetaData(new_file.second); - f->refs = 1; - - levels_[level].deleted_files.erase(f->fd.GetNumber()); - levels_[level].added_files->insert(f); - } - } - - // Save the current state in *v. - void SaveTo(Version* v) { - CheckConsistency(base_); - CheckConsistency(v); - - for (int level = 0; level < base_->NumberLevels(); level++) { - const auto& cmp = (level == 0) ? level_zero_cmp_ : level_nonzero_cmp_; - // Merge the set of added files with the set of pre-existing files. - // Drop any deleted files. Store the result in *v. - const auto& base_files = base_->files_[level]; - auto base_iter = base_files.begin(); - auto base_end = base_files.end(); - const auto& added_files = *levels_[level].added_files; - v->files_[level].reserve(base_files.size() + added_files.size()); - - for (const auto& added : added_files) { - // Add all smaller files listed in base_ - for (auto bpos = std::upper_bound(base_iter, base_end, added, cmp); - base_iter != bpos; - ++base_iter) { - MaybeAddFile(v, level, *base_iter); - } - - MaybeAddFile(v, level, added); - } - - // Add remaining base files - for (; base_iter != base_end; ++base_iter) { - MaybeAddFile(v, level, *base_iter); - } - } - - CheckConsistency(v); - } - - void LoadTableHandlers() { - for (int level = 0; level < cfd_->NumberLevels(); level++) { - for (auto& file_meta : *(levels_[level].added_files)) { - assert (!file_meta->table_reader_handle); - cfd_->table_cache()->FindTable( - base_->vset_->storage_options_, cfd_->internal_comparator(), - file_meta->fd, &file_meta->table_reader_handle, false); - if (file_meta->table_reader_handle != nullptr) { - // Load table_reader - file_meta->fd.table_reader = - cfd_->table_cache()->GetTableReaderFromHandle( - file_meta->table_reader_handle); - } - } - } - } - - void MaybeAddFile(Version* v, int level, FileMetaData* f) { - if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { - // File is deleted: do nothing - } else { - auto* files = &v->files_[level]; - if (level > 0 && !files->empty()) { - // Must not overlap - assert(cfd_->internal_comparator().Compare( - (*files)[files->size() - 1]->largest, f->smallest) < 0); - } - f->refs++; - files->push_back(f); - } - } -}; - -VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, - const EnvOptions& storage_options, Cache* table_cache) - : column_family_set_(new ColumnFamilySet(dbname, options, storage_options, - table_cache)), - env_(options->env), +VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& storage_options, Cache* table_cache, + WriteBuffer* write_buffer, + WriteController* write_controller) + : column_family_set_(new ColumnFamilySet( + dbname, db_options, storage_options, table_cache, + write_buffer, write_controller)), + env_(db_options->env), dbname_(dbname), - options_(options), + db_options_(db_options), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() pending_manifest_file_number_(0), @@ -1741,8 +1521,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, prev_log_number_(0), current_version_number_(0), manifest_file_size_(0), - storage_options_(storage_options), - storage_options_compactions_(storage_options_) {} + env_options_(storage_options), + env_options_compactions_(env_options_) {} VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on @@ -1756,6 +1536,14 @@ VersionSet::~VersionSet() { void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Version* v) { + // compute new compaction score + v->storage_info()->ComputeCompactionScore( + *column_family_data->GetLatestMutableCFOptions(), + column_family_data->ioptions()->compaction_options_fifo); + + // Mark v finalized + v->storage_info_.SetFinalized(); + // Make "v" current assert(v->refs_ == 0); Version* current = column_family_data->current(); @@ -1775,16 +1563,17 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, } Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, - VersionEdit* edit, port::Mutex* mu, + const MutableCFOptions& mutable_cf_options, + VersionEdit* edit, InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log, - const ColumnFamilyOptions* options) { + const ColumnFamilyOptions* new_cf_options) { mu->AssertHeld(); // column_family_data can be nullptr only if this is column_family_add. // in that case, we also need to specify ColumnFamilyOptions if (column_family_data == nullptr) { assert(edit->is_column_family_add_); - assert(options != nullptr); + assert(new_cf_options != nullptr); } // queue our request @@ -1809,7 +1598,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, std::vector batch_edits; Version* v = nullptr; - std::unique_ptr builder(nullptr); + std::unique_ptr builder_guard(nullptr); // process all requests in the queue ManifestWriter* last_writer = &w; @@ -1821,7 +1610,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, batch_edits.push_back(edit); } else { v = new Version(column_family_data, this, current_version_number_++); - builder.reset(new Builder(column_family_data)); + builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data)); + auto* builder = builder_guard->version_builder(); for (const auto& writer : manifest_writers_) { if (writer->edit->IsColumnFamilyManipulation() || writer->cfd->GetID() != column_family_data->GetID()) { @@ -1830,11 +1620,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, break; } last_writer = writer; - LogAndApplyHelper(column_family_data, builder.get(), v, last_writer->edit, - mu); + LogAndApplyHelper(column_family_data, builder, v, last_writer->edit, mu); batch_edits.push_back(last_writer->edit); } - builder->SaveTo(v); + builder->SaveTo(v->storage_info()); } // Initialize new descriptor log file if necessary by creating @@ -1844,9 +1633,9 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || - manifest_file_size_ > options_->max_manifest_file_size) { + manifest_file_size_ > db_options_->max_manifest_file_size) { pending_manifest_file_number_ = NewFileNumber(); - batch_edits.back()->SetNextFile(next_file_number_); + batch_edits.back()->SetNextFile(next_file_number_.load()); new_descriptor_log = true; } else { pending_manifest_file_number_ = manifest_file_number_; @@ -1862,35 +1651,29 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // Unlock during expensive operations. New writes cannot get here // because &w is ensuring that all new writes get queued. { - std::vector size_being_compacted; - if (!edit->IsColumnFamilyManipulation()) { - size_being_compacted.resize(v->NumberLevels() - 1); - // calculate the amount of data being compacted at every level - column_family_data->compaction_picker()->SizeBeingCompacted( - size_being_compacted); - } mu->Unlock(); - if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { + if (!edit->IsColumnFamilyManipulation() && + db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. - builder->LoadTableHandlers(); + builder_guard->version_builder()->LoadTableHandlers(); } // This is fine because everything inside of this block is serialized -- // only one thread can be here at the same time if (new_descriptor_log) { // create manifest file - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); + &descriptor_file, env_->OptimizeForManifestWrite(env_options_)); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( - options_->manifest_preallocation_size); + db_options_->manifest_preallocation_size); descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); } @@ -1898,43 +1681,45 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (!edit->IsColumnFamilyManipulation()) { // This is cpu-heavy operations, which should be called outside mutex. - v->PrepareApply(size_being_compacted); + v->PrepareApply(); } // Write new record to MANIFEST log if (s.ok()) { for (auto& e : batch_edits) { std::string record; - e->EncodeTo(&record); + if (!e->EncodeTo(&record)) { + s = Status::Corruption( + "Unable to Encode VersionEdit:" + e->DebugString(true)); + break; + } s = descriptor_log_->AddRecord(record); if (!s.ok()) { break; } } if (s.ok()) { - if (options_->use_fsync) { - StopWatch sw(env_, options_->statistics.get(), - MANIFEST_FILE_SYNC_MICROS); - s = descriptor_log_->file()->Fsync(); - } else { - StopWatch sw(env_, options_->statistics.get(), - MANIFEST_FILE_SYNC_MICROS); - s = descriptor_log_->file()->Sync(); - } + s = SyncManifest(env_, db_options_, descriptor_log_->file()); } if (!s.ok()) { - Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); + Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, + "MANIFEST write: %s\n", s.ToString().c_str()); bool all_records_in = true; for (auto& e : batch_edits) { std::string record; - e->EncodeTo(&record); + if (!e->EncodeTo(&record)) { + s = Status::Corruption( + "Unable to Encode VersionEdit:" + e->DebugString(true)); + all_records_in = false; + break; + } if (!ManifestContains(pending_manifest_file_number_, record)) { all_records_in = false; break; } } if (all_records_in) { - Log(options_->info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log, "MANIFEST contains log record despite error; advancing to new " "version to prevent mismatch between in-memory and logged state" " If paranoid is set, then the db is now in readonly mode."); @@ -1947,10 +1732,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // new CURRENT file that points to it. if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, - db_directory); + db_options_->disableDataSync ? nullptr : db_directory); if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { // delete old manifest file - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); // we don't care about an error here, PurgeObsoleteFiles will take care @@ -1964,7 +1749,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, new_manifest_file_size = descriptor_log_->file()->GetFileSize(); } - LogFlush(options_->info_log); + LogFlush(db_options_->info_log); mu->Lock(); } @@ -1973,8 +1758,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (edit->is_column_family_add_) { // no group commit on column family add assert(batch_edits.size() == 1); - assert(options != nullptr); - CreateColumnFamily(*options, edit); + assert(new_cf_options != nullptr); + CreateColumnFamily(*new_cf_options, edit); } else if (edit->is_column_family_drop_) { assert(batch_edits.size() == 1); column_family_data->SetDropped(); @@ -2000,12 +1785,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, manifest_file_size_ = new_manifest_file_size; prev_log_number_ = edit->prev_log_number_; } else { - Log(options_->info_log, "Error in committing version %lu to [%s]", + Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, + "Error in committing version %lu to [%s]", (unsigned long)v->GetVersionNumber(), column_family_data->GetName().c_str()); delete v; if (new_descriptor_log) { - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", manifest_file_number_, pending_manifest_file_number_); descriptor_log_.reset(); @@ -2035,7 +1821,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { assert(edit->IsColumnFamilyManipulation()); - edit->SetNextFile(next_file_number_); + edit->SetNextFile(next_file_number_.load()); edit->SetLastSequence(last_sequence_); if (edit->is_column_family_drop_) { // if we drop column family, we have to make sure to save max column family, @@ -2044,21 +1830,21 @@ void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { } } -void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, Builder* builder, - Version* v, VersionEdit* edit, - port::Mutex* mu) { +void VersionSet::LogAndApplyHelper(ColumnFamilyData* cfd, + VersionBuilder* builder, Version* v, + VersionEdit* edit, InstrumentedMutex* mu) { mu->AssertHeld(); assert(!edit->IsColumnFamilyManipulation()); if (edit->has_log_number_) { assert(edit->log_number_ >= cfd->GetLogNumber()); - assert(edit->log_number_ < next_file_number_); + assert(edit->log_number_ < next_file_number_.load()); } if (!edit->has_prev_log_number_) { edit->SetPrevLogNumber(prev_log_number_); } - edit->SetNextFile(next_file_number_); + edit->SetNextFile(next_file_number_.load()); edit->SetLastSequence(last_sequence_); builder->Apply(edit); @@ -2097,18 +1883,19 @@ Status VersionSet::Recover( return Status::Corruption("CURRENT file corrupted"); } - Log(options_->info_log, "Recovering from manifest file: %s\n", + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "Recovering from manifest file: %s\n", manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; unique_ptr manifest_file; s = env_->NewSequentialFile(manifest_filename, &manifest_file, - storage_options_); + env_options_); if (!s.ok()) { return s; } - uint64_t manifest_file_size; - s = env_->GetFileSize(manifest_filename, &manifest_file_size); + uint64_t current_manifest_file_size; + s = env_->GetFileSize(manifest_filename, ¤t_manifest_file_size); if (!s.ok()) { return s; } @@ -2120,9 +1907,9 @@ Status VersionSet::Recover( uint64_t next_file = 0; uint64_t last_sequence = 0; uint64_t log_number = 0; - uint64_t prev_log_number = 0; + uint64_t previous_log_number = 0; uint32_t max_column_family = 0; - std::unordered_map builders; + std::unordered_map builders; // add default column family auto default_cf_iter = cf_name_to_options.find(kDefaultColumnFamilyName); @@ -2134,7 +1921,7 @@ Status VersionSet::Recover( default_cf_edit.SetColumnFamily(0); ColumnFamilyData* default_cfd = CreateColumnFamily(default_cf_iter->second, &default_cf_edit); - builders.insert({0, new Builder(default_cfd)}); + builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); { VersionSet::LogReporter reporter; @@ -2180,7 +1967,8 @@ Status VersionSet::Recover( {edit.column_family_, edit.column_family_name_}); } else { cfd = CreateColumnFamily(cf_options->second, &edit); - builders.insert({edit.column_family_, new Builder(cfd)}); + builders.insert( + {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); } } else if (edit.is_column_family_drop_) { if (cf_in_builders) { @@ -2213,7 +2001,7 @@ Status VersionSet::Recover( cfd = column_family_set_->GetColumnFamily(edit.column_family_); // this should never happen since cf_in_builders is true assert(cfd != nullptr); - if (edit.max_level_ >= cfd->current()->NumberLevels()) { + if (edit.max_level_ >= cfd->current()->storage_info()->num_levels()) { s = Status::InvalidArgument( "db has more levels than options.num_levels"); break; @@ -2224,13 +2012,13 @@ Status VersionSet::Recover( // to builder auto builder = builders.find(edit.column_family_); assert(builder != builders.end()); - builder->second->Apply(&edit); + builder->second->version_builder()->Apply(&edit); } if (cfd != nullptr) { if (edit.has_log_number_) { if (cfd->GetLogNumber() > edit.log_number_) { - Log(options_->info_log, + Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log, "MANIFEST corruption detected, but ignored - Log numbers in " "records NOT monotonically increasing"); } else { @@ -2248,7 +2036,7 @@ Status VersionSet::Recover( } if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; + previous_log_number = edit.prev_log_number_; have_prev_log_number = true; } @@ -2278,18 +2066,18 @@ Status VersionSet::Recover( } if (!have_prev_log_number) { - prev_log_number = 0; + previous_log_number = 0; } column_family_set_->UpdateMaxColumnFamily(max_column_family); - MarkFileNumberUsed(prev_log_number); - MarkFileNumberUsed(log_number); + MarkFileNumberUsedDuringRecovery(previous_log_number); + MarkFileNumberUsedDuringRecovery(log_number); } // there were some column families in the MANIFEST that weren't specified // in the argument. This is OK in read_only mode - if (read_only == false && column_families_not_found.size() > 0) { + if (read_only == false && !column_families_not_found.empty()) { std::string list_of_not_found; for (const auto& cf : column_families_not_found) { list_of_not_found += ", " + cf.second; @@ -2304,42 +2092,40 @@ Status VersionSet::Recover( for (auto cfd : *column_family_set_) { auto builders_iter = builders.find(cfd->GetID()); assert(builders_iter != builders.end()); - auto builder = builders_iter->second; + auto* builder = builders_iter->second->version_builder(); - if (options_->max_open_files == -1) { + if (db_options_->max_open_files == -1) { // unlimited table cache. Pre-load table handle now. // Need to do it out of the mutex. builder->LoadTableHandlers(); } Version* v = new Version(cfd, this, current_version_number_++); - builder->SaveTo(v); + builder->SaveTo(v->storage_info()); // Install recovered version - std::vector size_being_compacted(v->NumberLevels() - 1); - cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); - v->PrepareApply(size_being_compacted); + v->PrepareApply(); AppendVersion(cfd, v); } - manifest_file_size_ = manifest_file_size; - next_file_number_ = next_file + 1; + manifest_file_size_ = current_manifest_file_size; + next_file_number_.store(next_file + 1); last_sequence_ = last_sequence; - prev_log_number_ = prev_log_number; + prev_log_number_ = previous_log_number; - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Recovered from manifest file:%s succeeded," "manifest_file_number is %lu, next_file_number is %lu, " "last_sequence is %lu, log_number is %lu," "prev_log_number is %lu," "max_column_family is %u\n", manifest_filename.c_str(), (unsigned long)manifest_file_number_, - (unsigned long)next_file_number_, (unsigned long)last_sequence_, + (unsigned long)next_file_number_.load(), (unsigned long)last_sequence_, (unsigned long)log_number, (unsigned long)prev_log_number_, column_family_set_->GetMaxColumnFamily()); for (auto cfd : *column_family_set_) { - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Column family [%s] (ID %u), log number is %" PRIu64 "\n", cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); } @@ -2422,7 +2208,7 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, #ifndef ROCKSDB_LITE Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, + const EnvOptions& env_options, int new_levels) { if (new_levels <= 1) { return Status::InvalidArgument( @@ -2433,7 +2219,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, std::shared_ptr tc(NewLRUCache( options->max_open_files - 10, options->table_cache_numshardbits, options->table_cache_remove_scan_count_limit)); - VersionSet versions(dbname, options, storage_options, tc.get()); + WriteController wc; + WriteBuffer wb(options->db_write_buffer_size); + VersionSet versions(dbname, options, env_options, tc.get(), &wb, &wc); Status status; std::vector dummy; @@ -2447,7 +2235,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, Version* current_version = versions.GetColumnFamilySet()->GetDefault()->current(); - int current_levels = current_version->NumberLevels(); + auto* vstorage = current_version->storage_info(); + int current_levels = vstorage->num_levels(); if (current_levels <= new_levels) { return Status::OK(); @@ -2458,7 +2247,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, int first_nonempty_level = -1; int first_nonempty_level_filenum = 0; for (int i = new_levels - 1; i < current_levels; i++) { - int file_num = current_version->NumLevelFiles(i); + int file_num = vstorage->NumLevelFiles(i); if (file_num != 0) { if (first_nonempty_level < 0) { first_nonempty_level = i; @@ -2475,36 +2264,37 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, } } - std::vector* old_files_list = current_version->files_; // we need to allocate an array with the old number of levels size to // avoid SIGSEGV in WriteSnapshot() // however, all levels bigger or equal to new_levels will be empty std::vector* new_files_list = new std::vector[current_levels]; for (int i = 0; i < new_levels - 1; i++) { - new_files_list[i] = old_files_list[i]; + new_files_list[i] = vstorage->LevelFiles(i); } if (first_nonempty_level > 0) { - new_files_list[new_levels - 1] = old_files_list[first_nonempty_level]; + new_files_list[new_levels - 1] = vstorage->LevelFiles(first_nonempty_level); } - delete[] current_version->files_; - current_version->files_ = new_files_list; - current_version->num_levels_ = new_levels; + delete[] vstorage -> files_; + vstorage->files_ = new_files_list; + vstorage->num_levels_ = new_levels; + MutableCFOptions mutable_cf_options(*options, ImmutableCFOptions(*options)); VersionEdit ve; - port::Mutex dummy_mutex; - MutexLock l(&dummy_mutex); - return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), &ve, - &dummy_mutex, nullptr, true); + InstrumentedMutex dummy_mutex; + InstrumentedMutexLock l(&dummy_mutex); + return versions.LogAndApply( + versions.GetColumnFamilySet()->GetDefault(), + mutable_cf_options, &ve, &dummy_mutex, nullptr, true); } Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex) { // Open the specified manifest file. unique_ptr file; - Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); + Status s = options.env->NewSequentialFile(dscname, &file, env_options_); if (!s.ok()) { return s; } @@ -2514,10 +2304,10 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool have_last_sequence = false; uint64_t next_file = 0; uint64_t last_sequence = 0; - uint64_t prev_log_number = 0; + uint64_t previous_log_number = 0; int count = 0; std::unordered_map comparators; - std::unordered_map builders; + std::unordered_map builders; // add default column family VersionEdit default_cf_edit; @@ -2525,7 +2315,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, default_cf_edit.SetColumnFamily(0); ColumnFamilyData* default_cfd = CreateColumnFamily(ColumnFamilyOptions(options), &default_cf_edit); - builders.insert({0, new Builder(default_cfd)}); + builders.insert({0, new BaseReferencedVersionBuilder(default_cfd)}); { VersionSet::LogReporter reporter; @@ -2564,7 +2354,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, break; } cfd = CreateColumnFamily(ColumnFamilyOptions(options), &edit); - builders.insert({edit.column_family_, new Builder(cfd)}); + builders.insert( + {edit.column_family_, new BaseReferencedVersionBuilder(cfd)}); } else if (edit.is_column_family_drop_) { if (!cf_in_builders) { s = Status::Corruption( @@ -2596,7 +2387,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, // to builder auto builder = builders.find(edit.column_family_); assert(builder != builders.end()); - builder->second->Apply(&edit); + builder->second->version_builder()->Apply(&edit); } if (cfd != nullptr && edit.has_log_number_) { @@ -2604,7 +2395,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } if (edit.has_prev_log_number_) { - prev_log_number = edit.prev_log_number_; + previous_log_number = edit.prev_log_number_; have_prev_log_number = true; } @@ -2635,7 +2426,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } if (!have_prev_log_number) { - prev_log_number = 0; + previous_log_number = 0; } } @@ -2643,14 +2434,11 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, for (auto cfd : *column_family_set_) { auto builders_iter = builders.find(cfd->GetID()); assert(builders_iter != builders.end()); - auto builder = builders_iter->second; + auto builder = builders_iter->second->version_builder(); Version* v = new Version(cfd, this, current_version_number_++); - builder->SaveTo(v); - std::vector size_being_compacted(v->NumberLevels() - 1); - cfd->compaction_picker()->SizeBeingCompacted(size_being_compacted); - v->PrepareApply(size_being_compacted); - delete builder; + builder->SaveTo(v->storage_info()); + v->PrepareApply(); printf("--------------- Column family \"%s\" (ID %u) --------------\n", cfd->GetName().c_str(), (unsigned int)cfd->GetID()); @@ -2665,15 +2453,20 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, delete v; } - next_file_number_ = next_file + 1; + // Free builders + for (auto& builder : builders) { + delete builder.second; + } + + next_file_number_.store(next_file + 1); last_sequence_ = last_sequence; - prev_log_number_ = prev_log_number; + prev_log_number_ = previous_log_number; printf( "next_file_number %lu last_sequence " "%lu prev_log_number %lu max_column_family %u\n", - (unsigned long)next_file_number_, (unsigned long)last_sequence, - (unsigned long)prev_log_number, + (unsigned long)next_file_number_.load(), (unsigned long)last_sequence, + (unsigned long)previous_log_number, column_family_set_->GetMaxColumnFamily()); } @@ -2681,9 +2474,11 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } #endif // ROCKSDB_LITE -void VersionSet::MarkFileNumberUsed(uint64_t number) { - if (next_file_number_ <= number) { - next_file_number_ = number + 1; +void VersionSet::MarkFileNumberUsedDuringRecovery(uint64_t number) { + // only called during recovery which is single threaded, so this works because + // there can't be concurrent calls + if (next_file_number_.load(std::memory_order_relaxed) <= number) { + next_file_number_.store(number + 1, std::memory_order_relaxed); } } @@ -2708,7 +2503,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { edit.SetComparatorName( cfd->internal_comparator().user_comparator()->Name()); std::string record; - edit.EncodeTo(&record); + if (!edit.EncodeTo(&record)) { + return Status::Corruption( + "Unable to Encode VersionEdit:" + edit.DebugString(true)); + } Status s = log->AddRecord(record); if (!s.ok()) { return s; @@ -2721,7 +2519,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { edit.SetColumnFamily(cfd->GetID()); for (int level = 0; level < cfd->NumberLevels(); level++) { - for (const auto& f : cfd->current()->files_[level]) { + for (const auto& f : + cfd->current()->storage_info()->LevelFiles(level)) { edit.AddFile(level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); @@ -2729,7 +2528,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { } edit.SetLogNumber(cfd->GetLogNumber()); std::string record; - edit.EncodeTo(&record); + if (!edit.EncodeTo(&record)) { + return Status::Corruption( + "Unable to Encode VersionEdit:" + edit.DebugString(true)); + } Status s = log->AddRecord(record); if (!s.ok()) { return s; @@ -2742,16 +2544,17 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // Opens the mainfest file and reads all records // till it finds the record we are looking for. -bool VersionSet::ManifestContains(uint64_t manifest_file_number, +bool VersionSet::ManifestContains(uint64_t manifest_file_num, const std::string& record) const { - std::string fname = - DescriptorFileName(dbname_, manifest_file_number); - Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); + std::string fname = DescriptorFileName(dbname_, manifest_file_num); + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: checking %s\n", fname.c_str()); unique_ptr file; - Status s = env_->NewSequentialFile(fname, &file, storage_options_); + Status s = env_->NewSequentialFile(fname, &file, env_options_); if (!s.ok()) { - Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); - Log(options_->info_log, + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: %s\n", s.ToString().c_str()); + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "ManifestContains: is unable to reopen the manifest file %s", fname.c_str()); return false; @@ -2766,15 +2569,17 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number, break; } } - Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: result = %d\n", result ? 1 : 0); return result; } uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; - for (int level = 0; level < v->NumberLevels(); level++) { - const std::vector& files = v->files_[level]; + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + const std::vector& files = vstorage->LevelFiles(level); for (size_t i = 0; i < files.size(); i++) { if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <= 0) { @@ -2794,7 +2599,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { // approximate offset of "ikey" within the table. TableReader* table_reader_ptr; Iterator* iter = v->cfd_->table_cache()->NewIterator( - ReadOptions(), storage_options_, v->cfd_->internal_comparator(), + ReadOptions(), env_options_, v->cfd_->internal_comparator(), files[i]->fd, &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); @@ -2813,25 +2618,32 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { Version* dummy_versions = cfd->dummy_versions(); for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { - for (int level = 0; level < v->NumberLevels(); level++) { - total_files += v->files_[level].size(); + const auto* vstorage = v->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + total_files += vstorage->LevelFiles(level).size(); } } } // just one time extension to the right size - live_list->reserve(live_list->size() + total_files); + live_list->reserve(live_list->size() + static_cast(total_files)); for (auto cfd : *column_family_set_) { + auto* current = cfd->current(); + bool found_current = false; Version* dummy_versions = cfd->dummy_versions(); for (Version* v = dummy_versions->next_; v != dummy_versions; v = v->next_) { - for (int level = 0; level < v->NumberLevels(); level++) { - for (const auto& f : v->files_[level]) { - live_list->push_back(f->fd); - } + v->AddLiveFiles(live_list); + if (v == current) { + found_current = true; } } + if (!found_current && current != nullptr) { + // Should never happen unless it is a bug. + assert(false); + current->AddLiveFiles(live_list); + } } } @@ -2839,41 +2651,42 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { auto cfd = c->column_family_data(); ReadOptions read_options; read_options.verify_checksums = - cfd->options()->verify_checksums_in_compaction; + c->mutable_cf_options()->verify_checksums_in_compaction; read_options.fill_cache = false; // Level-0 files have to be merged together. For other levels, // we will make a concatenating iterator per level. // TODO(opt): use concatenating iterator for level-0 if there is no overlap - const int space = (c->level() == 0 ? - c->input_levels(0)->num_files + c->num_input_levels() - 1: - c->num_input_levels()); - Iterator** list = new Iterator*[space]; - int num = 0; - for (int which = 0; which < c->num_input_levels(); which++) { + const size_t space = (c->level() == 0 ? c->input_levels(0)->num_files + + c->num_input_levels() - 1 + : c->num_input_levels()); + Iterator** list = new Iterator* [space]; + size_t num = 0; + for (size_t which = 0; which < c->num_input_levels(); which++) { if (c->input_levels(which)->num_files != 0) { if (c->level(which) == 0) { - const FileLevel* flevel = c->input_levels(which); + const LevelFilesBrief* flevel = c->input_levels(which); for (size_t i = 0; i < flevel->num_files; i++) { list[num++] = cfd->table_cache()->NewIterator( - read_options, storage_options_compactions_, + read_options, env_options_compactions_, cfd->internal_comparator(), flevel->files[i].fd, nullptr, true /* for compaction */); } } else { // Create concatenating iterator for the files from this level - list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( - cfd->table_cache(), read_options, storage_options_, + list[num++] = NewTwoLevelIterator(new LevelFileIteratorState( + cfd->table_cache(), read_options, env_options_, cfd->internal_comparator(), true /* for_compaction */, false /* prefix enabled */), - new Version::LevelFileNumIterator(cfd->internal_comparator(), - c->input_levels(which))); + new LevelFileNumIterator(cfd->internal_comparator(), + c->input_levels(which))); } } } assert(num <= space); - Iterator* result = NewMergingIterator( - &c->column_family_data()->internal_comparator(), list, num); + Iterator* result = + NewMergingIterator(&c->column_family_data()->internal_comparator(), list, + static_cast(num)); delete[] list; return result; } @@ -2883,47 +2696,30 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { #ifndef NDEBUG Version* version = c->column_family_data()->current(); + const VersionStorageInfo* vstorage = version->storage_info(); if (c->input_version() != version) { - Log(options_->info_log, - "[%s] VerifyCompactionFileConsistency version mismatch", + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "[%s] compaction output being applied to a different base version from" + " input version", c->column_family_data()->GetName().c_str()); } - // verify files in level - int level = c->level(); - for (int i = 0; i < c->num_input_files(0); i++) { - uint64_t number = c->input(0, i)->fd.GetNumber(); - - // look for this file in the current version - bool found = false; - for (unsigned int j = 0; j < version->files_[level].size(); j++) { - FileMetaData* f = version->files_[level][j]; - if (f->fd.GetNumber() == number) { - found = true; - break; + for (size_t input = 0; input < c->num_input_levels(); ++input) { + int level = c->level(input); + for (size_t i = 0; i < c->num_input_files(input); ++i) { + uint64_t number = c->input(input, i)->fd.GetNumber(); + bool found = false; + for (unsigned int j = 0; j < vstorage->files_[level].size(); j++) { + FileMetaData* f = vstorage->files_[level][j]; + if (f->fd.GetNumber() == number) { + found = true; + break; + } } - } - if (!found) { - return false; // input files non existant in current version - } - } - // verify level+1 files - level++; - for (int i = 0; i < c->num_input_files(1); i++) { - uint64_t number = c->input(1, i)->fd.GetNumber(); - - // look for this file in the current version - bool found = false; - for (unsigned int j = 0; j < version->files_[level].size(); j++) { - FileMetaData* f = version->files_[level][j]; - if (f->fd.GetNumber() == number) { - found = true; - break; + if (!found) { + return false; // input files non existent in current version } } - if (!found) { - return false; // input files non existant in current version - } } #endif return true; // everything good @@ -2934,8 +2730,9 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, ColumnFamilyData** cfd) { for (auto cfd_iter : *column_family_set_) { Version* version = cfd_iter->current(); - for (int level = 0; level < version->NumberLevels(); level++) { - for (const auto& file : version->files_[level]) { + const auto* vstorage = version->storage_info(); + for (int level = 0; level < vstorage->num_levels(); level++) { + for (const auto& file : vstorage->LevelFiles(level)) { if (file->fd.GetNumber() == number) { *meta = file; *filelevel = level; @@ -2951,15 +2748,16 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { for (auto cfd : *column_family_set_) { for (int level = 0; level < cfd->NumberLevels(); level++) { - for (const auto& file : cfd->current()->files_[level]) { + for (const auto& file : + cfd->current()->storage_info()->LevelFiles(level)) { LiveFileMetaData filemetadata; filemetadata.column_family_name = cfd->GetName(); uint32_t path_id = file->fd.GetPathId(); - if (path_id < options_->db_paths.size()) { - filemetadata.db_path = options_->db_paths[path_id].path; + if (path_id < db_options_->db_paths.size()) { + filemetadata.db_path = db_options_->db_paths[path_id].path; } else { - assert(!options_->db_paths.empty()); - filemetadata.db_path = options_->db_paths.back().path; + assert(!db_options_->db_paths.empty()); + filemetadata.db_path = db_options_->db_paths.back().path; } filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); filemetadata.level = level; @@ -2974,23 +2772,37 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { } } -void VersionSet::GetObsoleteFiles(std::vector* files) { - files->insert(files->end(), obsolete_files_.begin(), obsolete_files_.end()); - obsolete_files_.clear(); +void VersionSet::GetObsoleteFiles(std::vector* files, + uint64_t min_pending_output) { + std::vector pending_files; + for (auto f : obsolete_files_) { + if (f->fd.GetNumber() < min_pending_output) { + files->push_back(f); + } else { + pending_files.push_back(f); + } + } + obsolete_files_.swap(pending_files); } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& options, VersionEdit* edit) { + const ColumnFamilyOptions& cf_options, VersionEdit* edit) { assert(edit->is_column_family_add_); Version* dummy_versions = new Version(nullptr, this); + // Ref() dummy version once so that later we can call Unref() to delete it + // by avoiding calling "delete" explicitly (~Version is private) + dummy_versions->Ref(); auto new_cfd = column_family_set_->CreateColumnFamily( - edit->column_family_name_, edit->column_family_, dummy_versions, options); + edit->column_family_name_, edit->column_family_, dummy_versions, + cf_options); Version* v = new Version(new_cfd, this, current_version_number_++); AppendVersion(new_cfd, v); - new_cfd->CreateNewMemtable(); + // GetLatestMutableCFOptions() is safe here without mutex since the + // cfd is not available to client + new_cfd->CreateNewMemtable(*new_cfd->GetLatestMutableCFOptions()); new_cfd->SetLogNumber(edit->log_number_); return new_cfd; } diff --git a/db/version_set.h b/db/version_set.h index 2f6d477a1..b00c9ce2b 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -26,6 +26,7 @@ #include #include #include "db/dbformat.h" +#include "db/version_builder.h" #include "db/version_edit.h" #include "port/port.h" #include "db/table_cache.h" @@ -34,19 +35,23 @@ #include "db/column_family.h" #include "db/log_reader.h" #include "db/file_indexer.h" +#include "db/write_controller.h" +#include "util/instrumented_mutex.h" namespace rocksdb { -namespace log { class Writer; } +namespace log { +class Writer; +} class Compaction; -class CompactionPicker; class Iterator; class LogBuffer; class LookupKey; class MemTable; class Version; class VersionSet; +class WriteBuffer; class MergeContext; class ColumnFamilyData; class ColumnFamilySet; @@ -58,7 +63,7 @@ class MergeIteratorBuilder; // REQUIRES: "file_level.files" contains a sorted list of // non-overlapping files. extern int FindFile(const InternalKeyComparator& icmp, - const FileLevel& file_level, + const LevelFilesBrief& file_level, const Slice& key); // Returns true iff some file in "files" overlaps the user key range @@ -70,94 +75,96 @@ extern int FindFile(const InternalKeyComparator& icmp, extern bool SomeFileOverlapsRange( const InternalKeyComparator& icmp, bool disjoint_sorted_files, - const FileLevel& file_level, + const LevelFilesBrief& file_level, const Slice* smallest_user_key, const Slice* largest_user_key); -// Generate FileLevel from vector +// Generate LevelFilesBrief from vector // Would copy smallest_key and largest_key data to sequential memory // arena: Arena used to allocate the memory -extern void DoGenerateFileLevel(FileLevel* file_level, +extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena); -class Version { +class VersionStorageInfo { public: - // Append to *iters a sequence of iterators that will - // yield the contents of this Version when merged together. - // REQUIRES: This version has been saved (see VersionSet::SaveTo) - void AddIterators(const ReadOptions&, const EnvOptions& soptions, - std::vector* iters); + VersionStorageInfo(const InternalKeyComparator* internal_comparator, + const Comparator* user_comparator, int num_levels, + CompactionStyle compaction_style, + VersionStorageInfo* src_vstorage); + ~VersionStorageInfo(); - void AddIterators(const ReadOptions&, const EnvOptions& soptions, - MergeIteratorBuilder* merger_iter_builder); + void Reserve(int level, size_t size) { files_[level].reserve(size); } - // Lookup the value for key. If found, store it in *val and - // return OK. Else return a non-OK status. - // Uses *operands to store merge_operator operations to apply later - // REQUIRES: lock is not held - void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, MergeContext* merge_context, - bool* value_found = nullptr); + void AddFile(int level, FileMetaData* f); - // Updates internal structures that keep track of compaction scores - // We use compaction scores to figure out which compaction to do next - // REQUIRES: If Version is not yet saved to current_, it can be called without - // a lock. Once a version is saved to current_, call only with mutex held - void ComputeCompactionScore(std::vector& size_being_compacted); + void SetFinalized() { finalized_ = true; } - // Generate file_levels_ from files_ - void GenerateFileLevels(); + // Update num_non_empty_levels_. + void UpdateNumNonEmptyLevels(); - // Update scores, pre-calculated variables. It needs to be called before - // applying the version to the version set. - void PrepareApply(std::vector& size_being_compacted); + void GenerateFileIndexer() { + file_indexer_.UpdateIndex(&arena_, num_non_empty_levels_, files_); + } - // Reference count management (so Versions do not disappear out from - // under live iterators) - void Ref(); - // Decrease reference count. Delete the object if no reference left - // and return true. Otherwise, return false. - bool Unref(); + // Update the accumulated stats from a file-meta. + void UpdateAccumulatedStats(FileMetaData* file_meta); - // Returns true iff some level needs a compaction. - bool NeedsCompaction() const; + void ComputeCompensatedSizes(); + + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // REQUIRES: db_mutex held!! + // TODO find a better way to pass compaction_options_fifo. + void ComputeCompactionScore( + const MutableCFOptions& mutable_cf_options, + const CompactionOptionsFIFO& compaction_options_fifo); + + // Generate level_files_brief_ from files_ + void GenerateLevelFilesBrief(); + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(); + + int MaxInputLevel() const; // Returns the maxmimum compaction score for levels 1 to max - double MaxCompactionScore() const { return max_compaction_score_; } + double max_compaction_score() const { return max_compaction_score_; } // See field declaration - int MaxCompactionScoreLevel() const { return max_compaction_score_level_; } + int max_compaction_score_level() const { return max_compaction_score_level_; } + + // Return level number that has idx'th highest score + int CompactionScoreLevel(int idx) const { return compaction_level_[idx]; } + + // Return idx'th highest score + double CompactionScore(int idx) const { return compaction_score_[idx]; } void GetOverlappingInputs( - int level, - const InternalKey* begin, // nullptr means before all keys - const InternalKey* end, // nullptr means after all keys + int level, const InternalKey* begin, // nullptr means before all keys + const InternalKey* end, // nullptr means after all keys std::vector* inputs, - int hint_index = -1, // index of overlap file - int* file_index = nullptr); // return index of overlap file + int hint_index = -1, // index of overlap file + int* file_index = nullptr); // return index of overlap file void GetOverlappingInputsBinarySearch( - int level, - const Slice& begin, // nullptr means before all keys - const Slice& end, // nullptr means after all keys + int level, const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys std::vector* inputs, - int hint_index, // index of overlap file - int* file_index); // return index of overlap file + int hint_index, // index of overlap file + int* file_index); // return index of overlap file void ExtendOverlappingInputs( - int level, - const Slice& begin, // nullptr means before all keys - const Slice& end, // nullptr means after all keys + int level, const Slice& begin, // nullptr means before all keys + const Slice& end, // nullptr means after all keys std::vector* inputs, - unsigned int index); // start extending from this index + unsigned int index); // start extending from this index // Returns true iff some file in the specified level overlaps // some part of [*smallest_user_key,*largest_user_key]. // smallest_user_key==NULL represents a key smaller than all keys in the DB. // largest_user_key==NULL represents a key largest than all keys in the DB. - bool OverlapInLevel(int level, - const Slice* smallest_user_key, + bool OverlapInLevel(int level, const Slice* smallest_user_key, const Slice* largest_user_key); // Returns true iff the first or last file in inputs contains @@ -167,27 +174,77 @@ class Version { bool HasOverlappingUserKey(const std::vector* inputs, int level); - // Return the level at which we should place a new memtable compaction // result that covers the range [smallest_user_key,largest_user_key]. - int PickLevelForMemTableOutput(const Slice& smallest_user_key, + int PickLevelForMemTableOutput(const MutableCFOptions& mutable_cf_options, + const Slice& smallest_user_key, const Slice& largest_user_key); - int NumberLevels() const { return num_levels_; } + int num_levels() const { return num_levels_; } - // REQUIRES: lock is held - int NumLevelFiles(int level) const { return files_[level].size(); } + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + int num_non_empty_levels() const { + assert(finalized_); + return num_non_empty_levels_; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + int NumLevelFiles(int level) const { + assert(finalized_); + return static_cast(files_[level].size()); + } // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; + uint64_t NumLevelBytes(int level) const; + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const std::vector& LevelFiles(int level) const { + return files_[level]; + } + + const rocksdb::LevelFilesBrief& LevelFilesBrief(int level) const { + assert(level < static_cast(level_files_brief_.size())); + return level_files_brief_[level]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const std::vector& FilesBySize(int level) const { + assert(finalized_); + return files_by_size_[level]; + } + + // REQUIRES: lock is held + // Set the index that is used to offset into files_by_size_ to find + // the next compaction candidate file. + void SetNextCompactionIndex(int level, int index) { + next_file_to_compact_by_size_[level] = index; + } + + // REQUIRES: lock is held + int NextCompactionIndex(int level) const { + return next_file_to_compact_by_size_[level]; + } + + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + const FileIndexer& file_indexer() const { + assert(finalized_); + return file_indexer_; + } + + // Only the first few entries of files_by_size_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t kNumberFilesToSort = 50; // Return a human-readable short (single-line) summary of the number // of files per level. Uses *scratch as backing store. struct LevelSummaryStorage { - char buffer[100]; + char buffer[1000]; }; struct FileSummaryStorage { - char buffer[1000]; + char buffer[3000]; }; const char* LevelSummary(LevelSummaryStorage* scratch) const; // Return a human-readable short (single-line) summary of files @@ -198,6 +255,126 @@ class Version { // file at a level >= 1. int64_t MaxNextLevelOverlappingBytes(); + // Return a human readable string that describes this version's contents. + std::string DebugString(bool hex = false) const; + + uint64_t GetAverageValueSize() const { + if (accumulated_num_non_deletions_ == 0) { + return 0; + } + assert(accumulated_raw_key_size_ + accumulated_raw_value_size_ > 0); + assert(accumulated_file_size_ > 0); + return accumulated_raw_value_size_ / accumulated_num_non_deletions_ * + accumulated_file_size_ / + (accumulated_raw_key_size_ + accumulated_raw_value_size_); + } + + uint64_t GetEstimatedActiveKeys() const; + + // re-initializes the index that is used to offset into files_by_size_ + // to find the next compaction candidate file. + void ResetNextCompactionIndex(int level) { + next_file_to_compact_by_size_[level] = 0; + } + + const InternalKeyComparator* InternalComparator() { + return internal_comparator_; + } + + private: + const InternalKeyComparator* internal_comparator_; + const Comparator* user_comparator_; + int num_levels_; // Number of levels + int num_non_empty_levels_; // Number of levels. Any level larger than it + // is guaranteed to be empty. + // A short brief metadata of files per level + autovector level_files_brief_; + FileIndexer file_indexer_; + Arena arena_; // Used to allocate space for file_levels_ + + CompactionStyle compaction_style_; + + // List of files per level, files in each level are arranged + // in increasing order of keys + std::vector* files_; + + // A list for the same set of files that are stored in files_, + // but files in each level are now sorted based on file + // size. The file with the largest size is at the front. + // This vector stores the index of the file from files_. + std::vector> files_by_size_; + + // An index into files_by_size_ that specifies the first + // file that is not yet compacted + std::vector next_file_to_compact_by_size_; + + // Only the first few entries of files_by_size_ are sorted. + // There is no need to sort all the files because it is likely + // that on a running system, we need to look at only the first + // few largest files because a new version is created every few + // seconds/minutes (because of concurrent compactions). + static const size_t number_of_files_to_sort_ = 50; + + // Level that should be compacted next and its compaction score. + // Score < 1 means compaction is not strictly needed. These fields + // are initialized by Finalize(). + // The most critical level to be compacted is listed first + // These are used to pick the best compaction level + std::vector compaction_score_; + std::vector compaction_level_; + double max_compaction_score_ = 0.0; // max score in l1 to ln-1 + int max_compaction_score_level_ = 0; // level on which max score occurs + + // the following are the sampled temporary stats. + // the current accumulated size of sampled files. + uint64_t accumulated_file_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_key_size_; + // the current accumulated size of all raw keys based on the sampled files. + uint64_t accumulated_raw_value_size_; + // total number of non-deletion entries + uint64_t accumulated_num_non_deletions_; + // total number of deletion entries + uint64_t accumulated_num_deletions_; + // the number of samples + uint64_t num_samples_; + + bool finalized_; + + friend class Version; + friend class VersionSet; + // No copying allowed + VersionStorageInfo(const VersionStorageInfo&) = delete; + void operator=(const VersionStorageInfo&) = delete; +}; + +class Version { + public: + // Append to *iters a sequence of iterators that will + // yield the contents of this Version when merged together. + // REQUIRES: This version has been saved (see VersionSet::SaveTo) + void AddIterators(const ReadOptions&, const EnvOptions& soptions, + MergeIteratorBuilder* merger_iter_builder); + + // Lookup the value for key. If found, store it in *val and + // return OK. Else return a non-OK status. + // Uses *operands to store merge_operator operations to apply later + // REQUIRES: lock is not held + void Get(const ReadOptions&, const LookupKey& key, std::string* val, + Status* status, MergeContext* merge_context, + bool* value_found = nullptr); + + // Loads some stats information from files. Call without mutex held. It needs + // to be called before applying the version to the version set. + void PrepareApply(); + + // Reference count management (so Versions do not disappear out from + // under live iterators) + void Ref(); + // Decrease reference count. Delete the object if no reference left + // and return true. Otherwise, return false. + bool Unref(); + // Add all files listed in the current version to *live. void AddLiveFiles(std::vector* live); @@ -207,16 +384,6 @@ class Version { // Returns the version nuber of this version uint64_t GetVersionNumber() const { return version_number_; } - uint64_t GetAverageValueSize() const { - if (num_non_deletions_ == 0) { - return 0; - } - assert(total_raw_key_size_ + total_raw_value_size_ > 0); - assert(total_file_size_ > 0); - return total_raw_value_size_ / num_non_deletions_ * total_file_size_ / - (total_raw_key_size_ + total_raw_value_size_); - } - // REQUIRES: lock is held // On success, "tp" will contains the table properties of the file // specified in "file_meta". If the file name of "file_meta" is @@ -232,125 +399,72 @@ class Version { // tables' propertis, represented as shared_ptr. Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - uint64_t GetEstimatedActiveKeys(); + uint64_t GetEstimatedActiveKeys() { + return storage_info_.GetEstimatedActiveKeys(); + } size_t GetMemoryUsageByTableReaders(); - // used to sort files by size - struct Fsize { - int index; - FileMetaData* file; - }; + ColumnFamilyData* cfd() const { return cfd_; } + + + // Return the next Version in the linked list. Used for debug only + Version* TEST_Next() const { + return next_; + } + + VersionStorageInfo* storage_info() { return &storage_info_; } + + VersionSet* version_set() { return vset_; } + + void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); private: - friend class Compaction; friend class VersionSet; - friend class DBImpl; - friend class ColumnFamilyData; - friend class CompactionPicker; - friend class LevelCompactionPicker; - friend class UniversalCompactionPicker; - friend class FIFOCompactionPicker; - friend class ForwardIterator; - friend class InternalStats; - - class LevelFileNumIterator; - class LevelFileIteratorState; - - bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, - const Slice& internal_prefix) const; - // Update num_non_empty_levels_. - void UpdateNumNonEmptyLevels(); + const InternalKeyComparator* internal_comparator() const { + return storage_info_.internal_comparator_; + } + const Comparator* user_comparator() const { + return storage_info_.user_comparator_; + } - // The helper function of UpdateTemporaryStats, which may fill the missing + bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter, + const Slice& internal_prefix) const; + + // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_mata from its associated TableProperties. // Returns true if it does initialize FileMetaData. bool MaybeInitializeFileMetaData(FileMetaData* file_meta); - // Update the temporary stats associated with the current version. - // This temporary stats will be used in compaction. - void UpdateTemporaryStats(); + // Update the accumulated stats associated with the current version. + // This accumulated stats will be used in compaction. + void UpdateAccumulatedStats(); // Sort all files for this version based on their file size and // record results in files_by_size_. The largest files are listed first. void UpdateFilesBySize(); ColumnFamilyData* cfd_; // ColumnFamilyData to which this Version belongs - const InternalKeyComparator* internal_comparator_; - const Comparator* user_comparator_; + Logger* info_log_; + Statistics* db_statistics_; TableCache* table_cache_; const MergeOperator* merge_operator_; - autovector file_levels_; // A copy of list of files per level - Logger* info_log_; - Statistics* db_statistics_; - int num_levels_; // Number of levels - int num_non_empty_levels_; // Number of levels. Any level larger than it - // is guaranteed to be empty. - FileIndexer file_indexer_; + VersionStorageInfo storage_info_; VersionSet* vset_; // VersionSet to which this Version belongs - Arena arena_; // Used to allocate space for file_levels_ Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version - // List of files per level, files in each level are arranged - // in increasing order of keys - std::vector* files_; - - // A list for the same set of files that are stored in files_, - // but files in each level are now sorted based on file - // size. The file with the largest size is at the front. - // This vector stores the index of the file from files_. - std::vector> files_by_size_; - - // An index into files_by_size_ that specifies the first - // file that is not yet compacted - std::vector next_file_to_compact_by_size_; - - // Only the first few entries of files_by_size_ are sorted. - // There is no need to sort all the files because it is likely - // that on a running system, we need to look at only the first - // few largest files because a new version is created every few - // seconds/minutes (because of concurrent compactions). - static const size_t number_of_files_to_sort_ = 50; - - // Level that should be compacted next and its compaction score. - // Score < 1 means compaction is not strictly needed. These fields - // are initialized by Finalize(). - // The most critical level to be compacted is listed first - // These are used to pick the best compaction level - std::vector compaction_score_; - std::vector compaction_level_; - double max_compaction_score_; // max score in l1 to ln-1 - int max_compaction_score_level_; // level on which max score occurs - // A version number that uniquely represents this version. This is // used for debugging and logging purposes only. uint64_t version_number_; Version(ColumnFamilyData* cfd, VersionSet* vset, uint64_t version_number = 0); - // total file size - uint64_t total_file_size_; - // the total size of all raw keys. - uint64_t total_raw_key_size_; - // the total size of all raw values. - uint64_t total_raw_value_size_; - // total number of non-deletion entries - uint64_t num_non_deletions_; - // total number of deletion entries - uint64_t num_deletions_; - ~Version(); - // re-initializes the index that is used to offset into files_by_size_ - // to find the next compaction candidate file. - void ResetNextCompactionIndex(int level) { - next_file_to_compact_by_size_[level] = 0; - } - // No copying allowed Version(const Version&); void operator=(const Version&); @@ -358,8 +472,9 @@ class Version { class VersionSet { public: - VersionSet(const std::string& dbname, const DBOptions* options, - const EnvOptions& storage_options, Cache* table_cache); + VersionSet(const std::string& dbname, const DBOptions* db_options, + const EnvOptions& env_options, Cache* table_cache, + WriteBuffer* write_buffer, WriteController* write_controller); ~VersionSet(); // Apply *edit to the current version to form a new descriptor that @@ -368,11 +483,12 @@ class VersionSet { // column_family_options has to be set if edit is column family add // REQUIRES: *mu is held on entry. // REQUIRES: no other thread concurrently calls LogAndApply() - Status LogAndApply(ColumnFamilyData* column_family_data, VersionEdit* edit, - port::Mutex* mu, Directory* db_directory = nullptr, - bool new_descriptor_log = false, - const ColumnFamilyOptions* column_family_options = - nullptr); + Status LogAndApply( + ColumnFamilyData* column_family_data, + const MutableCFOptions& mutable_cf_options, VersionEdit* edit, + InstrumentedMutex* mu, Directory* db_directory = nullptr, + bool new_descriptor_log = false, + const ColumnFamilyOptions* column_family_options = nullptr); // Recover the last saved descriptor from persistent storage. // If read_only == true, Recover() will not complain if some column families @@ -397,7 +513,7 @@ class VersionSet { // among [4-6] contains files. static Status ReduceNumberOfLevels(const std::string& dbname, const Options* options, - const EnvOptions& storage_options, + const EnvOptions& env_options, int new_levels); // printf contents (for debugging) @@ -407,23 +523,16 @@ class VersionSet { #endif // ROCKSDB_LITE // Return the current manifest file number - uint64_t ManifestFileNumber() const { return manifest_file_number_; } + uint64_t manifest_file_number() const { return manifest_file_number_; } - uint64_t PendingManifestFileNumber() const { + uint64_t pending_manifest_file_number() const { return pending_manifest_file_number_; } + uint64_t current_next_file_number() const { return next_file_number_.load(); } + // Allocate and return a new file number - uint64_t NewFileNumber() { return next_file_number_++; } - - // Arrange to reuse "file_number" unless a newer file number has - // already been allocated. - // REQUIRES: "file_number" was returned by a call to NewFileNumber(). - void ReuseLogFileNumber(uint64_t file_number) { - if (next_file_number_ == file_number + 1) { - next_file_number_ = file_number; - } - } + uint64_t NewFileNumber() { return next_file_number_.fetch_add(1); } // Return the last sequence number. uint64_t LastSequence() const { @@ -437,11 +546,12 @@ class VersionSet { } // Mark the specified file number as used. - void MarkFileNumberUsed(uint64_t number); + // REQUIRED: this is only called during single-threaded recovery + void MarkFileNumberUsedDuringRecovery(uint64_t number); // Return the log file number for the log file that is currently // being compacted, or zero if there is no such log file. - uint64_t PrevLogNumber() const { return prev_log_number_; } + uint64_t prev_log_number() const { return prev_log_number_; } // Returns the minimum log number such that all // log numbers less than or equal to it can be deleted @@ -467,7 +577,7 @@ class VersionSet { uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); // Return the size of the current manifest file - uint64_t ManifestFileSize() const { return manifest_file_size_; } + uint64_t manifest_file_size() const { return manifest_file_size_; } // verify that the files that we started with for a compaction // still exist in the current version and in the same original level. @@ -478,18 +588,19 @@ class VersionSet { Status GetMetadataForFile(uint64_t number, int* filelevel, FileMetaData** metadata, ColumnFamilyData** cfd); - void GetLiveFilesMetaData( - std::vector *metadata); + void GetLiveFilesMetaData(std::vector *metadata); - void GetObsoleteFiles(std::vector* files); + void GetObsoleteFiles(std::vector* files, + uint64_t min_pending_output); ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } + const EnvOptions& env_options() { return env_options_; } private: - class Builder; struct ManifestWriter; friend class Version; + friend class DBImpl; struct LogReporter : public log::Reader::Reporter { Status* status; @@ -506,15 +617,15 @@ class VersionSet { bool ManifestContains(uint64_t manifest_file_number, const std::string& record) const; - ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, + ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); std::unique_ptr column_family_set_; Env* const env_; const std::string dbname_; - const DBOptions* const options_; - uint64_t next_file_number_; + const DBOptions* const db_options_; + std::atomic next_file_number_; uint64_t manifest_file_number_; uint64_t pending_manifest_file_number_; std::atomic last_sequence_; @@ -534,20 +645,20 @@ class VersionSet { std::vector obsolete_files_; - // storage options for all reads and writes except compactions - const EnvOptions& storage_options_; + // env options for all reads and writes except compactions + const EnvOptions& env_options_; - // storage options used for compactions. This is a copy of - // storage_options_ but with readaheads set to readahead_compactions_. - const EnvOptions storage_options_compactions_; + // env options used for compactions. This is a copy of + // env_options_ but with readaheads set to readahead_compactions_. + const EnvOptions env_options_compactions_; // No copying allowed VersionSet(const VersionSet&); void operator=(const VersionSet&); void LogAndApplyCFHelper(VersionEdit* edit); - void LogAndApplyHelper(ColumnFamilyData* cfd, Builder* b, Version* v, - VersionEdit* edit, port::Mutex* mu); + void LogAndApplyHelper(ColumnFamilyData* cfd, VersionBuilder* b, Version* v, + VersionEdit* edit, InstrumentedMutex* mu); }; } // namespace rocksdb diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 402762efa..9920a9e05 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -14,15 +14,15 @@ namespace rocksdb { -class GenerateFileLevelTest { +class GenerateLevelFilesBriefTest { public: std::vector files_; - FileLevel file_level_; + LevelFilesBrief file_level_; Arena arena_; - GenerateFileLevelTest() { } + GenerateLevelFilesBriefTest() { } - ~GenerateFileLevelTest() { + ~GenerateLevelFilesBriefTest() { for (unsigned int i = 0; i < files_.size(); i++) { delete files_[i]; } @@ -49,33 +49,33 @@ class GenerateFileLevelTest { } }; -TEST(GenerateFileLevelTest, Empty) { - DoGenerateFileLevel(&file_level_, files_, &arena_); +TEST(GenerateLevelFilesBriefTest, Empty) { + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); ASSERT_EQ(0u, file_level_.num_files); ASSERT_EQ(0, Compare()); } -TEST(GenerateFileLevelTest, Single) { +TEST(GenerateLevelFilesBriefTest, Single) { Add("p", "q"); - DoGenerateFileLevel(&file_level_, files_, &arena_); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); ASSERT_EQ(1u, file_level_.num_files); ASSERT_EQ(0, Compare()); } -TEST(GenerateFileLevelTest, Multiple) { +TEST(GenerateLevelFilesBriefTest, Multiple) { Add("150", "200"); Add("200", "250"); Add("300", "350"); Add("400", "450"); - DoGenerateFileLevel(&file_level_, files_, &arena_); + DoGenerateLevelFilesBrief(&file_level_, files_, &arena_); ASSERT_EQ(4u, file_level_.num_files); ASSERT_EQ(0, Compare()); } class FindLevelFileTest { public: - FileLevel file_level_; + LevelFilesBrief file_level_; bool disjoint_sorted_files_; Arena arena_; diff --git a/db/wal_manager.cc b/db/wal_manager.cc new file mode 100644 index 000000000..aa79b0280 --- /dev/null +++ b/db/wal_manager.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/wal_manager.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include + +#include "db/filename.h" +#include "db/transaction_log_impl.h" +#include "db/log_reader.h" +#include "db/log_writer.h" +#include "db/write_batch_internal.h" +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "util/coding.h" +#include "util/logging.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" +#include "util/string_util.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE + +Status WalManager::GetSortedWalFiles(VectorLogPtr& files) { + // First get sorted files in db dir, then get sorted files from archived + // dir, to avoid a race condition where a log file is moved to archived + // dir in between. + Status s; + // list wal files in main db dir. + VectorLogPtr logs; + s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile); + if (!s.ok()) { + return s; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalFiles:2"); + + files.clear(); + // list wal files in archive dir. + std::string archivedir = ArchivalDirectory(db_options_.wal_dir); + if (env_->FileExists(archivedir)) { + s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); + if (!s.ok()) { + return s; + } + } + + uint64_t latest_archived_log_number = 0; + if (!files.empty()) { + latest_archived_log_number = files.back()->LogNumber(); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Latest Archived log: %" PRIu64, + latest_archived_log_number); + } + + files.reserve(files.size() + logs.size()); + for (auto& log : logs) { + if (log->LogNumber() > latest_archived_log_number) { + files.push_back(std::move(log)); + } else { + // When the race condition happens, we could see the + // same log in both db dir and archived dir. Simply + // ignore the one in db dir. Note that, if we read + // archived dir first, we would have missed the log file. + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "%s already moved to archive", log->PathName().c_str()); + } + } + + return s; +} + +Status WalManager::GetUpdatesSince( + SequenceNumber seq, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set) { + + // Get all sorted Wal Files. + // Do binary search and open files and find the seq number. + + std::unique_ptr wal_files(new VectorLogPtr); + Status s = GetSortedWalFiles(*wal_files); + if (!s.ok()) { + return s; + } + + s = RetainProbableWalFiles(*wal_files, seq); + if (!s.ok()) { + return s; + } + iter->reset(new TransactionLogIteratorImpl( + db_options_.wal_dir, &db_options_, read_options, env_options_, seq, + std::move(wal_files), version_set)); + return (*iter)->status(); +} + +// 1. Go through all archived files and +// a. if ttl is enabled, delete outdated files +// b. if archive size limit is enabled, delete empty files, +// compute file number and size. +// 2. If size limit is enabled: +// a. compute how many files should be deleted +// b. get sorted non-empty archived logs +// c. delete what should be deleted +void WalManager::PurgeObsoleteWALFiles() { + bool const ttl_enabled = db_options_.WAL_ttl_seconds > 0; + bool const size_limit_enabled = db_options_.WAL_size_limit_MB > 0; + if (!ttl_enabled && !size_limit_enabled) { + return; + } + + int64_t current_time; + Status s = env_->GetCurrentTime(¤t_time); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Can't get current time: %s", s.ToString().c_str()); + assert(false); + return; + } + uint64_t const now_seconds = static_cast(current_time); + uint64_t const time_to_check = (ttl_enabled && !size_limit_enabled) + ? db_options_.WAL_ttl_seconds / 2 + : kDefaultIntervalToDeleteObsoleteWAL; + + if (purge_wal_files_last_run_ + time_to_check > now_seconds) { + return; + } + + purge_wal_files_last_run_ = now_seconds; + + std::string archival_dir = ArchivalDirectory(db_options_.wal_dir); + std::vector files; + s = env_->GetChildren(archival_dir, &files); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Can't get archive files: %s", s.ToString().c_str()); + assert(false); + return; + } + + size_t log_files_num = 0; + uint64_t log_file_size = 0; + + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = archival_dir + "/" + f; + if (ttl_enabled) { + uint64_t file_m_time; + s = env_->GetFileModificationTime(file_path, &file_m_time); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Can't get file mod time: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } + if (now_seconds - file_m_time > db_options_.WAL_ttl_seconds) { + s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Can't delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + continue; + } + } + + if (size_limit_enabled) { + uint64_t file_size; + s = env_->GetFileSize(file_path, &file_size); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Unable to get file size: %s: %s", + file_path.c_str(), s.ToString().c_str()); + return; + } else { + if (file_size > 0) { + log_file_size = std::max(log_file_size, file_size); + ++log_files_num; + } else { + s = env_->DeleteFile(file_path); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Unable to delete file: %s: %s", + file_path.c_str(), s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(number); + } + } + } + } + } + } + + if (0 == log_files_num || !size_limit_enabled) { + return; + } + + size_t const files_keep_num = + db_options_.WAL_size_limit_MB * 1024 * 1024 / log_file_size; + if (log_files_num <= files_keep_num) { + return; + } + + size_t files_del_num = log_files_num - files_keep_num; + VectorLogPtr archived_logs; + GetSortedWalsOfType(archival_dir, archived_logs, kArchivedLogFile); + + if (files_del_num > archived_logs.size()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Trying to delete more archived log files than " + "exist. Deleting all"); + files_del_num = archived_logs.size(); + } + + for (size_t i = 0; i < files_del_num; ++i) { + std::string const file_path = archived_logs[i]->PathName(); + s = env_->DeleteFile(db_options_.wal_dir + "/" + file_path); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Unable to delete file: %s: %s", file_path.c_str(), + s.ToString().c_str()); + continue; + } else { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.erase(archived_logs[i]->LogNumber()); + } + } +} + +void WalManager::ArchiveWALFile(const std::string& fname, uint64_t number) { + auto archived_log_name = ArchivedLogFileName(db_options_.wal_dir, number); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:1"); + Status s = env_->RenameFile(fname, archived_log_name); + // The sync point below is used in (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::PurgeObsoleteFiles:2"); + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Move log file %s to %s -- %s\n", fname.c_str(), + archived_log_name.c_str(), s.ToString().c_str()); +} + +namespace { +struct CompareLogByPointer { + bool operator()(const std::unique_ptr& a, + const std::unique_ptr& b) { + LogFileImpl* a_impl = dynamic_cast(a.get()); + LogFileImpl* b_impl = dynamic_cast(b.get()); + return *a_impl < *b_impl; + } +}; +} + +Status WalManager::GetSortedWalsOfType(const std::string& path, + VectorLogPtr& log_files, + WalFileType log_type) { + std::vector all_files; + const Status status = env_->GetChildren(path, &all_files); + if (!status.ok()) { + return status; + } + log_files.reserve(all_files.size()); + for (const auto& f : all_files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + SequenceNumber sequence; + Status s = ReadFirstRecord(log_type, number, &sequence); + if (!s.ok()) { + return s; + } + if (sequence == 0) { + // empty file + continue; + } + + // Reproduce the race condition where a log file is moved + // to archived dir, between these two sync points, used in + // (DBTest,TransactionLogIteratorRace) + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:1"); + TEST_SYNC_POINT("WalManager::GetSortedWalsOfType:2"); + + uint64_t size_bytes; + s = env_->GetFileSize(LogFileName(path, number), &size_bytes); + // re-try in case the alive log file has been moved to archive. + std::string archived_file = ArchivedLogFileName(path, number); + if (!s.ok() && log_type == kAliveLogFile && + env_->FileExists(archived_file)) { + s = env_->GetFileSize(archived_file, &size_bytes); + if (!s.ok() && !env_->FileExists(archived_file)) { + // oops, the file just got deleted from archived dir! move on + s = Status::OK(); + continue; + } + } + if (!s.ok()) { + return s; + } + + log_files.push_back(std::move(std::unique_ptr( + new LogFileImpl(number, log_type, sequence, size_bytes)))); + } + } + CompareLogByPointer compare_log_files; + std::sort(log_files.begin(), log_files.end(), compare_log_files); + return status; +} + +Status WalManager::RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target) { + int64_t start = 0; // signed to avoid overflow when target is < first file. + int64_t end = static_cast(all_logs.size()) - 1; + // Binary Search. avoid opening all files. + while (end >= start) { + int64_t mid = start + (end - start) / 2; // Avoid overflow. + SequenceNumber current_seq_num = all_logs.at(mid)->StartSequence(); + if (current_seq_num == target) { + end = mid; + break; + } else if (current_seq_num < target) { + start = mid + 1; + } else { + end = mid - 1; + } + } + // end could be -ve. + size_t start_index = std::max(static_cast(0), end); + // The last wal file is always included + all_logs.erase(all_logs.begin(), all_logs.begin() + start_index); + return Status::OK(); +} + +Status WalManager::ReadFirstRecord(const WalFileType type, + const uint64_t number, + SequenceNumber* sequence) { + *sequence = 0; + if (type != kAliveLogFile && type != kArchivedLogFile) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "[WalManger] Unknown file type %s", ToString(type).c_str()); + return Status::NotSupported( + "File Type Not Known " + ToString(type)); + } + { + MutexLock l(&read_first_record_cache_mutex_); + auto itr = read_first_record_cache_.find(number); + if (itr != read_first_record_cache_.end()) { + *sequence = itr->second; + return Status::OK(); + } + } + Status s; + if (type == kAliveLogFile) { + std::string fname = LogFileName(db_options_.wal_dir, number); + s = ReadFirstLine(fname, sequence); + if (env_->FileExists(fname) && !s.ok()) { + // return any error that is not caused by non-existing file + return s; + } + } + + if (type == kArchivedLogFile || !s.ok()) { + // check if the file got moved to archive. + std::string archived_file = + ArchivedLogFileName(db_options_.wal_dir, number); + s = ReadFirstLine(archived_file, sequence); + // maybe the file was deleted from archive dir. If that's the case, return + // Status::OK(). The caller with identify this as empty file because + // *sequence == 0 + if (!s.ok() && !env_->FileExists(archived_file)) { + return Status::OK(); + } + } + + if (s.ok() && *sequence != 0) { + MutexLock l(&read_first_record_cache_mutex_); + read_first_record_cache_.insert({number, *sequence}); + } + return s; +} + +// the function returns status.ok() and sequence == 0 if the file exists, but is +// empty +Status WalManager::ReadFirstLine(const std::string& fname, + SequenceNumber* sequence) { + struct LogReporter : public log::Reader::Reporter { + Env* env; + Logger* info_log; + const char* fname; + + Status* status; + bool ignore_error; // true if db_options_.paranoid_checks==false + virtual void Corruption(size_t bytes, const Status& s) { + Log(InfoLogLevel::WARN_LEVEL, info_log, + "[WalManager] %s%s: dropping %d bytes; %s", + (this->ignore_error ? "(ignoring error) " : ""), fname, + static_cast(bytes), s.ToString().c_str()); + if (this->status->ok()) { + // only keep the first error + *this->status = s; + } + } + }; + + std::unique_ptr file; + Status status = env_->NewSequentialFile(fname, &file, env_options_); + + if (!status.ok()) { + return status; + } + + LogReporter reporter; + reporter.env = env_; + reporter.info_log = db_options_.info_log.get(); + reporter.fname = fname.c_str(); + reporter.status = &status; + reporter.ignore_error = !db_options_.paranoid_checks; + log::Reader reader(std::move(file), &reporter, true /*checksum*/, + 0 /*initial_offset*/); + std::string scratch; + Slice record; + + if (reader.ReadRecord(&record, &scratch) && + (status.ok() || !db_options_.paranoid_checks)) { + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + // TODO read record's till the first no corrupt entry? + } else { + WriteBatch batch; + WriteBatchInternal::SetContents(&batch, record); + *sequence = WriteBatchInternal::Sequence(&batch); + return Status::OK(); + } + } + + // ReadRecord returns false on EOF, which means that the log file is empty. we + // return status.ok() in that case and set sequence number to 0 + *sequence = 0; + return status; +} + +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/db/wal_manager.h b/db/wal_manager.h new file mode 100644 index 000000000..fc04863b2 --- /dev/null +++ b/db/wal_manager.h @@ -0,0 +1,95 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "port/port.h" + +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/types.h" +#include "rocksdb/transaction_log.h" +#include "rocksdb/status.h" + +#include "db/version_set.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +class WalManager { + public: + WalManager(const DBOptions& db_options, const EnvOptions& env_options) + : db_options_(db_options), + env_options_(env_options), + env_(db_options.env), + purge_wal_files_last_run_(0) {} + + Status GetSortedWalFiles(VectorLogPtr& files); + + Status GetUpdatesSince( + SequenceNumber seq_number, std::unique_ptr* iter, + const TransactionLogIterator::ReadOptions& read_options, + VersionSet* version_set); + + void PurgeObsoleteWALFiles(); + + void ArchiveWALFile(const std::string& fname, uint64_t number); + + Status TEST_ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence) { + return ReadFirstRecord(type, number, sequence); + } + + Status TEST_ReadFirstLine(const std::string& fname, + SequenceNumber* sequence) { + return ReadFirstLine(fname, sequence); + } + + private: + Status GetSortedWalsOfType(const std::string& path, VectorLogPtr& log_files, + WalFileType type); + // Requires: all_logs should be sorted with earliest log file first + // Retains all log files in all_logs which contain updates with seq no. + // Greater Than or Equal to the requested SequenceNumber. + Status RetainProbableWalFiles(VectorLogPtr& all_logs, + const SequenceNumber target); + + Status ReadFirstRecord(const WalFileType type, const uint64_t number, + SequenceNumber* sequence); + + Status ReadFirstLine(const std::string& fname, SequenceNumber* sequence); + + // ------- state from DBImpl ------ + const DBOptions& db_options_; + const EnvOptions& env_options_; + Env* env_; + + // ------- WalManager state ------- + // cache for ReadFirstRecord() calls + std::unordered_map read_first_record_cache_; + port::Mutex read_first_record_cache_mutex_; + + // last time when PurgeObsoleteWALFiles ran. + uint64_t purge_wal_files_last_run_; + + // obsolete files will be deleted every this seconds if ttl deletion is + // enabled and archive size_limit is disabled. + static const uint64_t kDefaultIntervalToDeleteObsoleteWAL = 600; +}; + +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc new file mode 100644 index 000000000..bc12012ba --- /dev/null +++ b/db/wal_manager_test.cc @@ -0,0 +1,283 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "rocksdb/cache.h" +#include "rocksdb/write_batch.h" + +#include "db/wal_manager.h" +#include "db/log_writer.h" +#include "db/column_family.h" +#include "db/version_set.h" +#include "db/writebuffer.h" +#include "util/testharness.h" +#include "util/testutil.h" +#include "table/mock_table.h" +#include "db/db_impl.h" + +namespace rocksdb { + +// TODO(icanadi) mock out VersionSet +// TODO(icanadi) move other WalManager-specific tests from db_test here +class WalManagerTest { + public: + WalManagerTest() + : env_(Env::Default()), + dbname_(test::TmpDir() + "/wal_manager_test"), + table_cache_(NewLRUCache(50000, 16, 8)), + write_buffer_(db_options_.db_write_buffer_size), + current_log_number_(0) { + DestroyDB(dbname_, Options()); + } + + void Init() { + ASSERT_OK(env_->CreateDirIfMissing(dbname_)); + ASSERT_OK(env_->CreateDirIfMissing(ArchivalDirectory(dbname_))); + db_options_.db_paths.emplace_back(dbname_, + std::numeric_limits::max()); + db_options_.wal_dir = dbname_; + + versions_.reset(new VersionSet(dbname_, &db_options_, env_options_, + table_cache_.get(), &write_buffer_, + &write_controller_)); + + wal_manager_.reset(new WalManager(db_options_, env_options_)); + } + + void Reopen() { + wal_manager_.reset(new WalManager(db_options_, env_options_)); + } + + // NOT thread safe + void Put(const std::string& key, const std::string& value) { + assert(current_log_writer_.get() != nullptr); + uint64_t seq = versions_->LastSequence() + 1; + WriteBatch batch; + batch.Put(key, value); + WriteBatchInternal::SetSequence(&batch, seq); + current_log_writer_->AddRecord(WriteBatchInternal::Contents(&batch)); + versions_->SetLastSequence(seq); + } + + // NOT thread safe + void RollTheLog(bool archived) { + current_log_number_++; + std::string fname = ArchivedLogFileName(dbname_, current_log_number_); + unique_ptr file; + ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); + current_log_writer_.reset(new log::Writer(std::move(file))); + } + + void CreateArchiveLogs(int num_logs, int entries_per_log) { + for (int i = 1; i <= num_logs; ++i) { + RollTheLog(true); + for (int k = 0; k < entries_per_log; ++k) { + Put(ToString(k), std::string(1024, 'a')); + } + } + } + + std::unique_ptr OpenTransactionLogIter( + const SequenceNumber seq) { + unique_ptr iter; + Status status = wal_manager_->GetUpdatesSince( + seq, &iter, TransactionLogIterator::ReadOptions(), versions_.get()); + ASSERT_OK(status); + return std::move(iter); + } + + Env* env_; + std::string dbname_; + WriteController write_controller_; + EnvOptions env_options_; + std::shared_ptr table_cache_; + DBOptions db_options_; + WriteBuffer write_buffer_; + std::unique_ptr versions_; + std::unique_ptr wal_manager_; + + std::unique_ptr current_log_writer_; + uint64_t current_log_number_; +}; + +TEST(WalManagerTest, ReadFirstRecordCache) { + Init(); + std::string path = dbname_ + "/000001.log"; + unique_ptr file; + ASSERT_OK(env_->NewWritableFile(path, &file, EnvOptions())); + + SequenceNumber s; + ASSERT_OK(wal_manager_->TEST_ReadFirstLine(path, &s)); + ASSERT_EQ(s, 0U); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 0U); + + log::Writer writer(std::move(file)); + WriteBatch batch; + batch.Put("foo", "bar"); + WriteBatchInternal::SetSequence(&batch, 10); + writer.AddRecord(WriteBatchInternal::Contents(&batch)); + + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here. + // Waiting for lei to finish with db_test + // env_->count_sequential_reads_ = true; + // sequential_read_counter_ sanity test + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 0); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // did a read + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); + + ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); + ASSERT_EQ(s, 10U); + // no new reads since the value is cached + // TODO(icanadi) move SpecialEnv outside of db_test, so we can reuse it here + // ASSERT_EQ(env_->sequential_read_counter_.Read(), 1); +} + +namespace { +uint64_t GetLogDirSize(std::string dir_path, Env* env) { + uint64_t dir_size = 0; + std::vector files; + env->GetChildren(dir_path, &files); + for (auto& f : files) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kLogFile) { + std::string const file_path = dir_path + "/" + f; + uint64_t file_size; + env->GetFileSize(file_path, &file_size); + dir_size += file_size; + } + } + return dir_size; +} +std::vector ListSpecificFiles( + Env* env, const std::string& path, const FileType expected_file_type) { + std::vector files; + std::vector file_numbers; + env->GetChildren(path, &files); + uint64_t number; + FileType type; + for (size_t i = 0; i < files.size(); ++i) { + if (ParseFileName(files[i], &number, &type)) { + if (type == expected_file_type) { + file_numbers.push_back(number); + } + } + } + return std::move(file_numbers); +} + +int CountRecords(TransactionLogIterator* iter) { + int count = 0; + SequenceNumber lastSequence = 0; + BatchResult res; + while (iter->Valid()) { + res = iter->GetBatch(); + ASSERT_TRUE(res.sequence > lastSequence); + ++count; + lastSequence = res.sequence; + ASSERT_OK(iter->status()); + iter->Next(); + } + return count; +} +} // namespace + +TEST(WalManagerTest, WALArchivalSizeLimit) { + db_options_.WAL_ttl_seconds = 0; + db_options_.WAL_size_limit_MB = 1000; + Init(); + + // TEST : Create WalManager with huge size limit and no ttl. + // Create some archived files and call PurgeObsoleteWALFiles(). + // Count the archived log files that survived. + // Assert that all of them did. + // Change size limit. Re-open WalManager. + // Assert that archive is not greater than WAL_size_limit_MB after + // PurgeObsoleteWALFiles() + // Set ttl and time_to_check_ to small values. Re-open db. + // Assert that there are no archived logs left. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_, archive_dir, kLogFile); + ASSERT_EQ(log_files.size(), 20U); + + db_options_.WAL_size_limit_MB = 8; + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + uint64_t archive_size = GetLogDirSize(archive_dir, env_); + ASSERT_TRUE(archive_size <= db_options_.WAL_size_limit_MB * 1024 * 1024); + + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(2 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_, archive_dir, kLogFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST(WalManagerTest, WALArchivalTtl) { + db_options_.WAL_ttl_seconds = 1000; + Init(); + + // TEST : Create WalManager with a ttl and no size limit. + // Create some archived log files and call PurgeObsoleteWALFiles(). + // Assert that files are not deleted + // Reopen db with small ttl. + // Assert that all archived logs was removed. + + std::string archive_dir = ArchivalDirectory(dbname_); + CreateArchiveLogs(20, 5000); + + std::vector log_files = + ListSpecificFiles(env_, archive_dir, kLogFile); + ASSERT_GT(log_files.size(), 0U); + + db_options_.WAL_ttl_seconds = 1; + env_->SleepForMicroseconds(3 * 1000 * 1000); + Reopen(); + wal_manager_->PurgeObsoleteWALFiles(); + + log_files = ListSpecificFiles(env_, archive_dir, kLogFile); + ASSERT_TRUE(log_files.empty()); +} + +TEST(WalManagerTest, TransactionLogIteratorMoveOverZeroFiles) { + Init(); + RollTheLog(false); + Put("key1", std::string(1024, 'a')); + // Create a zero record WAL file. + RollTheLog(false); + RollTheLog(false); + + Put("key2", std::string(1024, 'a')); + + auto iter = OpenTransactionLogIter(0); + ASSERT_EQ(2, CountRecords(iter.get())); +} + +TEST(WalManagerTest, TransactionLogIteratorJustEmptyFile) { + Init(); + RollTheLog(false); + auto iter = OpenTransactionLogIter(0); + // Check that an empty iterator is returned + ASSERT_TRUE(!iter->Valid()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/write_batch.cc b/db/write_batch.cc index fdc0e2c6e..285a1b37d 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -23,7 +23,6 @@ // data: uint8[len] #include "rocksdb/write_batch.h" -#include "rocksdb/options.h" #include "rocksdb/merge_operator.h" #include "db/dbformat.h" #include "db/db_impl.h" @@ -49,20 +48,6 @@ WriteBatch::~WriteBatch() { } WriteBatch::Handler::~Handler() { } -void WriteBatch::Handler::Put(const Slice& key, const Slice& value) { - // you need to either implement Put or PutCF - throw std::runtime_error("Handler::Put not implemented!"); -} - -void WriteBatch::Handler::Merge(const Slice& key, const Slice& value) { - throw std::runtime_error("Handler::Merge not implemented!"); -} - -void WriteBatch::Handler::Delete(const Slice& key) { - // you need to either implement Delete or DeleteCF - throw std::runtime_error("Handler::Delete not implemented!"); -} - void WriteBatch::Handler::LogData(const Slice& blob) { // If the user has not specified something to do with blobs, then we ignore // them. @@ -295,21 +280,23 @@ void WriteBatch::PutLogData(const Slice& blob) { } namespace { +// This class can *only* be used from a single-threaded write thread, because it +// calls ColumnFamilyMemTablesImpl::Seek() class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; ColumnFamilyMemTables* cf_mems_; - bool recovery_; + bool ignore_missing_column_families_; uint64_t log_number_; DBImpl* db_; const bool dont_filter_deletes_; MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, - bool recovery, uint64_t log_number, DB* db, - const bool dont_filter_deletes) + bool ignore_missing_column_families, uint64_t log_number, + DB* db, const bool dont_filter_deletes) : sequence_(sequence), cf_mems_(cf_mems), - recovery_(recovery), + ignore_missing_column_families_(ignore_missing_column_families), log_number_(log_number), db_(reinterpret_cast(db)), dont_filter_deletes_(dont_filter_deletes) { @@ -320,13 +307,21 @@ class MemTableInserter : public WriteBatch::Handler { } bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { + // We are only allowed to call this from a single-threaded write thread + // (or while holding DB mutex) bool found = cf_mems_->Seek(column_family_id); - if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { - // if in recovery envoronment: - // * If column family was not found, it might mean that the WAL write - // batch references to the column family that was dropped after the - // insert. We don't want to fail the whole write batch in that case -- we - // just ignore the update. + if (!found) { + if (ignore_missing_column_families_) { + *s = Status::OK(); + } else { + *s = Status::InvalidArgument( + "Invalid column family specified in write batch"); + } + return false; + } + if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) { + // This is true only in recovery environment (log_number_ is always 0 in + // non-recovery, regular write code-path) // * If log_number_ < cf_mems_->GetLogNumber(), this means that column // family already contains updates from this log. We can't apply updates // twice because of update-in-place or merge workloads -- ignore the @@ -334,18 +329,8 @@ class MemTableInserter : public WriteBatch::Handler { *s = Status::OK(); return false; } - if (!found) { - assert(!recovery_); - // If the column family was not found in non-recovery enviornment - // (client's write code-path), we have to fail the write and return - // the failure status to the client. - *s = Status::InvalidArgument( - "Invalid column family specified in write batch"); - return false; - } return true; } - virtual Status PutCF(uint32_t column_family_id, const Slice& key, const Slice& value) { Status seek_status; @@ -354,14 +339,14 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); - if (!options->inplace_update_support) { + auto* moptions = mem->GetMemTableOptions(); + if (!moptions->inplace_update_support) { mem->Add(sequence_, kTypeValue, key, value); - } else if (options->inplace_callback == nullptr) { + } else if (moptions->inplace_callback == nullptr) { mem->Update(sequence_, key, value); - RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); + RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED); } else { - if (mem->UpdateCallback(sequence_, key, value, *options)) { + if (mem->UpdateCallback(sequence_, key, value)) { } else { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; @@ -379,18 +364,18 @@ class MemTableInserter : public WriteBatch::Handler { Status s = db_->Get(ropts, cf_handle, key, &prev_value); char* prev_buffer = const_cast(prev_value.c_str()); - uint32_t prev_size = prev_value.size(); - auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, - s.ok() ? &prev_size : nullptr, - value, &merged_value); + uint32_t prev_size = static_cast(prev_value.size()); + auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr, + s.ok() ? &prev_size : nullptr, + value, &merged_value); if (status == UpdateStatus::UPDATED_INPLACE) { // prev_value is updated in-place with final value. mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); - RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); } else if (status == UpdateStatus::UPDATED) { // merged_value contains the final value. mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); - RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); + RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN); } } } @@ -398,6 +383,7 @@ class MemTableInserter : public WriteBatch::Handler { // sequence number. Even if the update eventually fails and does not result // in memtable add/update. sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -409,17 +395,17 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); + auto* moptions = mem->GetMemTableOptions(); bool perform_merge = false; - if (options->max_successive_merges > 0 && db_ != nullptr) { + if (moptions->max_successive_merges > 0 && db_ != nullptr) { LookupKey lkey(key, sequence_); // Count the number of successive merges at the head // of the key in the memtable size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); - if (num_merges >= options->max_successive_merges) { + if (num_merges >= moptions->max_successive_merges) { perform_merge = true; } } @@ -443,16 +429,16 @@ class MemTableInserter : public WriteBatch::Handler { Slice get_value_slice = Slice(get_value); // 2) Apply this merge - auto merge_operator = options->merge_operator.get(); + auto merge_operator = moptions->merge_operator; assert(merge_operator); std::deque operands; operands.push_front(value.ToString()); std::string new_value; if (!merge_operator->FullMerge(key, &get_value_slice, operands, - &new_value, options->info_log.get())) { + &new_value, moptions->info_log)) { // Failed to merge! - RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); + RecordTick(moptions->statistics, NUMBER_MERGE_FAILURES); // Store the delta in memtable perform_merge = false; @@ -468,6 +454,7 @@ class MemTableInserter : public WriteBatch::Handler { } sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } @@ -478,8 +465,8 @@ class MemTableInserter : public WriteBatch::Handler { return seek_status; } MemTable* mem = cf_mems_->GetMemTable(); - const Options* options = cf_mems_->GetOptions(); - if (!dont_filter_deletes_ && options->filter_deletes) { + auto* moptions = mem->GetMemTableOptions(); + if (!dont_filter_deletes_ && moptions->filter_deletes) { SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; ReadOptions ropts; @@ -490,23 +477,31 @@ class MemTableInserter : public WriteBatch::Handler { cf_handle = db_->DefaultColumnFamily(); } if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { - RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); + RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES); return Status::OK(); } } mem->Add(sequence_, kTypeDeletion, key, Slice()); sequence_++; + cf_mems_->CheckMemtableFull(); return Status::OK(); } }; } // namespace +// This function can only be called in these conditions: +// 1) During Recovery() +// 2) during Write(), in a single-threaded write thread +// The reason is that it calles ColumnFamilyMemTablesImpl::Seek(), which needs +// to be called from a single-threaded write thread (or while holding DB mutex) Status WriteBatchInternal::InsertInto(const WriteBatch* b, ColumnFamilyMemTables* memtables, - bool recovery, uint64_t log_number, - DB* db, const bool dont_filter_deletes) { + bool ignore_missing_column_families, + uint64_t log_number, DB* db, + const bool dont_filter_deletes) { MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, - recovery, log_number, db, dont_filter_deletes); + ignore_missing_column_families, log_number, db, + dont_filter_deletes); return b->Iterate(&inserter); } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index 9a191f4cb..793c0d40f 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -26,14 +26,14 @@ class ColumnFamilyMemTables { // been processed) virtual uint64_t GetLogNumber() const = 0; virtual MemTable* GetMemTable() const = 0; - virtual const Options* GetOptions() const = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; + virtual void CheckMemtableFull() = 0; }; class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { public: - ColumnFamilyMemTablesDefault(MemTable* mem, const Options* options) - : ok_(false), mem_(mem), options_(options) {} + explicit ColumnFamilyMemTablesDefault(MemTable* mem) + : ok_(false), mem_(mem) {} bool Seek(uint32_t column_family_id) override { ok_ = (column_family_id == 0); @@ -47,17 +47,13 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { return mem_; } - const Options* GetOptions() const override { - assert(ok_); - return options_; - } - ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } + void CheckMemtableFull() override {} + private: bool ok_; MemTable* mem_; - const Options* const options_; }; // WriteBatchInternal provides static methods for manipulating a @@ -106,18 +102,18 @@ class WriteBatchInternal { // Inserts batch entries into memtable // If dont_filter_deletes is false AND options.filter_deletes is true, // then --> Drops deletes in batch if db->KeyMayExist returns false - // If recovery == true, this means InsertInto is executed on a recovery - // code-path. WriteBatch referencing a dropped column family can be - // found on a recovery code-path and should be ignored (recovery should not - // fail). Additionally, the memtable will be updated only if + // If ignore_missing_column_families == true. WriteBatch referencing + // non-existing column family should be ignored. + // However, if ignore_missing_column_families == false, any WriteBatch + // referencing non-existing column family will return a InvalidArgument() + // failure. + // + // If log_number is non-zero, the memtable will be updated only if // memtables->GetLogNumber() >= log_number - // However, if recovery == false, any WriteBatch referencing - // non-existing column family will return a failure. Also, log_number is - // ignored in that case static Status InsertInto(const WriteBatch* batch, ColumnFamilyMemTables* memtables, - bool recovery = false, uint64_t log_number = 0, - DB* db = nullptr, + bool ignore_missing_column_families = false, + uint64_t log_number = 0, DB* db = nullptr, const bool dont_filter_deletes = true); static void Append(WriteBatch* dst, const WriteBatch* src); diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 1d30552b3..c51d1750f 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -13,11 +13,13 @@ #include "db/memtable.h" #include "db/column_family.h" #include "db/write_batch_internal.h" +#include "db/writebuffer.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "rocksdb/utilities/write_batch_with_index.h" #include "util/logging.h" #include "util/testharness.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -26,13 +28,17 @@ static std::string PrintContents(WriteBatch* b) { auto factory = std::make_shared(); Options options; options.memtable_factory = factory; - MemTable* mem = new MemTable(cmp, options); + ImmutableCFOptions ioptions(options); + WriteBuffer wb(options.db_write_buffer_size); + MemTable* mem = new MemTable(cmp, ioptions, + MutableCFOptions(options, ioptions), &wb); mem->Ref(); std::string state; - ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); + ColumnFamilyMemTablesDefault cf_mems_default(mem); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); int count = 0; - Iterator* iter = mem->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena)); for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { ParsedInternalKey ikey; memset((void *)&ikey, 0, sizeof(ikey)); @@ -67,7 +73,6 @@ static std::string PrintContents(WriteBatch* b) { state.append("@"); state.append(NumberToString(ikey.sequence)); } - delete iter; if (!s.ok()) { state.append(s.ToString()); } else if (count != WriteBatchInternal::Count(b)) { @@ -152,7 +157,7 @@ namespace { if (column_family_id == 0) { seen += "Put(" + key.ToString() + ", " + value.ToString() + ")"; } else { - seen += "PutCF(" + std::to_string(column_family_id) + ", " + + seen += "PutCF(" + ToString(column_family_id) + ", " + key.ToString() + ", " + value.ToString() + ")"; } return Status::OK(); @@ -162,7 +167,7 @@ namespace { if (column_family_id == 0) { seen += "Merge(" + key.ToString() + ", " + value.ToString() + ")"; } else { - seen += "MergeCF(" + std::to_string(column_family_id) + ", " + + seen += "MergeCF(" + ToString(column_family_id) + ", " + key.ToString() + ", " + value.ToString() + ")"; } return Status::OK(); @@ -174,7 +179,7 @@ namespace { if (column_family_id == 0) { seen += "Delete(" + key.ToString() + ")"; } else { - seen += "DeleteCF(" + std::to_string(column_family_id) + ", " + + seen += "DeleteCF(" + ToString(column_family_id) + ", " + key.ToString() + ")"; } return Status::OK(); @@ -182,6 +187,39 @@ namespace { }; } +TEST(WriteBatchTest, MergeNotImplemented) { + WriteBatch batch; + batch.Merge(Slice("foo"), Slice("bar")); + ASSERT_EQ(1, batch.Count()); + ASSERT_EQ("Merge(foo, bar)@0", + PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST(WriteBatchTest, PutNotImplemented) { + WriteBatch batch; + batch.Put(Slice("k1"), Slice("v1")); + ASSERT_EQ(1, batch.Count()); + ASSERT_EQ("Put(k1, v1)@0", + PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + +TEST(WriteBatchTest, DeleteNotImplemented) { + WriteBatch batch; + batch.Delete(Slice("k2")); + ASSERT_EQ(1, batch.Count()); + ASSERT_EQ("Delete(k2)@0", + PrintContents(&batch)); + + WriteBatch::Handler handler; + ASSERT_OK(batch.Iterate(&handler)); +} + TEST(WriteBatchTest, Blob) { WriteBatch batch; batch.Put(Slice("k1"), Slice("v1")); @@ -287,6 +325,9 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { explicit ColumnFamilyHandleImplDummy(int id) : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} uint32_t GetID() const override { return id_; } + const Comparator* user_comparator() const override { + return BytewiseComparator(); + } private: uint32_t id_; @@ -318,7 +359,7 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) { } TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) { - WriteBatchWithIndex batch(BytewiseComparator(), 20); + WriteBatchWithIndex batch; ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8); batch.Put(&zero, Slice("foo"), Slice("bar")); batch.Put(&two, Slice("twofoo"), Slice("bar2")); diff --git a/db/write_controller.cc b/db/write_controller.cc new file mode 100644 index 000000000..bb6f8ecf7 --- /dev/null +++ b/db/write_controller.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/write_controller.h" + +#include + +namespace rocksdb { + +std::unique_ptr WriteController::GetStopToken() { + ++total_stopped_; + return std::unique_ptr(new StopWriteToken(this)); +} + +std::unique_ptr WriteController::GetDelayToken( + uint64_t delay_us) { + total_delay_us_ += delay_us; + return std::unique_ptr( + new DelayWriteToken(this, delay_us)); +} + +bool WriteController::IsStopped() const { return total_stopped_ > 0; } +uint64_t WriteController::GetDelay() const { return total_delay_us_; } + +StopWriteToken::~StopWriteToken() { + assert(controller_->total_stopped_ >= 1); + --controller_->total_stopped_; +} + +DelayWriteToken::~DelayWriteToken() { + assert(controller_->total_delay_us_ >= delay_us_); + controller_->total_delay_us_ -= delay_us_; +} + +} // namespace rocksdb diff --git a/db/write_controller.h b/db/write_controller.h new file mode 100644 index 000000000..32e1d58f1 --- /dev/null +++ b/db/write_controller.h @@ -0,0 +1,78 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +#include + +namespace rocksdb { + +class WriteControllerToken; + +// WriteController is controlling write stalls in our write code-path. Write +// stalls happen when compaction can't keep up with write rate. +// All of the methods here (including WriteControllerToken's destructors) need +// to be called while holding DB mutex +class WriteController { + public: + WriteController() : total_stopped_(0), total_delay_us_(0) {} + ~WriteController() = default; + + // When an actor (column family) requests a stop token, all writes will be + // stopped until the stop token is released (deleted) + std::unique_ptr GetStopToken(); + // When an actor (column family) requests a delay token, total delay for all + // writes will be increased by delay_us. The delay will last until delay token + // is released + std::unique_ptr GetDelayToken(uint64_t delay_us); + + // these two metods are querying the state of the WriteController + bool IsStopped() const; + uint64_t GetDelay() const; + + private: + friend class WriteControllerToken; + friend class StopWriteToken; + friend class DelayWriteToken; + + int total_stopped_; + uint64_t total_delay_us_; +}; + +class WriteControllerToken { + public: + explicit WriteControllerToken(WriteController* controller) + : controller_(controller) {} + virtual ~WriteControllerToken() {} + + protected: + WriteController* controller_; + + private: + // no copying allowed + WriteControllerToken(const WriteControllerToken&) = delete; + void operator=(const WriteControllerToken&) = delete; +}; + +class StopWriteToken : public WriteControllerToken { + public: + explicit StopWriteToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~StopWriteToken(); +}; + +class DelayWriteToken : public WriteControllerToken { + public: + DelayWriteToken(WriteController* controller, uint64_t delay_us) + : WriteControllerToken(controller), delay_us_(delay_us) {} + virtual ~DelayWriteToken(); + + private: + uint64_t delay_us_; +}; + +} // namespace rocksdb diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc new file mode 100644 index 000000000..1cec9658d --- /dev/null +++ b/db/write_controller_test.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include "db/write_controller.h" + +#include "util/testharness.h" + +namespace rocksdb { + +class WriteControllerTest {}; + +TEST(WriteControllerTest, SanityTest) { + WriteController controller; + auto stop_token_1 = controller.GetStopToken(); + auto stop_token_2 = controller.GetStopToken(); + + ASSERT_EQ(true, controller.IsStopped()); + stop_token_1.reset(); + ASSERT_EQ(true, controller.IsStopped()); + stop_token_2.reset(); + ASSERT_EQ(false, controller.IsStopped()); + + auto delay_token_1 = controller.GetDelayToken(5); + ASSERT_EQ(static_cast(5), controller.GetDelay()); + auto delay_token_2 = controller.GetDelayToken(8); + ASSERT_EQ(static_cast(13), controller.GetDelay()); + + delay_token_2.reset(); + ASSERT_EQ(static_cast(5), controller.GetDelay()); + delay_token_1.reset(); + ASSERT_EQ(static_cast(0), controller.GetDelay()); + delay_token_1.reset(); + ASSERT_EQ(false, controller.IsStopped()); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/db/write_thread.cc b/db/write_thread.cc new file mode 100644 index 000000000..052e1209e --- /dev/null +++ b/db/write_thread.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/write_thread.h" + +namespace rocksdb { + +Status WriteThread::EnterWriteThread(WriteThread::Writer* w, + uint64_t expiration_time) { + // the following code block pushes the current writer "w" into the writer + // queue "writers_" and wait until one of the following conditions met: + // 1. the job of "w" has been done by some other writers. + // 2. "w" becomes the first writer in "writers_" + // 3. "w" timed-out. + writers_.push_back(w); + + bool timed_out = false; + while (!w->done && w != writers_.front()) { + if (expiration_time == 0) { + w->cv.Wait(); + } else if (w->cv.TimedWait(expiration_time)) { + if (w->in_batch_group) { + // then it means the front writer is currently doing the + // write on behalf of this "timed-out" writer. Then it + // should wait until the write completes. + expiration_time = 0; + } else { + timed_out = true; + break; + } + } + } + + if (timed_out) { +#ifndef NDEBUG + bool found = false; +#endif + for (auto iter = writers_.begin(); iter != writers_.end(); iter++) { + if (*iter == w) { + writers_.erase(iter); +#ifndef NDEBUG + found = true; +#endif + break; + } + } +#ifndef NDEBUG + assert(found); +#endif + // writers_.front() might still be in cond_wait without a time-out. + // As a result, we need to signal it to wake it up. Otherwise no + // one else will wake him up, and RocksDB will hang. + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } + return Status::TimedOut(); + } + return Status::OK(); +} + +void WriteThread::ExitWriteThread(WriteThread::Writer* w, + WriteThread::Writer* last_writer, + Status status) { + // Pop out the current writer and all writers being pushed before the + // current writer from the writer queue. + while (!writers_.empty()) { + Writer* ready = writers_.front(); + writers_.pop_front(); + if (ready != w) { + ready->status = status; + ready->done = true; + ready->cv.Signal(); + } + if (ready == last_writer) break; + } + + // Notify new head of write queue + if (!writers_.empty()) { + writers_.front()->cv.Signal(); + } +} + +// This function will be called only when the first writer succeeds. +// All writers in the to-be-built batch group will be processed. +// +// REQUIRES: Writer list must be non-empty +// REQUIRES: First writer must have a non-nullptr batch +void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer, + autovector* write_batch_group) { + assert(!writers_.empty()); + Writer* first = writers_.front(); + assert(first->batch != nullptr); + + size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); + + // Allow the group to grow up to a maximum size, but if the + // original write is small, limit the growth so we do not slow + // down the small write too much. + size_t max_size = 1 << 20; + if (size <= (128<<10)) { + max_size = size + (128<<10); + } + + *last_writer = first; + std::deque::iterator iter = writers_.begin(); + ++iter; // Advance past "first" + for (; iter != writers_.end(); ++iter) { + Writer* w = *iter; + if (w->sync && !first->sync) { + // Do not include a sync write into a batch handled by a non-sync write. + break; + } + + if (!w->disableWAL && first->disableWAL) { + // Do not include a write that needs WAL into a batch that has + // WAL disabled. + break; + } + + if (w->timeout_hint_us < first->timeout_hint_us) { + // Do not include those writes with shorter timeout. Otherwise, we might + // execute a write that should instead be aborted because of timeout. + break; + } + + if (w->batch == nullptr) { + // Do not include those writes with nullptr batch. Those are not writes, + // those are something else. They want to be alone + break; + } + + size += WriteBatchInternal::ByteSize(w->batch); + if (size > max_size) { + // Do not make batch too big + break; + } + + write_batch_group->push_back(w->batch); + w->in_batch_group = true; + *last_writer = w; + } +} + +} // namespace rocksdb diff --git a/db/write_thread.h b/db/write_thread.h new file mode 100644 index 000000000..db3520244 --- /dev/null +++ b/db/write_thread.h @@ -0,0 +1,81 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include "rocksdb/status.h" +#include "db/write_batch_internal.h" +#include "util/autovector.h" +#include "port/port.h" +#include "util/instrumented_mutex.h" + +namespace rocksdb { + +class WriteThread { + public: + static const uint64_t kNoTimeOut = std::numeric_limits::max(); + // Information kept for every waiting writer + struct Writer { + Status status; + WriteBatch* batch; + bool sync; + bool disableWAL; + bool in_batch_group; + bool done; + uint64_t timeout_hint_us; + InstrumentedCondVar cv; + + explicit Writer(InstrumentedMutex* mu) + : batch(nullptr), + sync(false), + disableWAL(false), + in_batch_group(false), + done(false), + timeout_hint_us(kNoTimeOut), + cv(mu) {} + }; + + WriteThread() = default; + ~WriteThread() = default; + + // Before applying write operation (such as DBImpl::Write, DBImpl::Flush) + // thread should grab the mutex_ and be the first on writers queue. + // EnterWriteThread is used for it. + // Be aware! Writer's job can be done by other thread (see DBImpl::Write + // for examples), so check it via w.done before applying changes. + // + // Writer* w: writer to be placed in the queue + // uint64_t expiration_time: maximum time to be in the queue + // See also: ExitWriteThread + // REQUIRES: db mutex held + Status EnterWriteThread(Writer* w, uint64_t expiration_time); + + // After doing write job, we need to remove already used writers from + // writers_ queue and notify head of the queue about it. + // ExitWriteThread is used for this. + // + // Writer* w: Writer, that was added by EnterWriteThread function + // Writer* last_writer: Since we can join a few Writers (as DBImpl::Write + // does) + // we should pass last_writer as a parameter to + // ExitWriteThread + // (if you don't touch other writers, just pass w) + // Status status: Status of write operation + // See also: EnterWriteThread + // REQUIRES: db mutex held + void ExitWriteThread(Writer* w, Writer* last_writer, Status status); + + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); + + private: + // Queue of writers. + std::deque writers_; +}; + +} // namespace rocksdb diff --git a/db/writebuffer.h b/db/writebuffer.h new file mode 100644 index 000000000..7047a9244 --- /dev/null +++ b/db/writebuffer.h @@ -0,0 +1,44 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// WriteBuffer is for managing memory allocation for one or more MemTables. + +#pragma once + +namespace rocksdb { + +class WriteBuffer { + public: + explicit WriteBuffer(size_t _buffer_size) + : buffer_size_(_buffer_size), memory_used_(0) {} + + ~WriteBuffer() {} + + size_t memory_usage() const { return memory_used_; } + size_t buffer_size() const { return buffer_size_; } + + // Should only be called from write thread + bool ShouldFlush() const { + return buffer_size() > 0 && memory_usage() >= buffer_size(); + } + + // Should only be called from write thread + void ReserveMem(size_t mem) { memory_used_ += mem; } + void FreeMem(size_t mem) { memory_used_ -= mem; } + + private: + const size_t buffer_size_; + size_t memory_used_; + + // No copying allowed + WriteBuffer(const WriteBuffer&); + void operator=(const WriteBuffer&); +}; + +} // namespace rocksdb diff --git a/examples/.gitignore b/examples/.gitignore index d3c22099a..5cb04d4b6 100644 --- a/examples/.gitignore +++ b/examples/.gitignore @@ -1,2 +1,4 @@ column_families_example simple_example +c_simple_example +compact_files_example diff --git a/examples/Makefile b/examples/Makefile index 2567fdf86..7bd88fbf0 100644 --- a/examples/Makefile +++ b/examples/Makefile @@ -1,9 +1,23 @@ -include ../build_config.mk +include ../make_config.mk -all: simple_example column_families_example +.PHONY: clean + +all: simple_example column_families_example compact_files_example c_simple_example simple_example: simple_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) column_families_example: column_families_example.cc $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + +compact_files_example: compact_files_example.cc + $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) + +.c.o: + $(CC) $(CFLAGS) -c $< -o $@ -I../include + +c_simple_example: c_simple_example.o + $(CXX) $@.o -o$@ ../librocksdb.a $(PLATFORM_LDFLAGS) $(EXEC_LDFLAGS) + +clean: + rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o diff --git a/examples/c_simple_example.c b/examples/c_simple_example.c new file mode 100644 index 000000000..7a6382765 --- /dev/null +++ b/examples/c_simple_example.c @@ -0,0 +1,74 @@ +#include +#include +#include +#include + +#include "rocksdb/c.h" + +#include // sysconf() - get CPU count + +const char DBPath[] = "/tmp/rocksdb_simple_example"; +const char DBBackupPath[] = "/tmp/rocksdb_simple_example_backup"; + +int main(int argc, char **argv) { + rocksdb_t *db; + rocksdb_backup_engine_t *be; + rocksdb_options_t *options = rocksdb_options_create(); + // Optimize RocksDB. This is the easiest way to + // get RocksDB to perform well + long cpus = sysconf(_SC_NPROCESSORS_ONLN); // get # of online cores + rocksdb_options_increase_parallelism(options, (int)(cpus)); + rocksdb_options_optimize_level_style_compaction(options, 0); + // create the DB if it's not already present + rocksdb_options_set_create_if_missing(options, 1); + + // open DB + char *err = NULL; + db = rocksdb_open(options, DBPath, &err); + assert(!err); + + // open Backup Engine that we will use for backing up or database + be = rocksdb_backup_engine_open(options, DBBackupPath, &err); + assert(!err); + + // Put key-value + rocksdb_writeoptions_t *writeoptions = rocksdb_writeoptions_create(); + const char key[] = "key"; + const char *value = "value"; + rocksdb_put(db, writeoptions, key, strlen(key), value, strlen(value) + 1, + &err); + assert(!err); + // Get value + rocksdb_readoptions_t *readoptions = rocksdb_readoptions_create(); + size_t len; + char *returned_value = + rocksdb_get(db, readoptions, key, strlen(key), &len, &err); + assert(!err); + assert(strcmp(returned_value, "value") == 0); + free(returned_value); + + // create new backup in a directory specified by DBBackupPath + rocksdb_backup_engine_create_new_backup(be, db, &err); + assert(!err); + + rocksdb_close(db); + + // If something is wrong, you might want to restore data from last backup + rocksdb_restore_options_t *restore_options = rocksdb_restore_options_create(); + rocksdb_backup_engine_restore_db_from_latest_backup(be, DBPath, DBPath, + restore_options, &err); + assert(!err); + rocksdb_restore_options_destroy(restore_options); + + db = rocksdb_open(options, DBPath, &err); + assert(!err); + + // cleanup + rocksdb_writeoptions_destroy(writeoptions); + rocksdb_readoptions_destroy(readoptions); + rocksdb_options_destroy(options); + rocksdb_backup_engine_close(be); + rocksdb_close(db); + + return 0; +} diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc index 2bdf6ec42..3ffac064d 100644 --- a/examples/column_families_example.cc +++ b/examples/column_families_example.cc @@ -33,7 +33,7 @@ int main() { // open DB with two column families std::vector column_families; - // have to open default column familiy + // have to open default column family column_families.push_back(ColumnFamilyDescriptor( kDefaultColumnFamilyName, ColumnFamilyOptions())); // open the new one, too diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc new file mode 100644 index 000000000..3e7638b7e --- /dev/null +++ b/examples/compact_files_example.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// An example code demonstrating how to use CompactFiles, EventListener, +// and GetColumnFamilyMetaData APIs to implement custom compaction algorithm. + +#include +#include +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/options.h" + +using namespace rocksdb; +std::string kDBPath = "/tmp/rocksdb_compact_files_example"; +class CompactionTask; + +// This is an example interface of external-compaction algorithm. +// Compaction algorithm can be implemented outside the core-RocksDB +// code by using the pluggable compaction APIs that RocksDb provides. +class Compactor : public EventListener { + public: + // Picks and returns a compaction task given the specified DB + // and column family. It is the caller's responsibility to + // destroy the returned CompactionTask. Returns "nullptr" + // if it cannot find a proper compaction task. + virtual CompactionTask* PickCompaction( + DB* db, const std::string& cf_name) = 0; + + // Schedule and run the specified compaction task in background. + virtual void ScheduleCompaction(CompactionTask *task) = 0; +}; + +// Example structure that describes a compaction task. +struct CompactionTask { + CompactionTask( + DB* db, Compactor* compactor, + const std::string& column_family_name, + const std::vector& input_file_names, + const int output_level, + const CompactionOptions& compact_options, + bool retry_on_fail) + : db(db), + compactor(compactor), + column_family_name(column_family_name), + input_file_names(input_file_names), + output_level(output_level), + compact_options(compact_options), + retry_on_fail(false) {} + DB* db; + Compactor* compactor; + const std::string& column_family_name; + std::vector input_file_names; + int output_level; + CompactionOptions compact_options; + bool retry_on_fail; +}; + +// A simple compaction algorithm that always compacts everything +// to the highest level whenever possible. +class FullCompactor : public Compactor { + public: + explicit FullCompactor(const Options options) : options_(options) { + compact_options_.compression = options_.compression; + compact_options_.output_file_size_limit = + options_.target_file_size_base; + } + + // When flush happens, it determins whether to trigger compaction. + // If triggered_writes_stop is true, it will also set the retry + // flag of compaction-task to true. + void OnFlushCompleted( + DB* db, const std::string& cf_name, + const std::string& file_path, + bool triggered_writes_slowdown, + bool triggered_writes_stop) override { + CompactionTask* task = PickCompaction(db, cf_name); + if (task != nullptr) { + if (triggered_writes_stop) { + task->retry_on_fail = true; + } + // Schedule compaction in a different thread. + ScheduleCompaction(task); + } + } + + // Always pick a compaction which includes all files whenever possible. + CompactionTask* PickCompaction( + DB* db, const std::string& cf_name) override { + ColumnFamilyMetaData cf_meta; + db->GetColumnFamilyMetaData(&cf_meta); + + std::vector input_file_names; + for (auto level : cf_meta.levels) { + for (auto file : level.files) { + if (file.being_compacted) { + return nullptr; + } + input_file_names.push_back(file.name); + } + } + return new CompactionTask( + db, this, cf_name, input_file_names, + options_.num_levels - 1, compact_options_, false); + } + + // Schedule the specified compaction task in background. + void ScheduleCompaction(CompactionTask* task) override { + options_.env->Schedule(&FullCompactor::CompactFiles, task); + } + + static void CompactFiles(void* arg) { + CompactionTask* task = reinterpret_cast(arg); + assert(task); + assert(task->db); + Status s = task->db->CompactFiles( + task->compact_options, + task->input_file_names, + task->output_level); + printf("CompactFiles() finished with status %s\n", s.ToString().c_str()); + if (!s.ok() && !s.IsIOError() && task->retry_on_fail) { + // If a compaction task with its retry_on_fail=true failed, + // try to schedule another compaction in case the reason + // is not an IO error. + CompactionTask* new_task = task->compactor->PickCompaction( + task->db, task->column_family_name); + task->compactor->ScheduleCompaction(new_task); + } + // release the task + delete task; + } + + private: + Options options_; + CompactionOptions compact_options_; +}; + +int main() { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + // Small slowdown and stop trigger for experimental purpose. + options.level0_slowdown_writes_trigger = 3; + options.level0_stop_writes_trigger = 5; + options.IncreaseParallelism(5); + options.listeners.emplace_back(new FullCompactor(options)); + + DB* db = nullptr; + DestroyDB(kDBPath, options); + Status s = DB::Open(options, kDBPath, &db); + assert(s.ok()); + assert(db); + + // if background compaction is not working, write will stall + // because of options.level0_stop_writes_trigger + for (int i = 1000; i < 99999; ++i) { + db->Put(WriteOptions(), std::to_string(i), + std::string(500, 'a' + (i % 26))); + } + + // verify the values are still there + std::string value; + for (int i = 1000; i < 99999; ++i) { + db->Get(ReadOptions(), std::to_string(i), + &value); + assert(value == std::string(500, 'a' + (i % 26))); + } + + // close the db. + delete db; + + return 0; +} diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 20e7faa4b..28a7c9e8b 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -27,14 +27,28 @@ int main() { assert(s.ok()); // Put key-value - s = db->Put(WriteOptions(), "key", "value"); + s = db->Put(WriteOptions(), "key1", "value"); assert(s.ok()); std::string value; // get value - s = db->Get(ReadOptions(), "key", &value); + s = db->Get(ReadOptions(), "key1", &value); assert(s.ok()); assert(value == "value"); + // atomically apply a set of updates + { + WriteBatch batch; + batch.Delete("key1"); + batch.Put("key2", value); + s = db->Write(WriteOptions(), &batch); + } + + s = db->Get(ReadOptions(), "key1", &value); + assert(s.IsNotFound()); + + db->Get(ReadOptions(), "key2", &value); + assert(value == "value"); + delete db; return 0; diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h index 5e7de77d3..67f5613de 100644 --- a/hdfs/env_hdfs.h +++ b/hdfs/env_hdfs.h @@ -93,6 +93,8 @@ class HdfsEnv : public Env { virtual Status RenameFile(const std::string& src, const std::string& target); + virtual Status LinkFile(const std::string& src, const std::string& target); + virtual Status LockFile(const std::string& fname, FileLock** lock); virtual Status UnlockFile(FileLock* lock); @@ -145,6 +147,10 @@ class HdfsEnv : public Env { posixEnv->SetBackgroundThreads(number, pri); } + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) override { + posixEnv->IncBackgroundThreadsIfNeeded(number, pri); + } + virtual std::string TimeToString(uint64_t number) { return posixEnv->TimeToString(number); } @@ -232,7 +238,7 @@ class HdfsEnv : public Env { explicit HdfsEnv(const std::string& fsname) { fprintf(stderr, "You have not build rocksdb with HDFS support\n"); fprintf(stderr, "Please see hdfs/README for details\n"); - throw std::exception(); + abort(); } virtual ~HdfsEnv() { @@ -287,6 +293,10 @@ class HdfsEnv : public Env { virtual Status RenameFile(const std::string& src, const std::string& target){return notsup;} + virtual Status LinkFile(const std::string& src, const std::string& target) { + return notsup; + } + virtual Status LockFile(const std::string& fname, FileLock** lock){return notsup;} virtual Status UnlockFile(FileLock* lock){return notsup;} @@ -319,7 +329,7 @@ class HdfsEnv : public Env { std::string* outputpath) {return notsup;} virtual void SetBackgroundThreads(int number, Priority pri = LOW) {} - + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) {} virtual std::string TimeToString(uint64_t number) { return "";} }; } diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index c54e6707f..ac5f612a0 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -55,6 +55,9 @@ extern "C" { /* Exported types */ typedef struct rocksdb_t rocksdb_t; +typedef struct rocksdb_backup_engine_t rocksdb_backup_engine_t; +typedef struct rocksdb_backup_engine_info_t rocksdb_backup_engine_info_t; +typedef struct rocksdb_restore_options_t rocksdb_restore_options_t; typedef struct rocksdb_cache_t rocksdb_cache_t; typedef struct rocksdb_compactionfilter_t rocksdb_compactionfilter_t; typedef struct rocksdb_compactionfiltercontext_t @@ -77,6 +80,8 @@ typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; typedef struct rocksdb_options_t rocksdb_options_t; typedef struct rocksdb_block_based_table_options_t rocksdb_block_based_table_options_t; +typedef struct rocksdb_cuckoo_table_options_t + rocksdb_cuckoo_table_options_t; typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; @@ -102,6 +107,56 @@ extern rocksdb_t* rocksdb_open_for_read_only( unsigned char error_if_log_file_exist, char** errptr); +extern rocksdb_backup_engine_t* rocksdb_backup_engine_open( + const rocksdb_options_t* options, + const char* path, + char** errptr); + +extern void rocksdb_backup_engine_create_new_backup( + rocksdb_backup_engine_t* be, + rocksdb_t* db, + char** errptr); + +extern rocksdb_restore_options_t* rocksdb_restore_options_create(); +extern void rocksdb_restore_options_destroy(rocksdb_restore_options_t* opt); +extern void rocksdb_restore_options_set_keep_log_files( + rocksdb_restore_options_t* opt, int v); + +extern void rocksdb_backup_engine_restore_db_from_latest_backup( + rocksdb_backup_engine_t *be, + const char* db_dir, + const char* wal_dir, + const rocksdb_restore_options_t *restore_options, + char** errptr); + +extern const rocksdb_backup_engine_info_t* rocksdb_backup_engine_get_backup_info( + rocksdb_backup_engine_t* be); + +extern int rocksdb_backup_engine_info_count( + const rocksdb_backup_engine_info_t* info); + +extern const int64_t rocksdb_backup_engine_info_timestamp( + const rocksdb_backup_engine_info_t* info, + int index); + +extern const uint32_t rocksdb_backup_engine_info_backup_id( + const rocksdb_backup_engine_info_t* info, + int index); + +extern const uint64_t rocksdb_backup_engine_info_size( + const rocksdb_backup_engine_info_t* info, + int index); + +extern const uint32_t rocksdb_backup_engine_info_number_files( + const rocksdb_backup_engine_info_t* info, + int index); + +extern void rocksdb_backup_engine_info_destroy( + const rocksdb_backup_engine_info_t *info); + +extern void rocksdb_backup_engine_close( + rocksdb_backup_engine_t* be); + extern rocksdb_t* rocksdb_open_column_families( const rocksdb_options_t* options, const char* name, @@ -376,6 +431,25 @@ extern void rocksdb_block_based_options_set_whole_key_filtering( extern void rocksdb_options_set_block_based_table_factory( rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options); +/* Cuckoo table options */ + +extern rocksdb_cuckoo_table_options_t* + rocksdb_cuckoo_options_create(); +extern void rocksdb_cuckoo_options_destroy( + rocksdb_cuckoo_table_options_t* options); +extern void rocksdb_cuckoo_options_set_hash_ratio( + rocksdb_cuckoo_table_options_t* options, double v); +extern void rocksdb_cuckoo_options_set_max_search_depth( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern void rocksdb_cuckoo_options_set_cuckoo_block_size( + rocksdb_cuckoo_table_options_t* options, uint32_t v); +extern void rocksdb_cuckoo_options_set_identity_as_first_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern void rocksdb_cuckoo_options_set_use_module_hash( + rocksdb_cuckoo_table_options_t* options, unsigned char v); +extern void rocksdb_options_set_cuckoo_table_factory( + rocksdb_options_t *opt, rocksdb_cuckoo_table_options_t* table_options); + /* Options */ extern rocksdb_options_t* rocksdb_options_create(); @@ -419,6 +493,7 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*); extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); +extern void rocksdb_options_set_max_total_wal_size(rocksdb_options_t* opt, uint64_t n); extern void rocksdb_options_set_compression_options( rocksdb_options_t*, int, int, int); extern void rocksdb_options_set_prefix_extractor( @@ -522,9 +597,6 @@ extern void rocksdb_options_set_hash_skip_list_rep(rocksdb_options_t*, size_t, i extern void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t*, size_t); extern void rocksdb_options_set_plain_table_factory(rocksdb_options_t*, uint32_t, int, double, size_t); -extern void rocksdb_options_set_max_bytes_for_level_base(rocksdb_options_t* opt, uint64_t n); -extern void rocksdb_options_set_stats_dump_period_sec(rocksdb_options_t* opt, unsigned int sec); - extern void rocksdb_options_set_min_level_to_compress(rocksdb_options_t* opt, int level); extern void rocksdb_options_set_memtable_prefix_bloom_bits( @@ -537,8 +609,6 @@ extern void rocksdb_options_set_min_partial_merge_operands( rocksdb_options_t*, uint32_t); extern void rocksdb_options_set_bloom_locality( rocksdb_options_t*, uint32_t); -extern void rocksdb_options_set_allow_thread_local( - rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_inplace_update_support( rocksdb_options_t*, unsigned char); extern void rocksdb_options_set_inplace_update_num_locks( @@ -698,6 +768,10 @@ extern void rocksdb_readoptions_set_fill_cache( extern void rocksdb_readoptions_set_snapshot( rocksdb_readoptions_t*, const rocksdb_snapshot_t*); +extern void rocksdb_readoptions_set_iterate_upper_bound( + rocksdb_readoptions_t*, + const char* key, + size_t keylen); extern void rocksdb_readoptions_set_read_tier( rocksdb_readoptions_t*, int); extern void rocksdb_readoptions_set_tailing( diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 65d44b6cb..a8a6f9b73 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -127,9 +127,6 @@ class Cache { void LRU_Append(Handle* e); void Unref(Handle* e); - struct Rep; - Rep* rep_; - // No copying allowed Cache(const Cache&); void operator=(const Cache&); diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index f3a8499a8..5b7dc1021 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -62,6 +62,10 @@ class Comparator { // must not be deleted. extern const Comparator* BytewiseComparator(); +// Return a builtin comparator that uses reverse lexicographic byte-wise +// ordering. +extern const Comparator* ReverseBytewiseComparator(); + } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_COMPARATOR_H_ diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index d9be6b427..7cba31488 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -15,19 +15,36 @@ #include #include #include +#include "rocksdb/metadata.h" #include "rocksdb/version.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "rocksdb/types.h" #include "rocksdb/transaction_log.h" +#include "rocksdb/listener.h" +#include "rocksdb/thread_status.h" namespace rocksdb { +struct Options; +struct DBOptions; +struct ColumnFamilyOptions; +struct ReadOptions; +struct WriteOptions; +struct FlushOptions; +struct CompactionOptions; +struct TableProperties; +class WriteBatch; +class Env; +class EventListener; + using std::unique_ptr; class ColumnFamilyHandle { public: virtual ~ColumnFamilyHandle() {} + virtual const std::string& GetName() const = 0; + virtual uint32_t GetID() const = 0; }; extern const std::string kDefaultColumnFamilyName; @@ -44,31 +61,14 @@ struct ColumnFamilyDescriptor { static const int kMajorVersion = __ROCKSDB_MAJOR__; static const int kMinorVersion = __ROCKSDB_MINOR__; -struct Options; -struct ReadOptions; -struct WriteOptions; -struct FlushOptions; -struct TableProperties; -class WriteBatch; -class Env; - -// Metadata associated with each SST file. -struct LiveFileMetaData { - std::string column_family_name; // Name of the column family - std::string db_path; - std::string name; // Name of the file - int level; // Level at which this file resides. - size_t size; // File size in bytes. - std::string smallestkey; // Smallest user defined key in the file. - std::string largestkey; // Largest user defined key in the file. - SequenceNumber smallest_seqno; // smallest seqno in file - SequenceNumber largest_seqno; // largest seqno in file -}; - // Abstract handle to particular state of a DB. // A Snapshot is an immutable object and can therefore be safely // accessed from multiple threads without any external synchronization. class Snapshot { + public: + // returns Snapshot's sequence number + virtual SequenceNumber GetSequenceNumber() const = 0; + protected: virtual ~Snapshot(); }; @@ -106,6 +106,9 @@ class DB { // that modify data, like put/delete, will return error. // If the db is opened in read only mode, then no compactions // will happen. + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. static Status OpenForReadOnly(const Options& options, const std::string& name, DB** dbptr, bool error_if_log_file_exist = false); @@ -115,6 +118,9 @@ class DB { // database that should be opened. However, you always need to specify default // column family. The default column family name is 'default' and it's stored // in rocksdb::kDefaultColumnFamilyName + // + // Not supported in ROCKSDB_LITE, in which case the function will + // return Status::NotSupported. static Status OpenForReadOnly( const DBOptions& db_options, const std::string& name, const std::vector& column_families, @@ -123,7 +129,7 @@ class DB { // Open DB with column families. // db_options specify database specific options - // column_families is the vector of all column families in the databse, + // column_families is the vector of all column families in the database, // containing column family name and options. You need to open ALL column // families in the database. To get the list of column families, you can use // ListColumnFamilies(). Also, you can open only a subset of column families @@ -195,6 +201,8 @@ class DB { } // Apply the specified updates to the database. + // If `updates` contains no update, WAL will still be synced if + // options.sync=true. // Returns OK on success, non-OK on failure. // Note: consider setting options.sync = true. virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0; @@ -301,6 +309,22 @@ class DB { // about the internal operation of the DB. // "rocksdb.sstables" - returns a multi-line string that describes all // of the sstables that make up the db contents. + // "rocksdb.cfstats" + // "rocksdb.dbstats" + // "rocksdb.num-immutable-mem-table" + // "rocksdb.mem-table-flush-pending" + // "rocksdb.compaction-pending" - 1 if at least one compaction is pending + // "rocksdb.background-errors" - accumulated number of background errors + // "rocksdb.cur-size-active-mem-table" + // "rocksdb.cur-size-all-mem-tables" + // "rocksdb.num-entries-active-mem-table" + // "rocksdb.num-entries-imm-mem-tables" + // "rocksdb.estimate-num-keys" - estimated keys in the column family + // "rocksdb.estimate-table-readers-mem" - estimated memory used for reding + // SST tables, that is not counted as a part of block cache. + // "rocksdb.is-file-deletions-enabled" + // "rocksdb.num-snapshots" + // "rocksdb.oldest-snapshot-time" virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) { @@ -308,7 +332,21 @@ class DB { } // Similar to GetProperty(), but only works for a subset of properties whose - // return value is an integer. Return the value by integer. + // return value is an integer. Return the value by integer. Supported + // properties: + // "rocksdb.num-immutable-mem-table" + // "rocksdb.mem-table-flush-pending" + // "rocksdb.compaction-pending" + // "rocksdb.background-errors" + // "rocksdb.cur-size-active-mem-table" + // "rocksdb.cur-size-all-mem-tables" + // "rocksdb.num-entries-active-mem-table" + // "rocksdb.num-entries-imm-mem-tables" + // "rocksdb.estimate-num-keys" + // "rocksdb.estimate-table-readers-mem" + // "rocksdb.is-file-deletions-enabled" + // "rocksdb.num-snapshots" + // "rocksdb.oldest-snapshot-time" virtual bool GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) = 0; virtual bool GetIntProperty(const Slice& property, uint64_t* value) { @@ -359,7 +397,35 @@ class DB { return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, target_level, target_path_id); } + virtual Status SetOptions(ColumnFamilyHandle* column_family, + const std::unordered_map& new_options) { + return Status::NotSupported("Not implemented"); + } + virtual Status SetOptions( + const std::unordered_map& new_options) { + return SetOptions(DefaultColumnFamily(), new_options); + } + // CompactFiles() inputs a list of files specified by file numbers + // and compacts them to the specified level. Note that the behavior + // is different from CompactRange in that CompactFiles() will + // perform the compaction job using the CURRENT thread. + // + // @see GetDataBaseMetaData + // @see GetColumnFamilyMetaData + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1) = 0; + + virtual Status CompactFiles( + const CompactionOptions& compact_options, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1) { + return CompactFiles(compact_options, DefaultColumnFamily(), + input_file_names, output_level, output_path_id); + } // Number of levels used for this DB. virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels() { return NumberLevels(DefaultColumnFamily()); } @@ -466,6 +532,21 @@ class DB { // and end key virtual void GetLiveFilesMetaData(std::vector* metadata) {} + // Obtains the meta data of the specified column family of the DB. + // Status::NotFound() will be returned if the current DB does not have + // any column family match the specified name. + // + // If cf_name is not specified, then the metadata of the default + // column family will be returned. + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle* column_family, + ColumnFamilyMetaData* metadata) {} + + // Get the metadata of the default column family. + void GetColumnFamilyMetaData( + ColumnFamilyMetaData* metadata) { + GetColumnFamilyMetaData(DefaultColumnFamily(), metadata); + } #endif // ROCKSDB_LITE // Sets the globally unique ID created at database creation time by invoking diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 70244bb31..dfc598ff6 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -20,9 +20,11 @@ #include #include #include +#include #include #include #include "rocksdb/status.h" +#include "rocksdb/thread_status.h" namespace rocksdb { @@ -36,6 +38,7 @@ class RandomRWFile; class Directory; struct DBOptions; class RateLimiter; +class ThreadStatusUpdater; using std::unique_ptr; using std::shared_ptr; @@ -82,7 +85,8 @@ struct EnvOptions { class Env { public: - Env() { } + Env() : thread_status_updater_(nullptr) {} + virtual ~Env(); // Return a default environment suitable for the current operating @@ -177,6 +181,11 @@ class Env { virtual Status RenameFile(const std::string& src, const std::string& target) = 0; + // Hard Link file src to target. + virtual Status LinkFile(const std::string& src, const std::string& target) { + return Status::NotSupported("LinkFile is not supported for this Env"); + } + // Lock the specified file. Used to prevent concurrent access to // the same db by multiple processes. On failure, stores nullptr in // *lock and returns non-OK. @@ -272,6 +281,11 @@ class Env { // default number: 1 virtual void SetBackgroundThreads(int number, Priority pri = LOW) = 0; + // Enlarge number of background worker threads of a specific thread pool + // for this environment if it is smaller than specified. 'LOW' is the default + // pool. + virtual void IncBackgroundThreadsIfNeeded(int number, Priority pri) = 0; + // Lower IO priority for threads from the specified pool. virtual void LowerThreadPoolIOPriority(Priority pool = LOW) {} @@ -291,12 +305,34 @@ class Env { virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const; + // Returns the status of all threads that belong to the current Env. + virtual Status GetThreadList(std::vector* thread_list) { + return Status::NotSupported("Not supported."); + } + + // Returns the pointer to ThreadStatusUpdater. This function will be + // used in RocksDB internally to update thread status and supports + // GetThreadList(). + virtual ThreadStatusUpdater* GetThreadStatusUpdater() const { + return thread_status_updater_; + } + + protected: + // The pointer to an internal structure that will update the + // status of each thread. + ThreadStatusUpdater* thread_status_updater_; + private: // No copying allowed Env(const Env&); void operator=(const Env&); }; +// The factory function to construct a ThreadStatusUpdater. Any Env +// that supports GetThreadList() feature should call this function in its +// constructor to initialize thread_status_updater_. +ThreadStatusUpdater* CreateThreadStatusUpdater(); + // A file abstraction for reading sequentially through a file class SequentialFile { public: @@ -471,8 +507,8 @@ class WritableFile { if (new_last_preallocated_block > last_preallocated_block_) { size_t num_spanned_blocks = new_last_preallocated_block - last_preallocated_block_; - Allocate(block_size * last_preallocated_block_, - block_size * num_spanned_blocks); + Allocate(static_cast(block_size * last_preallocated_block_), + static_cast(block_size * num_spanned_blocks)); last_preallocated_block_ = new_last_preallocated_block; } } @@ -575,11 +611,21 @@ enum InfoLogLevel : unsigned char { // An interface for writing log messages. class Logger { public: - enum { DO_NOT_SUPPORT_GET_LOG_FILE_SIZE = -1 }; + size_t kDoNotSupportGetLogFileSize = std::numeric_limits::max(); + explicit Logger(const InfoLogLevel log_level = InfoLogLevel::INFO_LEVEL) : log_level_(log_level) {} virtual ~Logger(); + // Write a header to the log file with the specified format + // It is recommended that you log all header information at the start of the + // application. But it is not enforced. + virtual void LogHeader(const char* format, va_list ap) { + // Default implementation does a simple INFO level log write. + // Please override as per the logger class requirement. + Logv(format, ap); + } + // Write an entry to the log file with the specified format. virtual void Logv(const char* format, va_list ap) = 0; @@ -587,7 +633,7 @@ class Logger { // and format. Any log with level under the internal log level // of *this (see @SetInfoLogLevel and @GetInfoLogLevel) will not be // printed. - void Logv(const InfoLogLevel log_level, const char* format, va_list ap) { + virtual void Logv(const InfoLogLevel log_level, const char* format, va_list ap) { static const char* kInfoLogLevelNames[5] = {"DEBUG", "INFO", "WARN", "ERROR", "FATAL"}; if (log_level < log_level_) { @@ -608,9 +654,7 @@ class Logger { Logv(new_format, ap); } } - virtual size_t GetLogFileSize() const { - return DO_NOT_SUPPORT_GET_LOG_FILE_SIZE; - } + virtual size_t GetLogFileSize() const { return kDoNotSupportGetLogFileSize; } // Flush to the OS buffers virtual void Flush() {} virtual InfoLogLevel GetInfoLogLevel() const { return log_level_; } @@ -643,6 +687,7 @@ extern void Log(const InfoLogLevel log_level, const shared_ptr& info_log, const char* format, ...); // a set of log functions with different log levels. +extern void Header(const shared_ptr& info_log, const char* format, ...); extern void Debug(const shared_ptr& info_log, const char* format, ...); extern void Info(const shared_ptr& info_log, const char* format, ...); extern void Warn(const shared_ptr& info_log, const char* format, ...); @@ -670,6 +715,7 @@ extern void Log(Logger* info_log, const char* format, ...) ; // a set of log functions with different log levels. +extern void Header(Logger* info_log, const char* format, ...); extern void Debug(Logger* info_log, const char* format, ...); extern void Info(Logger* info_log, const char* format, ...); extern void Warn(Logger* info_log, const char* format, ...); @@ -742,6 +788,11 @@ class EnvWrapper : public Env { Status RenameFile(const std::string& s, const std::string& t) { return target_->RenameFile(s, t); } + + Status LinkFile(const std::string& s, const std::string& t) { + return target_->LinkFile(s, t); + } + Status LockFile(const std::string& f, FileLock** l) { return target_->LockFile(f, l); } @@ -782,13 +833,27 @@ class EnvWrapper : public Env { void SetBackgroundThreads(int num, Priority pri) { return target_->SetBackgroundThreads(num, pri); } + + void IncBackgroundThreadsIfNeeded(int num, Priority pri) { + return target_->IncBackgroundThreadsIfNeeded(num, pri); + } + void LowerThreadPoolIOPriority(Priority pool = LOW) override { target_->LowerThreadPoolIOPriority(pool); } + std::string TimeToString(uint64_t time) { return target_->TimeToString(time); } + Status GetThreadList(std::vector* thread_list) { + return target_->GetThreadList(thread_list); + } + + ThreadStatusUpdater* GetThreadStatusUpdater() const override { + return target_->GetThreadStatusUpdater(); + } + private: Env* target_; }; diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index fa44db45f..90aefb388 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -21,11 +21,52 @@ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #include +#include namespace rocksdb { class Slice; +// A class that takes a bunch of keys, then generates filter +class FilterBitsBuilder { + public: + virtual ~FilterBitsBuilder() {} + + // Add Key to filter, you could use any way to store the key. + // Such as: storing hashes or original keys + // Keys are in sorted order and duplicated keys are possible. + virtual void AddKey(const Slice& key) = 0; + + // Generate the filter using the keys that are added + // The return value of this function would be the filter bits, + // The ownership of actual data is set to buf + virtual Slice Finish(std::unique_ptr* buf) = 0; +}; + +// A class that checks if a key can be in filter +// It should be initialized by Slice generated by BitsBuilder +class FilterBitsReader { + public: + virtual ~FilterBitsReader() {} + + // Check if the entry match the bits in filter + virtual bool MayMatch(const Slice& entry) = 0; +}; + +// We add a new format of filter block called full filter block +// This new interface gives you more space of customization +// +// For the full filter block, you can plug in your version by implement +// the FilterBitsBuilder and FilterBitsReader +// +// There are two sets of interface in FilterPolicy +// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter +// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for +// full filter. +// Set 1 MUST be implemented correctly, Set 2 is optional +// RocksDB would first try using functions in Set 2. if they return nullptr, +// it would use Set 1 instead. +// You can choose filter type in NewBloomFilterPolicy class FilterPolicy { public: virtual ~FilterPolicy(); @@ -51,11 +92,28 @@ class FilterPolicy { // This method may return true or false if the key was not on the // list, but it should aim to return false with a high probability. virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; + + // Get the FilterBitsBuilder, which is ONLY used for full filter block + // It contains interface to take individual key, then generate filter + virtual FilterBitsBuilder* GetFilterBitsBuilder() const { + return nullptr; + } + + // Get the FilterBitsReader, which is ONLY used for full filter block + // It contains interface to tell if key can be in filter + // The input slice should NOT be deleted by FilterPolicy + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const { + return nullptr; + } }; // Return a new filter policy that uses a bloom filter with approximately -// the specified number of bits per key. A good value for bits_per_key +// the specified number of bits per key. +// +// bits_per_key: bits per key in bloom filter. A good value for bits_per_key // is 10, which yields a filter with ~ 1% false positive rate. +// use_block_based_builder: use block based filter rather than full fiter. +// If you want to builder full filter, it needs to be set to false. // // Callers must delete the result after any database that is using the // result has been closed. @@ -67,8 +125,8 @@ class FilterPolicy { // ignores trailing spaces, it would be incorrect to use a // FilterPolicy (like NewBloomFilterPolicy) that does not ignore // trailing spaces in keys. -extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); - +extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, + bool use_block_based_builder = true); } #endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h new file mode 100644 index 000000000..adf46d647 --- /dev/null +++ b/include/rocksdb/immutable_options.h @@ -0,0 +1,101 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include "rocksdb/options.h" + +namespace rocksdb { + +// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a +// subset of Options that should not be changed during the entire lifetime +// of DB. You shouldn't need to access this data structure unless you are +// implementing a new TableFactory. Raw pointers defined in this struct do +// not have ownership to the data they point to. Options contains shared_ptr +// to these data. +struct ImmutableCFOptions { + explicit ImmutableCFOptions(const Options& options); + + CompactionStyle compaction_style; + + CompactionOptionsUniversal compaction_options_universal; + CompactionOptionsFIFO compaction_options_fifo; + + const SliceTransform* prefix_extractor; + + const Comparator* comparator; + + MergeOperator* merge_operator; + + const CompactionFilter* compaction_filter; + + CompactionFilterFactory* compaction_filter_factory; + + CompactionFilterFactoryV2* compaction_filter_factory_v2; + + bool inplace_update_support; + + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value); + + Logger* info_log; + + Statistics* statistics; + + InfoLogLevel info_log_level; + + Env* env; + + // Allow the OS to mmap file for reading sst tables. Default: false + bool allow_mmap_reads; + + // Allow the OS to mmap file for writing. Default: false + bool allow_mmap_writes; + + std::vector db_paths; + + MemTableRepFactory* memtable_factory; + + TableFactory* table_factory; + + Options::TablePropertiesCollectorFactories + table_properties_collector_factories; + + bool advise_random_on_open; + + // This options is required by PlainTableReader. May need to move it + // to PlainTalbeOptions just like bloom_bits_per_key + uint32_t bloom_locality; + + bool purge_redundant_kvs_while_flush; + + uint32_t min_partial_merge_operands; + + bool disable_data_sync; + + bool use_fsync; + + CompressionType compression; + + std::vector compression_per_level; + + CompressionOptions compression_opts; + + Options::AccessHint access_hint_on_compaction_start; + + int num_levels; + +#ifndef ROCKSDB_LITE + // A vector of EventListeners which call-back functions will be called + // when specific RocksDB event happens. + std::vector> listeners; +#endif // ROCKSDB_LITE +}; + +} // namespace rocksdb diff --git a/include/rocksdb/listener.h b/include/rocksdb/listener.h new file mode 100644 index 000000000..be5b96032 --- /dev/null +++ b/include/rocksdb/listener.h @@ -0,0 +1,95 @@ +// Copyright (c) 2014 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class DB; +class Status; + +struct CompactionJobInfo { + // the name of the column family where the compaction happened. + std::string cf_name; + // the status indicating whether the compaction was successful or not. + Status status; + // the output level of the compaction. + int output_level; + // the names of the compaction input files. + std::vector input_files; + // the names of the compaction output files. + std::vector output_files; +}; + +// EventListener class contains a set of call-back functions that will +// be called when specific RocksDB event happens such as flush. It can +// be used as a building block for developing custom features such as +// stats-collector or external compaction algorithm. +// +// Note that call-back functions should not run for an extended period of +// time before the function returns, otherwise RocksDB may be blocked. +// For example, it is not suggested to do DB::CompactFiles() (as it may +// run for a long while) or issue many of DB::Put() (as Put may be blocked +// in certain cases) in the same thread in the EventListener callback. +// However, doing DB::CompactFiles() and DB::Put() in another thread is +// considered safe. +// +// [Threading] All EventListener callback will be called using the +// actual thread that involves in that specific event. For example, it +// is the RocksDB background flush thread that does the actual flush to +// call EventListener::OnFlushCompleted(). +class EventListener { + public: + // A call-back function to RocksDB which will be called whenever a + // registered RocksDB flushes a file. The default implementation is + // no-op. + // + // Note that the this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param db a pointer to the rocksdb instance which just flushed + // a memtable to disk. + // @param column_family_id the id of the flushed column family. + // @param file_path the path to the newly created file. + // @param triggered_writes_slowdown true when rocksdb is currently + // slowing-down all writes to prevent creating too many Level 0 + // files as compaction seems not able to catch up the write request + // speed. This indicates that there're too many files in Level 0. + // @param triggered_writes_stop true when rocksdb is currently blocking + // any writes to prevent creating more L0 files. This indicates that + // there're too many files in level 0. Compactions should try to + // compact L0 files down to lower levels as soon as possible. + virtual void OnFlushCompleted( + DB* db, const std::string& column_family_name, + const std::string& file_path, + bool triggered_writes_slowdown, + bool triggered_writes_stop) {} + + // A call-back function for RocksDB which will be called whenever + // a registered RocksDB compacts a file. The default implementation + // is a no-op. + // + // Note that this function must be implemented in a way such that + // it should not run for an extended period of time before the function + // returns. Otherwise, RocksDB may be blocked. + // + // @param db a pointer to the rocksdb instance which just compacted + // a file. + // @param ci a reference to a CompactionJobInfo struct. 'ci' is released + // after this function is returned, and must be copied if it is needed + // outside of this function. + virtual void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) {} + virtual ~EventListener() {} +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index b7fc39c81..97141cc73 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -14,8 +14,8 @@ // (4) Items are never deleted. // The liberal use of assertions is encouraged to enforce (1). // -// The factory will be passed an Arena object when a new MemTableRep is -// requested. The API for this object is in rocksdb/arena.h. +// The factory will be passed an MemTableAllocator object when a new MemTableRep +// is requested. // // Users can implement their own memtable representations. We include three // types built in: @@ -41,6 +41,7 @@ namespace rocksdb { class Arena; +class MemTableAllocator; class LookupKey; class Slice; class SliceTransform; @@ -65,7 +66,7 @@ class MemTableRep { virtual ~KeyComparator() { } }; - explicit MemTableRep(Arena* arena) : arena_(arena) {} + explicit MemTableRep(MemTableAllocator* allocator) : allocator_(allocator) {} // Allocate a buf of len size for storing key. The idea is that a specific // memtable representation knows its underlying data structure better. By @@ -101,7 +102,7 @@ class MemTableRep { bool (*callback_func)(void* arg, const char* entry)); // Report an approximation of how much memory has been used other than memory - // that was allocated through the arena. + // that was allocated through the allocator. virtual size_t ApproximateMemoryUsage() = 0; virtual ~MemTableRep() { } @@ -150,7 +151,7 @@ class MemTableRep { // Return an iterator that has a special Seek semantics. The result of // a Seek might only include keys with the same prefix as the target key. - // arena: If not null, the arena needs to be used to allocate the Iterator. + // arena: If not null, the arena is used to allocate the Iterator. // When destroying the iterator, the caller will not call "delete" // but Iterator::~Iterator() directly. The destructor needs to destroy // all the states but those allocated in arena. @@ -171,7 +172,7 @@ class MemTableRep { // user key. virtual Slice UserKey(const char* key) const; - Arena* arena_; + MemTableAllocator* allocator_; }; // This is the base class for all factories that are used by RocksDB to create @@ -180,18 +181,31 @@ class MemTableRepFactory { public: virtual ~MemTableRepFactory() {} virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Arena*, const SliceTransform*, + MemTableAllocator*, + const SliceTransform*, Logger* logger) = 0; virtual const char* Name() const = 0; }; // This uses a skip list to store keys. It is the default. +// +// Parameters: +// lookahead: If non-zero, each iterator's seek operation will start the +// search from the previously visited record (doing at most 'lookahead' +// steps). This is an optimization for the access pattern including many +// seeks with consecutive keys. class SkipListFactory : public MemTableRepFactory { public: + explicit SkipListFactory(size_t lookahead = 0) : lookahead_(lookahead) {} + virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Arena*, const SliceTransform*, + MemTableAllocator*, + const SliceTransform*, Logger* logger) override; virtual const char* Name() const override { return "SkipListFactory"; } + + private: + const size_t lookahead_; }; #ifndef ROCKSDB_LITE @@ -209,7 +223,8 @@ class VectorRepFactory : public MemTableRepFactory { public: explicit VectorRepFactory(size_t count = 0) : count_(count) { } virtual MemTableRep* CreateMemTableRep(const MemTableRep::KeyComparator&, - Arena*, const SliceTransform*, + MemTableAllocator*, + const SliceTransform*, Logger* logger) override; virtual const char* Name() const override { return "VectorRepFactory"; diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h new file mode 100644 index 000000000..e026fa96e --- /dev/null +++ b/include/rocksdb/metadata.h @@ -0,0 +1,90 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/types.h" + +#pragma once + +namespace rocksdb { +struct ColumnFamilyMetaData; +struct LevelMetaData; +struct SstFileMetaData; + +// The metadata that describes a column family. +struct ColumnFamilyMetaData { + ColumnFamilyMetaData() : size(0), name("") {} + ColumnFamilyMetaData(const std::string& _name, uint64_t _size, + const std::vector&& _levels) : + size(_size), name(_name), levels(_levels) {} + + // The size of this column family in bytes, which is equal to the sum of + // the file size of its "levels". + uint64_t size; + // The number of files in this column family. + size_t file_count; + // The name of the column family. + std::string name; + // The metadata of all levels in this column family. + std::vector levels; +}; + +// The metadata that describes a level. +struct LevelMetaData { + LevelMetaData(int _level, uint64_t _size, + const std::vector&& _files) : + level(_level), size(_size), + files(_files) {} + + // The level which this meta data describes. + const int level; + // The size of this level in bytes, which is equal to the sum of + // the file size of its "files". + const uint64_t size; + // The metadata of all sst files in this level. + const std::vector files; +}; + +// The metadata that describes a SST file. +struct SstFileMetaData { + SstFileMetaData() {} + SstFileMetaData(const std::string& _file_name, + const std::string& _path, uint64_t _size, + SequenceNumber _smallest_seqno, + SequenceNumber _largest_seqno, + const std::string& _smallestkey, + const std::string& _largestkey, + bool _being_compacted) : + size(_size), name(_file_name), + db_path(_path), smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), + smallestkey(_smallestkey), largestkey(_largestkey), + being_compacted(_being_compacted) {} + + // File size in bytes. + uint64_t size; + // The name of the file. + std::string name; + // The full path where the file locates. + std::string db_path; + + SequenceNumber smallest_seqno; // Smallest sequence number in file. + SequenceNumber largest_seqno; // Largest sequence number in file. + std::string smallestkey; // Smallest user defined key in the file. + std::string largestkey; // Largest user defined key in the file. + bool being_compacted; // true if the file is currently being compacted. +}; + +// The full set of metadata associated with each SST file. +struct LiveFileMetaData : SstFileMetaData { + std::string column_family_name; // Name of the column family + int level; // Level at which this file resides. +}; + + + +} // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 3569409c4..0541a7b34 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -10,12 +10,16 @@ #define STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ #include +#include #include #include #include +#include #include +#include #include "rocksdb/version.h" +#include "rocksdb/listener.h" #include "rocksdb/universal_compaction.h" namespace rocksdb { @@ -52,9 +56,18 @@ enum CompressionType : char { }; enum CompactionStyle : char { - kCompactionStyleLevel = 0x0, // level based compaction style - kCompactionStyleUniversal = 0x1, // Universal compaction style - kCompactionStyleFIFO = 0x2, // FIFO compaction style + // level based compaction style + kCompactionStyleLevel = 0x0, + // Universal compaction style + // Not supported in ROCKSDB_LITE. + kCompactionStyleUniversal = 0x1, + // FIFO compaction style + // Not supported in ROCKSDB_LITE + kCompactionStyleFIFO = 0x2, + // Disable background compaction. Compaction jobs are submitted + // via CompactFiles(). + // Not supported in ROCKSDB_LITE + kCompactionStyleNone = 0x3, }; struct CompactionOptionsFIFO { @@ -97,6 +110,8 @@ struct ColumnFamilyOptions { // Use this if you don't need to keep the data sorted, i.e. you'll never use // an iterator, only Put() and Get() API calls + // + // Not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeForPointLookup( uint64_t block_cache_size_mb); @@ -114,6 +129,8 @@ struct ColumnFamilyOptions { // biggest performance gains. // Note: we might use more memory than memtable_memory_budget during high // write rate period + // + // OptimizeUniversalStyleCompaction is not supported in ROCKSDB_LITE ColumnFamilyOptions* OptimizeLevelStyleCompaction( uint64_t memtable_memory_budget = 512 * 1024 * 1024); ColumnFamilyOptions* OptimizeUniversalStyleCompaction( @@ -188,14 +205,22 @@ struct ColumnFamilyOptions { // Also, a larger write buffer will result in a longer recovery time // the next time the database is opened. // + // Note that write_buffer_size is enforced per column family. + // See db_write_buffer_size for sharing memory across column families. + // // Default: 4MB + // + // Dynamically changeable through SetOptions() API size_t write_buffer_size; // The maximum number of write buffers that are built up in memory. // The default and the minimum number is 2, so that when 1 write buffer // is being flushed to storage, new writes can continue to the other // write buffer. + // // Default: 2 + // + // Dynamically changeable through SetOptions() API int max_write_buffer_number; // The minimum number of write buffers that will be merged together @@ -224,17 +249,12 @@ struct ColumnFamilyOptions { CompressionType compression; // Different levels can have different compression policies. There - // are cases where most lower levels would like to quick compression - // algorithm while the higher levels (which have more data) use + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use // compression algorithms that have better compression but could - // be slower. This array, if non nullptr, should have an entry for - // each level of the database. This array, if non nullptr, overides the - // value specified in the previous field 'compression'. The caller is - // reponsible for allocating memory and initializing the values in it - // before invoking Open(). The caller is responsible for freeing this - // array and it could be freed anytime after the return from Open(). - // This could have been a std::vector but that makes the equivalent - // java/C api hard to construct. + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. std::vector compression_per_level; // different options for compression algorithms @@ -263,14 +283,20 @@ struct ColumnFamilyOptions { // level-0 compaction will not be triggered by number of files at all. // // Default: 4 + // + // Dynamically changeable through SetOptions() API int level0_file_num_compaction_trigger; // Soft limit on number of level-0 files. We start slowing down writes at this // point. A value <0 means that no writing slow down will be triggered by // number of files in level-0. + // + // Dynamically changeable through SetOptions() API int level0_slowdown_writes_trigger; // Maximum number of level-0 files. We stop writes at this point. + // + // Dynamically changeable through SetOptions() API int level0_stop_writes_trigger; // Maximum level to which a new compacted memtable is pushed if it @@ -279,6 +305,8 @@ struct ColumnFamilyOptions { // expensive manifest file operations. We do not push all the way to // the largest level since that can generate a lot of wasted disk // space if the same key space is being repeatedly overwritten. + // + // Dynamically changeable through SetOptions() API int max_mem_compaction_level; // Target file size for compaction. @@ -289,11 +317,16 @@ struct ColumnFamilyOptions { // target_file_size_multiplier is 10, then each file on level-1 will // be 2MB, and each file on level 2 will be 20MB, // and each file on level-3 will be 200MB. + // + // Default: 2MB. + // + // Dynamically changeable through SetOptions() API + uint64_t target_file_size_base; - // by default target_file_size_base is 2MB. - int target_file_size_base; - // by default target_file_size_multiplier is 1, which means + // By default target_file_size_multiplier is 1, which means // by default files in different levels will have similar size. + // + // Dynamically changeable through SetOptions() API int target_file_size_multiplier; // Control maximum total data size for a level. @@ -304,22 +337,32 @@ struct ColumnFamilyOptions { // max_bytes_for_level_multiplier is 10, total data size for level-1 // will be 20MB, total file size for level-2 will be 200MB, // and total file size for level-3 will be 2GB. - - // by default 'max_bytes_for_level_base' is 10MB. + // + // Default: 10MB. + // + // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base; - // by default 'max_bytes_for_level_base' is 10. + + // Default: 10. + // + // Dynamically changeable through SetOptions() API int max_bytes_for_level_multiplier; // Different max-size multipliers for different levels. // These are multiplied by max_bytes_for_level_multiplier to arrive // at the max-size of each level. + // // Default: 1 + // + // Dynamically changeable through SetOptions() API std::vector max_bytes_for_level_multiplier_additional; // Maximum number of bytes in all compacted files. We avoid expanding // the lower level file set of a compaction if it would make the // total compaction cover more than // (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + // + // Dynamically changeable through SetOptions() API int expanded_compaction_factor; // Maximum number of bytes in all source files to be compacted in a @@ -329,27 +372,35 @@ struct ColumnFamilyOptions { // (source_compaction_factor * targetFileSizeLevel()) many bytes. // Default:1, i.e. pick maxfilesize amount of data as the source of // a compaction. + // + // Dynamically changeable through SetOptions() API int source_compaction_factor; // Control maximum bytes of overlaps in grandparent (i.e., level+2) before we // stop building a single file in a level->level+1 compaction. + // + // Dynamically changeable through SetOptions() API int max_grandparent_overlap_factor; // Puts are delayed 0-1 ms when any level has a compaction score that exceeds // soft_rate_limit. This is ignored when == 0.0. // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not // hold, RocksDB will set soft_rate_limit = hard_rate_limit + // // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API double soft_rate_limit; // Puts are delayed 1ms at a time when any level has a compaction score that // exceeds hard_rate_limit. This is ignored when <= 1.0. + // // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API double hard_rate_limit; - // Max time a put will be stalled when hard_rate_limit is enforced. If 0, then - // there is no limit. - // Default: 1000 + // DEPRECATED -- this options is no longer used unsigned int rate_limit_delay_max_milliseconds; // size of one block in arena memory allocation. @@ -365,10 +416,14 @@ struct ColumnFamilyOptions { // conforms to the restrictions. // // Default: 0 + // + // Dynamically changeable through SetOptions() API size_t arena_block_size; // Disable automatic compactions. Manual compactions can still // be issued on this column family + // + // Dynamically changeable through SetOptions() API bool disable_auto_compactions; // Purge duplicate/deleted keys when a memtable is flushed to storage. @@ -380,7 +435,10 @@ struct ColumnFamilyOptions { // If true, compaction will verify checksum on every read that happens // as part of compaction + // // Default: true + // + // Dynamically changeable through SetOptions() API bool verify_checksums_in_compaction; // The options needed to support Universal Style compactions @@ -393,14 +451,20 @@ struct ColumnFamilyOptions { // If KeyMayExist returns false, i.e. the key definitely does not exist, then // the delete is a noop. KeyMayExist only incurs in-memory look up. // This optimization avoids writing the delete to storage when appropriate. + // // Default: false + // + // Dynamically changeable through SetOptions() API bool filter_deletes; // An iteration->Next() sequentially skips over keys with the same // user-key unless this option is set. This number specifies the number // of keys (with the same userkey) that will be sequentially // skipped before a reseek is issued. + // // Default: 8 + // + // Dynamically changeable through SetOptions() API uint64_t max_sequential_skip_in_iterations; // This is a factory that provides MemTableRep objects. @@ -409,10 +473,24 @@ struct ColumnFamilyOptions { std::shared_ptr memtable_factory; // This is a factory that provides TableFactory objects. - // Default: a factory that provides a default implementation of - // Table and TableBuilder. + // Default: a block-based table factory that provides a default + // implementation of TableBuilder and TableReader with default + // BlockBasedTableOptions. std::shared_ptr table_factory; + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + // This option allows user to to collect their own interested statistics of // the tables. // Default: empty vector -- no user-defined statistics collection will be @@ -423,7 +501,8 @@ struct ColumnFamilyOptions { // Allows thread-safe inplace updates. If this is true, there is no way to // achieve point-in-time consistency using snapshot or iterator (assuming - // concurrent updates). + // concurrent updates). Hence iterator and multi-get will return results + // which are not consistent as of any point-in-time. // If inplace_callback function is not set, // Put(key, new_value) will update inplace the existing_value iff // * key exists in current memtable @@ -435,6 +514,8 @@ struct ColumnFamilyOptions { // Number of locks used for inplace update // Default: 10000, if inplace_update_support = true, else 0. + // + // Dynamically changeable through SetOptions() API size_t inplace_update_num_locks; // existing_value - pointer to previous value (from both memtable and sst). @@ -481,9 +562,13 @@ struct ColumnFamilyOptions { // if prefix_extractor is set and bloom_bits is not 0, create prefix bloom // for memtable + // + // Dynamically changeable through SetOptions() API uint32_t memtable_prefix_bloom_bits; // number of hash probes per key + // + // Dynamically changeable through SetOptions() API uint32_t memtable_prefix_bloom_probes; // Page size for huge page TLB for bloom in memtable. If <=0, not allocate @@ -491,7 +576,8 @@ struct ColumnFamilyOptions { // Need to reserve huge pages for it to be allocated. For example: // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt - + // + // Dynamically changeable through SetOptions() API size_t memtable_prefix_bloom_huge_page_tlb_size; // Control locality of bloom filter probes to improve cache miss rate. @@ -511,6 +597,8 @@ struct ColumnFamilyOptions { // operations in the memtable. // // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API size_t max_successive_merges; // The number of partial merge operands to accumulate before partial @@ -522,6 +610,12 @@ struct ColumnFamilyOptions { // Default: 2 uint32_t min_partial_merge_operands; +#ifndef ROCKSDB_LITE + // A vector of EventListeners which call-back functions will be called + // when specific RocksDB event happens. + std::vector> listeners; +#endif // ROCKSDB_LITE + // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options @@ -533,12 +627,14 @@ struct ColumnFamilyOptions { struct DBOptions { // Some functions that make it easier to optimize RocksDB +#ifndef ROCKSDB_LITE // By default, RocksDB uses only one background thread for flush and // compaction. Calling this function will set it up such that total of // `total_threads` is used. Good value for `total_threads` is the number of // cores. You almost definitely want to call this function if your system is // bottlenecked by RocksDB. DBOptions* IncreaseParallelism(int total_threads = 16); +#endif // ROCKSDB_LITE // If true, the database will be created if it is missing. // Default: false @@ -552,14 +648,11 @@ struct DBOptions { // Default: false bool error_if_exists; - // If true, the implementation will do aggressive checking of the - // data it is processing and will stop early if it detects any - // errors. This may have unforeseen ramifications: for example, a - // corruption of one DB entry may cause a large number of entries to - // become unreadable or for the entire DB to become unopenable. - // If any of the writes to the database fails (Put, Delete, Merge, Write), - // the database will switch to read-only mode and fail all other + // If true, RocksDB will aggressively check consistency of the data. + // Also, if any of the writes to the database fails (Put, Delete, Merge, + // Write), the database will switch to read-only mode and fail all other // Write operations. + // In most cases you want this to be set to true. // Default: true bool paranoid_checks; @@ -594,7 +687,7 @@ struct DBOptions { // column families whose memtables are backed by the oldest live WAL file // (i.e. the ones that are causing all the space amplification). If set to 0 // (default), we will dynamically choose the WAL size limit to be - // [sum of all write_buffer_size * max_write_buffer_number] * 2 + // [sum of all write_buffer_size * max_write_buffer_number] * 4 // Default: 0 uint64_t max_total_wal_size; @@ -603,7 +696,7 @@ struct DBOptions { // it does not use any locks to prevent concurrent updates. std::shared_ptr statistics; - // If true, then the contents of data files are not synced + // If true, then the contents of manifest and data files are not synced // to stable storage. Their contents remain in the OS buffers till the // OS decides to flush them. This option is good for bulk-loading // of data. Once the bulk-loading is complete, please issue a @@ -758,9 +851,7 @@ struct DBOptions { // Disable child process inherit open files. Default: true bool is_fd_close_on_exec; - // Skip log corruption error on recovery (If client is ok with - // losing most recent changes) - // Default: false + // DEPRECATED -- this options is no longer used bool skip_log_error_on_recovery; // if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec @@ -772,15 +863,28 @@ struct DBOptions { // Default: true bool advise_random_on_open; + // Amount of data to build up in memtables across all column + // families before writing to disk. + // + // This is distinct from write_buffer_size, which enforces a limit + // for a single memtable. + // + // This feature is disabled by default. Specify a non-zero value + // to enable it. + // + // Default: 0 (disabled) + size_t db_write_buffer_size; + // Specify the file access pattern once a compaction is started. // It will be applied to all input files of a compaction. // Default: NORMAL - enum { - NONE, - NORMAL, - SEQUENTIAL, - WILLNEED - } access_hint_on_compaction_start; + enum AccessHint { + NONE, + NORMAL, + SEQUENTIAL, + WILLNEED + }; + AccessHint access_hint_on_compaction_start; // Use adaptive mutex, which spins in the user space before resorting // to kernel. This could reduce context switch when the mutex is not @@ -789,10 +893,6 @@ struct DBOptions { // Default: false bool use_adaptive_mutex; - // Allow RocksDB to use thread local storage to optimize performance. - // Default: true - bool allow_thread_local; - // Create DBOptions with default values for all fields DBOptions(); // Create DBOptions from Options @@ -809,6 +909,12 @@ struct DBOptions { // When rate limiter is enabled, it automatically enables bytes_per_sync // to 1MB. uint64_t bytes_per_sync; + + // If true, then the status of the threads involved in this DB will + // be tracked and available via GetThreadList() API. + // + // Default: false + bool enable_thread_tracking; }; // Options to control the behavior of a database (passed to DB::Open) @@ -889,6 +995,18 @@ struct ReadOptions { // ! DEPRECATED // const Slice* prefix; + // "iterate_upper_bound" defines the extent upto which the forward iterator + // can returns entries. Once the bound is reached, Valid() will be false. + // "iterate_upper_bound" is exclusive ie the bound value is + // not a valid entry. If iterator_extractor is not null, the Seek target + // and iterator_upper_bound need to have the same prefix. + // This is because ordering is not guaranteed outside of prefix domain. + // There is no lower bound on the iterator. If needed, that can be easily + // implemented + // + // Default: nullptr + const Slice* iterate_upper_bound; + // Specify if this read request should process data that ALREADY // resides on a particular cache. If the required data is not // found at the specified cache, then Status::Incomplete is returned. @@ -912,6 +1030,7 @@ struct ReadOptions { : verify_checksums(true), fill_cache(true), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} @@ -919,6 +1038,7 @@ struct ReadOptions { : verify_checksums(cksum), fill_cache(cache), snapshot(nullptr), + iterate_upper_bound(nullptr), read_tier(kReadAllTier), tailing(false), total_order_seek(false) {} @@ -959,7 +1079,17 @@ struct WriteOptions { // Default: 0 uint64_t timeout_hint_us; - WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {} + // If true and if user is trying to write to column families that don't exist + // (they were dropped), ignore the write (don't return an error). If there + // are multiple writes in a WriteBatch, other writes will succeed. + // Default: false + bool ignore_missing_column_families; + + WriteOptions() + : sync(false), + disableWAL(false), + timeout_hint_us(0), + ignore_missing_column_families(false) {} }; // Options that control flush operations @@ -981,6 +1111,20 @@ extern Options GetOptions(size_t total_write_buffer_limit, int read_amplification_threshold = 8, int write_amplification_threshold = 32, uint64_t target_db_size = 68719476736 /* 64GB */); + +// CompactionOptions are used in CompactFiles() call. +struct CompactionOptions { + // Compaction output compression type + // Default: snappy + CompressionType compression; + // Compaction will create files of size `output_file_size_limit`. + // Default: MAX, which means that compaction will create a single file + uint64_t output_file_size_limit; + + CompactionOptions() + : compression(kSnappyCompression), + output_file_size_limit(std::numeric_limits::max()) {} +}; } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index e96d09d2a..18c186a95 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -65,6 +65,9 @@ struct PerfContext { uint64_t write_wal_time; // total time spent on writing to WAL // total time spent on writing to mem tables uint64_t write_memtable_time; + uint64_t db_mutex_lock_nanos; // time spent on acquiring DB mutex. + // Time spent on waiting with a condition variable created with DB mutex. + uint64_t db_condition_wait_nanos; }; #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 406a8abb9..05d0f9df6 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -123,7 +123,7 @@ inline bool operator!=(const Slice& x, const Slice& y) { } inline int Slice::compare(const Slice& b) const { - const int min_len = (size_ < b.size_) ? size_ : b.size_; + const size_t min_len = (size_ < b.size_) ? size_ : b.size_; int r = memcmp(data_, b.data_, min_len); if (r == 0) { if (size_ < b.size_) r = -1; diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index a78455001..3694c5802 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -36,10 +36,39 @@ class SliceTransform { // determine whether dst=Transform(src) for some src virtual bool InRange(const Slice& dst) const = 0; + + // Transform(s)=Transform(`prefix`) for any s with `prefix` as a prefix. + // + // This function is not used by RocksDB, but for users. If users pass + // Options by string to RocksDB, they might not know what prefix extractor + // they are using. This function is to help users can determine: + // if they want to iterate all keys prefixing `prefix`, whetherit is + // safe to use prefix bloom filter and seek to key `prefix`. + // If this function returns true, this means a user can Seek() to a prefix + // using the bloom filter. Otherwise, user needs to skip the bloom filter + // by setting ReadOptions.total_order_seek = true. + // + // Here is an example: Suppose we implement a slice transform that returns + // the first part of the string after spliting it using deimiter ",": + // 1. SameResultWhenAppended("abc,") should return true. If aplying prefix + // bloom filter using it, all slices matching "abc:.*" will be extracted + // to "abc,", so any SST file or memtable containing any of those key + // will not be filtered out. + // 2. SameResultWhenAppended("abc") should return false. A user will not + // guaranteed to see all the keys matching "abc.*" if a user seek to "abc" + // against a DB with the same setting. If one SST file only contains + // "abcd,e", the file can be filtered out and the key will be invisible. + // + // i.e., an implementation always returning false is safe. + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return false; + } }; extern const SliceTransform* NewFixedPrefixTransform(size_t prefix_len); +extern const SliceTransform* NewCappedPrefixTransform(size_t cap_len); + extern const SliceTransform* NewNoopTransform(); } diff --git a/include/rocksdb/sst_dump_tool.h b/include/rocksdb/sst_dump_tool.h new file mode 100644 index 000000000..39bfb519b --- /dev/null +++ b/include/rocksdb/sst_dump_tool.h @@ -0,0 +1,17 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE +#pragma once + +namespace rocksdb { + +class SSTDumpTool { + public: + int Run(int argc, char** argv); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 6785833b4..c5b364a0c 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -53,6 +53,13 @@ enum Tickers : uint32_t { // # of memtable misses. MEMTABLE_MISS, + // # of Get() queries served by L0 + GET_HIT_L0, + // # of Get() queries served by L1 + GET_HIT_L1, + // # of Get() queries served by L2 and up + GET_HIT_L2_AND_UP, + /** * COMPACTION_KEY_DROP_* count the reasons for key drop during compaction * There are 3 reasons currently. @@ -73,12 +80,16 @@ enum Tickers : uint32_t { NO_FILE_CLOSES, NO_FILE_OPENS, NO_FILE_ERRORS, - // Time system had to wait to do LO-L1 compactions + // DEPRECATED Time system had to wait to do LO-L1 compactions STALL_L0_SLOWDOWN_MICROS, - // Time system had to wait to move memtable to L1. + // DEPRECATED Time system had to wait to move memtable to L1. STALL_MEMTABLE_COMPACTION_MICROS, - // write throttle because of too many files in L0 + // DEPRECATED write throttle because of too many files in L0 STALL_L0_NUM_FILES_MICROS, + // Writer has to wait for compaction or flush to finish. + STALL_MICROS, + // The wait time for db mutex. + DB_MUTEX_WAIT_MICROS, RATE_LIMIT_DELAY_MILLIS, NO_ITERATORS, // number of iterators currently open @@ -115,7 +126,7 @@ enum Tickers : uint32_t { // head of the writers queue. WRITE_DONE_BY_SELF, WRITE_DONE_BY_OTHER, - WRITE_TIMEDOUT, // Number of writes ending up with timed-out. + WRITE_TIMEDOUT, // Number of writes ending up with timed-out. WRITE_WITH_WAL, // Number of Write calls that request WAL COMPACT_READ_BYTES, // Bytes read during compaction COMPACT_WRITE_BYTES, // Bytes written during compaction @@ -146,6 +157,9 @@ const std::vector> TickersNameMap = { {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, {MEMTABLE_HIT, "rocksdb.memtable.hit"}, {MEMTABLE_MISS, "rocksdb.memtable.miss"}, + {GET_HIT_L0, "rocksdb.l0.hit"}, + {GET_HIT_L1, "rocksdb.l1.hit"}, + {GET_HIT_L2_AND_UP, "rocksdb.l2andup.hit"}, {COMPACTION_KEY_DROP_NEWER_ENTRY, "rocksdb.compaction.key.drop.new"}, {COMPACTION_KEY_DROP_OBSOLETE, "rocksdb.compaction.key.drop.obsolete"}, {COMPACTION_KEY_DROP_USER, "rocksdb.compaction.key.drop.user"}, @@ -160,6 +174,8 @@ const std::vector> TickersNameMap = { {STALL_L0_SLOWDOWN_MICROS, "rocksdb.l0.slowdown.micros"}, {STALL_MEMTABLE_COMPACTION_MICROS, "rocksdb.memtable.compaction.micros"}, {STALL_L0_NUM_FILES_MICROS, "rocksdb.l0.num.files.stall.micros"}, + {STALL_MICROS, "rocksdb.stall.micros"}, + {DB_MUTEX_WAIT_MICROS, "rocksdb.db.mutex.wait.micros"}, {RATE_LIMIT_DELAY_MILLIS, "rocksdb.rate.limit.delay.millis"}, {NO_ITERATORS, "rocksdb.num.iterators"}, {NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get"}, @@ -212,7 +228,6 @@ enum Histograms : uint32_t { READ_BLOCK_COMPACTION_MICROS, READ_BLOCK_GET_MICROS, WRITE_RAW_BLOCK_MICROS, - STALL_L0_SLOWDOWN_COUNT, STALL_MEMTABLE_COMPACTION_COUNT, STALL_L0_NUM_FILES_COUNT, @@ -220,6 +235,7 @@ enum Histograms : uint32_t { SOFT_RATE_LIMIT_DELAY_COUNT, NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, + WRITE_STALL, HISTOGRAM_ENUM_MAX, }; diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index b20689a77..177d705f3 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -61,6 +61,9 @@ class Status { static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kIncomplete, msg, msg2); } + static Status ShutdownInProgress() { + return Status(kShutdownInProgress); + } static Status ShutdownInProgress(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kShutdownInProgress, msg, msg2); @@ -71,6 +74,12 @@ class Status { static Status TimedOut(const Slice& msg, const Slice& msg2 = Slice()) { return Status(kTimedOut, msg, msg2); } + static Status Aborted() { + return Status(kAborted); + } + static Status Aborted(const Slice& msg, const Slice& msg2 = Slice()) { + return Status(kAborted, msg, msg2); + } // Returns true iff the status indicates success. bool ok() const { return code() == kOk; } @@ -96,11 +105,13 @@ class Status { // Returns true iff the status indicates Incomplete bool IsIncomplete() const { return code() == kIncomplete; } - // Returns true iff the status indicates Incomplete + // Returns true iff the status indicates Shutdown In progress bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } bool IsTimedOut() const { return code() == kTimedOut; } + bool IsAborted() const { return code() == kAborted; } + // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. std::string ToString() const; @@ -115,7 +126,8 @@ class Status { kMergeInProgress = 6, kIncomplete = 7, kShutdownInProgress = 8, - kTimedOut = 9 + kTimedOut = 9, + kAborted = 10 }; Code code() const { @@ -130,8 +142,8 @@ class Status { Code code_; const char* state_; - explicit Status(Code code) : code_(code), state_(nullptr) { } - Status(Code code, const Slice& msg, const Slice& msg2); + explicit Status(Code _code) : code_(_code), state_(nullptr) {} + Status(Code _code, const Slice& msg, const Slice& msg2); static const char* CopyState(const char* s); }; diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 3a47ed939..b67eeffef 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -23,6 +23,7 @@ #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" #include "rocksdb/status.h" namespace rocksdb { @@ -124,6 +125,22 @@ struct BlockBasedTableOptions { // If true, place whole keys in the filter (not just prefixes). // This must generally be true for gets to be efficient. bool whole_key_filtering = true; + + // We currently have three versions: + // 0 -- This version is currently written out by all RocksDB's versions by + // default. Can be read by really old RocksDB's. Doesn't support changing + // checksum (default is CRC32). + // 1 -- Can be read by RocksDB's versions since 3.0. Supports non-default + // checksum, like xxHash. It is written by RocksDB when + // BlockBasedTableOptions::checksum is something other than kCRC32c. (version + // 0 is silently upconverted) + // 2 -- Can be read by RocksDB's versions since 3.10. Changes the way we + // encode compressed blocks with LZ4, BZip2 and Zlib compression. If you + // don't plan to run RocksDB before version 3.10, you should probably use + // this. + // This option only affects newly written tables. When reading exising tables, + // the information about version is read from the footer. + uint32_t format_version = 0; }; // Table Properties that are specific to block-based table properties. @@ -250,23 +267,46 @@ struct CuckooTablePropertyNames { // Denotes if the key sorted in the file is Internal Key (if false) // or User Key only (if true). static const std::string kIsLastLevel; + // Indicate if using identity function for the first hash function. + static const std::string kIdentityAsFirstHash; + // Indicate if using module or bit and to calculate hash value + static const std::string kUseModuleHash; + // Fixed user key length + static const std::string kUserKeyLength; +}; + +struct CuckooTableOptions { + // Determines the utilization of hash tables. Smaller values + // result in larger hash tables with fewer collisions. + double hash_table_ratio = 0.9; + // A property used by builder to determine the depth to go to + // to search for a path to displace elements in case of + // collision. See Builder.MakeSpaceForKey method. Higher + // values result in more efficient hash tables with fewer + // lookups but take more time to build. + uint32_t max_search_depth = 100; + // In case of collision while inserting, the builder + // attempts to insert in the next cuckoo_block_size + // locations before skipping over to the next Cuckoo hash + // function. This makes lookups more cache friendly in case + // of collisions. + uint32_t cuckoo_block_size = 5; + // If this option is enabled, user key is treated as uint64_t and its value + // is used as hash value directly. This option changes builder's behavior. + // Reader ignore this option and behave according to what specified in table + // property. + bool identity_as_first_hash = false; + // If this option is set to true, module is used during hash calculation. + // This often yields better space efficiency at the cost of performance. + // If this optino is set to false, # of entries in table is constrained to be + // power of two, and bit and is used to calculate hash, which is faster in + // general. + bool use_module_hash = true; }; // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing -// @hash_table_ratio: Determines the utilization of hash tables. Smaller values -// result in larger hash tables with fewer collisions. -// @max_search_depth: A property used by builder to determine the depth to go to -// to search for a path to displace elements in case of -// collision. See Builder.MakeSpaceForKey method. Higher -// values result in more efficient hash tables with fewer -// lookups but take more time to build. -// @cuckoo_block_size: In case of collision while inserting, the builder -// attempts to insert in the next cuckoo_block_size -// locations before skipping over to the next Cuckoo hash -// function. This makes lookups more cache friendly in case -// of collisions. -extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9, - uint32_t max_search_depth = 100, uint32_t cuckoo_block_size = 5); +extern TableFactory* NewCuckooTableFactory( + const CuckooTableOptions& table_options = CuckooTableOptions()); #endif // ROCKSDB_LITE @@ -293,14 +333,15 @@ class TableFactory { // and cache the table object returned. // (1) SstFileReader (for SST Dump) opens the table and dump the table // contents using the interator of the table. - // options and soptions are options. options is the general options. + // ImmutableCFOptions is a subset of Options that can not be altered. + // EnvOptions is a subset of Options that will be used by Env. // Multiple configured can be accessed from there, including and not // limited to block cache and key comparators. // file is a file handler to handle the file for the table // file_size is the physical file size of the file // table_reader is the output table reader virtual Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; @@ -318,20 +359,25 @@ class TableFactory { // (4) When running Repairer, it creates a table builder to convert logs to // SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) // - // options is the general options. Multiple configured can be acceseed from - // there, including and not limited to compression options. - // file is a handle of a writable file. It is the caller's responsibility to - // keep the file open and close the file after closing the table builder. - // compression_type is the compression type to use in this table. + // ImmutableCFOptions is a subset of Options that can not be altered. + // Multiple configured can be acceseed from there, including and not limited + // to compression options. file is a handle of a writable file. + // It is the caller's responsibility to keep the file open and close the file + // after closing the table builder. compression_type is the compression type + // to use in this table. virtual TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const = 0; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const = 0; - // Sanitizes the specified DB Options. + // Sanitizes the specified DB Options and ColumnFamilyOptions. // // If the function cannot find a way to sanitize the input DB Options, // a non-ok Status will be returned. - virtual Status SanitizeDBOptions(DBOptions* db_opts) const = 0; + virtual Status SanitizeOptions( + const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const = 0; // Return a string that contains printable format of table configurations. // RocksDB prints configurations at DB Open(). @@ -339,13 +385,14 @@ class TableFactory { }; #ifndef ROCKSDB_LITE -// Create a special table factory that can open both of block based table format -// and plain table, based on setting inside the SST files. It should be used to +// Create a special table factory that can open either of the supported +// table formats, based on setting inside the SST files. It should be used to // convert a DB from one table format to another. // @table_factory_to_write: the table factory used when writing to new files. // @block_based_table_factory: block based table factory to use. If NULL, use // a default one. // @plain_table_factory: plain table factory to use. If NULL, use a default one. +// @cuckoo_table_factory: cuckoo table factory to use. If NULL, use a default one. extern TableFactory* NewAdaptiveTableFactory( std::shared_ptr table_factory_to_write = nullptr, std::shared_ptr block_based_table_factory = nullptr, diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h new file mode 100644 index 000000000..36efd6f75 --- /dev/null +++ b/include/rocksdb/thread_status.h @@ -0,0 +1,105 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file defines the structures for exposing run-time status of any +// rocksdb-related thread. Such run-time status can be obtained via +// GetThreadList() API. +// +// Note that all thread-status features are still under-development, and +// thus APIs and class definitions might subject to change at this point. +// Will remove this comment once the APIs have been finalized. + +#pragma once + +#include +#include + +#ifndef ROCKSDB_USING_THREAD_STATUS +#define ROCKSDB_USING_THREAD_STATUS \ + !defined(ROCKSDB_LITE) && \ + !defined(NROCKSDB_THREAD_STATUS) && \ + !defined(OS_MACOSX) && \ + !defined(IOS_CROSS_COMPILE) +#endif + +namespace rocksdb { + +// A structure that describes the current status of a thread. +// The status of active threads can be fetched using +// rocksdb::GetThreadList(). +struct ThreadStatus { + // The type of a thread. + enum ThreadType : int { + HIGH_PRIORITY = 0, // RocksDB BG thread in high-pri thread pool + LOW_PRIORITY, // RocksDB BG thread in low-pri thread pool + USER, // User thread (Non-RocksDB BG thread) + NUM_THREAD_TYPES + }; + + // The type used to refer to a thread operation. + // A thread operation describes high-level action of a thread. + // Examples include compaction and flush. + enum OperationType : int { + OP_UNKNOWN = 0, + OP_COMPACTION, + OP_FLUSH, + NUM_OP_TYPES + }; + + // The type used to refer to a thread state. + // A state describes lower-level action of a thread + // such as reading / writing a file or waiting for a mutex. + enum StateType : int { + STATE_UNKNOWN = 0, + STATE_MUTEX_WAIT = 1, + NUM_STATE_TYPES + }; + + ThreadStatus(const uint64_t _id, + const ThreadType _thread_type, + const std::string& _db_name, + const std::string& _cf_name, + const OperationType _operation_type, + const StateType _state_type) : + thread_id(_id), thread_type(_thread_type), + db_name(_db_name), + cf_name(_cf_name), + operation_type(_operation_type), state_type(_state_type) {} + + // An unique ID for the thread. + const uint64_t thread_id; + + // The type of the thread, it could be HIGH_PRIORITY, + // LOW_PRIORITY, and USER + const ThreadType thread_type; + + // The name of the DB instance where the thread is currently + // involved with. It would be set to empty string if the thread + // does not involve in any DB operation. + const std::string db_name; + + // The name of the column family where the thread is currently + // It would be set to empty string if the thread does not involve + // in any column family. + const std::string cf_name; + + // The operation (high-level action) that the current thread is involved. + const OperationType operation_type; + + // The state (lower-level action) that the current thread is involved. + const StateType state_type; + + // The followings are a set of utility functions for interpreting + // the information of ThreadStatus + + // Obtain the name of an operation given its type. + static const std::string& GetOperationName(OperationType op_type); + + // Obtain the name of a state given its type. + static const std::string& GetStateName(StateType state_type); +}; + + +} // namespace rocksdb diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index 78365769d..4b4ba6079 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -10,7 +10,10 @@ #pragma once #ifndef ROCKSDB_LITE +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -127,9 +130,41 @@ struct BackupInfo { int64_t timestamp; uint64_t size; + uint32_t number_files; + BackupInfo() {} - BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size) - : backup_id(_backup_id), timestamp(_timestamp), size(_size) {} + + BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size, + uint32_t _number_files) + : backup_id(_backup_id), timestamp(_timestamp), size(_size), + number_files(_number_files) {} +}; + +class BackupStatistics { + public: + BackupStatistics() { + number_success_backup = 0; + number_fail_backup = 0; + } + + BackupStatistics(uint32_t _number_success_backup, + uint32_t _number_fail_backup) + : number_success_backup(_number_success_backup), + number_fail_backup(_number_fail_backup) {} + + ~BackupStatistics() {} + + void IncrementNumberSuccessBackup(); + void IncrementNumberFailBackup(); + + uint32_t GetNumberSuccessBackup() const; + uint32_t GetNumberFailBackup() const; + + std::string ToString() const; + + private: + uint32_t number_success_backup; + uint32_t number_fail_backup; }; class BackupEngineReadOnly { @@ -142,6 +177,8 @@ class BackupEngineReadOnly { // You can GetBackupInfo safely, even with other BackupEngine performing // backups on the same directory virtual void GetBackupInfo(std::vector* backup_info) = 0; + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) = 0; // Restoring DB from backup is NOT safe when there is another BackupEngine // running that might call DeleteBackup() or PurgeOldBackups(). It is caller's @@ -161,7 +198,12 @@ class BackupEngine { virtual ~BackupEngine() {} static BackupEngine* NewBackupEngine(Env* db_env, - const BackupableDBOptions& options); + const BackupableDBOptions& options) + __attribute__((deprecated("Please use Open() instead"))); + + static Status Open(Env* db_env, + const BackupableDBOptions& options, + BackupEngine** backup_engine_ptr); virtual Status CreateNewBackup(DB* db, bool flush_before_backup = false) = 0; virtual Status PurgeOldBackups(uint32_t num_backups_to_keep) = 0; @@ -169,12 +211,16 @@ class BackupEngine { virtual void StopBackup() = 0; virtual void GetBackupInfo(std::vector* backup_info) = 0; + virtual void GetCorruptedBackups( + std::vector* corrupt_backup_ids) = 0; virtual Status RestoreDBFromBackup( BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = RestoreOptions()) = 0; virtual Status RestoreDBFromLatestBackup( const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = RestoreOptions()) = 0; + + virtual Status GarbageCollect() = 0; }; // Stack your DB with BackupableDB to be able to backup the DB @@ -193,6 +239,8 @@ class BackupableDB : public StackableDB { Status CreateNewBackup(bool flush_before_backup = false); // Returns info about backups in backup_info void GetBackupInfo(std::vector* backup_info); + // Returns info about corrupt backups in corrupt_backups + void GetCorruptedBackups(std::vector* corrupt_backup_ids); // deletes old backups, keeping latest num_backups_to_keep alive Status PurgeOldBackups(uint32_t num_backups_to_keep); // deletes a specific backup @@ -206,6 +254,11 @@ class BackupableDB : public StackableDB { // next time you create BackupableDB or RestoreBackupableDB. void StopBackup(); + // Will delete all the files we don't need anymore + // It will do the full scan of the files/ directory and delete all the + // files that are not referenced. + Status GarbageCollect(); + private: BackupEngine* backup_engine_; }; @@ -218,6 +271,8 @@ class RestoreBackupableDB { // Returns info about backups in backup_info void GetBackupInfo(std::vector* backup_info); + // Returns info about corrupt backups in corrupt_backups + void GetCorruptedBackups(std::vector* corrupt_backup_ids); // restore from backup with backup_id // IMPORTANT -- if options_.share_table_files == true and you restore DB @@ -244,6 +299,11 @@ class RestoreBackupableDB { // deletes a specific backup Status DeleteBackup(BackupID backup_id); + // Will delete all the files we don't need anymore + // It will do the full scan of the files/ directory and delete all the + // files that are not referenced. + Status GarbageCollect(); + private: BackupEngine* backup_engine_; }; diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h new file mode 100644 index 000000000..b60f4ebc6 --- /dev/null +++ b/include/rocksdb/utilities/checkpoint.h @@ -0,0 +1,34 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// A checkpoint is an openable snapshot of a database at a point in time. + +#pragma once + +#include +#include "rocksdb/status.h" + +namespace rocksdb { + +class DB; + +class Checkpoint { + public: + // Creates a Checkpoint object to be used for creating openable sbapshots + static Status Create(DB* db, Checkpoint** checkpoint_ptr); + + // Builds an openable snapshot of RocksDB on the same disk, which + // accepts an output directory on the same disk, and under the directory + // (1) hard-linked SST files pointing to existing live SST files + // SST files will be copied if output directory is on a different filesystem + // (2) a copied manifest files and other files + // The directory should not already exist and will be created by this API. + // The directory will be an absolute path + virtual Status CreateCheckpoint(const std::string& checkpoint_dir); + + virtual ~Checkpoint() {} +}; + +} // namespace rocksdb diff --git a/include/rocksdb/utilities/convenience.h b/include/rocksdb/utilities/convenience.h new file mode 100644 index 000000000..bf3942aae --- /dev/null +++ b/include/rocksdb/utilities/convenience.h @@ -0,0 +1,58 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE +// Take a map of option name and option value, apply them into the +// base_options, and return the new options as a result +Status GetColumnFamilyOptionsFromMap( + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options); + +Status GetDBOptionsFromMap( + const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options); + +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options); + +// Take a string representation of option names and values, apply them into the +// base_options, and return the new options as a result. The string has the +// following format: +// "write_buffer_size=1024;max_write_buffer_number=2" +// Nested options config is also possible. For example, you can define +// BlockBasedTableOptions as part of the string for block-based table factory: +// "write_buffer_size=1024;block_based_table_factory={block_size=4k};" +// "max_write_buffer_num=2" +Status GetColumnFamilyOptionsFromString( + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options); + +Status GetDBOptionsFromString( + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options); + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, + const std::string& opts_str, + BlockBasedTableOptions* new_table_options); + +#endif // ROCKSDB_LITE + +} // namespace rocksdb diff --git a/include/rocksdb/utilities/leveldb_options.h b/include/rocksdb/utilities/leveldb_options.h new file mode 100644 index 000000000..8e2c3a1d5 --- /dev/null +++ b/include/rocksdb/utilities/leveldb_options.h @@ -0,0 +1,144 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +namespace rocksdb { + +class Cache; +class Comparator; +class Env; +class FilterPolicy; +class Logger; +struct Options; +class Snapshot; + +enum CompressionType : char; + +// Options to control the behavior of a database (passed to +// DB::Open). A LevelDBOptions object can be initialized as though +// it were a LevelDB Options object, and then it can be converted into +// a RocksDB Options object. +struct LevelDBOptions { + // ------------------- + // Parameters that affect behavior + + // Comparator used to define the order of keys in the table. + // Default: a comparator that uses lexicographic byte-wise ordering + // + // REQUIRES: The client must ensure that the comparator supplied + // here has the same name and orders keys *exactly* the same as the + // comparator provided to previous open calls on the same DB. + const Comparator* comparator; + + // If true, the database will be created if it is missing. + // Default: false + bool create_if_missing; + + // If true, an error is raised if the database already exists. + // Default: false + bool error_if_exists; + + // If true, the implementation will do aggressive checking of the + // data it is processing and will stop early if it detects any + // errors. This may have unforeseen ramifications: for example, a + // corruption of one DB entry may cause a large number of entries to + // become unreadable or for the entire DB to become unopenable. + // Default: false + bool paranoid_checks; + + // Use the specified object to interact with the environment, + // e.g. to read/write files, schedule background work, etc. + // Default: Env::Default() + Env* env; + + // Any internal progress/error information generated by the db will + // be written to info_log if it is non-NULL, or to a file stored + // in the same directory as the DB contents if info_log is NULL. + // Default: NULL + Logger* info_log; + + // ------------------- + // Parameters that affect performance + + // Amount of data to build up in memory (backed by an unsorted log + // on disk) before converting to a sorted on-disk file. + // + // Larger values increase performance, especially during bulk loads. + // Up to two write buffers may be held in memory at the same time, + // so you may wish to adjust this parameter to control memory usage. + // Also, a larger write buffer will result in a longer recovery time + // the next time the database is opened. + // + // Default: 4MB + size_t write_buffer_size; + + // Number of open files that can be used by the DB. You may need to + // increase this if your database has a large working set (budget + // one open file per 2MB of working set). + // + // Default: 1000 + int max_open_files; + + // Control over blocks (user data is stored in a set of blocks, and + // a block is the unit of reading from disk). + + // If non-NULL, use the specified cache for blocks. + // If NULL, leveldb will automatically create and use an 8MB internal cache. + // Default: NULL + Cache* block_cache; + + // Approximate size of user data packed per block. Note that the + // block size specified here corresponds to uncompressed data. The + // actual size of the unit read from disk may be smaller if + // compression is enabled. This parameter can be changed dynamically. + // + // Default: 4K + size_t block_size; + + // Number of keys between restart points for delta encoding of keys. + // This parameter can be changed dynamically. Most clients should + // leave this parameter alone. + // + // Default: 16 + int block_restart_interval; + + // Compress blocks using the specified compression algorithm. This + // parameter can be changed dynamically. + // + // Default: kSnappyCompression, which gives lightweight but fast + // compression. + // + // Typical speeds of kSnappyCompression on an Intel(R) Core(TM)2 2.4GHz: + // ~200-500MB/s compression + // ~400-800MB/s decompression + // Note that these speeds are significantly faster than most + // persistent storage speeds, and therefore it is typically never + // worth switching to kNoCompression. Even if the input data is + // incompressible, the kSnappyCompression implementation will + // efficiently detect that and will switch to uncompressed mode. + CompressionType compression; + + // If non-NULL, use the specified filter policy to reduce disk reads. + // Many applications will benefit from passing the result of + // NewBloomFilterPolicy() here. + // + // Default: NULL + const FilterPolicy* filter_policy; + + // Create a LevelDBOptions object with default values for all fields. + LevelDBOptions(); +}; + +// Converts a LevelDBOptions object into a RocksDB Options object. +Options ConvertOptions(const LevelDBOptions& leveldb_options); + +} // namespace rocksdb diff --git a/include/rocksdb/utilities/spatial_db.h b/include/rocksdb/utilities/spatial_db.h index cba93cd5f..1beb5c7f1 100644 --- a/include/rocksdb/utilities/spatial_db.h +++ b/include/rocksdb/utilities/spatial_db.h @@ -222,7 +222,9 @@ class SpatialDB : public StackableDB { // Calling Compact() after inserting a bunch of elements should speed up // reading. This is especially useful if you use SpatialDBOptions::bulk_load - virtual Status Compact() = 0; + // Num threads determines how many threads we'll use for compactions. Setting + // this to bigger number will use more IO and CPU, but finish faster + virtual Status Compact(int num_threads = 1) = 0; // Query the specified spatial_index. Query will return all elements that // intersect bbox, but it may also return some extra elements. diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 417378f5d..7bdf9928e 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -3,6 +3,7 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once +#include #include "rocksdb/db.h" namespace rocksdb { @@ -132,6 +133,17 @@ class StackableDB : public DB { target_level, target_path_id); } + using DB::CompactFiles; + virtual Status CompactFiles( + const CompactionOptions& compact_options, + ColumnFamilyHandle* column_family, + const std::vector& input_file_names, + const int output_level, const int output_path_id = -1) override { + return db_->CompactFiles( + compact_options, column_family, input_file_names, + output_level, output_path_id); + } + using DB::NumberLevels; virtual int NumberLevels(ColumnFamilyHandle* column_family) override { return db_->NumberLevels(column_family); @@ -169,6 +181,8 @@ class StackableDB : public DB { return db_->Flush(fopts, column_family); } +#ifndef ROCKSDB_LITE + virtual Status DisableFileDeletions() override { return db_->DisableFileDeletions(); } @@ -182,6 +196,14 @@ class StackableDB : public DB { db_->GetLiveFilesMetaData(metadata); } + virtual void GetColumnFamilyMetaData( + ColumnFamilyHandle *column_family, + ColumnFamilyMetaData* cf_meta) override { + db_->GetColumnFamilyMetaData(column_family, cf_meta); + } + +#endif // ROCKSDB_LITE + virtual Status GetLiveFiles(std::vector& vec, uint64_t* mfs, bool flush_memtable = true) override { return db_->GetLiveFiles(vec, mfs, flush_memtable); @@ -203,6 +225,12 @@ class StackableDB : public DB { return db_->GetDbIdentity(identity); } + using DB::SetOptions; + virtual Status SetOptions( + const std::unordered_map& new_options) override { + return db_->SetOptions(new_options); + } + using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) { diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index c09f53d11..566934b70 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -11,14 +11,15 @@ #pragma once -#include "rocksdb/status.h" +#include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" #include "rocksdb/slice.h" +#include "rocksdb/status.h" #include "rocksdb/write_batch.h" namespace rocksdb { class ColumnFamilyHandle; -struct SliceParts; class Comparator; enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord }; @@ -38,10 +39,16 @@ class WBWIIterator { virtual bool Valid() const = 0; + virtual void SeekToFirst() = 0; + + virtual void SeekToLast() = 0; + virtual void Seek(const Slice& key) = 0; virtual void Next() = 0; + virtual void Prev() = 0; + virtual const WriteEntry& Entry() const = 0; virtual Status status() const = 0; @@ -56,43 +63,51 @@ class WBWIIterator { // A user can call NewIterator() to create an iterator. class WriteBatchWithIndex { public: - // index_comparator indicates the order when iterating data in the write - // batch. Technically, it doesn't have to be the same as the one used in - // the DB. + // backup_index_comparator: the backup comparator used to compare keys + // within the same column family, if column family is not given in the + // interface, or we can't find a column family from the column family handle + // passed in, backup_index_comparator will be used for the column family. // reserved_bytes: reserved bytes in underlying WriteBatch - explicit WriteBatchWithIndex(const Comparator* index_comparator, - size_t reserved_bytes = 0); + // overwrite_key: if true, overwrite the key in the index when inserting + // the same key as previously, so iterator will never + // show two entries with the same key. + explicit WriteBatchWithIndex( + const Comparator* backup_index_comparator = BytewiseComparator(), + size_t reserved_bytes = 0, bool overwrite_key = false); virtual ~WriteBatchWithIndex(); WriteBatch* GetWriteBatch(); - virtual void Put(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value); - - virtual void Put(const Slice& key, const Slice& value); - - virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key, - const Slice& value); + void Put(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); - virtual void Merge(const Slice& key, const Slice& value); + void Put(const Slice& key, const Slice& value); - virtual void PutLogData(const Slice& blob); + void Merge(ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value); - virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key); - virtual void Delete(const Slice& key); + void Merge(const Slice& key, const Slice& value); - virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key); + void PutLogData(const Slice& blob); - virtual void Delete(const SliceParts& key); + void Delete(ColumnFamilyHandle* column_family, const Slice& key); + void Delete(const Slice& key); // Create an iterator of a column family. User can call iterator.Seek() to // search to the next entry of or after a key. Keys will be iterated in the // order given by index_comparator. For multiple updates on the same key, // each update will be returned as a separate entry, in the order of update // time. - virtual WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); + WBWIIterator* NewIterator(ColumnFamilyHandle* column_family); // Create an iterator of the default column family. - virtual WBWIIterator* NewIterator(); + WBWIIterator* NewIterator(); + + // Will create a new Iterator that will use WBWIIterator as a delta and + // base_iterator as base + Iterator* NewIteratorWithBase(ColumnFamilyHandle* column_family, + Iterator* base_iterator); + // default column family + Iterator* NewIteratorWithBase(Iterator* base_iterator); private: struct Rep; diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index d6ccaeda5..2e76fe5be 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -4,9 +4,8 @@ // of patent rights can be found in the PATENTS file in the same directory. #pragma once -// Also update Makefile if you change these #define ROCKSDB_MAJOR 3 -#define ROCKSDB_MINOR 5 +#define ROCKSDB_MINOR 10 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index db440be02..462a54a59 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -105,10 +105,11 @@ class WriteBatch { return Status::InvalidArgument( "non-default column family and PutCF not implemented"); } - virtual void Put(const Slice& key, const Slice& value); + virtual void Put(const Slice& key, const Slice& value) {} + // Merge and LogData are not pure virtual. Otherwise, we would break // existing clients of Handler on a source code level. The default - // implementation of Merge simply throws a runtime exception. + // implementation of Merge does nothing. virtual Status MergeCF(uint32_t column_family_id, const Slice& key, const Slice& value) { if (column_family_id == 0) { @@ -118,7 +119,8 @@ class WriteBatch { return Status::InvalidArgument( "non-default column family and MergeCF not implemented"); } - virtual void Merge(const Slice& key, const Slice& value); + virtual void Merge(const Slice& key, const Slice& value) {} + // The default implementation of LogData does nothing. virtual void LogData(const Slice& blob); virtual Status DeleteCF(uint32_t column_family_id, const Slice& key) { @@ -129,7 +131,8 @@ class WriteBatch { return Status::InvalidArgument( "non-default column family and DeleteCF not implemented"); } - virtual void Delete(const Slice& key); + virtual void Delete(const Slice& key) {} + // Continue is called by WriteBatch::Iterate. If it returns false, // iteration is halted. Otherwise, it continues iterating. The default // implementation always returns true. diff --git a/java/HISTORY-JAVA.md b/java/HISTORY-JAVA.md index 4cf0f7d18..9bced168e 100644 --- a/java/HISTORY-JAVA.md +++ b/java/HISTORY-JAVA.md @@ -1,5 +1,20 @@ # RocksJava Change Log +## By 01/31/2015 +### New Features +* WriteBatchWithIndex support. +* Iterator support for WriteBatch and WriteBatchWithIndex +* GetUpdatesSince support. +* Snapshots carry now information about the related sequence number. +* TTL DB support. + +## By 11/14/2014 +### New Features +* Full support for Column Family. +* Slice and Comparator support. +* Default merge operator support. +* RateLimiter support. + ## By 06/15/2014 ### New Features * Added basic Java binding for rocksdb::Env such that multiple RocksDB can share the same thread pool and environment. diff --git a/java/Makefile b/java/Makefile index 47b2afb9e..a07afb20a 100644 --- a/java/Makefile +++ b/java/Makefile @@ -1,34 +1,178 @@ -NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv +NATIVE_JAVA_CLASSES = org.rocksdb.AbstractComparator\ + org.rocksdb.AbstractSlice\ + org.rocksdb.BackupableDB\ + org.rocksdb.BackupableDBOptions\ + org.rocksdb.BlockBasedTableConfig\ + org.rocksdb.BloomFilter\ + org.rocksdb.Checkpoint\ + org.rocksdb.ColumnFamilyHandle\ + org.rocksdb.ColumnFamilyOptions\ + org.rocksdb.Comparator\ + org.rocksdb.ComparatorOptions\ + org.rocksdb.DBOptions\ + org.rocksdb.DirectComparator\ + org.rocksdb.DirectSlice\ + org.rocksdb.FlushOptions\ + org.rocksdb.Filter\ + org.rocksdb.GenericRateLimiterConfig\ + org.rocksdb.HashLinkedListMemTableConfig\ + org.rocksdb.HashSkipListMemTableConfig\ + org.rocksdb.MergeOperator\ + org.rocksdb.Options\ + org.rocksdb.PlainTableConfig\ + org.rocksdb.ReadOptions\ + org.rocksdb.RestoreBackupableDB\ + org.rocksdb.RestoreOptions\ + org.rocksdb.RocksDB\ + org.rocksdb.RocksEnv\ + org.rocksdb.RocksIterator\ + org.rocksdb.SkipListMemTableConfig\ + org.rocksdb.Slice\ + org.rocksdb.Statistics\ + org.rocksdb.TransactionLogIterator\ + org.rocksdb.TtlDB\ + org.rocksdb.VectorMemTableConfig\ + org.rocksdb.Snapshot\ + org.rocksdb.StringAppendOperator\ + org.rocksdb.WriteBatch\ + org.rocksdb.WriteBatch.Handler\ + org.rocksdb.WriteOptions\ + org.rocksdb.WriteBatchWithIndex\ + org.rocksdb.WBWIRocksIterator + +NATIVE_JAVA_TEST_CLASSES = org.rocksdb.WriteBatchTest\ + org.rocksdb.WriteBatchTestInternalHelper + +ROCKSDB_MAJOR = $(shell egrep "ROCKSDB_MAJOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) +ROCKSDB_MINOR = $(shell egrep "ROCKSDB_MINOR.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) +ROCKSDB_PATCH = $(shell egrep "ROCKSDB_PATCH.[0-9]" ../include/rocksdb/version.h | cut -d ' ' -f 3) NATIVE_INCLUDE = ./include -ROCKSDB_JAR = rocksdbjni.jar +ARCH := $(shell getconf LONG_BIT) +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar +ifeq ($(PLATFORM), OS_MACOSX) +ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar +endif + +JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ + org.rocksdb.BackupableDBTest\ + org.rocksdb.BlockBasedTableConfigTest\ + org.rocksdb.CheckPointTest\ + org.rocksdb.ColumnFamilyOptionsTest\ + org.rocksdb.ColumnFamilyTest\ + org.rocksdb.ComparatorOptionsTest\ + org.rocksdb.ComparatorTest\ + org.rocksdb.CompressionOptionsTest\ + org.rocksdb.DBOptionsTest\ + org.rocksdb.DirectComparatorTest\ + org.rocksdb.DirectSliceTest\ + org.rocksdb.util.EnvironmentTest\ + org.rocksdb.FilterTest\ + org.rocksdb.FlushTest\ + org.rocksdb.InfoLogLevelTest\ + org.rocksdb.KeyMayExistTest\ + org.rocksdb.MemTableTest\ + org.rocksdb.MergeTest\ + org.rocksdb.MixedOptionsTest\ + org.rocksdb.OptionsTest\ + org.rocksdb.PlainTableConfigTest\ + org.rocksdb.ReadOnlyTest\ + org.rocksdb.ReadOptionsTest\ + org.rocksdb.RocksDBTest\ + org.rocksdb.RocksEnvTest\ + org.rocksdb.RocksIteratorTest\ + org.rocksdb.util.SizeUnitTest\ + org.rocksdb.SliceTest\ + org.rocksdb.SnapshotTest\ + org.rocksdb.TransactionLogIteratorTest\ + org.rocksdb.TtlDBTest\ + org.rocksdb.StatisticsCollectorTest\ + org.rocksdb.WriteBatchHandlerTest\ + org.rocksdb.WriteBatchTest\ + org.rocksdb.WriteOptionsTest\ + org.rocksdb.WriteBatchWithIndexTest + +MAIN_SRC = src/main/java +TEST_SRC = src/test/java +OUTPUT = target +MAIN_CLASSES = $(OUTPUT)/classes +TEST_CLASSES = $(OUTPUT)/test-classes +JAVADOC = $(OUTPUT)/apidocs + +BENCHMARK_MAIN_SRC = benchmark/src/main/java +BENCHMARK_OUTPUT = benchmark/target +BENCHMARK_MAIN_CLASSES = $(BENCHMARK_OUTPUT)/classes + +SAMPLES_MAIN_SRC = samples/src/main/java +SAMPLES_OUTPUT = samples/target +SAMPLES_MAIN_CLASSES = $(SAMPLES_OUTPUT)/classes + +JAVA_TEST_LIBDIR = test-libs +JAVA_JUNIT_JAR = $(JAVA_TEST_LIBDIR)/junit-4.12.jar +JAVA_HAMCR_JAR = $(JAVA_TEST_LIBDIR)/hamcrest-core-1.3.jar +JAVA_MOCKITO_JAR = $(JAVA_TEST_LIBDIR)/mockito-all-1.10.19.jar +JAVA_CGLIB_JAR = $(JAVA_TEST_LIBDIR)/cglib-2.2.2.jar +JAVA_ASSERTJ_JAR = $(JAVA_TEST_LIBDIR)/assertj-core-1.7.1.jar +JAVA_TESTCLASSPATH = $(JAVA_JUNIT_JAR):$(JAVA_HAMCR_JAR):$(JAVA_MOCKITO_JAR):$(JAVA_CGLIB_JAR):$(JAVA_ASSERTJ_JAR) clean: - -find . -name "*.class" -exec rm {} \; - -find . -name "hs*.log" -exec rm {} \; - rm -f $(ROCKSDB_JAR) + rm -rf include/* + rm -rf test-libs/ + rm -rf $(OUTPUT) + rm -rf $(BENCHMARK_OUTPUT) + rm -rf $(SAMPLES_OUTPUT) + + +javadocs: + mkdir -p $(JAVADOC) + javadoc -d $(JAVADOC) -sourcepath $(MAIN_SRC) -subpackages org + +javalib: java java_test javadocs java: - javac org/rocksdb/util/*.java org/rocksdb/*.java + mkdir -p $(MAIN_CLASSES) + javac -d $(MAIN_CLASSES)\ + $(MAIN_SRC)/org/rocksdb/util/*.java\ + $(MAIN_SRC)/org/rocksdb/*.java @cp ../HISTORY.md ./HISTORY-CPP.md @rm -f ./HISTORY-CPP.md - javah -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) + javah -cp $(MAIN_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_CLASSES) sample: java - javac -cp $(ROCKSDB_JAR) RocksDBSample.java + mkdir -p $(SAMPLES_MAIN_CLASSES) + javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBSample.java @rm -rf /tmp/rocksdbjni @rm -rf /tmp/rocksdbjni_not_found - java -ea -Djava.library.path=.:../ -cp ".:./*" -Xcheck:jni RocksDBSample /tmp/rocksdbjni + java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBSample /tmp/rocksdbjni @rm -rf /tmp/rocksdbjni @rm -rf /tmp/rocksdbjni_not_found -test: java - javac org/rocksdb/test/*.java - java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.WriteBatchTest - java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.BackupableDBTest - java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.OptionsTest - java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.ReadOptionsTest - java -ea -Djava.library.path=.:../ -cp "$(ROCKSDB_JAR):.:./*" org.rocksdb.test.StatisticsCollectorTest +column_family_sample: java + mkdir -p $(SAMPLES_MAIN_CLASSES) + javac -cp $(MAIN_CLASSES) -d $(SAMPLES_MAIN_CLASSES) $(SAMPLES_MAIN_SRC)/RocksDBColumnFamilySample.java + @rm -rf /tmp/rocksdbjni + java -ea -Xcheck:jni -Djava.library.path=target -cp $(MAIN_CLASSES):$(SAMPLES_MAIN_CLASSES) RocksDBColumnFamilySample /tmp/rocksdbjni + @rm -rf /tmp/rocksdbjni + +resolve_test_deps: + mkdir -p "$(JAVA_TEST_LIBDIR)" + test -s "$(JAVA_JUNIT_JAR)" || curl -k -L -o $(JAVA_JUNIT_JAR) http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar + test -s "$(JAVA_HAMCR_JAR)" || curl -k -L -o $(JAVA_HAMCR_JAR) http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar + test -s "$(JAVA_MOCKITO_JAR)" || curl -k -L -o "$(JAVA_MOCKITO_JAR)" http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar + test -s "$(JAVA_CGLIB_JAR)" || curl -k -L -o "$(JAVA_CGLIB_JAR)" http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar + test -s "$(JAVA_ASSERTJ_JAR)" || curl -k -L -o "$(JAVA_ASSERTJ_JAR)" http://central.maven.org/maven2/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar + +java_test: resolve_test_deps + mkdir -p $(TEST_CLASSES) + javac -cp $(MAIN_CLASSES):$(JAVA_TESTCLASSPATH) -d $(TEST_CLASSES)\ + $(TEST_SRC)/org/rocksdb/test/*.java\ + $(TEST_SRC)/org/rocksdb/util/*.java\ + $(TEST_SRC)/org/rocksdb/*.java + javah -cp $(MAIN_CLASSES):$(TEST_CLASSES) -d $(NATIVE_INCLUDE) -jni $(NATIVE_JAVA_TEST_CLASSES) + +test: java resolve_test_deps java_test + java -ea -Xcheck:jni -Djava.library.path=target -cp "$(MAIN_CLASSES):$(TEST_CLASSES):$(JAVA_TESTCLASSPATH)" org.rocksdb.test.RocksJunitRunner $(JAVA_TESTS) db_bench: java - javac org/rocksdb/benchmark/*.java + mkdir -p $(BENCHMARK_MAIN_CLASSES) + javac -cp $(MAIN_CLASSES) -d $(BENCHMARK_MAIN_CLASSES) $(BENCHMARK_MAIN_SRC)/org/rocksdb/benchmark/*.java diff --git a/java/RELEASE.md b/java/RELEASE.md new file mode 100644 index 000000000..084460c88 --- /dev/null +++ b/java/RELEASE.md @@ -0,0 +1,54 @@ +## Cross-building + +RocksDB can be built as a single self contained cross-platform JAR. The cross-platform jar can be usd on any 64-bit OSX system, 32-bit Linux system, or 64-bit Linux system. + +Building a cross-platform JAR requires: + + * [Vagrant](https://www.vagrantup.com/) + * [Virtualbox](https://www.virtualbox.org/) + * A Mac OSX machine that can compile RocksDB. + * Java 7 set as JAVA_HOME. + +Once you have these items, run this make command from RocksDB's root source directory: + + make jclean clean rocksdbjavastaticrelease + +This command will build RocksDB natively on OSX, and will then spin up two Vagrant Virtualbox Ubuntu images to build RocksDB for both 32-bit and 64-bit Linux. + +You can find all native binaries and JARs in the java directory upon completion: + + librocksdbjni-linux32.so + librocksdbjni-linux64.so + librocksdbjni-osx.jnilib + rocksdbjni-3.5.0-javadoc.jar + rocksdbjni-3.5.0-linux32.jar + rocksdbjni-3.5.0-linux64.jar + rocksdbjni-3.5.0-osx.jar + rocksdbjni-3.5.0-sources.jar + rocksdbjni-3.5.0.jar + +## Maven publication + +Set ~/.m2/settings.xml to contain: + + + + + sonatype-nexus-staging + your-sonatype-jira-username + your-sonatype-jira-password + + + + +From RocksDB's root directory, first build the Java static JARs: + + make jclean clean rocksdbjavastaticpublish + +This command will [stage the JAR artifacts on the Sonatype staging repository](http://central.sonatype.org/pages/manual-staging-bundle-creation-and-deployment.html). To release the staged artifacts. + +1. Go to [https://oss.sonatype.org/#stagingRepositories](https://oss.sonatype.org/#stagingRepositories) and search for "rocksdb" in the upper right hand search box. +2. Select the rocksdb staging repository, and inspect its contents. +3. If all is well, follow [these steps](https://oss.sonatype.org/#stagingRepositories) to close the repository and release it. + +After the release has occurred, the artifacts will be synced to Maven central within 24-48 hours. diff --git a/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java similarity index 93% rename from java/org/rocksdb/benchmark/DbBenchmark.java rename to java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java index b715f9af1..64fc5f0a7 100644 --- a/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java @@ -163,15 +163,6 @@ public class DbBenchmark { EXISTING } - enum CompressionType { - NONE, - SNAPPY, - ZLIB, - BZIP2, - LZ4, - LZ4HC - } - static { RocksDB.loadLibrary(); } @@ -255,7 +246,7 @@ public class DbBenchmark { for (long j = 0; j < entriesPerBatch_; j++) { getKey(key, i + j, keyRange_); DbBenchmark.this.gen_.generate(value); - db_.put(writeOpt_, key, value); + batch.put(key, value); stats_.finishedSingleOp(keySize_ + valueSize_); } db_.write(writeOpt_, batch); @@ -457,18 +448,16 @@ public class DbBenchmark { // options.setPrefixSize((Integer)flags_.get(Flag.prefix_size)); // options.setKeysPerPrefix((Long)flags_.get(Flag.keys_per_prefix)); compressionType_ = (String) flags.get(Flag.compression_type); - compression_ = CompressionType.NONE; + compression_ = CompressionType.NO_COMPRESSION; try { - if (compressionType_.equals("snappy")) { - System.loadLibrary("snappy"); - } else if (compressionType_.equals("zlib")) { - System.loadLibrary("z"); - } else if (compressionType_.equals("bzip2")) { - System.loadLibrary("bzip2"); - } else if (compressionType_.equals("lz4")) { - System.loadLibrary("lz4"); - } else if (compressionType_.equals("lz4hc")) { - System.loadLibrary("lz4hc"); + if (compressionType_!=null) { + final CompressionType compressionType = + CompressionType.getCompressionType(compressionType_); + if (compressionType != null && + compressionType != CompressionType.NO_COMPRESSION) { + System.loadLibrary(compressionType.getLibraryName()); + } + } } catch (UnsatisfiedLinkError e) { System.err.format("Unable to load %s library:%s%n" + @@ -489,32 +478,38 @@ public class DbBenchmark { options.setDisableWAL((Boolean)flags_.get(Flag.disable_wal)); } - private void prepareOptions(Options options) { + private void prepareOptions(Options options) throws RocksDBException { if (!useExisting_) { options.setCreateIfMissing(true); } else { options.setCreateIfMissing(false); } - if (memtable_.equals("skip_list")) { - options.setMemTableConfig(new SkipListMemTableConfig()); - } else if (memtable_.equals("vector")) { - options.setMemTableConfig(new VectorMemTableConfig()); - } else if (memtable_.equals("hash_linkedlist")) { - options.setMemTableConfig( - new HashLinkedListMemTableConfig() - .setBucketCount(hashBucketCount_)); - options.useFixedLengthPrefixExtractor(prefixSize_); - } else if (memtable_.equals("hash_skiplist") || - memtable_.equals("prefix_hash")) { - options.setMemTableConfig( - new HashSkipListMemTableConfig() - .setBucketCount(hashBucketCount_)); - options.useFixedLengthPrefixExtractor(prefixSize_); - } else { - System.err.format( - "unable to detect the specified memtable, " + - "use the default memtable factory %s%n", - options.memTableFactoryName()); + switch (memtable_) { + case "skip_list": + options.setMemTableConfig(new SkipListMemTableConfig()); + break; + case "vector": + options.setMemTableConfig(new VectorMemTableConfig()); + break; + case "hash_linkedlist": + options.setMemTableConfig( + new HashLinkedListMemTableConfig() + .setBucketCount(hashBucketCount_)); + options.useFixedLengthPrefixExtractor(prefixSize_); + break; + case "hash_skiplist": + case "prefix_hash": + options.setMemTableConfig( + new HashSkipListMemTableConfig() + .setBucketCount(hashBucketCount_)); + options.useFixedLengthPrefixExtractor(prefixSize_); + break; + default: + System.err.format( + "unable to detect the specified memtable, " + + "use the default memtable factory %s%n", + options.memTableFactoryName()); + break; } if (usePlainTable_) { options.setTableFormatConfig( @@ -523,8 +518,8 @@ public class DbBenchmark { BlockBasedTableConfig table_options = new BlockBasedTableConfig(); table_options.setBlockSize((Long)flags_.get(Flag.block_size)) .setBlockCacheSize((Long)flags_.get(Flag.cache_size)) - .setFilterBitsPerKey((Integer)flags_.get(Flag.bloom_bits)) - .setCacheNumShardBits((Integer)flags_.get(Flag.cache_numshardbits)); + .setCacheNumShardBits( + (Integer)flags_.get(Flag.cache_numshardbits)); options.setTableFormatConfig(table_options); } options.setWriteBufferSize( @@ -645,53 +640,65 @@ public class DbBenchmark { int currentTaskId = 0; boolean known = true; - if (benchmark.equals("fillseq")) { - tasks.add(new WriteSequentialTask( - currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); - } else if (benchmark.equals("fillbatch")) { - tasks.add(new WriteRandomTask( - currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000)); - } else if (benchmark.equals("fillrandom")) { - tasks.add(new WriteRandomTask( - currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); - } else if (benchmark.equals("filluniquerandom")) { - tasks.add(new WriteUniqueRandomTask( - currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); - } else if (benchmark.equals("fillsync")) { - writeOpt.setSync(true); - tasks.add(new WriteRandomTask( - currentTaskId++, randSeed_, num_ / 1000, num_ / 1000, - writeOpt, 1)); - } else if (benchmark.equals("readseq")) { - for (int t = 0; t < threadNum_; ++t) { - tasks.add(new ReadSequentialTask( - currentTaskId++, randSeed_, reads_ / threadNum_, num_)); - } - } else if (benchmark.equals("readrandom")) { - for (int t = 0; t < threadNum_; ++t) { - tasks.add(new ReadRandomTask( - currentTaskId++, randSeed_, reads_ / threadNum_, num_)); - } - } else if (benchmark.equals("readwhilewriting")) { - WriteTask writeTask = new WriteRandomTask( - -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_); - writeTask.stats_.setExcludeFromMerge(); - bgTasks.add(writeTask); - for (int t = 0; t < threadNum_; ++t) { - tasks.add(new ReadRandomTask( - currentTaskId++, randSeed_, reads_ / threadNum_, num_)); - } - } else if (benchmark.equals("readhot")) { - for (int t = 0; t < threadNum_; ++t) { - tasks.add(new ReadRandomTask( - currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100)); - } - } else if (benchmark.equals("delete")) { - destroyDb(); - open(options); - } else { - known = false; - System.err.println("Unknown benchmark: " + benchmark); + switch (benchmark) { + case "fillseq": + tasks.add(new WriteSequentialTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + break; + case "fillbatch": + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_ / 1000, num_, writeOpt, 1000)); + break; + case "fillrandom": + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + break; + case "filluniquerandom": + tasks.add(new WriteUniqueRandomTask( + currentTaskId++, randSeed_, num_, num_, writeOpt, 1)); + break; + case "fillsync": + writeOpt.setSync(true); + tasks.add(new WriteRandomTask( + currentTaskId++, randSeed_, num_ / 1000, num_ / 1000, + writeOpt, 1)); + break; + case "readseq": + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadSequentialTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + break; + case "readrandom": + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + break; + case "readwhilewriting": + WriteTask writeTask = new WriteRandomTask( + -1, randSeed_, Long.MAX_VALUE, num_, writeOpt, 1, writesPerSeconds_); + writeTask.stats_.setExcludeFromMerge(); + bgTasks.add(writeTask); + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_)); + } + break; + case "readhot": + for (int t = 0; t < threadNum_; ++t) { + tasks.add(new ReadRandomTask( + currentTaskId++, randSeed_, reads_ / threadNum_, num_ / 100)); + } + break; + case "delete": + destroyDb(); + open(options); + break; + default: + known = false; + System.err.println("Unknown benchmark: " + benchmark); + break; } if (known) { ExecutorService executor = Executors.newCachedThreadPool(); @@ -800,7 +807,7 @@ public class DbBenchmark { System.out.printf( "%-16s : %11.5f micros/op; %6.1f MB/s;%s %d / %d task(s) finished.\n", - benchmark, (double) elapsedSeconds / stats.done_ * 1e6, + benchmark, elapsedSeconds / stats.done_ * 1e6, (stats.bytes_ / 1048576.0) / elapsedSeconds, extra, taskFinishedCount, concurrentThreads); } diff --git a/java/crossbuild/Vagrantfile b/java/crossbuild/Vagrantfile new file mode 100644 index 000000000..c4b1b1df2 --- /dev/null +++ b/java/crossbuild/Vagrantfile @@ -0,0 +1,25 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# Vagrantfile API/syntax version. Don't touch unless you know what you're doing! +VAGRANTFILE_API_VERSION = "2" + +Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| + + config.vm.define "linux32" do |linux32| + linux32.vm.box = "hansode/centos-5.6-i386" + end + + config.vm.define "linux64" do |linux64| + linux64.vm.box = "hansode/centos-5.6-x86_64" + end + + config.vm.provider "virtualbox" do |v| + v.memory = 2048 + v.cpus = 4 + end + + config.vm.provision :shell, path: "build-linux-centos.sh" + config.vm.synced_folder "../", "/rocksdb-build" + config.vm.synced_folder "../..", "/rocksdb", type: "rsync" +end diff --git a/java/crossbuild/build-linux-centos.sh b/java/crossbuild/build-linux-centos.sh new file mode 100755 index 000000000..158303069 --- /dev/null +++ b/java/crossbuild/build-linux-centos.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# install all required packages for rocksdb that are available through yum +ARCH=$(uname -i) +sudo yum -y install java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel + +# install gcc/g++ 4.7 via CERN (http://linux.web.cern.ch/linux/devtoolset/) +sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo +sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern +sudo yum -y install devtoolset-1.1 +wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz +tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-1.1 ./configure; scl enable devtoolset-1.1 make; sudo make install +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib + +# set java home so we can build rocksdb jars +export JAVA_HOME=/usr/lib/jvm/java-1.7.0 + +# build rocksdb +cd /rocksdb +scl enable devtoolset-1.1 'make jclean clean' +scl enable devtoolset-1.1 'make -j 4 rocksdbjavastatic' +cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build + diff --git a/java/crossbuild/build-linux.sh b/java/crossbuild/build-linux.sh new file mode 100755 index 000000000..48d1c28d9 --- /dev/null +++ b/java/crossbuild/build-linux.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash +# install all required packages for rocksdb +sudo apt-get update +sudo apt-get -y install git make gcc g++ libgflags-dev libsnappy-dev zlib1g-dev libbz2-dev default-jdk + +# set java home so we can build rocksdb jars +export JAVA_HOME=$(echo /usr/lib/jvm/java-7-openjdk*) +cd /rocksdb +make jclean clean +make -j 4 rocksdbjavastatic +cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build +cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build +sudo shutdown -h now + diff --git a/java/jdb_bench.sh b/java/jdb_bench.sh index dba7dbd31..9665de785 100755 --- a/java/jdb_bench.sh +++ b/java/jdb_bench.sh @@ -1 +1,10 @@ -java -server -d64 -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=.:../ -cp "rocksdbjni.jar:.:./*" org.rocksdb.benchmark.DbBenchmark $@ +PLATFORM=64 +if [ `getconf LONG_BIT` != "64" ] +then + PLATFORM=32 +fi + +ROCKS_JAR=`find target -name rocksdbjni*.jar` + +echo "Running benchmark in $PLATFORM-Bit mode." +java -server -d$PLATFORM -XX:NewSize=4m -XX:+AggressiveOpts -Djava.library.path=target -cp "${ROCKS_JAR}:benchmark/target/classes" org.rocksdb.benchmark.DbBenchmark $@ diff --git a/java/org/rocksdb/BackupableDB.java b/java/org/rocksdb/BackupableDB.java deleted file mode 100644 index 108c4deb5..000000000 --- a/java/org/rocksdb/BackupableDB.java +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * A subclass of RocksDB which supports backup-related operations. - * - * @see BackupableDBOptions - */ -public class BackupableDB extends RocksDB { - /** - * Open a BackupableDB under the specified path. - * Note that the backup path should be set properly in the - * input BackupableDBOptions. - * - * @param opt options for db. - * @param bopt backup related options. - * @param the db path for storing data. The path for storing - * backup should be specified in the BackupableDBOptions. - * @return reference to the opened BackupableDB. - */ - public static BackupableDB open( - Options opt, BackupableDBOptions bopt, String db_path) - throws RocksDBException { - - RocksDB db = RocksDB.open(opt, db_path); - BackupableDB bdb = new BackupableDB(); - bdb.open(db.nativeHandle_, bopt.nativeHandle_); - - // Prevent the RocksDB object from attempting to delete - // the underly C++ DB object. - db.disOwnNativeHandle(); - - return bdb; - } - - /** - * Captures the state of the database in the latest backup. - * Note that this function is not thread-safe. - * - * @param flushBeforeBackup if true, then all data will be flushed - * before creating backup. - */ - public void createNewBackup(boolean flushBeforeBackup) { - createNewBackup(nativeHandle_, flushBeforeBackup); - } - - /** - * Deletes old backups, keeping latest numBackupsToKeep alive. - * - * @param numBackupsToKeep Number of latest backups to keep. - */ - public void purgeOldBackups(int numBackupsToKeep) { - purgeOldBackups(nativeHandle_, numBackupsToKeep); - } - - - /** - * Close the BackupableDB instance and release resource. - * - * Internally, BackupableDB owns the rocksdb::DB pointer to its - * associated RocksDB. The release of that RocksDB pointer is - * handled in the destructor of the c++ rocksdb::BackupableDB and - * should be transparent to Java developers. - */ - @Override public synchronized void close() { - if (isInitialized()) { - super.close(); - } - } - - /** - * A protected construction that will be used in the static factory - * method BackupableDB.open(). - */ - protected BackupableDB() { - super(); - } - - @Override protected void finalize() { - close(); - } - - protected native void open(long rocksDBHandle, long backupDBOptionsHandle); - protected native void createNewBackup(long handle, boolean flag); - protected native void purgeOldBackups(long handle, int numBackupsToKeep); -} diff --git a/java/org/rocksdb/BackupableDBOptions.java b/java/org/rocksdb/BackupableDBOptions.java deleted file mode 100644 index 2c5047f77..000000000 --- a/java/org/rocksdb/BackupableDBOptions.java +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * BackupableDBOptions to control the behavior of a backupable database. - * It will be used during the creation of a BackupableDB. - * - * Note that dispose() must be called before an Options instance - * become out-of-scope to release the allocated memory in c++. - * - * @param path Where to keep the backup files. Has to be different than dbname. - Best to set this to dbname_ + "/backups" - * @param shareTableFiles If share_table_files == true, backup will assume that - * table files with same name have the same contents. This enables - * incremental backups and avoids unnecessary data copies. If - * share_table_files == false, each backup will be on its own and will not - * share any data with other backups. default: true - * @param sync If sync == true, we can guarantee you'll get consistent backup - * even on a machine crash/reboot. Backup process is slower with sync - * enabled. If sync == false, we don't guarantee anything on machine reboot. - * However, chances are some of the backups are consistent. Default: true - * @param destroyOldData If true, it will delete whatever backups there are - * already. Default: false - * @param backupLogFiles If false, we won't backup log files. This option can be - * useful for backing up in-memory databases where log file are persisted, - * but table files are in memory. Default: true - * @param backupRateLimit Max bytes that can be transferred in a second during - * backup. If 0 or negative, then go as fast as you can. Default: 0 - * @param restoreRateLimit Max bytes that can be transferred in a second during - * restore. If 0 or negative, then go as fast as you can. Default: 0 - */ -public class BackupableDBOptions extends RocksObject { - public BackupableDBOptions(String path, boolean shareTableFiles, boolean sync, - boolean destroyOldData, boolean backupLogFiles, long backupRateLimit, - long restoreRateLimit) { - super(); - - backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit; - restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit; - - newBackupableDBOptions(path, shareTableFiles, sync, destroyOldData, - backupLogFiles, backupRateLimit, restoreRateLimit); - } - - /** - * Returns the path to the BackupableDB directory. - * - * @return the path to the BackupableDB directory. - */ - public String backupDir() { - assert(isInitialized()); - return backupDir(nativeHandle_); - } - - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void newBackupableDBOptions(String path, - boolean shareTableFiles, boolean sync, boolean destroyOldData, - boolean backupLogFiles, long backupRateLimit, long restoreRateLimit); - private native String backupDir(long handle); - private native void disposeInternal(long handle); -} diff --git a/java/org/rocksdb/BlockBasedTableConfig.java b/java/org/rocksdb/BlockBasedTableConfig.java deleted file mode 100644 index 523a57691..000000000 --- a/java/org/rocksdb/BlockBasedTableConfig.java +++ /dev/null @@ -1,210 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -package org.rocksdb; - -/** - * The config for plain table sst format. - * - * BlockBasedTable is a RocksDB's default SST file format. - */ -public class BlockBasedTableConfig extends TableFormatConfig { - - public BlockBasedTableConfig() { - noBlockCache_ = false; - blockCacheSize_ = 8 * 1024 * 1024; - blockSize_ = 4 * 1024; - blockSizeDeviation_ =10; - blockRestartInterval_ =16; - wholeKeyFiltering_ = true; - bitsPerKey_ = 0; - } - - /** - * Disable block cache. If this is set to true, - * then no block cache should be used, and the block_cache should - * point to a nullptr object. - * Default: false - * - * @param noBlockCache if use block cache - * @return the reference to the current config. - */ - public BlockBasedTableConfig setNoBlockCache(boolean noBlockCache) { - noBlockCache_ = noBlockCache; - return this; - } - - /** - * @return if block cache is disabled - */ - public boolean noBlockCache() { - return noBlockCache_; - } - - /** - * Set the amount of cache in bytes that will be used by RocksDB. - * If cacheSize is non-positive, then cache will not be used. - * DEFAULT: 8M - * - * @param blockCacheSize block cache size in bytes - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockCacheSize(long blockCacheSize) { - blockCacheSize_ = blockCacheSize; - return this; - } - - /** - * @return block cache size in bytes - */ - public long blockCacheSize() { - return blockCacheSize_; - } - - /** - * Controls the number of shards for the block cache. - * This is applied only if cacheSize is set to non-negative. - * - * @param numShardBits the number of shard bits. The resulting - * number of shards would be 2 ^ numShardBits. Any negative - * number means use default settings." - * @return the reference to the current option. - */ - public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) { - numShardBits_ = numShardBits; - return this; - } - - /** - * Returns the number of shard bits used in the block cache. - * The resulting number of shards would be 2 ^ (returned value). - * Any negative number means use default settings. - * - * @return the number of shard bits used in the block cache. - */ - public int cacheNumShardBits() { - return numShardBits_; - } - - /** - * Approximate size of user data packed per block. Note that the - * block size specified here corresponds to uncompressed data. The - * actual size of the unit read from disk may be smaller if - * compression is enabled. This parameter can be changed dynamically. - * Default: 4K - * - * @param blockSize block size in bytes - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockSize(long blockSize) { - blockSize_ = blockSize; - return this; - } - - /** - * @return block size in bytes - */ - public long blockSize() { - return blockSize_; - } - - /** - * This is used to close a block before it reaches the configured - * 'block_size'. If the percentage of free space in the current block is less - * than this specified number and adding a new record to the block will - * exceed the configured block size, then this block will be closed and the - * new record will be written to the next block. - * Default is 10. - * - * @param blockSizeDeviation the deviation to block size allowed - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockSizeDeviation(int blockSizeDeviation) { - blockSizeDeviation_ = blockSizeDeviation; - return this; - } - - /** - * @return the hash table ratio. - */ - public int blockSizeDeviation() { - return blockSizeDeviation_; - } - - /** - * Set block restart interval - * - * @param restartInterval block restart interval. - * @return the reference to the current config. - */ - public BlockBasedTableConfig setBlockRestartInterval(int restartInterval) { - blockRestartInterval_ = restartInterval; - return this; - } - - /** - * @return block restart interval - */ - public int blockRestartInterval() { - return blockRestartInterval_; - } - - /** - * If true, place whole keys in the filter (not just prefixes). - * This must generally be true for gets to be efficient. - * Default: true - * - * @param wholeKeyFiltering if enable whole key filtering - * @return the reference to the current config. - */ - public BlockBasedTableConfig setWholeKeyFiltering(boolean wholeKeyFiltering) { - wholeKeyFiltering_ = wholeKeyFiltering; - return this; - } - - /** - * @return if whole key filtering is enabled - */ - public boolean wholeKeyFiltering() { - return wholeKeyFiltering_; - } - - /** - * Use the specified filter policy to reduce disk reads. - * - * Filter should not be disposed before options instances using this filter is - * disposed. If dispose() function is not called, then filter object will be - * GC'd automatically. - * - * Filter instance can be re-used in multiple options instances. - * - * @param Filter policy java instance. - * @return the reference to the current config. - */ - public BlockBasedTableConfig setFilterBitsPerKey(int bitsPerKey) { - bitsPerKey_ = bitsPerKey; - return this; - } - - @Override protected long newTableFactoryHandle() { - return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_, - blockSize_, blockSizeDeviation_, blockRestartInterval_, - wholeKeyFiltering_, bitsPerKey_); - } - - private native long newTableFactoryHandle( - boolean noBlockCache, long blockCacheSize, int numShardbits, - long blockSize, int blockSizeDeviation, int blockRestartInterval, - boolean wholeKeyFiltering, int bitsPerKey); - - private boolean noBlockCache_; - private long blockCacheSize_; - private int numShardBits_; - private long shard; - private long blockSize_; - private int blockSizeDeviation_; - private int blockRestartInterval_; - private boolean wholeKeyFiltering_; - private int bitsPerKey_; -} diff --git a/java/org/rocksdb/BloomFilter.java b/java/org/rocksdb/BloomFilter.java deleted file mode 100644 index 9c4913a8c..000000000 --- a/java/org/rocksdb/BloomFilter.java +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * This class creates a new filter policy that uses a bloom filter - * with approximately the specified number of bits per key. - * A good value for bitsPerKey is 10, which yields a filter - * with ~ 1% false positive rate. - * - * Default value of bits per key is 10. - */ -public class BloomFilter extends Filter { - private static final int DEFAULT_BITS_PER_KEY = 10; - private final int bitsPerKey_; - - public BloomFilter() { - this(DEFAULT_BITS_PER_KEY); - } - - public BloomFilter(int bitsPerKey) { - super(); - bitsPerKey_ = bitsPerKey; - - createNewFilter(); - } - - @Override - protected void createNewFilter() { - createNewFilter0(bitsPerKey_); - } - - private native void createNewFilter0(int bitsKeyKey); -} diff --git a/java/org/rocksdb/CompressionType.java b/java/org/rocksdb/CompressionType.java deleted file mode 100644 index c5d6253a9..000000000 --- a/java/org/rocksdb/CompressionType.java +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -public enum CompressionType { - NO_COMPRESSION((byte) 0), - SNAPPY_COMPRESSION((byte) 1), - ZLIB_COMPRESSION((byte) 2), - BZLIB2_COMPRESSION((byte) 3), - LZ4_COMPRESSION((byte) 4), - LZ4HC_COMPRESSION((byte) 5); - - private final byte value_; - - private CompressionType(byte value) { - value_ = value; - } - - public byte getValue() { - return value_; - } -} diff --git a/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/org/rocksdb/HashLinkedListMemTableConfig.java deleted file mode 100644 index 24fcd8b52..000000000 --- a/java/org/rocksdb/HashLinkedListMemTableConfig.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.rocksdb; - -/** - * The config for hash linked list memtable representation - * Such memtable contains a fix-sized array of buckets, where - * each bucket points to a sorted singly-linked - * list (or null if the bucket is empty). - * - * Note that since this mem-table representation relies on the - * key prefix, it is required to invoke one of the usePrefixExtractor - * functions to specify how to extract key prefix given a key. - * If proper prefix-extractor is not set, then RocksDB will - * use the default memtable representation (SkipList) instead - * and post a warning in the LOG. - */ -public class HashLinkedListMemTableConfig extends MemTableConfig { - public static final long DEFAULT_BUCKET_COUNT = 50000; - - public HashLinkedListMemTableConfig() { - bucketCount_ = DEFAULT_BUCKET_COUNT; - } - - /** - * Set the number of buckets in the fixed-size array used - * in the hash linked-list mem-table. - * - * @param count the number of hash buckets. - * @return the reference to the current HashLinkedListMemTableConfig. - */ - public HashLinkedListMemTableConfig setBucketCount(long count) { - bucketCount_ = count; - return this; - } - - /** - * Returns the number of buckets that will be used in the memtable - * created based on this config. - * - * @return the number of buckets - */ - public long bucketCount() { - return bucketCount_; - } - - @Override protected long newMemTableFactoryHandle() { - return newMemTableFactoryHandle(bucketCount_); - } - - private native long newMemTableFactoryHandle(long bucketCount); - - private long bucketCount_; -} diff --git a/java/org/rocksdb/Options.java b/java/org/rocksdb/Options.java deleted file mode 100644 index 125f06afd..000000000 --- a/java/org/rocksdb/Options.java +++ /dev/null @@ -1,2240 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * Options to control the behavior of a database. It will be used - * during the creation of a RocksDB (i.e., RocksDB.open()). - * - * If dispose() function is not called, then it will be GC'd automatically and - * native resources will be released as part of the process. - */ -public class Options extends RocksObject { - static final long DEFAULT_CACHE_SIZE = 8 << 20; - static final int DEFAULT_NUM_SHARD_BITS = -1; - /** - * Construct options for opening a RocksDB. - * - * This constructor will create (by allocating a block of memory) - * an rocksdb::Options in the c++ side. - */ - public Options() { - super(); - cacheSize_ = DEFAULT_CACHE_SIZE; - numShardBits_ = DEFAULT_NUM_SHARD_BITS; - newOptions(); - env_ = RocksEnv.getDefault(); - } - - /** - * If this value is set to true, then the database will be created - * if it is missing during RocksDB.open(). - * Default: false - * - * @param flag a flag indicating whether to create a database the - * specified database in RocksDB.open() operation is missing. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setCreateIfMissing(boolean flag) { - assert(isInitialized()); - setCreateIfMissing(nativeHandle_, flag); - return this; - } - - /** - * Use the specified object to interact with the environment, - * e.g. to read/write files, schedule background work, etc. - * Default: RocksEnv.getDefault() - */ - public Options setEnv(RocksEnv env) { - assert(isInitialized()); - setEnv(nativeHandle_, env.nativeHandle_); - env_ = env; - return this; - } - private native void setEnv(long optHandle, long envHandle); - - public RocksEnv getEnv() { - return env_; - } - private native long getEnvHandle(long handle); - - /** - * Return true if the create_if_missing flag is set to true. - * If true, the database will be created if it is missing. - * - * @return true if the createIfMissing option is set to true. - * @see setCreateIfMissing() - */ - public boolean createIfMissing() { - assert(isInitialized()); - return createIfMissing(nativeHandle_); - } - - /** - * Amount of data to build up in memory (backed by an unsorted log - * on disk) before converting to a sorted on-disk file. - * - * Larger values increase performance, especially during bulk loads. - * Up to max_write_buffer_number write buffers may be held in memory - * at the same time, so you may wish to adjust this parameter - * to control memory usage. - * - * Also, a larger write buffer will result in a longer recovery time - * the next time the database is opened. - * - * Default: 4MB - * @param writeBufferSize the size of write buffer. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setWriteBufferSize(long writeBufferSize) { - assert(isInitialized()); - setWriteBufferSize(nativeHandle_, writeBufferSize); - return this; - } - - /** - * Return size of write buffer size. - * - * @return size of write buffer. - * @see setWriteBufferSize() - */ - public long writeBufferSize() { - assert(isInitialized()); - return writeBufferSize(nativeHandle_); - } - - /** - * The maximum number of write buffers that are built up in memory. - * The default is 2, so that when 1 write buffer is being flushed to - * storage, new writes can continue to the other write buffer. - * Default: 2 - * - * @param maxWriteBufferNumber maximum number of write buffers. - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) { - assert(isInitialized()); - setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); - return this; - } - - /** - * Returns maximum number of write buffers. - * - * @return maximum number of write buffers. - * @see setMaxWriteBufferNumber() - */ - public int maxWriteBufferNumber() { - assert(isInitialized()); - return maxWriteBufferNumber(nativeHandle_); - } - - /** - * If true, an error will be thrown during RocksDB.open() if the - * database already exists. - * - * @return if true, an error is raised when the specified database - * already exists before open. - */ - public boolean errorIfExists() { - assert(isInitialized()); - return errorIfExists(nativeHandle_); - } - private native boolean errorIfExists(long handle); - - /** - * If true, an error will be thrown during RocksDB.open() if the - * database already exists. - * Default: false - * - * @param errorIfExists if true, an exception will be thrown - * during RocksDB.open() if the database already exists. - * @return the reference to the current option. - * @see RocksDB.open() - */ - public Options setErrorIfExists(boolean errorIfExists) { - assert(isInitialized()); - setErrorIfExists(nativeHandle_, errorIfExists); - return this; - } - private native void setErrorIfExists(long handle, boolean errorIfExists); - - /** - * If true, the implementation will do aggressive checking of the - * data it is processing and will stop early if it detects any - * errors. This may have unforeseen ramifications: for example, a - * corruption of one DB entry may cause a large number of entries to - * become unreadable or for the entire DB to become unopenable. - * If any of the writes to the database fails (Put, Delete, Merge, Write), - * the database will switch to read-only mode and fail all other - * Write operations. - * - * @return a boolean indicating whether paranoid-check is on. - */ - public boolean paranoidChecks() { - assert(isInitialized()); - return paranoidChecks(nativeHandle_); - } - private native boolean paranoidChecks(long handle); - - /** - * If true, the implementation will do aggressive checking of the - * data it is processing and will stop early if it detects any - * errors. This may have unforeseen ramifications: for example, a - * corruption of one DB entry may cause a large number of entries to - * become unreadable or for the entire DB to become unopenable. - * If any of the writes to the database fails (Put, Delete, Merge, Write), - * the database will switch to read-only mode and fail all other - * Write operations. - * Default: true - * - * @param paranoidChecks a flag to indicate whether paranoid-check - * is on. - * @return the reference to the current option. - */ - public Options setParanoidChecks(boolean paranoidChecks) { - assert(isInitialized()); - setParanoidChecks(nativeHandle_, paranoidChecks); - return this; - } - private native void setParanoidChecks( - long handle, boolean paranoidChecks); - - /** - * Number of open files that can be used by the DB. You may need to - * increase this if your database has a large working set. Value -1 means - * files opened are always kept open. You can estimate number of files based - * on target_file_size_base and target_file_size_multiplier for level-based - * compaction. For universal-style compaction, you can usually set it to -1. - * - * @return the maximum number of open files. - */ - public int maxOpenFiles() { - assert(isInitialized()); - return maxOpenFiles(nativeHandle_); - } - private native int maxOpenFiles(long handle); - - /** - * Number of open files that can be used by the DB. You may need to - * increase this if your database has a large working set. Value -1 means - * files opened are always kept open. You can estimate number of files based - * on target_file_size_base and target_file_size_multiplier for level-based - * compaction. For universal-style compaction, you can usually set it to -1. - * Default: 5000 - * - * @param maxOpenFiles the maximum number of open files. - * @return the reference to the current option. - */ - public Options setMaxOpenFiles(int maxOpenFiles) { - assert(isInitialized()); - setMaxOpenFiles(nativeHandle_, maxOpenFiles); - return this; - } - private native void setMaxOpenFiles(long handle, int maxOpenFiles); - - /** - * If true, then the contents of data files are not synced - * to stable storage. Their contents remain in the OS buffers till the - * OS decides to flush them. This option is good for bulk-loading - * of data. Once the bulk-loading is complete, please issue a - * sync to the OS to flush all dirty buffesrs to stable storage. - * - * @return if true, then data-sync is disabled. - */ - public boolean disableDataSync() { - assert(isInitialized()); - return disableDataSync(nativeHandle_); - } - private native boolean disableDataSync(long handle); - - /** - * If true, then the contents of data files are not synced - * to stable storage. Their contents remain in the OS buffers till the - * OS decides to flush them. This option is good for bulk-loading - * of data. Once the bulk-loading is complete, please issue a - * sync to the OS to flush all dirty buffesrs to stable storage. - * Default: false - * - * @param disableDataSync a boolean flag to specify whether to - * disable data sync. - * @return the reference to the current option. - */ - public Options setDisableDataSync(boolean disableDataSync) { - assert(isInitialized()); - setDisableDataSync(nativeHandle_, disableDataSync); - return this; - } - private native void setDisableDataSync(long handle, boolean disableDataSync); - - /** - * If true, then every store to stable storage will issue a fsync. - * If false, then every store to stable storage will issue a fdatasync. - * This parameter should be set to true while storing data to - * filesystem like ext3 that can lose files after a reboot. - * - * @return true if fsync is used. - */ - public boolean useFsync() { - assert(isInitialized()); - return useFsync(nativeHandle_); - } - private native boolean useFsync(long handle); - - /** - * If true, then every store to stable storage will issue a fsync. - * If false, then every store to stable storage will issue a fdatasync. - * This parameter should be set to true while storing data to - * filesystem like ext3 that can lose files after a reboot. - * Default: false - * - * @param useFsync a boolean flag to specify whether to use fsync - * @return the reference to the current option. - */ - public Options setUseFsync(boolean useFsync) { - assert(isInitialized()); - setUseFsync(nativeHandle_, useFsync); - return this; - } - private native void setUseFsync(long handle, boolean useFsync); - - /** - * The time interval in seconds between each two consecutive stats logs. - * This number controls how often a new scribe log about - * db deploy stats is written out. - * -1 indicates no logging at all. - * - * @return the time interval in seconds between each two consecutive - * stats logs. - */ - public int dbStatsLogInterval() { - assert(isInitialized()); - return dbStatsLogInterval(nativeHandle_); - } - private native int dbStatsLogInterval(long handle); - - /** - * The time interval in seconds between each two consecutive stats logs. - * This number controls how often a new scribe log about - * db deploy stats is written out. - * -1 indicates no logging at all. - * Default value is 1800 (half an hour). - * - * @param dbStatsLogInterval the time interval in seconds between each - * two consecutive stats logs. - * @return the reference to the current option. - */ - public Options setDbStatsLogInterval(int dbStatsLogInterval) { - assert(isInitialized()); - setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval); - return this; - } - private native void setDbStatsLogInterval( - long handle, int dbStatsLogInterval); - - /** - * Returns the directory of info log. - * - * If it is empty, the log files will be in the same dir as data. - * If it is non empty, the log files will be in the specified dir, - * and the db data dir's absolute path will be used as the log file - * name's prefix. - * - * @return the path to the info log directory - */ - public String dbLogDir() { - assert(isInitialized()); - return dbLogDir(nativeHandle_); - } - private native String dbLogDir(long handle); - - /** - * This specifies the info LOG dir. - * If it is empty, the log files will be in the same dir as data. - * If it is non empty, the log files will be in the specified dir, - * and the db data dir's absolute path will be used as the log file - * name's prefix. - * - * @param dbLogDir the path to the info log directory - * @return the reference to the current option. - */ - public Options setDbLogDir(String dbLogDir) { - assert(isInitialized()); - setDbLogDir(nativeHandle_, dbLogDir); - return this; - } - private native void setDbLogDir(long handle, String dbLogDir); - - /** - * Returns the path to the write-ahead-logs (WAL) directory. - * - * If it is empty, the log files will be in the same dir as data, - * dbname is used as the data dir by default - * If it is non empty, the log files will be in kept the specified dir. - * When destroying the db, - * all log files in wal_dir and the dir itself is deleted - * - * @return the path to the write-ahead-logs (WAL) directory. - */ - public String walDir() { - assert(isInitialized()); - return walDir(nativeHandle_); - } - private native String walDir(long handle); - - /** - * This specifies the absolute dir path for write-ahead logs (WAL). - * If it is empty, the log files will be in the same dir as data, - * dbname is used as the data dir by default - * If it is non empty, the log files will be in kept the specified dir. - * When destroying the db, - * all log files in wal_dir and the dir itself is deleted - * - * @param walDir the path to the write-ahead-log directory. - * @return the reference to the current option. - */ - public Options setWalDir(String walDir) { - assert(isInitialized()); - setWalDir(nativeHandle_, walDir); - return this; - } - private native void setWalDir(long handle, String walDir); - - /** - * The periodicity when obsolete files get deleted. The default - * value is 6 hours. The files that get out of scope by compaction - * process will still get automatically delete on every compaction, - * regardless of this setting - * - * @return the time interval in micros when obsolete files will be deleted. - */ - public long deleteObsoleteFilesPeriodMicros() { - assert(isInitialized()); - return deleteObsoleteFilesPeriodMicros(nativeHandle_); - } - private native long deleteObsoleteFilesPeriodMicros(long handle); - - /** - * The periodicity when obsolete files get deleted. The default - * value is 6 hours. The files that get out of scope by compaction - * process will still get automatically delete on every compaction, - * regardless of this setting - * - * @param micros the time interval in micros - * @return the reference to the current option. - */ - public Options setDeleteObsoleteFilesPeriodMicros(long micros) { - assert(isInitialized()); - setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); - return this; - } - private native void setDeleteObsoleteFilesPeriodMicros( - long handle, long micros); - - /** - * Returns the maximum number of concurrent background compaction jobs, - * submitted to the default LOW priority thread pool. - * When increasing this number, we may also want to consider increasing - * number of threads in LOW priority thread pool. - * Default: 1 - * - * @return the maximum number of concurrent background compaction jobs. - * @see Env.setBackgroundThreads() - */ - public int maxBackgroundCompactions() { - assert(isInitialized()); - return maxBackgroundCompactions(nativeHandle_); - } - - /** - * Creates statistics object which collects metrics about database operations. - Statistics objects should not be shared between DB instances as - it does not use any locks to prevent concurrent updates. - * - * @return the instance of the current Options. - * @see RocksDB.open() - */ - public Options createStatistics() { - assert(isInitialized()); - createStatistics(nativeHandle_); - return this; - } - - /** - * Returns statistics object. Calls createStatistics() if - * C++ returns NULL pointer for statistics. - * - * @return the instance of the statistics object. - * @see createStatistics() - */ - public Statistics statisticsPtr() { - assert(isInitialized()); - - long statsPtr = statisticsPtr(nativeHandle_); - if(statsPtr == 0) { - createStatistics(); - statsPtr = statisticsPtr(nativeHandle_); - } - - return new Statistics(statsPtr); - } - - /** - * Specifies the maximum number of concurrent background compaction jobs, - * submitted to the default LOW priority thread pool. - * If you're increasing this, also consider increasing number of threads in - * LOW priority thread pool. For more information, see - * Default: 1 - * - * @param maxBackgroundCompactions the maximum number of background - * compaction jobs. - * @return the reference to the current option. - * - * @see Env.setBackgroundThreads() - * @see maxBackgroundFlushes() - */ - public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) { - assert(isInitialized()); - setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); - return this; - } - - /** - * Returns the maximum number of concurrent background flush jobs. - * If you're increasing this, also consider increasing number of threads in - * HIGH priority thread pool. For more information, see - * Default: 1 - * - * @return the maximum number of concurrent background flush jobs. - * @see Env.setBackgroundThreads() - */ - public int maxBackgroundFlushes() { - assert(isInitialized()); - return maxBackgroundFlushes(nativeHandle_); - } - private native int maxBackgroundFlushes(long handle); - - /** - * Specifies the maximum number of concurrent background flush jobs. - * If you're increasing this, also consider increasing number of threads in - * HIGH priority thread pool. For more information, see - * Default: 1 - * - * @param maxBackgroundFlushes - * @return the reference to the current option. - * - * @see Env.setBackgroundThreads() - * @see maxBackgroundCompactions() - */ - public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) { - assert(isInitialized()); - setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); - return this; - } - private native void setMaxBackgroundFlushes( - long handle, int maxBackgroundFlushes); - - /** - * Returns the maximum size of a info log file. If the current log file - * is larger than this size, a new info log file will be created. - * If 0, all logs will be written to one log file. - * - * @return the maximum size of the info log file. - */ - public long maxLogFileSize() { - assert(isInitialized()); - return maxLogFileSize(nativeHandle_); - } - private native long maxLogFileSize(long handle); - - /** - * Specifies the maximum size of a info log file. If the current log file - * is larger than `max_log_file_size`, a new info log file will - * be created. - * If 0, all logs will be written to one log file. - * - * @param maxLogFileSize the maximum size of a info log file. - * @return the reference to the current option. - */ - public Options setMaxLogFileSize(long maxLogFileSize) { - assert(isInitialized()); - setMaxLogFileSize(nativeHandle_, maxLogFileSize); - return this; - } - private native void setMaxLogFileSize(long handle, long maxLogFileSize); - - /** - * Returns the time interval for the info log file to roll (in seconds). - * If specified with non-zero value, log file will be rolled - * if it has been active longer than `log_file_time_to_roll`. - * Default: 0 (disabled) - * - * @return the time interval in seconds. - */ - public long logFileTimeToRoll() { - assert(isInitialized()); - return logFileTimeToRoll(nativeHandle_); - } - private native long logFileTimeToRoll(long handle); - - /** - * Specifies the time interval for the info log file to roll (in seconds). - * If specified with non-zero value, log file will be rolled - * if it has been active longer than `log_file_time_to_roll`. - * Default: 0 (disabled) - * - * @param logFileTimeToRoll the time interval in seconds. - * @return the reference to the current option. - */ - public Options setLogFileTimeToRoll(long logFileTimeToRoll) { - assert(isInitialized()); - setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); - return this; - } - private native void setLogFileTimeToRoll( - long handle, long logFileTimeToRoll); - - /** - * Returns the maximum number of info log files to be kept. - * Default: 1000 - * - * @return the maximum number of info log files to be kept. - */ - public long keepLogFileNum() { - assert(isInitialized()); - return keepLogFileNum(nativeHandle_); - } - private native long keepLogFileNum(long handle); - - /** - * Specifies the maximum number of info log files to be kept. - * Default: 1000 - * - * @param keepLogFileNum the maximum number of info log files to be kept. - * @return the reference to the current option. - */ - public Options setKeepLogFileNum(long keepLogFileNum) { - assert(isInitialized()); - setKeepLogFileNum(nativeHandle_, keepLogFileNum); - return this; - } - private native void setKeepLogFileNum(long handle, long keepLogFileNum); - - /** - * Manifest file is rolled over on reaching this limit. - * The older manifest file be deleted. - * The default value is MAX_INT so that roll-over does not take place. - * - * @return the size limit of a manifest file. - */ - public long maxManifestFileSize() { - assert(isInitialized()); - return maxManifestFileSize(nativeHandle_); - } - private native long maxManifestFileSize(long handle); - - /** - * Manifest file is rolled over on reaching this limit. - * The older manifest file be deleted. - * The default value is MAX_INT so that roll-over does not take place. - * - * @param maxManifestFileSize the size limit of a manifest file. - * @return the reference to the current option. - */ - public Options setMaxManifestFileSize(long maxManifestFileSize) { - assert(isInitialized()); - setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); - return this; - } - private native void setMaxManifestFileSize( - long handle, long maxManifestFileSize); - - /** - * Number of shards used for table cache. - * - * @return the number of shards used for table cache. - */ - public int tableCacheNumshardbits() { - assert(isInitialized()); - return tableCacheNumshardbits(nativeHandle_); - } - private native int tableCacheNumshardbits(long handle); - - /** - * Number of shards used for table cache. - * - * @param tableCacheNumshardbits the number of chards - * @return the reference to the current option. - */ - public Options setTableCacheNumshardbits(int tableCacheNumshardbits) { - assert(isInitialized()); - setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); - return this; - } - private native void setTableCacheNumshardbits( - long handle, int tableCacheNumshardbits); - - /** - * During data eviction of table's LRU cache, it would be inefficient - * to strictly follow LRU because this piece of memory will not really - * be released unless its refcount falls to zero. Instead, make two - * passes: the first pass will release items with refcount = 1, - * and if not enough space releases after scanning the number of - * elements specified by this parameter, we will remove items in LRU - * order. - * - * @return scan count limit - */ - public int tableCacheRemoveScanCountLimit() { - assert(isInitialized()); - return tableCacheRemoveScanCountLimit(nativeHandle_); - } - private native int tableCacheRemoveScanCountLimit(long handle); - - /** - * During data eviction of table's LRU cache, it would be inefficient - * to strictly follow LRU because this piece of memory will not really - * be released unless its refcount falls to zero. Instead, make two - * passes: the first pass will release items with refcount = 1, - * and if not enough space releases after scanning the number of - * elements specified by this parameter, we will remove items in LRU - * order. - * - * @param limit scan count limit - * @return the reference to the current option. - */ - public Options setTableCacheRemoveScanCountLimit(int limit) { - assert(isInitialized()); - setTableCacheRemoveScanCountLimit(nativeHandle_, limit); - return this; - } - private native void setTableCacheRemoveScanCountLimit( - long handle, int limit); - - /** - * WalTtlSeconds() and walSizeLimitMB() affect how archived logs - * will be deleted. - * 1. If both set to 0, logs will be deleted asap and will not get into - * the archive. - * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted. - * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that - * are older than WAL_ttl_seconds will be deleted. - * 4. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first. - * - * @return the wal-ttl seconds - * @see walSizeLimitMB() - */ - public long walTtlSeconds() { - assert(isInitialized()); - return walTtlSeconds(nativeHandle_); - } - private native long walTtlSeconds(long handle); - - /** - * WalTtlSeconds() and walSizeLimitMB() affect how archived logs - * will be deleted. - * 1. If both set to 0, logs will be deleted asap and will not get into - * the archive. - * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted. - * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that - * are older than WAL_ttl_seconds will be deleted. - * 4. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first. - * - * @param walTtlSeconds the ttl seconds - * @return the reference to the current option. - * @see setWalSizeLimitMB() - */ - public Options setWalTtlSeconds(long walTtlSeconds) { - assert(isInitialized()); - setWalTtlSeconds(nativeHandle_, walTtlSeconds); - return this; - } - private native void setWalTtlSeconds(long handle, long walTtlSeconds); - - /** - * WalTtlSeconds() and walSizeLimitMB() affect how archived logs - * will be deleted. - * 1. If both set to 0, logs will be deleted asap and will not get into - * the archive. - * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted. - * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that - * are older than WAL_ttl_seconds will be deleted. - * 4. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first. - * - * @return size limit in mega-bytes. - * @see walSizeLimitMB() - */ - public long walSizeLimitMB() { - assert(isInitialized()); - return walSizeLimitMB(nativeHandle_); - } - private native long walSizeLimitMB(long handle); - - /** - * WalTtlSeconds() and walSizeLimitMB() affect how archived logs - * will be deleted. - * 1. If both set to 0, logs will be deleted asap and will not get into - * the archive. - * 2. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, - * WAL files will be checked every 10 min and if total size is greater - * then WAL_size_limit_MB, they will be deleted starting with the - * earliest until size_limit is met. All empty files will be deleted. - * 3. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then - * WAL files will be checked every WAL_ttl_secondsi / 2 and those that - * are older than WAL_ttl_seconds will be deleted. - * 4. If both are not 0, WAL files will be checked every 10 min and both - * checks will be performed with ttl being first. - * - * @param sizeLimitMB size limit in mega-bytes. - * @return the reference to the current option. - * @see setWalSizeLimitMB() - */ - public Options setWalSizeLimitMB(long sizeLimitMB) { - assert(isInitialized()); - setWalSizeLimitMB(nativeHandle_, sizeLimitMB); - return this; - } - private native void setWalSizeLimitMB(long handle, long sizeLimitMB); - - /** - * Number of bytes to preallocate (via fallocate) the manifest - * files. Default is 4mb, which is reasonable to reduce random IO - * as well as prevent overallocation for mounts that preallocate - * large amounts of data (such as xfs's allocsize option). - * - * @return size in bytes. - */ - public long manifestPreallocationSize() { - assert(isInitialized()); - return manifestPreallocationSize(nativeHandle_); - } - private native long manifestPreallocationSize(long handle); - - /** - * Number of bytes to preallocate (via fallocate) the manifest - * files. Default is 4mb, which is reasonable to reduce random IO - * as well as prevent overallocation for mounts that preallocate - * large amounts of data (such as xfs's allocsize option). - * - * @param size the size in byte - * @return the reference to the current option. - */ - public Options setManifestPreallocationSize(long size) { - assert(isInitialized()); - setManifestPreallocationSize(nativeHandle_, size); - return this; - } - private native void setManifestPreallocationSize( - long handle, long size); - - /** - * Data being read from file storage may be buffered in the OS - * Default: true - * - * @return if true, then OS buffering is allowed. - */ - public boolean allowOsBuffer() { - assert(isInitialized()); - return allowOsBuffer(nativeHandle_); - } - private native boolean allowOsBuffer(long handle); - - /** - * Data being read from file storage may be buffered in the OS - * Default: true - * - * @param allowOsBufferif true, then OS buffering is allowed. - * @return the reference to the current option. - */ - public Options setAllowOsBuffer(boolean allowOsBuffer) { - assert(isInitialized()); - setAllowOsBuffer(nativeHandle_, allowOsBuffer); - return this; - } - private native void setAllowOsBuffer( - long handle, boolean allowOsBuffer); - - /** - * Allow the OS to mmap file for reading sst tables. - * Default: false - * - * @return true if mmap reads are allowed. - */ - public boolean allowMmapReads() { - assert(isInitialized()); - return allowMmapReads(nativeHandle_); - } - private native boolean allowMmapReads(long handle); - - /** - * Allow the OS to mmap file for reading sst tables. - * Default: false - * - * @param allowMmapReads true if mmap reads are allowed. - * @return the reference to the current option. - */ - public Options setAllowMmapReads(boolean allowMmapReads) { - assert(isInitialized()); - setAllowMmapReads(nativeHandle_, allowMmapReads); - return this; - } - private native void setAllowMmapReads( - long handle, boolean allowMmapReads); - - /** - * Allow the OS to mmap file for writing. Default: false - * - * @return true if mmap writes are allowed. - */ - public boolean allowMmapWrites() { - assert(isInitialized()); - return allowMmapWrites(nativeHandle_); - } - private native boolean allowMmapWrites(long handle); - - /** - * Allow the OS to mmap file for writing. Default: false - * - * @param allowMmapWrites true if mmap writes are allowd. - * @return the reference to the current option. - */ - public Options setAllowMmapWrites(boolean allowMmapWrites) { - assert(isInitialized()); - setAllowMmapWrites(nativeHandle_, allowMmapWrites); - return this; - } - private native void setAllowMmapWrites( - long handle, boolean allowMmapWrites); - - /** - * Disable child process inherit open files. Default: true - * - * @return true if child process inheriting open files is disabled. - */ - public boolean isFdCloseOnExec() { - assert(isInitialized()); - return isFdCloseOnExec(nativeHandle_); - } - private native boolean isFdCloseOnExec(long handle); - - /** - * Disable child process inherit open files. Default: true - * - * @param isFdCloseOnExec true if child process inheriting open - * files is disabled. - * @return the reference to the current option. - */ - public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) { - assert(isInitialized()); - setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); - return this; - } - private native void setIsFdCloseOnExec( - long handle, boolean isFdCloseOnExec); - - /** - * Skip log corruption error on recovery (If client is ok with - * losing most recent changes) - * Default: false - * - * @return true if log corruption errors are skipped during recovery. - */ - public boolean skipLogErrorOnRecovery() { - assert(isInitialized()); - return skipLogErrorOnRecovery(nativeHandle_); - } - private native boolean skipLogErrorOnRecovery(long handle); - - /** - * Skip log corruption error on recovery (If client is ok with - * losing most recent changes) - * Default: false - * - * @param skip true if log corruption errors are skipped during recovery. - * @return the reference to the current option. - */ - public Options setSkipLogErrorOnRecovery(boolean skip) { - assert(isInitialized()); - setSkipLogErrorOnRecovery(nativeHandle_, skip); - return this; - } - private native void setSkipLogErrorOnRecovery( - long handle, boolean skip); - - /** - * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec - * Default: 3600 (1 hour) - * - * @return time interval in seconds. - */ - public int statsDumpPeriodSec() { - assert(isInitialized()); - return statsDumpPeriodSec(nativeHandle_); - } - private native int statsDumpPeriodSec(long handle); - - /** - * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec - * Default: 3600 (1 hour) - * - * @param statsDumpPeriodSec time interval in seconds. - * @return the reference to the current option. - */ - public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) { - assert(isInitialized()); - setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); - return this; - } - private native void setStatsDumpPeriodSec( - long handle, int statsDumpPeriodSec); - - /** - * If set true, will hint the underlying file system that the file - * access pattern is random, when a sst file is opened. - * Default: true - * - * @return true if hinting random access is on. - */ - public boolean adviseRandomOnOpen() { - return adviseRandomOnOpen(nativeHandle_); - } - private native boolean adviseRandomOnOpen(long handle); - - /** - * If set true, will hint the underlying file system that the file - * access pattern is random, when a sst file is opened. - * Default: true - * - * @param adviseRandomOnOpen true if hinting random access is on. - * @return the reference to the current option. - */ - public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) { - assert(isInitialized()); - setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); - return this; - } - private native void setAdviseRandomOnOpen( - long handle, boolean adviseRandomOnOpen); - - /** - * Use adaptive mutex, which spins in the user space before resorting - * to kernel. This could reduce context switch when the mutex is not - * heavily contended. However, if the mutex is hot, we could end up - * wasting spin time. - * Default: false - * - * @return true if adaptive mutex is used. - */ - public boolean useAdaptiveMutex() { - assert(isInitialized()); - return useAdaptiveMutex(nativeHandle_); - } - private native boolean useAdaptiveMutex(long handle); - - /** - * Use adaptive mutex, which spins in the user space before resorting - * to kernel. This could reduce context switch when the mutex is not - * heavily contended. However, if the mutex is hot, we could end up - * wasting spin time. - * Default: false - * - * @param useAdaptiveMutex true if adaptive mutex is used. - * @return the reference to the current option. - */ - public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) { - assert(isInitialized()); - setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); - return this; - } - private native void setUseAdaptiveMutex( - long handle, boolean useAdaptiveMutex); - - /** - * Allows OS to incrementally sync files to disk while they are being - * written, asynchronously, in the background. - * Issue one request for every bytes_per_sync written. 0 turns it off. - * Default: 0 - * - * @return size in bytes - */ - public long bytesPerSync() { - return bytesPerSync(nativeHandle_); - } - private native long bytesPerSync(long handle); - - /** - * Allows OS to incrementally sync files to disk while they are being - * written, asynchronously, in the background. - * Issue one request for every bytes_per_sync written. 0 turns it off. - * Default: 0 - * - * @param bytesPerSync size in bytes - * @return the reference to the current option. - */ - public Options setBytesPerSync(long bytesPerSync) { - assert(isInitialized()); - setBytesPerSync(nativeHandle_, bytesPerSync); - return this; - } - private native void setBytesPerSync( - long handle, long bytesPerSync); - - /** - * Allow RocksDB to use thread local storage to optimize performance. - * Default: true - * - * @return true if thread-local storage is allowed - */ - public boolean allowThreadLocal() { - assert(isInitialized()); - return allowThreadLocal(nativeHandle_); - } - private native boolean allowThreadLocal(long handle); - - /** - * Allow RocksDB to use thread local storage to optimize performance. - * Default: true - * - * @param allowThreadLocal true if thread-local storage is allowed. - * @return the reference to the current option. - */ - public Options setAllowThreadLocal(boolean allowThreadLocal) { - assert(isInitialized()); - setAllowThreadLocal(nativeHandle_, allowThreadLocal); - return this; - } - private native void setAllowThreadLocal( - long handle, boolean allowThreadLocal); - - /** - * Set the config for mem-table. - * - * @param config the mem-table config. - * @return the instance of the current Options. - */ - public Options setMemTableConfig(MemTableConfig config) { - setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); - return this; - } - - /** - * Returns the name of the current mem table representation. - * Memtable format can be set using setTableFormatConfig. - * - * @return the name of the currently-used memtable factory. - * @see setTableFormatConfig() - */ - public String memTableFactoryName() { - assert(isInitialized()); - return memTableFactoryName(nativeHandle_); - } - - /** - * Set the config for table format. - * - * @param config the table format config. - * @return the reference of the current Options. - */ - public Options setTableFormatConfig(TableFormatConfig config) { - setTableFactory(nativeHandle_, config.newTableFactoryHandle()); - return this; - } - - /** - * @return the name of the currently used table factory. - */ - public String tableFactoryName() { - assert(isInitialized()); - return tableFactoryName(nativeHandle_); - } - - /** - * This prefix-extractor uses the first n bytes of a key as its prefix. - * - * In some hash-based memtable representation such as HashLinkedList - * and HashSkipList, prefixes are used to partition the keys into - * several buckets. Prefix extractor is used to specify how to - * extract the prefix given a key. - * - * @param n use the first n bytes of a key as its prefix. - */ - public Options useFixedLengthPrefixExtractor(int n) { - assert(isInitialized()); - useFixedLengthPrefixExtractor(nativeHandle_, n); - return this; - } - -/////////////////////////////////////////////////////////////////////// - /** - * Number of keys between restart points for delta encoding of keys. - * This parameter can be changed dynamically. Most clients should - * leave this parameter alone. - * Default: 16 - * - * @return the number of keys between restart points. - */ - public int blockRestartInterval() { - return blockRestartInterval(nativeHandle_); - } - private native int blockRestartInterval(long handle); - - /** - * Number of keys between restart points for delta encoding of keys. - * This parameter can be changed dynamically. Most clients should - * leave this parameter alone. - * Default: 16 - * - * @param blockRestartInterval the number of keys between restart points. - * @return the reference to the current option. - */ - public Options setBlockRestartInterval(int blockRestartInterval) { - setBlockRestartInterval(nativeHandle_, blockRestartInterval); - return this; - } - private native void setBlockRestartInterval( - long handle, int blockRestartInterval); - - /** - * Compress blocks using the specified compression algorithm. This - parameter can be changed dynamically. - * - * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. - * - * @return Compression type. - */ - public CompressionType compressionType() { - return CompressionType.values()[compressionType(nativeHandle_)]; - } - private native byte compressionType(long handle); - - /** - * Compress blocks using the specified compression algorithm. This - parameter can be changed dynamically. - * - * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. - * - * @param compressionType Compression Type. - * @return the reference to the current option. - */ - public Options setCompressionType(CompressionType compressionType) { - setCompressionType(nativeHandle_, compressionType.getValue()); - return this; - } - private native void setCompressionType(long handle, byte compressionType); - - /** - * Compaction style for DB. - * - * @return Compaction style. - */ - public CompactionStyle compactionStyle() { - return CompactionStyle.values()[compactionStyle(nativeHandle_)]; - } - private native byte compactionStyle(long handle); - - /** - * Set compaction style for DB. - * - * Default: LEVEL. - * - * @param compactionStyle Compaction style. - * @return the reference to the current option. - */ - public Options setCompactionStyle(CompactionStyle compactionStyle) { - setCompactionStyle(nativeHandle_, compactionStyle.getValue()); - return this; - } - private native void setCompactionStyle(long handle, byte compactionStyle); - - /** - * If level-styled compaction is used, then this number determines - * the total number of levels. - * - * @return the number of levels. - */ - public int numLevels() { - return numLevels(nativeHandle_); - } - private native int numLevels(long handle); - - /** - * Set the number of levels for this database - * If level-styled compaction is used, then this number determines - * the total number of levels. - * - * @param numLevels the number of levels. - * @return the reference to the current option. - */ - public Options setNumLevels(int numLevels) { - setNumLevels(nativeHandle_, numLevels); - return this; - } - private native void setNumLevels( - long handle, int numLevels); - - /** - * The number of files in leve 0 to trigger compaction from level-0 to - * level-1. A value < 0 means that level-0 compaction will not be - * triggered by number of files at all. - * Default: 4 - * - * @return the number of files in level 0 to trigger compaction. - */ - public int levelZeroFileNumCompactionTrigger() { - return levelZeroFileNumCompactionTrigger(nativeHandle_); - } - private native int levelZeroFileNumCompactionTrigger(long handle); - - /** - * Number of files to trigger level-0 compaction. A value <0 means that - * level-0 compaction will not be triggered by number of files at all. - * Default: 4 - * - * @param numFiles the number of files in level-0 to trigger compaction. - * @return the reference to the current option. - */ - public Options setLevelZeroFileNumCompactionTrigger( - int numFiles) { - setLevelZeroFileNumCompactionTrigger( - nativeHandle_, numFiles); - return this; - } - private native void setLevelZeroFileNumCompactionTrigger( - long handle, int numFiles); - - /** - * Soft limit on the number of level-0 files. We start slowing down writes - * at this point. A value < 0 means that no writing slow down will be - * triggered by number of files in level-0. - * - * @return the soft limit on the number of level-0 files. - */ - public int levelZeroSlowdownWritesTrigger() { - return levelZeroSlowdownWritesTrigger(nativeHandle_); - } - private native int levelZeroSlowdownWritesTrigger(long handle); - - /** - * Soft limit on number of level-0 files. We start slowing down writes at this - * point. A value <0 means that no writing slow down will be triggered by - * number of files in level-0. - * - * @param numFiles soft limit on number of level-0 files. - * @return the reference to the current option. - */ - public Options setLevelZeroSlowdownWritesTrigger( - int numFiles) { - setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles); - return this; - } - private native void setLevelZeroSlowdownWritesTrigger( - long handle, int numFiles); - - /** - * Maximum number of level-0 files. We stop writes at this point. - * - * @return the hard limit of the number of level-0 file. - */ - public int levelZeroStopWritesTrigger() { - return levelZeroStopWritesTrigger(nativeHandle_); - } - private native int levelZeroStopWritesTrigger(long handle); - - /** - * Maximum number of level-0 files. We stop writes at this point. - * - * @param numFiles the hard limit of the number of level-0 files. - * @return the reference to the current option. - */ - public Options setLevelZeroStopWritesTrigger(int numFiles) { - setLevelZeroStopWritesTrigger(nativeHandle_, numFiles); - return this; - } - private native void setLevelZeroStopWritesTrigger( - long handle, int numFiles); - - /** - * The highest level to which a new compacted memtable is pushed if it - * does not create overlap. We try to push to level 2 to avoid the - * relatively expensive level 0=>1 compactions and to avoid some - * expensive manifest file operations. We do not push all the way to - * the largest level since that can generate a lot of wasted disk - * space if the same key space is being repeatedly overwritten. - * - * @return the highest level where a new compacted memtable will be pushed. - */ - public int maxMemCompactionLevel() { - return maxMemCompactionLevel(nativeHandle_); - } - private native int maxMemCompactionLevel(long handle); - - /** - * The highest level to which a new compacted memtable is pushed if it - * does not create overlap. We try to push to level 2 to avoid the - * relatively expensive level 0=>1 compactions and to avoid some - * expensive manifest file operations. We do not push all the way to - * the largest level since that can generate a lot of wasted disk - * space if the same key space is being repeatedly overwritten. - * - * @param maxMemCompactionLevel the highest level to which a new compacted - * mem-table will be pushed. - * @return the reference to the current option. - */ - public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) { - setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel); - return this; - } - private native void setMaxMemCompactionLevel( - long handle, int maxMemCompactionLevel); - - /** - * The target file size for compaction. - * This targetFileSizeBase determines a level-1 file size. - * Target file size for level L can be calculated by - * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) - * For example, if targetFileSizeBase is 2MB and - * target_file_size_multiplier is 10, then each file on level-1 will - * be 2MB, and each file on level 2 will be 20MB, - * and each file on level-3 will be 200MB. - * by default targetFileSizeBase is 2MB. - * - * @return the target size of a level-0 file. - * - * @see targetFileSizeMultiplier() - */ - public int targetFileSizeBase() { - return targetFileSizeBase(nativeHandle_); - } - private native int targetFileSizeBase(long handle); - - /** - * The target file size for compaction. - * This targetFileSizeBase determines a level-1 file size. - * Target file size for level L can be calculated by - * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) - * For example, if targetFileSizeBase is 2MB and - * target_file_size_multiplier is 10, then each file on level-1 will - * be 2MB, and each file on level 2 will be 20MB, - * and each file on level-3 will be 200MB. - * by default targetFileSizeBase is 2MB. - * - * @param targetFileSizeBase the target size of a level-0 file. - * @return the reference to the current option. - * - * @see setTargetFileSizeMultiplier() - */ - public Options setTargetFileSizeBase(int targetFileSizeBase) { - setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); - return this; - } - private native void setTargetFileSizeBase( - long handle, int targetFileSizeBase); - - /** - * targetFileSizeMultiplier defines the size ratio between a - * level-(L+1) file and level-L file. - * By default targetFileSizeMultiplier is 1, meaning - * files in different levels have the same target. - * - * @return the size ratio between a level-(L+1) file and level-L file. - */ - public int targetFileSizeMultiplier() { - return targetFileSizeMultiplier(nativeHandle_); - } - private native int targetFileSizeMultiplier(long handle); - - /** - * targetFileSizeMultiplier defines the size ratio between a - * level-L file and level-(L+1) file. - * By default target_file_size_multiplier is 1, meaning - * files in different levels have the same target. - * - * @param multiplier the size ratio between a level-(L+1) file - * and level-L file. - * @return the reference to the current option. - */ - public Options setTargetFileSizeMultiplier(int multiplier) { - setTargetFileSizeMultiplier(nativeHandle_, multiplier); - return this; - } - private native void setTargetFileSizeMultiplier( - long handle, int multiplier); - - /** - * The upper-bound of the total size of level-1 files in bytes. - * Maximum number of bytes for level L can be calculated as - * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) - * For example, if maxBytesForLevelBase is 20MB, and if - * max_bytes_for_level_multiplier is 10, total data size for level-1 - * will be 20MB, total file size for level-2 will be 200MB, - * and total file size for level-3 will be 2GB. - * by default 'maxBytesForLevelBase' is 10MB. - * - * @return the upper-bound of the total size of leve-1 files in bytes. - * @see maxBytesForLevelMultiplier() - */ - public long maxBytesForLevelBase() { - return maxBytesForLevelBase(nativeHandle_); - } - private native long maxBytesForLevelBase(long handle); - - /** - * The upper-bound of the total size of level-1 files in bytes. - * Maximum number of bytes for level L can be calculated as - * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) - * For example, if maxBytesForLevelBase is 20MB, and if - * max_bytes_for_level_multiplier is 10, total data size for level-1 - * will be 20MB, total file size for level-2 will be 200MB, - * and total file size for level-3 will be 2GB. - * by default 'maxBytesForLevelBase' is 10MB. - * - * @return maxBytesForLevelBase the upper-bound of the total size of - * leve-1 files in bytes. - * @return the reference to the current option. - * @see setMaxBytesForLevelMultiplier() - */ - public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) { - setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase); - return this; - } - private native void setMaxBytesForLevelBase( - long handle, long maxBytesForLevelBase); - - /** - * The ratio between the total size of level-(L+1) files and the total - * size of level-L files for all L. - * DEFAULT: 10 - * - * @return the ratio between the total size of level-(L+1) files and - * the total size of level-L files for all L. - * @see maxBytesForLevelBase() - */ - public int maxBytesForLevelMultiplier() { - return maxBytesForLevelMultiplier(nativeHandle_); - } - private native int maxBytesForLevelMultiplier(long handle); - - /** - * The ratio between the total size of level-(L+1) files and the total - * size of level-L files for all L. - * DEFAULT: 10 - * - * @param multiplier the ratio between the total size of level-(L+1) - * files and the total size of level-L files for all L. - * @return the reference to the current option. - * @see setMaxBytesForLevelBase() - */ - public Options setMaxBytesForLevelMultiplier(int multiplier) { - setMaxBytesForLevelMultiplier(nativeHandle_, multiplier); - return this; - } - private native void setMaxBytesForLevelMultiplier( - long handle, int multiplier); - - /** - * Maximum number of bytes in all compacted files. We avoid expanding - * the lower level file set of a compaction if it would make the - * total compaction cover more than - * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. - * - * @return the maximum number of bytes in all compacted files. - * @see sourceCompactionFactor() - */ - public int expandedCompactionFactor() { - return expandedCompactionFactor(nativeHandle_); - } - private native int expandedCompactionFactor(long handle); - - /** - * Maximum number of bytes in all compacted files. We avoid expanding - * the lower level file set of a compaction if it would make the - * total compaction cover more than - * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. - * - * @param expandedCompactionFactor the maximum number of bytes in all - * compacted files. - * @return the reference to the current option. - * @see setSourceCompactionFactor() - */ - public Options setExpandedCompactionFactor(int expandedCompactionFactor) { - setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor); - return this; - } - private native void setExpandedCompactionFactor( - long handle, int expandedCompactionFactor); - - /** - * Maximum number of bytes in all source files to be compacted in a - * single compaction run. We avoid picking too many files in the - * source level so that we do not exceed the total source bytes - * for compaction to exceed - * (source_compaction_factor * targetFileSizeLevel()) many bytes. - * Default:1, i.e. pick maxfilesize amount of data as the source of - * a compaction. - * - * @return the maximum number of bytes in all source files to be compactedo. - * @see expendedCompactionFactor() - */ - public int sourceCompactionFactor() { - return sourceCompactionFactor(nativeHandle_); - } - private native int sourceCompactionFactor(long handle); - - /** - * Maximum number of bytes in all source files to be compacted in a - * single compaction run. We avoid picking too many files in the - * source level so that we do not exceed the total source bytes - * for compaction to exceed - * (source_compaction_factor * targetFileSizeLevel()) many bytes. - * Default:1, i.e. pick maxfilesize amount of data as the source of - * a compaction. - * - * @param sourceCompactionFactor the maximum number of bytes in all - * source files to be compacted in a single compaction run. - * @return the reference to the current option. - * @see setExpendedCompactionFactor() - */ - public Options setSourceCompactionFactor(int sourceCompactionFactor) { - setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor); - return this; - } - private native void setSourceCompactionFactor( - long handle, int sourceCompactionFactor); - - /** - * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we - * stop building a single file in a level->level+1 compaction. - * - * @return maximum bytes of overlaps in "grandparent" level. - */ - public int maxGrandparentOverlapFactor() { - return maxGrandparentOverlapFactor(nativeHandle_); - } - private native int maxGrandparentOverlapFactor(long handle); - - /** - * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we - * stop building a single file in a level->level+1 compaction. - * - * @param maxGrandparentOverlapFactor maximum bytes of overlaps in - * "grandparent" level. - * @return the reference to the current option. - */ - public Options setMaxGrandparentOverlapFactor( - int maxGrandparentOverlapFactor) { - setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor); - return this; - } - private native void setMaxGrandparentOverlapFactor( - long handle, int maxGrandparentOverlapFactor); - - /** - * Puts are delayed 0-1 ms when any level has a compaction score that exceeds - * soft_rate_limit. This is ignored when == 0.0. - * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not - * hold, RocksDB will set soft_rate_limit = hard_rate_limit - * Default: 0 (disabled) - * - * @return soft-rate-limit for put delay. - */ - public double softRateLimit() { - return softRateLimit(nativeHandle_); - } - private native double softRateLimit(long handle); - - /** - * Puts are delayed 0-1 ms when any level has a compaction score that exceeds - * soft_rate_limit. This is ignored when == 0.0. - * CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not - * hold, RocksDB will set soft_rate_limit = hard_rate_limit - * Default: 0 (disabled) - * - * @param softRateLimit the soft-rate-limit of a compaction score - * for put delay. - * @return the reference to the current option. - */ - public Options setSoftRateLimit(double softRateLimit) { - setSoftRateLimit(nativeHandle_, softRateLimit); - return this; - } - private native void setSoftRateLimit( - long handle, double softRateLimit); - - /** - * Puts are delayed 1ms at a time when any level has a compaction score that - * exceeds hard_rate_limit. This is ignored when <= 1.0. - * Default: 0 (disabled) - * - * @return the hard-rate-limit of a compaction score for put delay. - */ - public double hardRateLimit() { - return hardRateLimit(nativeHandle_); - } - private native double hardRateLimit(long handle); - - /** - * Puts are delayed 1ms at a time when any level has a compaction score that - * exceeds hard_rate_limit. This is ignored when <= 1.0. - * Default: 0 (disabled) - * - * @param hardRateLimit the hard-rate-limit of a compaction score for put - * delay. - * @return the reference to the current option. - */ - public Options setHardRateLimit(double hardRateLimit) { - setHardRateLimit(nativeHandle_, hardRateLimit); - return this; - } - private native void setHardRateLimit( - long handle, double hardRateLimit); - - /** - * The maximum time interval a put will be stalled when hard_rate_limit - * is enforced. If 0, then there is no limit. - * Default: 1000 - * - * @return the maximum time interval a put will be stalled when - * hard_rate_limit is enforced. - */ - public int rateLimitDelayMaxMilliseconds() { - return rateLimitDelayMaxMilliseconds(nativeHandle_); - } - private native int rateLimitDelayMaxMilliseconds(long handle); - - /** - * The maximum time interval a put will be stalled when hard_rate_limit - * is enforced. If 0, then there is no limit. - * Default: 1000 - * - * @param rateLimitDelayMaxMilliseconds the maximum time interval a put - * will be stalled. - * @return the reference to the current option. - */ - public Options setRateLimitDelayMaxMilliseconds( - int rateLimitDelayMaxMilliseconds) { - setRateLimitDelayMaxMilliseconds( - nativeHandle_, rateLimitDelayMaxMilliseconds); - return this; - } - private native void setRateLimitDelayMaxMilliseconds( - long handle, int rateLimitDelayMaxMilliseconds); - - /** - * The size of one block in arena memory allocation. - * If <= 0, a proper value is automatically calculated (usually 1/10 of - * writer_buffer_size). - * - * There are two additonal restriction of the The specified size: - * (1) size should be in the range of [4096, 2 << 30] and - * (2) be the multiple of the CPU word (which helps with the memory - * alignment). - * - * We'll automatically check and adjust the size number to make sure it - * conforms to the restrictions. - * Default: 0 - * - * @return the size of an arena block - */ - public long arenaBlockSize() { - return arenaBlockSize(nativeHandle_); - } - private native long arenaBlockSize(long handle); - - /** - * The size of one block in arena memory allocation. - * If <= 0, a proper value is automatically calculated (usually 1/10 of - * writer_buffer_size). - * - * There are two additonal restriction of the The specified size: - * (1) size should be in the range of [4096, 2 << 30] and - * (2) be the multiple of the CPU word (which helps with the memory - * alignment). - * - * We'll automatically check and adjust the size number to make sure it - * conforms to the restrictions. - * Default: 0 - * - * @param arenaBlockSize the size of an arena block - * @return the reference to the current option. - */ - public Options setArenaBlockSize(long arenaBlockSize) { - setArenaBlockSize(nativeHandle_, arenaBlockSize); - return this; - } - private native void setArenaBlockSize( - long handle, long arenaBlockSize); - - /** - * Disable automatic compactions. Manual compactions can still - * be issued on this column family - * - * @return true if auto-compactions are disabled. - */ - public boolean disableAutoCompactions() { - return disableAutoCompactions(nativeHandle_); - } - private native boolean disableAutoCompactions(long handle); - - /** - * Disable automatic compactions. Manual compactions can still - * be issued on this column family - * - * @param disableAutoCompactions true if auto-compactions are disabled. - * @return the reference to the current option. - */ - public Options setDisableAutoCompactions(boolean disableAutoCompactions) { - setDisableAutoCompactions(nativeHandle_, disableAutoCompactions); - return this; - } - private native void setDisableAutoCompactions( - long handle, boolean disableAutoCompactions); - - /** - * Purge duplicate/deleted keys when a memtable is flushed to storage. - * Default: true - * - * @return true if purging keys is disabled. - */ - public boolean purgeRedundantKvsWhileFlush() { - return purgeRedundantKvsWhileFlush(nativeHandle_); - } - private native boolean purgeRedundantKvsWhileFlush(long handle); - - /** - * Purge duplicate/deleted keys when a memtable is flushed to storage. - * Default: true - * - * @param purgeRedundantKvsWhileFlush true if purging keys is disabled. - * @return the reference to the current option. - */ - public Options setPurgeRedundantKvsWhileFlush( - boolean purgeRedundantKvsWhileFlush) { - setPurgeRedundantKvsWhileFlush( - nativeHandle_, purgeRedundantKvsWhileFlush); - return this; - } - private native void setPurgeRedundantKvsWhileFlush( - long handle, boolean purgeRedundantKvsWhileFlush); - - /** - * If true, compaction will verify checksum on every read that happens - * as part of compaction - * Default: true - * - * @return true if compaction verifies checksum on every read. - */ - public boolean verifyChecksumsInCompaction() { - return verifyChecksumsInCompaction(nativeHandle_); - } - private native boolean verifyChecksumsInCompaction(long handle); - - /** - * If true, compaction will verify checksum on every read that happens - * as part of compaction - * Default: true - * - * @param verifyChecksumsInCompaction true if compaction verifies - * checksum on every read. - * @return the reference to the current option. - */ - public Options setVerifyChecksumsInCompaction( - boolean verifyChecksumsInCompaction) { - setVerifyChecksumsInCompaction( - nativeHandle_, verifyChecksumsInCompaction); - return this; - } - private native void setVerifyChecksumsInCompaction( - long handle, boolean verifyChecksumsInCompaction); - - /** - * Use KeyMayExist API to filter deletes when this is true. - * If KeyMayExist returns false, i.e. the key definitely does not exist, then - * the delete is a noop. KeyMayExist only incurs in-memory look up. - * This optimization avoids writing the delete to storage when appropriate. - * Default: false - * - * @return true if filter-deletes behavior is on. - */ - public boolean filterDeletes() { - return filterDeletes(nativeHandle_); - } - private native boolean filterDeletes(long handle); - - /** - * Use KeyMayExist API to filter deletes when this is true. - * If KeyMayExist returns false, i.e. the key definitely does not exist, then - * the delete is a noop. KeyMayExist only incurs in-memory look up. - * This optimization avoids writing the delete to storage when appropriate. - * Default: false - * - * @param filterDeletes true if filter-deletes behavior is on. - * @return the reference to the current option. - */ - public Options setFilterDeletes(boolean filterDeletes) { - setFilterDeletes(nativeHandle_, filterDeletes); - return this; - } - private native void setFilterDeletes( - long handle, boolean filterDeletes); - - /** - * An iteration->Next() sequentially skips over keys with the same - * user-key unless this option is set. This number specifies the number - * of keys (with the same userkey) that will be sequentially - * skipped before a reseek is issued. - * Default: 8 - * - * @return the number of keys could be skipped in a iteration. - */ - public long maxSequentialSkipInIterations() { - return maxSequentialSkipInIterations(nativeHandle_); - } - private native long maxSequentialSkipInIterations(long handle); - - /** - * An iteration->Next() sequentially skips over keys with the same - * user-key unless this option is set. This number specifies the number - * of keys (with the same userkey) that will be sequentially - * skipped before a reseek is issued. - * Default: 8 - * - * @param maxSequentialSkipInIterations the number of keys could - * be skipped in a iteration. - * @return the reference to the current option. - */ - public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) { - setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); - return this; - } - private native void setMaxSequentialSkipInIterations( - long handle, long maxSequentialSkipInIterations); - - /** - * Allows thread-safe inplace updates. - * If inplace_callback function is not set, - * Put(key, new_value) will update inplace the existing_value iff - * * key exists in current memtable - * * new sizeof(new_value) <= sizeof(existing_value) - * * existing_value for that key is a put i.e. kTypeValue - * If inplace_callback function is set, check doc for inplace_callback. - * Default: false. - * - * @return true if thread-safe inplace updates are allowed. - */ - public boolean inplaceUpdateSupport() { - return inplaceUpdateSupport(nativeHandle_); - } - private native boolean inplaceUpdateSupport(long handle); - - /** - * Allows thread-safe inplace updates. - * If inplace_callback function is not set, - * Put(key, new_value) will update inplace the existing_value iff - * * key exists in current memtable - * * new sizeof(new_value) <= sizeof(existing_value) - * * existing_value for that key is a put i.e. kTypeValue - * If inplace_callback function is set, check doc for inplace_callback. - * Default: false. - * - * @param inplaceUpdateSupport true if thread-safe inplace updates - * are allowed. - * @return the reference to the current option. - */ - public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) { - setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); - return this; - } - private native void setInplaceUpdateSupport( - long handle, boolean inplaceUpdateSupport); - - /** - * Number of locks used for inplace update - * Default: 10000, if inplace_update_support = true, else 0. - * - * @return the number of locks used for inplace update. - */ - public long inplaceUpdateNumLocks() { - return inplaceUpdateNumLocks(nativeHandle_); - } - private native long inplaceUpdateNumLocks(long handle); - - /** - * Number of locks used for inplace update - * Default: 10000, if inplace_update_support = true, else 0. - * - * @param inplaceUpdateNumLocks the number of locks used for - * inplace updates. - * @return the reference to the current option. - */ - public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) { - setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks); - return this; - } - private native void setInplaceUpdateNumLocks( - long handle, long inplaceUpdateNumLocks); - - /** - * Returns the number of bits used in the prefix bloom filter. - * - * This value will be used only when a prefix-extractor is specified. - * - * @return the number of bloom-bits. - * @see useFixedLengthPrefixExtractor() - */ - public int memtablePrefixBloomBits() { - return memtablePrefixBloomBits(nativeHandle_); - } - private native int memtablePrefixBloomBits(long handle); - - /** - * Sets the number of bits used in the prefix bloom filter. - * - * This value will be used only when a prefix-extractor is specified. - * - * @param memtablePrefixBloomBits the number of bits used in the - * prefix bloom filter. - * @return the reference to the current option. - */ - public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) { - setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); - return this; - } - private native void setMemtablePrefixBloomBits( - long handle, int memtablePrefixBloomBits); - - /** - * The number of hash probes per key used in the mem-table. - * - * @return the number of hash probes per key. - */ - public int memtablePrefixBloomProbes() { - return memtablePrefixBloomProbes(nativeHandle_); - } - private native int memtablePrefixBloomProbes(long handle); - - /** - * The number of hash probes per key used in the mem-table. - * - * @param memtablePrefixBloomProbes the number of hash probes per key. - * @return the reference to the current option. - */ - public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) { - setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); - return this; - } - private native void setMemtablePrefixBloomProbes( - long handle, int memtablePrefixBloomProbes); - - /** - * Control locality of bloom filter probes to improve cache miss rate. - * This option only applies to memtable prefix bloom and plaintable - * prefix bloom. It essentially limits the max number of cache lines each - * bloom filter check can touch. - * This optimization is turned off when set to 0. The number should never - * be greater than number of probes. This option can boost performance - * for in-memory workload but should use with care since it can cause - * higher false positive rate. - * Default: 0 - * - * @return the level of locality of bloom-filter probes. - * @see setMemTablePrefixBloomProbes - */ - public int bloomLocality() { - return bloomLocality(nativeHandle_); - } - private native int bloomLocality(long handle); - - /** - * Control locality of bloom filter probes to improve cache miss rate. - * This option only applies to memtable prefix bloom and plaintable - * prefix bloom. It essentially limits the max number of cache lines each - * bloom filter check can touch. - * This optimization is turned off when set to 0. The number should never - * be greater than number of probes. This option can boost performance - * for in-memory workload but should use with care since it can cause - * higher false positive rate. - * Default: 0 - * - * @param bloomLocality the level of locality of bloom-filter probes. - * @return the reference to the current option. - */ - public Options setBloomLocality(int bloomLocality) { - setBloomLocality(nativeHandle_, bloomLocality); - return this; - } - private native void setBloomLocality( - long handle, int bloomLocality); - - /** - * Maximum number of successive merge operations on a key in the memtable. - * - * When a merge operation is added to the memtable and the maximum number of - * successive merges is reached, the value of the key will be calculated and - * inserted into the memtable instead of the merge operation. This will - * ensure that there are never more than max_successive_merges merge - * operations in the memtable. - * - * Default: 0 (disabled) - * - * @return the maximum number of successive merges. - */ - public long maxSuccessiveMerges() { - return maxSuccessiveMerges(nativeHandle_); - } - private native long maxSuccessiveMerges(long handle); - - /** - * Maximum number of successive merge operations on a key in the memtable. - * - * When a merge operation is added to the memtable and the maximum number of - * successive merges is reached, the value of the key will be calculated and - * inserted into the memtable instead of the merge operation. This will - * ensure that there are never more than max_successive_merges merge - * operations in the memtable. - * - * Default: 0 (disabled) - * - * @param maxSuccessiveMerges the maximum number of successive merges. - * @return the reference to the current option. - */ - public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) { - setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); - return this; - } - private native void setMaxSuccessiveMerges( - long handle, long maxSuccessiveMerges); - - /** - * The minimum number of write buffers that will be merged together - * before writing to storage. If set to 1, then - * all write buffers are fushed to L0 as individual files and this increases - * read amplification because a get request has to check in all of these - * files. Also, an in-memory merge may result in writing lesser - * data to storage if there are duplicate records in each of these - * individual write buffers. Default: 1 - * - * @return the minimum number of write buffers that will be merged together. - */ - public int minWriteBufferNumberToMerge() { - return minWriteBufferNumberToMerge(nativeHandle_); - } - private native int minWriteBufferNumberToMerge(long handle); - - /** - * The minimum number of write buffers that will be merged together - * before writing to storage. If set to 1, then - * all write buffers are fushed to L0 as individual files and this increases - * read amplification because a get request has to check in all of these - * files. Also, an in-memory merge may result in writing lesser - * data to storage if there are duplicate records in each of these - * individual write buffers. Default: 1 - * - * @param minWriteBufferNumberToMerge the minimum number of write buffers - * that will be merged together. - * @return the reference to the current option. - */ - public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) { - setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge); - return this; - } - private native void setMinWriteBufferNumberToMerge( - long handle, int minWriteBufferNumberToMerge); - - /** - * The number of partial merge operands to accumulate before partial - * merge will be performed. Partial merge will not be called - * if the list of values to merge is less than min_partial_merge_operands. - * - * If min_partial_merge_operands < 2, then it will be treated as 2. - * - * Default: 2 - * - * @return - */ - public int minPartialMergeOperands() { - return minPartialMergeOperands(nativeHandle_); - } - private native int minPartialMergeOperands(long handle); - - /** - * The number of partial merge operands to accumulate before partial - * merge will be performed. Partial merge will not be called - * if the list of values to merge is less than min_partial_merge_operands. - * - * If min_partial_merge_operands < 2, then it will be treated as 2. - * - * Default: 2 - * - * @param minPartialMergeOperands - * @return the reference to the current option. - */ - public Options setMinPartialMergeOperands(int minPartialMergeOperands) { - setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands); - return this; - } - private native void setMinPartialMergeOperands( - long handle, int minPartialMergeOperands); - - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - static final int DEFAULT_PLAIN_TABLE_BLOOM_BITS_PER_KEY = 10; - static final double DEFAULT_PLAIN_TABLE_HASH_TABLE_RATIO = 0.75; - static final int DEFAULT_PLAIN_TABLE_INDEX_SPARSENESS = 16; - - private native void newOptions(); - private native void disposeInternal(long handle); - private native void setCreateIfMissing(long handle, boolean flag); - private native boolean createIfMissing(long handle); - private native void setWriteBufferSize(long handle, long writeBufferSize); - private native long writeBufferSize(long handle); - private native void setMaxWriteBufferNumber( - long handle, int maxWriteBufferNumber); - private native int maxWriteBufferNumber(long handle); - private native void setMaxBackgroundCompactions( - long handle, int maxBackgroundCompactions); - private native int maxBackgroundCompactions(long handle); - private native void createStatistics(long optHandle); - private native long statisticsPtr(long optHandle); - - private native void setMemTableFactory(long handle, long factoryHandle); - private native String memTableFactoryName(long handle); - - private native void setTableFactory(long handle, long factoryHandle); - private native String tableFactoryName(long handle); - - private native void useFixedLengthPrefixExtractor( - long handle, int prefixLength); - - long cacheSize_; - int numShardBits_; - RocksEnv env_; -} diff --git a/java/org/rocksdb/PlainTableConfig.java b/java/org/rocksdb/PlainTableConfig.java deleted file mode 100644 index 554ce3840..000000000 --- a/java/org/rocksdb/PlainTableConfig.java +++ /dev/null @@ -1,123 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -package org.rocksdb; - -/** - * The config for plain table sst format. - * - * PlainTable is a RocksDB's SST file format optimized for low query latency - * on pure-memory or really low-latency media. It also support prefix - * hash feature. - */ -public class PlainTableConfig extends TableFormatConfig { - public static final int VARIABLE_LENGTH = 0; - public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10; - public static final double DEFAULT_HASH_TABLE_RATIO = 0.75; - public static final int DEFAULT_INDEX_SPARSENESS = 16; - - public PlainTableConfig() { - keySize_ = VARIABLE_LENGTH; - bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY; - hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO; - indexSparseness_ = DEFAULT_INDEX_SPARSENESS; - } - - /** - * Set the length of the user key. If it is set to be VARIABLE_LENGTH, - * then it indicates the user keys are variable-lengthed. Otherwise, - * all the keys need to have the same length in byte. - * DEFAULT: VARIABLE_LENGTH - * - * @param keySize the length of the user key. - * @return the reference to the current config. - */ - public PlainTableConfig setKeySize(int keySize) { - keySize_ = keySize; - return this; - } - - /** - * @return the specified size of the user key. If VARIABLE_LENGTH, - * then it indicates variable-length key. - */ - public int keySize() { - return keySize_; - } - - /** - * Set the number of bits per key used by the internal bloom filter - * in the plain table sst format. - * - * @param bitsPerKey the number of bits per key for bloom filer. - * @return the reference to the current config. - */ - public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) { - bloomBitsPerKey_ = bitsPerKey; - return this; - } - - /** - * @return the number of bits per key used for the bloom filter. - */ - public int bloomBitsPerKey() { - return bloomBitsPerKey_; - } - - /** - * hashTableRatio is the desired utilization of the hash table used - * for prefix hashing. The ideal ratio would be the number of - * prefixes / the number of hash buckets. If this value is set to - * zero, then hash table will not be used. - * - * @param ratio the hash table ratio. - * @return the reference to the current config. - */ - public PlainTableConfig setHashTableRatio(double ratio) { - hashTableRatio_ = ratio; - return this; - } - - /** - * @return the hash table ratio. - */ - public double hashTableRatio() { - return hashTableRatio_; - } - - /** - * Index sparseness determines the index interval for keys inside the - * same prefix. This number is equal to the maximum number of linear - * search required after hash and binary search. If it's set to 0, - * then each key will be indexed. - * - * @param sparseness the index sparseness. - * @return the reference to the current config. - */ - public PlainTableConfig setIndexSparseness(int sparseness) { - indexSparseness_ = sparseness; - return this; - } - - /** - * @return the index sparseness. - */ - public int indexSparseness() { - return indexSparseness_; - } - - @Override protected long newTableFactoryHandle() { - return newTableFactoryHandle(keySize_, bloomBitsPerKey_, - hashTableRatio_, indexSparseness_); - } - - private native long newTableFactoryHandle( - int keySize, int bloomBitsPerKey, - double hashTableRatio, int indexSparseness); - - private int keySize_; - private int bloomBitsPerKey_; - private double hashTableRatio_; - private int indexSparseness_; -} diff --git a/java/org/rocksdb/RestoreBackupableDB.java b/java/org/rocksdb/RestoreBackupableDB.java deleted file mode 100644 index dbde447a0..000000000 --- a/java/org/rocksdb/RestoreBackupableDB.java +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * This class is used to access information about backups and restore from them. - * - * Note that dispose() must be called before this instance become out-of-scope - * to release the allocated memory in c++. - * - * @param options Instance of BackupableDBOptions. - */ -public class RestoreBackupableDB extends RocksObject { - public RestoreBackupableDB(BackupableDBOptions options) { - super(); - nativeHandle_ = newRestoreBackupableDB(options.nativeHandle_); - } - - /** - * Restore from backup with backup_id - * IMPORTANT -- if options_.share_table_files == true and you restore DB - * from some backup that is not the latest, and you start creating new - * backups from the new DB, they will probably fail. - * - * Example: Let's say you have backups 1, 2, 3, 4, 5 and you restore 3. - * If you add new data to the DB and try creating a new backup now, the - * database will diverge from backups 4 and 5 and the new backup will fail. - * If you want to create new backup, you will first have to delete backups 4 - * and 5. - */ - public void restoreDBFromBackup(long backupId, String dbDir, String walDir, - RestoreOptions restoreOptions) throws RocksDBException { - restoreDBFromBackup0(nativeHandle_, backupId, dbDir, walDir, - restoreOptions.nativeHandle_); - } - - /** - * Restore from the latest backup. - */ - public void restoreDBFromLatestBackup(String dbDir, String walDir, - RestoreOptions restoreOptions) throws RocksDBException { - restoreDBFromLatestBackup0(nativeHandle_, dbDir, walDir, - restoreOptions.nativeHandle_); - } - - /** - * Deletes old backups, keeping latest numBackupsToKeep alive. - * - * @param Number of latest backups to keep - */ - public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException { - purgeOldBackups0(nativeHandle_, numBackupsToKeep); - } - - /** - * Deletes a specific backup. - * - * @param ID of backup to delete. - */ - public void deleteBackup(long backupId) throws RocksDBException { - deleteBackup0(nativeHandle_, backupId); - } - - /** - * Release the memory allocated for the current instance - * in the c++ side. - */ - @Override public synchronized void disposeInternal() { - assert(isInitialized()); - dispose(nativeHandle_); - } - - private native long newRestoreBackupableDB(long options); - private native void restoreDBFromBackup0(long nativeHandle, long backupId, - String dbDir, String walDir, long restoreOptions) throws RocksDBException; - private native void restoreDBFromLatestBackup0(long nativeHandle, - String dbDir, String walDir, long restoreOptions) throws RocksDBException; - private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep); - private native void deleteBackup0(long nativeHandle, long backupId); - private native void dispose(long nativeHandle); -} diff --git a/java/org/rocksdb/RocksDB.java b/java/org/rocksdb/RocksDB.java deleted file mode 100644 index f8968d14d..000000000 --- a/java/org/rocksdb/RocksDB.java +++ /dev/null @@ -1,370 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -import java.util.List; -import java.util.Map; -import java.util.HashMap; -import java.io.Closeable; -import java.io.IOException; -import org.rocksdb.util.Environment; - -/** - * A RocksDB is a persistent ordered map from keys to values. It is safe for - * concurrent access from multiple threads without any external synchronization. - * All methods of this class could potentially throw RocksDBException, which - * indicates sth wrong at the rocksdb library side and the call failed. - */ -public class RocksDB extends RocksObject { - public static final int NOT_FOUND = -1; - private static final String[] compressionLibs_ = { - "snappy", "z", "bzip2", "lz4", "lz4hc"}; - - /** - * Loads the necessary library files. - * Calling this method twice will have no effect. - */ - public static synchronized void loadLibrary() { - // loading possibly necessary libraries. - for (String lib : compressionLibs_) { - try { - System.loadLibrary(lib); - } catch (UnsatisfiedLinkError e) { - // since it may be optional, we ignore its loading failure here. - } - } - // However, if any of them is required. We will see error here. - System.loadLibrary("rocksdbjni"); - } - - /** - * Tries to load the necessary library files from the given list of - * directories. - * - * @param paths a list of strings where each describes a directory - * of a library. - */ - public static synchronized void loadLibrary(List paths) { - for (String lib : compressionLibs_) { - for (String path : paths) { - try { - System.load(path + "/" + Environment.getSharedLibraryName(lib)); - break; - } catch (UnsatisfiedLinkError e) { - // since they are optional, we ignore loading fails. - } - } - } - boolean success = false; - UnsatisfiedLinkError err = null; - for (String path : paths) { - try { - System.load(path + "/" + Environment.getJniLibraryName("rocksdbjni")); - success = true; - break; - } catch (UnsatisfiedLinkError e) { - err = e; - } - } - if (success == false) { - throw err; - } - } - - /** - * The factory constructor of RocksDB that opens a RocksDB instance given - * the path to the database using the default options w/ createIfMissing - * set to true. - * - * @param path the path to the rocksdb. - * @param status an out value indicating the status of the Open(). - * @return a rocksdb instance on success, null if the specified rocksdb can - * not be opened. - * - * @see Options.setCreateIfMissing() - * @see Options.createIfMissing() - */ - public static RocksDB open(String path) throws RocksDBException { - RocksDB db = new RocksDB(); - - // This allows to use the rocksjni default Options instead of - // the c++ one. - Options options = new Options(); - return open(options, path); - } - - /** - * The factory constructor of RocksDB that opens a RocksDB instance given - * the path to the database using the specified options and db path. - * - * Options instance *should* not be disposed before all DBs using this options - * instance have been closed. If user doesn't call options dispose explicitly, - * then this options instance will be GC'd automatically. - * - * Options instance can be re-used to open multiple DBs if DB statistics is - * not used. If DB statistics are required, then its recommended to open DB - * with new Options instance as underlying native statistics instance does not - * use any locks to prevent concurrent updates. - */ - public static RocksDB open(Options options, String path) - throws RocksDBException { - // when non-default Options is used, keeping an Options reference - // in RocksDB can prevent Java to GC during the life-time of - // the currently-created RocksDB. - RocksDB db = new RocksDB(); - db.open(options.nativeHandle_, path); - - db.storeOptionsInstance(options); - return db; - } - - private void storeOptionsInstance(Options options) { - options_ = options; - } - - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - /** - * Close the RocksDB instance. - * This function is equivalent to dispose(). - */ - public void close() { - dispose(); - } - - /** - * Set the database entry for "key" to "value". - * - * @param key the specified key to be inserted. - * @param value the value associated with the specified key. - */ - public void put(byte[] key, byte[] value) throws RocksDBException { - put(nativeHandle_, key, key.length, value, value.length); - } - - /** - * Set the database entry for "key" to "value". - * - * @param key the specified key to be inserted. - * @param value the value associated with the specified key. - */ - public void put(WriteOptions writeOpts, byte[] key, byte[] value) - throws RocksDBException { - put(nativeHandle_, writeOpts.nativeHandle_, - key, key.length, value, value.length); - } - - /** - * Apply the specified updates to the database. - */ - public void write(WriteOptions writeOpts, WriteBatch updates) - throws RocksDBException { - write(writeOpts.nativeHandle_, updates.nativeHandle_); - } - - /** - * Get the value associated with the specified key. - * - * @param key the key to retrieve the value. - * @param value the out-value to receive the retrieved value. - * @return The size of the actual value that matches the specified - * {@code key} in byte. If the return value is greater than the - * length of {@code value}, then it indicates that the size of the - * input buffer {@code value} is insufficient and partial result will - * be returned. RocksDB.NOT_FOUND will be returned if the value not - * found. - */ - public int get(byte[] key, byte[] value) throws RocksDBException { - return get(nativeHandle_, key, key.length, value, value.length); - } - - /** - * Get the value associated with the specified key. - * - * @param key the key to retrieve the value. - * @param value the out-value to receive the retrieved value. - * @return The size of the actual value that matches the specified - * {@code key} in byte. If the return value is greater than the - * length of {@code value}, then it indicates that the size of the - * input buffer {@code value} is insufficient and partial result will - * be returned. RocksDB.NOT_FOUND will be returned if the value not - * found. - */ - public int get(ReadOptions opt, byte[] key, byte[] value) - throws RocksDBException { - return get(nativeHandle_, opt.nativeHandle_, - key, key.length, value, value.length); - } - - /** - * The simplified version of get which returns a new byte array storing - * the value associated with the specified input key if any. null will be - * returned if the specified key is not found. - * - * @param key the key retrieve the value. - * @return a byte array storing the value associated with the input key if - * any. null if it does not find the specified key. - * - * @see RocksDBException - */ - public byte[] get(byte[] key) throws RocksDBException { - return get(nativeHandle_, key, key.length); - } - - /** - * The simplified version of get which returns a new byte array storing - * the value associated with the specified input key if any. null will be - * returned if the specified key is not found. - * - * @param key the key retrieve the value. - * @param opt Read options. - * @return a byte array storing the value associated with the input key if - * any. null if it does not find the specified key. - * - * @see RocksDBException - */ - public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException { - return get(nativeHandle_, opt.nativeHandle_, key, key.length); - } - - /** - * Returns a map of keys for which values were found in DB. - * - * @param keys List of keys for which values need to be retrieved. - * @return Map where key of map is the key passed by user and value for map - * entry is the corresponding value in DB. - * - * @see RocksDBException - */ - public Map multiGet(List keys) - throws RocksDBException { - assert(keys.size() != 0); - - List values = multiGet( - nativeHandle_, keys, keys.size()); - - Map keyValueMap = new HashMap(); - for(int i = 0; i < values.size(); i++) { - if(values.get(i) == null) { - continue; - } - - keyValueMap.put(keys.get(i), values.get(i)); - } - - return keyValueMap; - } - - - /** - * Returns a map of keys for which values were found in DB. - * - * @param List of keys for which values need to be retrieved. - * @param opt Read options. - * @return Map where key of map is the key passed by user and value for map - * entry is the corresponding value in DB. - * - * @see RocksDBException - */ - public Map multiGet(ReadOptions opt, List keys) - throws RocksDBException { - assert(keys.size() != 0); - - List values = multiGet( - nativeHandle_, opt.nativeHandle_, keys, keys.size()); - - Map keyValueMap = new HashMap(); - for(int i = 0; i < values.size(); i++) { - if(values.get(i) == null) { - continue; - } - - keyValueMap.put(keys.get(i), values.get(i)); - } - - return keyValueMap; - } - - /** - * Remove the database entry (if any) for "key". Returns OK on - * success, and a non-OK status on error. It is not an error if "key" - * did not exist in the database. - */ - public void remove(byte[] key) throws RocksDBException { - remove(nativeHandle_, key, key.length); - } - - /** - * Remove the database entry (if any) for "key". Returns OK on - * success, and a non-OK status on error. It is not an error if "key" - * did not exist in the database. - */ - public void remove(WriteOptions writeOpt, byte[] key) - throws RocksDBException { - remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length); - } - - /** - * Return a heap-allocated iterator over the contents of the database. - * The result of newIterator() is initially invalid (caller must - * call one of the Seek methods on the iterator before using it). - * - * Caller should close the iterator when it is no longer needed. - * The returned iterator should be closed before this db is closed. - * - * @return instance of iterator object. - */ - public RocksIterator newIterator() { - return new RocksIterator(iterator0(nativeHandle_)); - } - - /** - * Private constructor. - */ - protected RocksDB() { - super(); - } - - // native methods - protected native void open( - long optionsHandle, String path) throws RocksDBException; - protected native void put( - long handle, byte[] key, int keyLen, - byte[] value, int valueLen) throws RocksDBException; - protected native void put( - long handle, long writeOptHandle, - byte[] key, int keyLen, - byte[] value, int valueLen) throws RocksDBException; - protected native void write( - long writeOptHandle, long batchHandle) throws RocksDBException; - protected native int get( - long handle, byte[] key, int keyLen, - byte[] value, int valueLen) throws RocksDBException; - protected native int get( - long handle, long readOptHandle, byte[] key, int keyLen, - byte[] value, int valueLen) throws RocksDBException; - protected native List multiGet( - long dbHandle, List keys, int keysCount); - protected native List multiGet( - long dbHandle, long rOptHandle, List keys, int keysCount); - protected native byte[] get( - long handle, byte[] key, int keyLen) throws RocksDBException; - protected native byte[] get( - long handle, long readOptHandle, - byte[] key, int keyLen) throws RocksDBException; - protected native void remove( - long handle, byte[] key, int keyLen) throws RocksDBException; - protected native void remove( - long handle, long writeOptHandle, - byte[] key, int keyLen) throws RocksDBException; - protected native long iterator0(long optHandle); - private native void disposeInternal(long handle); - - protected Options options_; -} diff --git a/java/org/rocksdb/RocksIterator.java b/java/org/rocksdb/RocksIterator.java deleted file mode 100644 index 9ef2e8c24..000000000 --- a/java/org/rocksdb/RocksIterator.java +++ /dev/null @@ -1,136 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -/** - * An iterator yields a sequence of key/value pairs from a source. - * The following class defines the interface. Multiple implementations - * are provided by this library. In particular, iterators are provided - * to access the contents of a Table or a DB. - * - * Multiple threads can invoke const methods on an RocksIterator without - * external synchronization, but if any of the threads may call a - * non-const method, all threads accessing the same RocksIterator must use - * external synchronization. - */ -public class RocksIterator extends RocksObject { - public RocksIterator(long nativeHandle) { - super(); - nativeHandle_ = nativeHandle; - } - - /** - * An iterator is either positioned at a key/value pair, or - * not valid. This method returns true iff the iterator is valid. - * @return true if iterator is valid. - */ - public boolean isValid() { - assert(isInitialized()); - return isValid0(nativeHandle_); - } - - /** - * Position at the first key in the source. The iterator is Valid() - * after this call iff the source is not empty. - */ - public void seekToFirst() { - assert(isInitialized()); - seekToFirst0(nativeHandle_); - } - - /** - * Position at the last key in the source. The iterator is - * Valid() after this call iff the source is not empty. - */ - public void seekToLast() { - assert(isInitialized()); - seekToLast0(nativeHandle_); - } - - /** - * Moves to the next entry in the source. After this call, Valid() is - * true iff the iterator was not positioned at the last entry in the source. - * REQUIRES: Valid() - */ - public void next() { - assert(isInitialized()); - next0(nativeHandle_); - } - - /** - * Moves to the previous entry in the source. After this call, Valid() is - * true iff the iterator was not positioned at the first entry in source. - * REQUIRES: Valid() - */ - public void prev() { - assert(isInitialized()); - prev0(nativeHandle_); - } - - /** - * Return the key for the current entry. The underlying storage for - * the returned slice is valid only until the next modification of - * the iterator. - * REQUIRES: Valid() - * @return key for the current entry. - */ - public byte[] key() { - assert(isInitialized()); - return key0(nativeHandle_); - } - - /** - * Return the value for the current entry. The underlying storage for - * the returned slice is valid only until the next modification of - * the iterator. - * REQUIRES: !AtEnd() && !AtStart() - * @return value for the current entry. - */ - public byte[] value() { - assert(isInitialized()); - return value0(nativeHandle_); - } - - /** - * Position at the first key in the source that at or past target - * The iterator is Valid() after this call iff the source contains - * an entry that comes at or past target. - */ - public void seek(byte[] target) { - assert(isInitialized()); - seek0(nativeHandle_, target, target.length); - } - - /** - * If an error has occurred, return it. Else return an ok status. - * If non-blocking IO is requested and this operation cannot be - * satisfied without doing some IO, then this returns Status::Incomplete(). - * - */ - public void status() throws RocksDBException { - assert(isInitialized()); - status0(nativeHandle_); - } - - /** - * Deletes underlying C++ iterator pointer. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native boolean isValid0(long handle); - private native void disposeInternal(long handle); - private native void seekToFirst0(long handle); - private native void seekToLast0(long handle); - private native void next0(long handle); - private native void prev0(long handle); - private native byte[] key0(long handle); - private native byte[] value0(long handle); - private native void seek0(long handle, byte[] target, int targetLen); - private native void status0(long handle); -} diff --git a/java/org/rocksdb/SkipListMemTableConfig.java b/java/org/rocksdb/SkipListMemTableConfig.java deleted file mode 100644 index 7f9f5cb5f..000000000 --- a/java/org/rocksdb/SkipListMemTableConfig.java +++ /dev/null @@ -1,15 +0,0 @@ -package org.rocksdb; - -/** - * The config for skip-list memtable representation. - */ -public class SkipListMemTableConfig extends MemTableConfig { - public SkipListMemTableConfig() { - } - - @Override protected long newMemTableFactoryHandle() { - return newMemTableFactoryHandle0(); - } - - private native long newMemTableFactoryHandle0(); -} diff --git a/java/org/rocksdb/WriteBatch.java b/java/org/rocksdb/WriteBatch.java deleted file mode 100644 index f538dc1a0..000000000 --- a/java/org/rocksdb/WriteBatch.java +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb; - -import java.util.*; - -/** - * WriteBatch holds a collection of updates to apply atomically to a DB. - * - * The updates are applied in the order in which they are added - * to the WriteBatch. For example, the value of "key" will be "v3" - * after the following batch is written: - * - * batch.put("key", "v1"); - * batch.remove("key"); - * batch.put("key", "v2"); - * batch.put("key", "v3"); - * - * Multiple threads can invoke const methods on a WriteBatch without - * external synchronization, but if any of the threads may call a - * non-const method, all threads accessing the same WriteBatch must use - * external synchronization. - */ -public class WriteBatch extends RocksObject { - public WriteBatch() { - super(); - newWriteBatch(0); - } - - public WriteBatch(int reserved_bytes) { - nativeHandle_ = 0; - newWriteBatch(reserved_bytes); - } - - /** - * Returns the number of updates in the batch. - */ - public native int count(); - - /** - * Store the mapping "key->value" in the database. - */ - public void put(byte[] key, byte[] value) { - put(key, key.length, value, value.length); - } - - /** - * Merge "value" with the existing value of "key" in the database. - * "key->merge(existing, value)" - */ - public void merge(byte[] key, byte[] value) { - merge(key, key.length, value, value.length); - } - - /** - * If the database contains a mapping for "key", erase it. Else do nothing. - */ - public void remove(byte[] key) { - remove(key, key.length); - } - - /** - * Append a blob of arbitrary size to the records in this batch. The blob will - * be stored in the transaction log but not in any other file. In particular, - * it will not be persisted to the SST files. When iterating over this - * WriteBatch, WriteBatch::Handler::LogData will be called with the contents - * of the blob as it is encountered. Blobs, puts, deletes, and merges will be - * encountered in the same order in thich they were inserted. The blob will - * NOT consume sequence number(s) and will NOT increase the count of the batch - * - * Example application: add timestamps to the transaction log for use in - * replication. - */ - public void putLogData(byte[] blob) { - putLogData(blob, blob.length); - } - - /** - * Clear all updates buffered in this batch - */ - public native void clear(); - - /** - * Delete the c++ side pointer. - */ - @Override protected void disposeInternal() { - assert(isInitialized()); - disposeInternal(nativeHandle_); - } - - private native void newWriteBatch(int reserved_bytes); - private native void put(byte[] key, int keyLen, - byte[] value, int valueLen); - private native void merge(byte[] key, int keyLen, - byte[] value, int valueLen); - private native void remove(byte[] key, int keyLen); - private native void putLogData(byte[] blob, int blobLen); - private native void disposeInternal(long handle); -} - -/** - * Package-private class which provides java api to access - * c++ WriteBatchInternal. - */ -class WriteBatchInternal { - static native void setSequence(WriteBatch batch, long sn); - static native long sequence(WriteBatch batch); - static native void append(WriteBatch b1, WriteBatch b2); -} diff --git a/java/org/rocksdb/WriteBatchTest.java b/java/org/rocksdb/WriteBatchTest.java deleted file mode 100644 index 03a866313..000000000 --- a/java/org/rocksdb/WriteBatchTest.java +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -package org.rocksdb; - -import java.util.*; -import java.io.UnsupportedEncodingException; - -/** - * This class mimics the db/write_batch_test.cc in the c++ rocksdb library. - */ -public class WriteBatchTest { - static { - RocksDB.loadLibrary(); - } - - public static void main(String args[]) { - System.out.println("Testing WriteBatchTest.Empty ==="); - Empty(); - - System.out.println("Testing WriteBatchTest.Multiple ==="); - Multiple(); - - System.out.println("Testing WriteBatchTest.Append ==="); - Append(); - - System.out.println("Testing WriteBatchTest.Blob ==="); - Blob(); - - // The following tests have not yet ported. - // Continue(); - // PutGatherSlices(); - - System.out.println("Passed all WriteBatchTest!"); - } - - static void Empty() { - WriteBatch batch = new WriteBatch(); - assert(batch.count() == 0); - } - - static void Multiple() { - try { - WriteBatch batch = new WriteBatch(); - batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); - batch.remove("box".getBytes("US-ASCII")); - batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); - WriteBatchInternal.setSequence(batch, 100); - assert(100 == WriteBatchInternal.sequence(batch)); - assert(3 == batch.count()); - assert(new String("Put(baz, boo)@102" + - "Delete(box)@101" + - "Put(foo, bar)@100") - .equals(new String(getContents(batch), "US-ASCII"))); - } catch (UnsupportedEncodingException e) { - System.err.println(e); - assert(false); - } - } - - static void Append() { - WriteBatch b1 = new WriteBatch(); - WriteBatch b2 = new WriteBatch(); - WriteBatchInternal.setSequence(b1, 200); - WriteBatchInternal.setSequence(b2, 300); - WriteBatchInternal.append(b1, b2); - assert(getContents(b1).length == 0); - assert(b1.count() == 0); - try { - b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); - WriteBatchInternal.append(b1, b2); - assert("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII"))); - assert(1 == b1.count()); - b2.clear(); - b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); - WriteBatchInternal.append(b1, b2); - assert(new String("Put(a, va)@200" + - "Put(b, vb)@201") - .equals(new String(getContents(b1), "US-ASCII"))); - assert(2 == b1.count()); - b2.remove("foo".getBytes("US-ASCII")); - WriteBatchInternal.append(b1, b2); - assert(new String("Put(a, va)@200" + - "Put(b, vb)@202" + - "Put(b, vb)@201" + - "Delete(foo)@203") - .equals(new String(getContents(b1), "US-ASCII"))); - assert(4 == b1.count()); - } catch (UnsupportedEncodingException e) { - System.err.println(e); - assert(false); - } - } - - static void Blob() { - WriteBatch batch = new WriteBatch(); - try { - batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); - batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); - batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); - batch.putLogData("blob1".getBytes("US-ASCII")); - batch.remove("k2".getBytes("US-ASCII")); - batch.putLogData("blob2".getBytes("US-ASCII")); - batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); - assert(5 == batch.count()); - assert(new String("Merge(foo, bar)@4" + - "Put(k1, v1)@0" + - "Delete(k2)@3" + - "Put(k2, v2)@1" + - "Put(k3, v3)@2") - .equals(new String(getContents(batch), "US-ASCII"))); - } catch (UnsupportedEncodingException e) { - System.err.println(e); - assert(false); - } - } - - static native byte[] getContents(WriteBatch batch); -} diff --git a/java/org/rocksdb/test/BackupableDBTest.java b/java/org/rocksdb/test/BackupableDBTest.java deleted file mode 100644 index 9d3a64c06..000000000 --- a/java/org/rocksdb/test/BackupableDBTest.java +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb.test; - -import org.rocksdb.*; - -public class BackupableDBTest { - static final String db_path = "/tmp/backupablejni_db"; - static final String backup_path = "/tmp/backupablejni_db_backup"; - static { - RocksDB.loadLibrary(); - } - public static void main(String[] args) { - - Options opt = new Options(); - opt.setCreateIfMissing(true); - - BackupableDBOptions bopt = new BackupableDBOptions(backup_path, false, - true, false, true, 0, 0); - BackupableDB bdb = null; - - try { - bdb = BackupableDB.open(opt, bopt, db_path); - - bdb.put("abc".getBytes(), "def".getBytes()); - bdb.put("ghi".getBytes(), "jkl".getBytes()); - bdb.createNewBackup(true); - - // delete record after backup - bdb.remove("abc".getBytes()); - byte[] value = bdb.get("abc".getBytes()); - assert(value == null); - bdb.close(); - - // restore from backup - RestoreOptions ropt = new RestoreOptions(false); - RestoreBackupableDB rdb = new RestoreBackupableDB(bopt); - rdb.restoreDBFromLatestBackup(db_path, db_path, - ropt); - rdb.dispose(); - ropt.dispose(); - - // verify that backed up data contains deleted record - bdb = BackupableDB.open(opt, bopt, db_path); - value = bdb.get("abc".getBytes()); - assert(new String(value).equals("def")); - - System.out.println("Backup and restore test passed"); - } catch (RocksDBException e) { - System.err.format("[ERROR]: %s%n", e); - e.printStackTrace(); - } finally { - opt.dispose(); - bopt.dispose(); - if (bdb != null) { - bdb.close(); - } - } - } -} diff --git a/java/org/rocksdb/test/OptionsTest.java b/java/org/rocksdb/test/OptionsTest.java deleted file mode 100644 index b065c9023..000000000 --- a/java/org/rocksdb/test/OptionsTest.java +++ /dev/null @@ -1,388 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb.test; - -import java.util.Random; -import org.rocksdb.RocksDB; -import org.rocksdb.Options; - -public class OptionsTest { - static { - RocksDB.loadLibrary(); - } - public static void main(String[] args) { - Options opt = new Options(); - Random rand = new Random(); - { // CreateIfMissing test - boolean boolValue = rand.nextBoolean(); - opt.setCreateIfMissing(boolValue); - assert(opt.createIfMissing() == boolValue); - } - - { // ErrorIfExists test - boolean boolValue = rand.nextBoolean(); - opt.setErrorIfExists(boolValue); - assert(opt.errorIfExists() == boolValue); - } - - { // ParanoidChecks test - boolean boolValue = rand.nextBoolean(); - opt.setParanoidChecks(boolValue); - assert(opt.paranoidChecks() == boolValue); - } - - { // MaxOpenFiles test - int intValue = rand.nextInt(); - opt.setMaxOpenFiles(intValue); - assert(opt.maxOpenFiles() == intValue); - } - - { // DisableDataSync test - boolean boolValue = rand.nextBoolean(); - opt.setDisableDataSync(boolValue); - assert(opt.disableDataSync() == boolValue); - } - - { // UseFsync test - boolean boolValue = rand.nextBoolean(); - opt.setUseFsync(boolValue); - assert(opt.useFsync() == boolValue); - } - - { // DbStatsLogInterval test - int intValue = rand.nextInt(); - opt.setDbStatsLogInterval(intValue); - assert(opt.dbStatsLogInterval() == intValue); - } - - { // DbLogDir test - String str = "path/to/DbLogDir"; - opt.setDbLogDir(str); - assert(opt.dbLogDir().equals(str)); - } - - { // WalDir test - String str = "path/to/WalDir"; - opt.setWalDir(str); - assert(opt.walDir().equals(str)); - } - - { // DeleteObsoleteFilesPeriodMicros test - long longValue = rand.nextLong(); - opt.setDeleteObsoleteFilesPeriodMicros(longValue); - assert(opt.deleteObsoleteFilesPeriodMicros() == longValue); - } - - { // MaxBackgroundCompactions test - int intValue = rand.nextInt(); - opt.setMaxBackgroundCompactions(intValue); - assert(opt.maxBackgroundCompactions() == intValue); - } - - { // MaxBackgroundFlushes test - int intValue = rand.nextInt(); - opt.setMaxBackgroundFlushes(intValue); - assert(opt.maxBackgroundFlushes() == intValue); - } - - { // MaxLogFileSize test - long longValue = rand.nextLong(); - opt.setMaxLogFileSize(longValue); - assert(opt.maxLogFileSize() == longValue); - } - - { // LogFileTimeToRoll test - long longValue = rand.nextLong(); - opt.setLogFileTimeToRoll(longValue); - assert(opt.logFileTimeToRoll() == longValue); - } - - { // KeepLogFileNum test - long longValue = rand.nextLong(); - opt.setKeepLogFileNum(longValue); - assert(opt.keepLogFileNum() == longValue); - } - - { // MaxManifestFileSize test - long longValue = rand.nextLong(); - opt.setMaxManifestFileSize(longValue); - assert(opt.maxManifestFileSize() == longValue); - } - - { // TableCacheNumshardbits test - int intValue = rand.nextInt(); - opt.setTableCacheNumshardbits(intValue); - assert(opt.tableCacheNumshardbits() == intValue); - } - - { // TableCacheRemoveScanCountLimit test - int intValue = rand.nextInt(); - opt.setTableCacheRemoveScanCountLimit(intValue); - assert(opt.tableCacheRemoveScanCountLimit() == intValue); - } - - { // WalTtlSeconds test - long longValue = rand.nextLong(); - opt.setWalTtlSeconds(longValue); - assert(opt.walTtlSeconds() == longValue); - } - - { // ManifestPreallocationSize test - long longValue = rand.nextLong(); - opt.setManifestPreallocationSize(longValue); - assert(opt.manifestPreallocationSize() == longValue); - } - - { // AllowOsBuffer test - boolean boolValue = rand.nextBoolean(); - opt.setAllowOsBuffer(boolValue); - assert(opt.allowOsBuffer() == boolValue); - } - - { // AllowMmapReads test - boolean boolValue = rand.nextBoolean(); - opt.setAllowMmapReads(boolValue); - assert(opt.allowMmapReads() == boolValue); - } - - { // AllowMmapWrites test - boolean boolValue = rand.nextBoolean(); - opt.setAllowMmapWrites(boolValue); - assert(opt.allowMmapWrites() == boolValue); - } - - { // IsFdCloseOnExec test - boolean boolValue = rand.nextBoolean(); - opt.setIsFdCloseOnExec(boolValue); - assert(opt.isFdCloseOnExec() == boolValue); - } - - { // SkipLogErrorOnRecovery test - boolean boolValue = rand.nextBoolean(); - opt.setSkipLogErrorOnRecovery(boolValue); - assert(opt.skipLogErrorOnRecovery() == boolValue); - } - - { // StatsDumpPeriodSec test - int intValue = rand.nextInt(); - opt.setStatsDumpPeriodSec(intValue); - assert(opt.statsDumpPeriodSec() == intValue); - } - - { // AdviseRandomOnOpen test - boolean boolValue = rand.nextBoolean(); - opt.setAdviseRandomOnOpen(boolValue); - assert(opt.adviseRandomOnOpen() == boolValue); - } - - { // UseAdaptiveMutex test - boolean boolValue = rand.nextBoolean(); - opt.setUseAdaptiveMutex(boolValue); - assert(opt.useAdaptiveMutex() == boolValue); - } - - { // BytesPerSync test - long longValue = rand.nextLong(); - opt.setBytesPerSync(longValue); - assert(opt.bytesPerSync() == longValue); - } - - { // AllowThreadLocal test - boolean boolValue = rand.nextBoolean(); - opt.setAllowThreadLocal(boolValue); - assert(opt.allowThreadLocal() == boolValue); - } - - { // WriteBufferSize test - long longValue = rand.nextLong(); - opt.setWriteBufferSize(longValue); - assert(opt.writeBufferSize() == longValue); - } - - { // MaxWriteBufferNumber test - int intValue = rand.nextInt(); - opt.setMaxWriteBufferNumber(intValue); - assert(opt.maxWriteBufferNumber() == intValue); - } - - { // MinWriteBufferNumberToMerge test - int intValue = rand.nextInt(); - opt.setMinWriteBufferNumberToMerge(intValue); - assert(opt.minWriteBufferNumberToMerge() == intValue); - } - - { // NumLevels test - int intValue = rand.nextInt(); - opt.setNumLevels(intValue); - assert(opt.numLevels() == intValue); - } - - { // LevelFileNumCompactionTrigger test - int intValue = rand.nextInt(); - opt.setLevelZeroFileNumCompactionTrigger(intValue); - assert(opt.levelZeroFileNumCompactionTrigger() == intValue); - } - - { // LevelSlowdownWritesTrigger test - int intValue = rand.nextInt(); - opt.setLevelZeroSlowdownWritesTrigger(intValue); - assert(opt.levelZeroSlowdownWritesTrigger() == intValue); - } - - { // LevelStopWritesTrigger test - int intValue = rand.nextInt(); - opt.setLevelZeroStopWritesTrigger(intValue); - assert(opt.levelZeroStopWritesTrigger() == intValue); - } - - { // MaxMemCompactionLevel test - int intValue = rand.nextInt(); - opt.setMaxMemCompactionLevel(intValue); - assert(opt.maxMemCompactionLevel() == intValue); - } - - { // TargetFileSizeBase test - int intValue = rand.nextInt(); - opt.setTargetFileSizeBase(intValue); - assert(opt.targetFileSizeBase() == intValue); - } - - { // TargetFileSizeMultiplier test - int intValue = rand.nextInt(); - opt.setTargetFileSizeMultiplier(intValue); - assert(opt.targetFileSizeMultiplier() == intValue); - } - - { // MaxBytesForLevelBase test - long longValue = rand.nextLong(); - opt.setMaxBytesForLevelBase(longValue); - assert(opt.maxBytesForLevelBase() == longValue); - } - - { // MaxBytesForLevelMultiplier test - int intValue = rand.nextInt(); - opt.setMaxBytesForLevelMultiplier(intValue); - assert(opt.maxBytesForLevelMultiplier() == intValue); - } - - { // ExpandedCompactionFactor test - int intValue = rand.nextInt(); - opt.setExpandedCompactionFactor(intValue); - assert(opt.expandedCompactionFactor() == intValue); - } - - { // SourceCompactionFactor test - int intValue = rand.nextInt(); - opt.setSourceCompactionFactor(intValue); - assert(opt.sourceCompactionFactor() == intValue); - } - - { // MaxGrandparentOverlapFactor test - int intValue = rand.nextInt(); - opt.setMaxGrandparentOverlapFactor(intValue); - assert(opt.maxGrandparentOverlapFactor() == intValue); - } - - { // SoftRateLimit test - double doubleValue = rand.nextDouble(); - opt.setSoftRateLimit(doubleValue); - assert(opt.softRateLimit() == doubleValue); - } - - { // HardRateLimit test - double doubleValue = rand.nextDouble(); - opt.setHardRateLimit(doubleValue); - assert(opt.hardRateLimit() == doubleValue); - } - - { // RateLimitDelayMaxMilliseconds test - int intValue = rand.nextInt(); - opt.setRateLimitDelayMaxMilliseconds(intValue); - assert(opt.rateLimitDelayMaxMilliseconds() == intValue); - } - - { // ArenaBlockSize test - long longValue = rand.nextLong(); - opt.setArenaBlockSize(longValue); - assert(opt.arenaBlockSize() == longValue); - } - - { // DisableAutoCompactions test - boolean boolValue = rand.nextBoolean(); - opt.setDisableAutoCompactions(boolValue); - assert(opt.disableAutoCompactions() == boolValue); - } - - { // PurgeRedundantKvsWhileFlush test - boolean boolValue = rand.nextBoolean(); - opt.setPurgeRedundantKvsWhileFlush(boolValue); - assert(opt.purgeRedundantKvsWhileFlush() == boolValue); - } - - { // VerifyChecksumsInCompaction test - boolean boolValue = rand.nextBoolean(); - opt.setVerifyChecksumsInCompaction(boolValue); - assert(opt.verifyChecksumsInCompaction() == boolValue); - } - - { // FilterDeletes test - boolean boolValue = rand.nextBoolean(); - opt.setFilterDeletes(boolValue); - assert(opt.filterDeletes() == boolValue); - } - - { // MaxSequentialSkipInIterations test - long longValue = rand.nextLong(); - opt.setMaxSequentialSkipInIterations(longValue); - assert(opt.maxSequentialSkipInIterations() == longValue); - } - - { // InplaceUpdateSupport test - boolean boolValue = rand.nextBoolean(); - opt.setInplaceUpdateSupport(boolValue); - assert(opt.inplaceUpdateSupport() == boolValue); - } - - { // InplaceUpdateNumLocks test - long longValue = rand.nextLong(); - opt.setInplaceUpdateNumLocks(longValue); - assert(opt.inplaceUpdateNumLocks() == longValue); - } - - { // MemtablePrefixBloomBits test - int intValue = rand.nextInt(); - opt.setMemtablePrefixBloomBits(intValue); - assert(opt.memtablePrefixBloomBits() == intValue); - } - - { // MemtablePrefixBloomProbes test - int intValue = rand.nextInt(); - opt.setMemtablePrefixBloomProbes(intValue); - assert(opt.memtablePrefixBloomProbes() == intValue); - } - - { // BloomLocality test - int intValue = rand.nextInt(); - opt.setBloomLocality(intValue); - assert(opt.bloomLocality() == intValue); - } - - { // MaxSuccessiveMerges test - long longValue = rand.nextLong(); - opt.setMaxSuccessiveMerges(longValue); - assert(opt.maxSuccessiveMerges() == longValue); - } - - { // MinPartialMergeOperands test - int intValue = rand.nextInt(); - opt.setMinPartialMergeOperands(intValue); - assert(opt.minPartialMergeOperands() == intValue); - } - - opt.dispose(); - System.out.println("Passed OptionsTest"); - } -} diff --git a/java/org/rocksdb/test/ReadOptionsTest.java b/java/org/rocksdb/test/ReadOptionsTest.java deleted file mode 100644 index b3b5b2690..000000000 --- a/java/org/rocksdb/test/ReadOptionsTest.java +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb.test; - -import java.util.Random; -import org.rocksdb.RocksDB; -import org.rocksdb.ReadOptions; - -public class ReadOptionsTest { - static { - RocksDB.loadLibrary(); - } - public static void main(String[] args) { - ReadOptions opt = new ReadOptions(); - Random rand = new Random(); - { // VerifyChecksums test - boolean boolValue = rand.nextBoolean(); - opt.setVerifyChecksums(boolValue); - assert(opt.verifyChecksums() == boolValue); - } - - { // FillCache test - boolean boolValue = rand.nextBoolean(); - opt.setFillCache(boolValue); - assert(opt.fillCache() == boolValue); - } - - { // Tailing test - boolean boolValue = rand.nextBoolean(); - opt.setTailing(boolValue); - assert(opt.tailing() == boolValue); - } - - opt.dispose(); - System.out.println("Passed ReadOptionsTest"); - } -} diff --git a/java/org/rocksdb/test/StatisticsCollectorTest.java b/java/org/rocksdb/test/StatisticsCollectorTest.java deleted file mode 100644 index e497d14df..000000000 --- a/java/org/rocksdb/test/StatisticsCollectorTest.java +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -package org.rocksdb.test; - -import java.util.Collections; -import org.rocksdb.*; - -public class StatisticsCollectorTest { - static final String db_path = "/tmp/backupablejni_db"; - static { - RocksDB.loadLibrary(); - } - - public static void main(String[] args) - throws InterruptedException, RocksDBException { - Options opt = new Options().createStatistics().setCreateIfMissing(true); - Statistics stats = opt.statisticsPtr(); - - RocksDB db = RocksDB.open(db_path); - - StatsCallbackMock callback = new StatsCallbackMock(); - StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback); - - StatisticsCollector statsCollector = new StatisticsCollector( - Collections.singletonList(statsInput), 100); - statsCollector.start(); - - Thread.sleep(1000); - - assert(callback.tickerCallbackCount > 0); - assert(callback.histCallbackCount > 0); - - statsCollector.shutDown(1000); - - db.close(); - opt.dispose(); - - System.out.println("Stats collector test passed.!"); - } -} diff --git a/java/org/rocksdb/util/Environment.java b/java/org/rocksdb/util/Environment.java deleted file mode 100644 index c2e3bc088..000000000 --- a/java/org/rocksdb/util/Environment.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.rocksdb.util; - -public class Environment { - private static String OS = System.getProperty("os.name").toLowerCase(); - - public static boolean isWindows() { - return (OS.indexOf("win") >= 0); - } - - public static boolean isMac() { - return (OS.indexOf("mac") >= 0); - } - - public static boolean isUnix() { - return (OS.indexOf("nix") >= 0 || - OS.indexOf("nux") >= 0 || - OS.indexOf("aix") >= 0); - } - - public static String getSharedLibraryName(String name) { - if (isUnix()) { - return String.format("lib%s.so", name); - } else if (isMac()) { - return String.format("lib%s.dylib", name); - } - throw new UnsupportedOperationException(); - } - - public static String getJniLibraryName(String name) { - if (isUnix()) { - return String.format("lib%s.so", name); - } else if (isMac()) { - return String.format("lib%s.jnilib", name); - } - throw new UnsupportedOperationException(); - } -} diff --git a/java/rocksjni.pom b/java/rocksjni.pom new file mode 100644 index 000000000..74676fdf4 --- /dev/null +++ b/java/rocksjni.pom @@ -0,0 +1,140 @@ + + + 4.0.0 + RocksDB JNI + http://rocksdb.org/ + org.rocksdb + rocksdbjni + + - + RocksDB fat jar that contains .so files for linux32 and linux64, and jnilib files + for Mac OSX. + + + + Apache License 2.0 + http://www.apache.org/licenses/LICENSE-2.0.html + repo + + + + scm:git:git://github.com/dropwizard/metrics.git + scm:git:git@github.com:dropwizard/metrics.git + http://github.com/dropwizard/metrics/ + HEAD + + + + Facebook + help@facebook.com + America/New_York + + architect + + + + + + 1.7 + 1.7 + UTF-8 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.2 + + ${project.build.source} + ${project.build.target} + ${project.build.sourceEncoding} + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.18.1 + + -ea -Xcheck:jni -Djava.library.path=${project.build.directory} + + + + org.jacoco + jacoco-maven-plugin + 0.7.2.201409121644 + + + + prepare-agent + + + + report + prepare-package + + report + + + + + + org.codehaus.gmaven + groovy-maven-plugin + 2.0 + + + process-classes + + execute + + + + Xenu + + + String fileContents = new File("${project.basedir}/../include/rocksdb/version.h").getText('UTF-8') + matcher = (fileContents =~ /(?s).*ROCKSDB_MAJOR ([0-9]+).*?/) + String major_version = matcher.getAt(0).getAt(1) + matcher = (fileContents =~ /(?s).*ROCKSDB_MINOR ([0-9]+).*?/) + String minor_version = matcher.getAt(0).getAt(1) + matcher = (fileContents =~ /(?s).*ROCKSDB_PATCH ([0-9]+).*?/) + String patch_version = matcher.getAt(0).getAt(1) + String version = String.format('%s.%s.%s', major_version, minor_version, patch_version) + // Set version to be used in pom.properties + project.version = version + // Set version to be set as jar name + project.build.finalName = project.artifactId + "-" + version + + + + + + + + + + + junit + junit + 4.12 + test + + + org.assertj + assertj-core + 1.7.1 + test + + + org.mockito + mockito-all + 1.10.19 + test + + + diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc index 2aa1d0b1d..d26e46e88 100644 --- a/java/rocksjni/backupablejni.cc +++ b/java/rocksjni/backupablejni.cc @@ -11,6 +11,7 @@ #include #include #include +#include #include "include/org_rocksdb_BackupableDB.h" #include "include/org_rocksdb_BackupableDBOptions.h" @@ -53,7 +54,7 @@ void Java_org_rocksdb_BackupableDB_createNewBackup( * Signature: (JI)V */ void Java_org_rocksdb_BackupableDB_purgeOldBackups( - JNIEnv* env, jobject jbdb, jlong jhandle, jboolean jnumBackupsToKeep) { + JNIEnv* env, jobject jbdb, jlong jhandle, jint jnumBackupsToKeep) { rocksdb::Status s = reinterpret_cast(jhandle)-> PurgeOldBackups(jnumBackupsToKeep); @@ -62,6 +63,78 @@ void Java_org_rocksdb_BackupableDB_purgeOldBackups( } } +/* + * Class: org_rocksdb_BackupableDB + * Method: deleteBackup0 + * Signature: (JI)V + */ +void Java_org_rocksdb_BackupableDB_deleteBackup0(JNIEnv* env, + jobject jobj, jlong jhandle, jint jbackup_id) { + auto rdb = reinterpret_cast(jhandle); + rocksdb::Status s = rdb->DeleteBackup(jbackup_id); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_BackupableDB + * Method: getBackupInfo + * Signature: (J)Ljava/util/List; + */ +jobject Java_org_rocksdb_BackupableDB_getBackupInfo( + JNIEnv* env, jobject jbdb, jlong jhandle) { + std::vector backup_infos; + reinterpret_cast(jhandle)-> + GetBackupInfo(&backup_infos); + return rocksdb::BackupInfoListJni::getBackupInfo(env, + backup_infos); +} + +/* + * Class: org_rocksdb_BackupableDB + * Method: getCorruptedBackups + * Signature: (J)[I; + */ +jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups( + JNIEnv* env, jobject jbdb, jlong jhandle) { + std::vector backup_ids; + reinterpret_cast(jhandle)-> + GetCorruptedBackups(&backup_ids); + // store backupids in int array + const std::vector::size_type + kIdSize = backup_ids.size(); + int int_backup_ids[kIdSize]; + for (std::vector::size_type i = 0; + i != kIdSize; i++) { + int_backup_ids[i] = backup_ids[i]; + } + // Store ints in java array + jintArray ret_backup_ids; + // Its ok to loose precision here (64->32) + jsize ret_backup_ids_size = static_cast(kIdSize); + ret_backup_ids = env->NewIntArray(ret_backup_ids_size); + env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, + int_backup_ids); + return ret_backup_ids; +} + +/* + * Class: org_rocksdb_BackupableDB + * Method: garbageCollect + * Signature: (J)V + */ +void Java_org_rocksdb_BackupableDB_garbageCollect(JNIEnv* env, + jobject jobj, jlong jhandle) { + auto db = reinterpret_cast(jhandle); + rocksdb::Status s = db->GarbageCollect(); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + /////////////////////////////////////////////////////////////////////////// // BackupDBOptions @@ -71,20 +144,10 @@ void Java_org_rocksdb_BackupableDB_purgeOldBackups( * Signature: (Ljava/lang/String;)V */ void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions( - JNIEnv* env, jobject jobj, jstring jpath, jboolean jshare_table_files, - jboolean jsync, jboolean jdestroy_old_data, jboolean jbackup_log_files, - jlong jbackup_rate_limit, jlong jrestore_rate_limit) { - jbackup_rate_limit = (jbackup_rate_limit <= 0) ? 0 : jbackup_rate_limit; - jrestore_rate_limit = (jrestore_rate_limit <= 0) ? 0 : jrestore_rate_limit; - + JNIEnv* env, jobject jobj, jstring jpath) { const char* cpath = env->GetStringUTFChars(jpath, 0); - - auto bopt = new rocksdb::BackupableDBOptions(cpath, nullptr, - jshare_table_files, nullptr, jsync, jdestroy_old_data, jbackup_log_files, - jbackup_rate_limit, jrestore_rate_limit); - + auto bopt = new rocksdb::BackupableDBOptions(cpath); env->ReleaseStringUTFChars(jpath, cpath); - rocksdb::BackupableDBOptionsJni::setHandle(env, jobj, bopt); } @@ -94,11 +157,165 @@ void Java_org_rocksdb_BackupableDBOptions_newBackupableDBOptions( * Signature: (J)Ljava/lang/String; */ jstring Java_org_rocksdb_BackupableDBOptions_backupDir( - JNIEnv* env, jobject jopt, jlong jhandle, jstring jpath) { + JNIEnv* env, jobject jopt, jlong jhandle) { auto bopt = reinterpret_cast(jhandle); return env->NewStringUTF(bopt->backup_dir.c_str()); } +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setShareTableFiles + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setShareTableFiles( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + auto bopt = reinterpret_cast(jhandle); + bopt->share_table_files = flag; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: shareTableFiles + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_BackupableDBOptions_shareTableFiles( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->share_table_files; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setSync( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + auto bopt = reinterpret_cast(jhandle); + bopt->sync = flag; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: sync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_BackupableDBOptions_sync( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->sync; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setDestroyOldData + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setDestroyOldData( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + auto bopt = reinterpret_cast(jhandle); + bopt->destroy_old_data = flag; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: destroyOldData + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_BackupableDBOptions_destroyOldData( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->destroy_old_data; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setBackupLogFiles + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setBackupLogFiles( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + auto bopt = reinterpret_cast(jhandle); + bopt->backup_log_files = flag; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: backupLogFiles + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_BackupableDBOptions_backupLogFiles( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->backup_log_files; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setBackupRateLimit + * Signature: (JJ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setBackupRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jbackup_rate_limit) { + auto bopt = reinterpret_cast(jhandle); + bopt->backup_rate_limit = jbackup_rate_limit; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: backupRateLimit + * Signature: (J)J + */ +jlong Java_org_rocksdb_BackupableDBOptions_backupRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->backup_rate_limit; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setRestoreRateLimit + * Signature: (JJ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setRestoreRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jrestore_rate_limit) { + auto bopt = reinterpret_cast(jhandle); + bopt->restore_rate_limit = jrestore_rate_limit; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: restoreRateLimit + * Signature: (J)J + */ +jlong Java_org_rocksdb_BackupableDBOptions_restoreRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->restore_rate_limit; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: setShareFilesWithChecksum + * Signature: (JZ)V + */ +void Java_org_rocksdb_BackupableDBOptions_setShareFilesWithChecksum( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + auto bopt = reinterpret_cast(jhandle); + bopt->share_files_with_checksum = flag; +} + +/* + * Class: org_rocksdb_BackupableDBOptions + * Method: shareFilesWithChecksum + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_BackupableDBOptions_shareFilesWithChecksum( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto bopt = reinterpret_cast(jhandle); + return bopt->share_files_with_checksum; +} + /* * Class: org_rocksdb_BackupableDBOptions * Method: disposeInternal @@ -109,6 +326,5 @@ void Java_org_rocksdb_BackupableDBOptions_disposeInternal( auto bopt = reinterpret_cast(jhandle); assert(bopt); delete bopt; - rocksdb::BackupableDBOptionsJni::setHandle(env, jopt, nullptr); } diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc new file mode 100644 index 000000000..72a40be00 --- /dev/null +++ b/java/rocksjni/checkpoint.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Checkpoint methods from Java side. + +#include +#include +#include +#include + +#include "include/org_rocksdb_Checkpoint.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "rocksdb/utilities/checkpoint.h" +/* + * Class: org_rocksdb_Checkpoint + * Method: newCheckpoint + * Signature: (J)J + */ +jlong Java_org_rocksdb_Checkpoint_newCheckpoint(JNIEnv* env, + jclass jclazz, jlong jdb_handle) { + auto db = reinterpret_cast(jdb_handle); + rocksdb::Checkpoint* checkpoint; + rocksdb::Checkpoint::Create(db, &checkpoint); + return reinterpret_cast(checkpoint); +} + +/* + * Class: org_rocksdb_Checkpoint + * Method: dispose + * Signature: (J)V + */ +void Java_org_rocksdb_Checkpoint_disposeInternal(JNIEnv* env, jobject jobj, + jlong jhandle) { + auto checkpoint = reinterpret_cast(jhandle); + assert(checkpoint); + delete checkpoint; +} + +/* + * Class: org_rocksdb_Checkpoint + * Method: createCheckpoint + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_Checkpoint_createCheckpoint( + JNIEnv* env, jobject jobj, jlong jcheckpoint_handle, + jstring jcheckpoint_path) { + auto checkpoint = reinterpret_cast( + jcheckpoint_handle); + const char* checkpoint_path = env->GetStringUTFChars( + jcheckpoint_path, 0); + rocksdb::Status s = checkpoint->CreateCheckpoint( + checkpoint_path); + env->ReleaseStringUTFChars(jcheckpoint_path, checkpoint_path); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} diff --git a/java/rocksjni/columnfamilyhandle.cc b/java/rocksjni/columnfamilyhandle.cc new file mode 100644 index 000000000..be3b4c82f --- /dev/null +++ b/java/rocksjni/columnfamilyhandle.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Iterator methods from Java side. + +#include +#include +#include + +#include "include/org_rocksdb_ColumnFamilyHandle.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_ColumnFamilyHandle + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ColumnFamilyHandle_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + delete it; +} diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc new file mode 100644 index 000000000..196376235 --- /dev/null +++ b/java/rocksjni/comparator.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::Comparator. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_AbstractComparator.h" +#include "include/org_rocksdb_Comparator.h" +#include "include/org_rocksdb_DirectComparator.h" +#include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/portal.h" + +// + +/* + * Class: org_rocksdb_Comparator + * Method: createNewComparator0 + * Signature: ()V + */ +void Java_org_rocksdb_Comparator_createNewComparator0( + JNIEnv* env, jobject jobj, jlong copt_handle) { + const rocksdb::ComparatorJniCallbackOptions* copt = + reinterpret_cast(copt_handle); + const rocksdb::ComparatorJniCallback* c = + new rocksdb::ComparatorJniCallback(env, jobj, copt); + rocksdb::AbstractComparatorJni::setHandle(env, jobj, c); +} +// + +// use_adaptive_mutex)), + mtx_findShortestSeparator(new port::Mutex(copt->use_adaptive_mutex)) { + // Note: Comparator methods may be accessed by multiple threads, + // so we ref the jvm not the env + const jint rs = env->GetJavaVM(&m_jvm); + assert(rs == JNI_OK); + + // Note: we want to access the Java Comparator instance + // across multiple method calls, so we create a global ref + m_jComparator = env->NewGlobalRef(jComparator); + + // Note: The name of a Comparator will not change during it's lifetime, + // so we cache it in a global var + jmethodID jNameMethodId = AbstractComparatorJni::getNameMethodId(env); + jstring jsName = (jstring)env->CallObjectMethod(m_jComparator, jNameMethodId); + m_name = JniUtil::copyString(env, jsName); // also releases jsName + + m_jCompareMethodId = AbstractComparatorJni::getCompareMethodId(env); + m_jFindShortestSeparatorMethodId = + AbstractComparatorJni::getFindShortestSeparatorMethodId(env); + m_jFindShortSuccessorMethodId = + AbstractComparatorJni::getFindShortSuccessorMethodId(env); +} + +/** + * Attach/Get a JNIEnv for the current native thread + */ +JNIEnv* BaseComparatorJniCallback::getJniEnv() const { + JNIEnv *env; + jint rs = m_jvm->AttachCurrentThread(reinterpret_cast(&env), NULL); + assert(rs == JNI_OK); + return env; +} + +const char* BaseComparatorJniCallback::Name() const { + return m_name.c_str(); +} + +int BaseComparatorJniCallback::Compare(const Slice& a, const Slice& b) const { + JNIEnv* m_env = getJniEnv(); + + // TODO(adamretter): slice objects can potentially be cached using thread + // local variables to avoid locking. Could make this configurable depending on + // performance. + mtx_compare->Lock(); + + AbstractSliceJni::setHandle(m_env, m_jSliceA, &a); + AbstractSliceJni::setHandle(m_env, m_jSliceB, &b); + jint result = + m_env->CallIntMethod(m_jComparator, m_jCompareMethodId, m_jSliceA, + m_jSliceB); + + mtx_compare->Unlock(); + + m_jvm->DetachCurrentThread(); + + return result; +} + +void BaseComparatorJniCallback::FindShortestSeparator( + std::string* start, const Slice& limit) const { + if (start == nullptr) { + return; + } + + JNIEnv* m_env = getJniEnv(); + + const char* startUtf = start->c_str(); + jstring jsStart = m_env->NewStringUTF(startUtf); + + // TODO(adamretter): slice object can potentially be cached using thread local + // variable to avoid locking. Could make this configurable depending on + // performance. + mtx_findShortestSeparator->Lock(); + + AbstractSliceJni::setHandle(m_env, m_jSliceLimit, &limit); + jstring jsResultStart = + (jstring)m_env->CallObjectMethod(m_jComparator, + m_jFindShortestSeparatorMethodId, jsStart, m_jSliceLimit); + + mtx_findShortestSeparator->Unlock(); + + m_env->DeleteLocalRef(jsStart); + + if (jsResultStart != nullptr) { + // update start with result + *start = + JniUtil::copyString(m_env, jsResultStart); // also releases jsResultStart + } + + m_jvm->DetachCurrentThread(); +} + +void BaseComparatorJniCallback::FindShortSuccessor(std::string* key) const { + if (key == nullptr) { + return; + } + + JNIEnv* m_env = getJniEnv(); + + const char* keyUtf = key->c_str(); + jstring jsKey = m_env->NewStringUTF(keyUtf); + + jstring jsResultKey = + (jstring)m_env->CallObjectMethod(m_jComparator, + m_jFindShortSuccessorMethodId, jsKey); + + m_env->DeleteLocalRef(jsKey); + + if (jsResultKey != nullptr) { + // updates key with result, also releases jsResultKey. + *key = JniUtil::copyString(m_env, jsResultKey); + } + + m_jvm->DetachCurrentThread(); +} + +BaseComparatorJniCallback::~BaseComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + + m_env->DeleteGlobalRef(m_jComparator); + + // Note: do not need to explicitly detach, as this function is effectively + // called from the Java class's disposeInternal method, and so already + // has an attached thread, getJniEnv above is just a no-op Attach to get + // the env jvm->DetachCurrentThread(); +} + +ComparatorJniCallback::ComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt) : + BaseComparatorJniCallback(env, jComparator, copt) { + m_jSliceA = env->NewGlobalRef(SliceJni::construct0(env)); + m_jSliceB = env->NewGlobalRef(SliceJni::construct0(env)); + m_jSliceLimit = env->NewGlobalRef(SliceJni::construct0(env)); +} + +ComparatorJniCallback::~ComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + m_env->DeleteGlobalRef(m_jSliceA); + m_env->DeleteGlobalRef(m_jSliceB); + m_env->DeleteGlobalRef(m_jSliceLimit); +} + +DirectComparatorJniCallback::DirectComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt) : + BaseComparatorJniCallback(env, jComparator, copt) { + m_jSliceA = env->NewGlobalRef(DirectSliceJni::construct0(env)); + m_jSliceB = env->NewGlobalRef(DirectSliceJni::construct0(env)); + m_jSliceLimit = env->NewGlobalRef(DirectSliceJni::construct0(env)); +} + +DirectComparatorJniCallback::~DirectComparatorJniCallback() { + JNIEnv* m_env = getJniEnv(); + m_env->DeleteGlobalRef(m_jSliceA); + m_env->DeleteGlobalRef(m_jSliceB); + m_env->DeleteGlobalRef(m_jSliceLimit); +} +} // namespace rocksdb diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h new file mode 100644 index 000000000..65b986ca4 --- /dev/null +++ b/java/rocksjni/comparatorjnicallback.h @@ -0,0 +1,95 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::Comparator and rocksdb::DirectComparator. + +#ifndef JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ +#define JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ + +#include +#include +#include "rocksdb/comparator.h" +#include "rocksdb/slice.h" +#include "port/port.h" + +namespace rocksdb { + +struct ComparatorJniCallbackOptions { + // Use adaptive mutex, which spins in the user space before resorting + // to kernel. This could reduce context switch when the mutex is not + // heavily contended. However, if the mutex is hot, we could end up + // wasting spin time. + // Default: false + bool use_adaptive_mutex; + + ComparatorJniCallbackOptions() : use_adaptive_mutex(false) { + } +}; + +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB storage engine (C++) + * we then callback to the appropriate Java method + * this enables Comparators to be implemented in Java. + * + * The design of this Comparator caches the Java Slice + * objects that are used in the compare and findShortestSeparator + * method callbacks. Instead of creating new objects for each callback + * of those functions, by reuse via setHandle we are a lot + * faster; Unfortunately this means that we have to + * introduce independent locking in regions of each of those methods + * via the mutexs mtx_compare and mtx_findShortestSeparator respectively + */ +class BaseComparatorJniCallback : public Comparator { + public: + BaseComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + virtual ~BaseComparatorJniCallback(); + virtual const char* Name() const; + virtual int Compare(const Slice& a, const Slice& b) const; + virtual void FindShortestSeparator( + std::string* start, const Slice& limit) const; + virtual void FindShortSuccessor(std::string* key) const; + + private: + // used for synchronisation in compare method + port::Mutex* mtx_compare; + // used for synchronisation in findShortestSeparator method + port::Mutex* mtx_findShortestSeparator; + JavaVM* m_jvm; + jobject m_jComparator; + std::string m_name; + jmethodID m_jCompareMethodId; + jmethodID m_jFindShortestSeparatorMethodId; + jmethodID m_jFindShortSuccessorMethodId; + + protected: + JNIEnv* getJniEnv() const; + jobject m_jSliceA; + jobject m_jSliceB; + jobject m_jSliceLimit; +}; + +class ComparatorJniCallback : public BaseComparatorJniCallback { + public: + ComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + ~ComparatorJniCallback(); +}; + +class DirectComparatorJniCallback : public BaseComparatorJniCallback { + public: + DirectComparatorJniCallback( + JNIEnv* env, jobject jComparator, + const ComparatorJniCallbackOptions* copt); + ~DirectComparatorJniCallback(); +}; +} // namespace rocksdb + +#endif // JAVA_ROCKSJNI_COMPARATORJNICALLBACK_H_ diff --git a/java/rocksjni/env.cc b/java/rocksjni/env.cc index 3aed9f5a0..c6c58e144 100644 --- a/java/rocksjni/env.cc +++ b/java/rocksjni/env.cc @@ -15,7 +15,7 @@ * Signature: ()J */ jlong Java_org_rocksdb_RocksEnv_getDefaultEnvInternal( - JNIEnv* env, jclass jclass) { + JNIEnv* env, jclass jclazz) { return reinterpret_cast(rocksdb::Env::Default()); } diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc index 572b4a66d..2ce17d499 100644 --- a/java/rocksjni/filter.cc +++ b/java/rocksjni/filter.cc @@ -18,13 +18,18 @@ /* * Class: org_rocksdb_BloomFilter - * Method: createNewFilter0 - * Signature: (I)V + * Method: createBloomFilter + * Signature: (IZ)V */ -void Java_org_rocksdb_BloomFilter_createNewFilter0( - JNIEnv* env, jobject jobj, jint bits_per_key) { - const rocksdb::FilterPolicy* fp = rocksdb::NewBloomFilterPolicy(bits_per_key); - rocksdb::FilterJni::setHandle(env, jobj, fp); +void Java_org_rocksdb_BloomFilter_createNewBloomFilter( + JNIEnv* env, jobject jobj, jint bits_per_key, + jboolean use_block_base_builder) { + rocksdb::FilterPolicy* fp = const_cast( + rocksdb::NewBloomFilterPolicy(bits_per_key, use_block_base_builder)); + std::shared_ptr *pFilterPolicy = + new std::shared_ptr; + *pFilterPolicy = std::shared_ptr(fp); + rocksdb::FilterJni::setHandle(env, jobj, pFilterPolicy); } /* @@ -33,6 +38,9 @@ void Java_org_rocksdb_BloomFilter_createNewFilter0( * Signature: (J)V */ void Java_org_rocksdb_Filter_disposeInternal( - JNIEnv* env, jobject jobj, jlong handle) { - delete reinterpret_cast(handle); + JNIEnv* env, jobject jobj, jlong jhandle) { + + std::shared_ptr *handle = + reinterpret_cast *>(jhandle); + handle->reset(); } diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc index 84b0b3133..e9eb0bb37 100644 --- a/java/rocksjni/iterator.cc +++ b/java/rocksjni/iterator.cc @@ -14,6 +14,17 @@ #include "rocksjni/portal.h" #include "rocksdb/iterator.h" +/* + * Class: org_rocksdb_RocksIterator + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_RocksIterator_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + delete it; +} + /* * Class: org_rocksdb_RocksIterator * Method: isValid0 @@ -36,7 +47,7 @@ void Java_org_rocksdb_RocksIterator_seekToFirst0( /* * Class: org_rocksdb_RocksIterator - * Method: seekToFirst0 + * Method: seekToLast0 * Signature: (J)V */ void Java_org_rocksdb_RocksIterator_seekToLast0( @@ -46,7 +57,7 @@ void Java_org_rocksdb_RocksIterator_seekToLast0( /* * Class: org_rocksdb_RocksIterator - * Method: seekToLast0 + * Method: next0 * Signature: (J)V */ void Java_org_rocksdb_RocksIterator_next0( @@ -56,7 +67,7 @@ void Java_org_rocksdb_RocksIterator_next0( /* * Class: org_rocksdb_RocksIterator - * Method: next0 + * Method: prev0 * Signature: (J)V */ void Java_org_rocksdb_RocksIterator_prev0( @@ -66,42 +77,8 @@ void Java_org_rocksdb_RocksIterator_prev0( /* * Class: org_rocksdb_RocksIterator - * Method: prev0 - * Signature: (J)V - */ -jbyteArray Java_org_rocksdb_RocksIterator_key0( - JNIEnv* env, jobject jobj, jlong handle) { - auto it = reinterpret_cast(handle); - rocksdb::Slice key_slice = it->key(); - - jbyteArray jkey = env->NewByteArray(key_slice.size()); - env->SetByteArrayRegion( - jkey, 0, key_slice.size(), - reinterpret_cast(key_slice.data())); - return jkey; -} - -/* - * Class: org_rocksdb_RocksIterator - * Method: key0 - * Signature: (J)[B - */ -jbyteArray Java_org_rocksdb_RocksIterator_value0( - JNIEnv* env, jobject jobj, jlong handle) { - auto it = reinterpret_cast(handle); - rocksdb::Slice value_slice = it->value(); - - jbyteArray jvalue = env->NewByteArray(value_slice.size()); - env->SetByteArrayRegion( - jvalue, 0, value_slice.size(), - reinterpret_cast(value_slice.data())); - return jvalue; -} - -/* - * Class: org_rocksdb_RocksIterator - * Method: value0 - * Signature: (J)[B + * Method: seek0 + * Signature: (J[BI)V */ void Java_org_rocksdb_RocksIterator_seek0( JNIEnv* env, jobject jobj, jlong handle, @@ -118,8 +95,8 @@ void Java_org_rocksdb_RocksIterator_seek0( /* * Class: org_rocksdb_RocksIterator - * Method: seek0 - * Signature: (J[BI)V + * Method: status0 + * Signature: (J)V */ void Java_org_rocksdb_RocksIterator_status0( JNIEnv* env, jobject jobj, jlong handle) { @@ -135,11 +112,33 @@ void Java_org_rocksdb_RocksIterator_status0( /* * Class: org_rocksdb_RocksIterator - * Method: disposeInternal - * Signature: (J)V + * Method: key0 + * Signature: (J)[B */ -void Java_org_rocksdb_RocksIterator_disposeInternal( +jbyteArray Java_org_rocksdb_RocksIterator_key0( JNIEnv* env, jobject jobj, jlong handle) { auto it = reinterpret_cast(handle); - delete it; + rocksdb::Slice key_slice = it->key(); + + jbyteArray jkey = env->NewByteArray(static_cast(key_slice.size())); + env->SetByteArrayRegion(jkey, 0, static_cast(key_slice.size()), + reinterpret_cast(key_slice.data())); + return jkey; +} + +/* + * Class: org_rocksdb_RocksIterator + * Method: value0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_RocksIterator_value0( + JNIEnv* env, jobject jobj, jlong handle) { + auto it = reinterpret_cast(handle); + rocksdb::Slice value_slice = it->value(); + + jbyteArray jkeyValue = + env->NewByteArray(static_cast(value_slice.size())); + env->SetByteArrayRegion(jkeyValue, 0, static_cast(value_slice.size()), + reinterpret_cast(value_slice.data())); + return jkeyValue; } diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index a0d50f5f5..fe83885c2 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -5,6 +5,7 @@ // // This file implements the "bridge" between Java and C++ for MemTables. +#include "rocksjni/portal.h" #include "include/org_rocksdb_HashSkipListMemTableConfig.h" #include "include/org_rocksdb_HashLinkedListMemTableConfig.h" #include "include/org_rocksdb_VectorMemTableConfig.h" @@ -19,21 +20,41 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jbucket_count, jint jheight, jint jbranching_factor) { - return reinterpret_cast(rocksdb::NewHashSkipListRepFactory( - static_cast(jbucket_count), - static_cast(jheight), - static_cast(jbranching_factor))); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jbucket_count); + if (s.ok()) { + return reinterpret_cast(rocksdb::NewHashSkipListRepFactory( + static_cast(jbucket_count), + static_cast(jheight), + static_cast(jbranching_factor))); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } /* * Class: org_rocksdb_HashLinkedListMemTableConfig * Method: newMemTableFactoryHandle - * Signature: (J)J + * Signature: (JJIZI)J */ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( - JNIEnv* env, jobject jobj, jlong jbucket_count) { - return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( - static_cast(jbucket_count))); + JNIEnv* env, jobject jobj, jlong jbucket_count, jlong jhuge_page_tlb_size, + jint jbucket_entries_logging_threshold, + jboolean jif_log_bucket_dist_when_flash, jint jthreshold_use_skiplist) { + rocksdb::Status statusBucketCount = + rocksdb::check_if_jlong_fits_size_t(jbucket_count); + rocksdb::Status statusHugePageTlb = + rocksdb::check_if_jlong_fits_size_t(jhuge_page_tlb_size); + if (statusBucketCount.ok() && statusHugePageTlb.ok()) { + return reinterpret_cast(rocksdb::NewHashLinkListRepFactory( + static_cast(jbucket_count), + static_cast(jhuge_page_tlb_size), + static_cast(jbucket_entries_logging_threshold), + static_cast(jif_log_bucket_dist_when_flash), + static_cast(jthreshold_use_skiplist))); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, + !statusBucketCount.ok()?statusBucketCount:statusHugePageTlb); + return 0; } /* @@ -43,16 +64,27 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( */ jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( JNIEnv* env, jobject jobj, jlong jreserved_size) { - return reinterpret_cast(new rocksdb::VectorRepFactory( - static_cast(jreserved_size))); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jreserved_size); + if (s.ok()) { + return reinterpret_cast(new rocksdb::VectorRepFactory( + static_cast(jreserved_size))); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } /* * Class: org_rocksdb_SkipListMemTableConfig * Method: newMemTableFactoryHandle0 - * Signature: ()J + * Signature: (J)J */ jlong Java_org_rocksdb_SkipListMemTableConfig_newMemTableFactoryHandle0( - JNIEnv* env, jobject jobj) { - return reinterpret_cast(new rocksdb::SkipListFactory()); + JNIEnv* env, jobject jobj, jlong jlookahead) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jlookahead); + if (s.ok()) { + return reinterpret_cast(new rocksdb::SkipListFactory( + static_cast(jlookahead))); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } diff --git a/java/rocksjni/merge_operator.cc b/java/rocksjni/merge_operator.cc new file mode 100644 index 000000000..68fe9b635 --- /dev/null +++ b/java/rocksjni/merge_operator.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com). All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ +// for rocksdb::MergeOperator. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_StringAppendOperator.h" +#include "rocksjni/portal.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/statistics.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/merge_operator.h" +#include "utilities/merge_operators.h" + +/* + * Class: org_rocksdb_StringAppendOperator + * Method: newMergeOperatorHandle + * Signature: ()J + */ +jlong Java_org_rocksdb_StringAppendOperator_newMergeOperatorHandleImpl +(JNIEnv* env, jobject jobj) { + std::shared_ptr *op = + new std::shared_ptr(); + *op = rocksdb::MergeOperators::CreateFromStringId("stringappend"); + return reinterpret_cast(op); +} diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index da420c78f..9f0875b32 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -8,30 +8,56 @@ #include #include #include -#include +#include #include #include "include/org_rocksdb_Options.h" +#include "include/org_rocksdb_DBOptions.h" +#include "include/org_rocksdb_ColumnFamilyOptions.h" #include "include/org_rocksdb_WriteOptions.h" #include "include/org_rocksdb_ReadOptions.h" +#include "include/org_rocksdb_ComparatorOptions.h" +#include "include/org_rocksdb_FlushOptions.h" + +#include "rocksjni/comparatorjnicallback.h" #include "rocksjni/portal.h" + #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/memtablerep.h" #include "rocksdb/table.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/rate_limiter.h" +#include "rocksdb/comparator.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/utilities/convenience.h" +#include "utilities/merge_operators.h" /* * Class: org_rocksdb_Options * Method: newOptions * Signature: ()V */ -void Java_org_rocksdb_Options_newOptions(JNIEnv* env, jobject jobj) { +void Java_org_rocksdb_Options_newOptions__(JNIEnv* env, jobject jobj) { rocksdb::Options* op = new rocksdb::Options(); rocksdb::OptionsJni::setHandle(env, jobj, op); } +/* + * Class: org_rocksdb_Options + * Method: newOptions + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_newOptions__JJ(JNIEnv* env, jobject jobj, + jlong jdboptions, jlong jcfoptions) { + auto dbOpt = reinterpret_cast(jdboptions); + auto cfOpt = reinterpret_cast( + jcfoptions); + rocksdb::Options* op = new rocksdb::Options(*dbOpt, *cfOpt); + rocksdb::OptionsJni::setHandle(env, jobj, op); +} + /* * Class: org_rocksdb_Options * Method: disposeInternal @@ -42,6 +68,17 @@ void Java_org_rocksdb_Options_disposeInternal( delete reinterpret_cast(handle); } +/* + * Class: org_rocksdb_Options + * Method: setIncreaseParallelism + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setIncreaseParallelism( + JNIEnv * evnv, jobject jobj, jlong jhandle, jint totalThreads) { + reinterpret_cast + (jhandle)->IncreaseParallelism(static_cast(totalThreads)); +} + /* * Class: org_rocksdb_Options * Method: setCreateIfMissing @@ -62,6 +99,84 @@ jboolean Java_org_rocksdb_Options_createIfMissing( return reinterpret_cast(jhandle)->create_if_missing; } +/* + * Class: org_rocksdb_Options + * Method: setCreateMissingColumnFamilies + * Signature: (JZ)V + */ +void Java_org_rocksdb_Options_setCreateMissingColumnFamilies( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + reinterpret_cast + (jhandle)->create_missing_column_families = flag; +} + +/* + * Class: org_rocksdb_Options + * Method: createMissingColumnFamilies + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_Options_createMissingColumnFamilies( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast + (jhandle)->create_missing_column_families; +} + +/* + * Class: org_rocksdb_Options + * Method: setComparatorHandle + * Signature: (JI)V + */ +void Java_org_rocksdb_Options_setComparatorHandle__JI( + JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) { + switch (builtinComparator) { + case 1: + reinterpret_cast(jhandle)->comparator = + rocksdb::ReverseBytewiseComparator(); + break; + default: + reinterpret_cast(jhandle)->comparator = + rocksdb::BytewiseComparator(); + break; + } +} + +/* + * Class: org_rocksdb_Options + * Method: setComparatorHandle + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setComparatorHandle__JJ( + JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) { + reinterpret_cast(jopt_handle)->comparator = + reinterpret_cast(jcomparator_handle); +} + +/* + * Class: org_rocksdb_Options + * Method: setMergeOperatorName + * Signature: (JJjava/lang/String)V + */ +void Java_org_rocksdb_Options_setMergeOperatorName( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) { + auto options = reinterpret_cast(jhandle); + const char* op_name = env->GetStringUTFChars(jop_name, 0); + options->merge_operator = rocksdb::MergeOperators::CreateFromStringId( + op_name); + env->ReleaseStringUTFChars(jop_name, op_name); +} + +/* + * Class: org_rocksdb_Options + * Method: setMergeOperator + * Signature: (JJjava/lang/String)V + */ +void Java_org_rocksdb_Options_setMergeOperator( + JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) { + reinterpret_cast(jhandle)->merge_operator = + *(reinterpret_cast*> + (mergeOperatorHandle)); +} + /* * Class: org_rocksdb_Options * Method: setWriteBufferSize @@ -69,11 +184,15 @@ jboolean Java_org_rocksdb_Options_createIfMissing( */ void Java_org_rocksdb_Options_setWriteBufferSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { - reinterpret_cast(jhandle)->write_buffer_size = - static_cast(jwrite_buffer_size); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size); + if (s.ok()) { + reinterpret_cast(jhandle)->write_buffer_size = + jwrite_buffer_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } - /* * Class: org_rocksdb_Options * Method: writeBufferSize @@ -169,6 +288,40 @@ void Java_org_rocksdb_Options_setParanoidChecks( static_cast(paranoid_checks); } +/* + * Class: org_rocksdb_Options + * Method: setEnv + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setEnv( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jenv) { + reinterpret_cast(jhandle)->env = + reinterpret_cast(jenv); +} + +/* + * Class: org_rocksdb_Options + * Method: setMaxTotalWalSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setMaxTotalWalSize( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_total_wal_size) { + reinterpret_cast(jhandle)->max_total_wal_size = + static_cast(jmax_total_wal_size); +} + +/* + * Class: org_rocksdb_Options + * Method: maxTotalWalSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_maxTotalWalSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + max_total_wal_size; +} + /* * Class: org_rocksdb_Options * Method: maxOpenFiles @@ -361,8 +514,13 @@ jlong Java_org_rocksdb_Options_maxLogFileSize( */ void Java_org_rocksdb_Options_setMaxLogFileSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { - reinterpret_cast(jhandle)->max_log_file_size = - static_cast(max_log_file_size); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size); + if (s.ok()) { + reinterpret_cast(jhandle)->max_log_file_size = + max_log_file_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -382,8 +540,14 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll( */ void Java_org_rocksdb_Options_setLogFileTimeToRoll( JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { - reinterpret_cast(jhandle)->log_file_time_to_roll = - static_cast(log_file_time_to_roll); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + log_file_time_to_roll); + if (s.ok()) { + reinterpret_cast(jhandle)->log_file_time_to_roll = + log_file_time_to_roll; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -403,8 +567,13 @@ jlong Java_org_rocksdb_Options_keepLogFileNum( */ void Java_org_rocksdb_Options_setKeepLogFileNum( JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { - reinterpret_cast(jhandle)->keep_log_file_num = - static_cast(keep_log_file_num); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->keep_log_file_num = + keep_log_file_num; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -459,6 +628,39 @@ void Java_org_rocksdb_Options_setMemTableFactory( reinterpret_cast(jfactory_handle)); } +/* + * Class: org_rocksdb_Options + * Method: setRateLimiter + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setRateLimiter( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) { + reinterpret_cast(jhandle)->rate_limiter.reset( + reinterpret_cast(jrate_limiter_handle)); +} + +/* + * Class: org_rocksdb_Options + * Method: setInfoLogLevel + * Signature: (JB)V + */ +void Java_org_rocksdb_Options_setInfoLogLevel( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) { + reinterpret_cast(jhandle)->info_log_level = + static_cast(jlog_level); +} + +/* + * Class: org_rocksdb_Options + * Method: infoLogLevel + * Signature: (J)B + */ +jbyte Java_org_rocksdb_Options_infoLogLevel( + JNIEnv* env, jobject jobj, jlong jhandle) { + return static_cast( + reinterpret_cast(jhandle)->info_log_level); +} + /* * Class: org_rocksdb_Options * Method: tableCacheNumshardbits @@ -509,7 +711,8 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit( void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { reinterpret_cast(jhandle)->prefix_extractor.reset( - rocksdb::NewFixedPrefixTransform(static_cast(jprefix_length))); + rocksdb::NewFixedPrefixTransform( + static_cast(jprefix_length))); } /* @@ -572,8 +775,13 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize( */ void Java_org_rocksdb_Options_setManifestPreallocationSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { - reinterpret_cast(jhandle)->manifest_preallocation_size = - static_cast(preallocation_size); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size); + if (s.ok()) { + reinterpret_cast(jhandle)->manifest_preallocation_size = + preallocation_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -776,27 +984,6 @@ void Java_org_rocksdb_Options_setBytesPerSync( static_cast(bytes_per_sync); } -/* - * Class: org_rocksdb_Options - * Method: allowThreadLocal - * Signature: (J)Z - */ -jboolean Java_org_rocksdb_Options_allowThreadLocal( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->allow_thread_local; -} - -/* - * Class: org_rocksdb_Options - * Method: setAllowThreadLocal - * Signature: (JZ)V - */ -void Java_org_rocksdb_Options_setAllowThreadLocal( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_thread_local) { - reinterpret_cast(jhandle)->allow_thread_local = - static_cast(allow_thread_local); -} - /* * Method: tableFactoryName * Signature: (J)Ljava/lang/String @@ -998,9 +1185,9 @@ void Java_org_rocksdb_Options_setMaxMemCompactionLevel( /* * Class: org_rocksdb_Options * Method: targetFileSizeBase - * Signature: (J)I + * Signature: (J)J */ -jint Java_org_rocksdb_Options_targetFileSizeBase( +jlong Java_org_rocksdb_Options_targetFileSizeBase( JNIEnv* env, jobject jobj, jlong jhandle) { return reinterpret_cast(jhandle)->target_file_size_base; } @@ -1008,13 +1195,13 @@ jint Java_org_rocksdb_Options_targetFileSizeBase( /* * Class: org_rocksdb_Options * Method: setTargetFileSizeBase - * Signature: (JI)V + * Signature: (JJ)V */ void Java_org_rocksdb_Options_setTargetFileSizeBase( JNIEnv* env, jobject jobj, jlong jhandle, - jint jtarget_file_size_base) { + jlong jtarget_file_size_base) { reinterpret_cast(jhandle)->target_file_size_base = - static_cast(jtarget_file_size_base); + static_cast(jtarget_file_size_base); } /* @@ -1244,8 +1431,13 @@ jlong Java_org_rocksdb_Options_arenaBlockSize( */ void Java_org_rocksdb_Options_setArenaBlockSize( JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { - reinterpret_cast(jhandle)->arena_block_size = - static_cast(jarena_block_size); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size); + if (s.ok()) { + reinterpret_cast(jhandle)->arena_block_size = + jarena_block_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -1408,9 +1600,14 @@ jlong Java_org_rocksdb_Options_inplaceUpdateNumLocks( void Java_org_rocksdb_Options_setInplaceUpdateNumLocks( JNIEnv* env, jobject jobj, jlong jhandle, jlong jinplace_update_num_locks) { - reinterpret_cast( - jhandle)->inplace_update_num_locks = - static_cast(jinplace_update_num_locks); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + jinplace_update_num_locks); + if (s.ok()) { + reinterpret_cast(jhandle)->inplace_update_num_locks = + jinplace_update_num_locks; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -1500,8 +1697,14 @@ jlong Java_org_rocksdb_Options_maxSuccessiveMerges( void Java_org_rocksdb_Options_setMaxSuccessiveMerges( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_successive_merges) { - reinterpret_cast(jhandle)->max_successive_merges = - static_cast(jmax_successive_merges); + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + jmax_successive_merges); + if (s.ok()) { + reinterpret_cast(jhandle)->max_successive_merges = + jmax_successive_merges; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* @@ -1528,150 +1731,1992 @@ void Java_org_rocksdb_Options_setMinPartialMergeOperands( static_cast(jmin_partial_merge_operands); } +/* + * Method: optimizeForPointLookup + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_optimizeForPointLookup( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong block_cache_size_mb) { + reinterpret_cast(jhandle)-> + OptimizeForPointLookup(block_cache_size_mb); +} + +/* + * Method: optimizeLevelStyleCompaction + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_optimizeLevelStyleCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong memtable_memory_budget) { + reinterpret_cast(jhandle)-> + OptimizeLevelStyleCompaction(memtable_memory_budget); +} + +/* + * Class: org_rocksdb_Options + * Method: optimizeUniversalStyleCompaction + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_optimizeUniversalStyleCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong memtable_memory_budget) { + reinterpret_cast(jhandle)-> + OptimizeUniversalStyleCompaction(memtable_memory_budget); +} + +/* + * Class: org_rocksdb_Options + * Method: prepareForBulkLoad + * Signature: (J)V + */ +void Java_org_rocksdb_Options_prepareForBulkLoad( + JNIEnv* env, jobject jobj, jlong jhandle) { + reinterpret_cast(jhandle)-> + PrepareForBulkLoad(); +} + ////////////////////////////////////////////////////////////////////////////// -// WriteOptions +// rocksdb::ColumnFamilyOptions /* - * Class: org_rocksdb_WriteOptions - * Method: newWriteOptions + * Class: org_rocksdb_ColumnFamilyOptions + * Method: newColumnFamilyOptions * Signature: ()V */ -void Java_org_rocksdb_WriteOptions_newWriteOptions( - JNIEnv* env, jobject jwrite_options) { - rocksdb::WriteOptions* op = new rocksdb::WriteOptions(); - rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op); +void Java_org_rocksdb_ColumnFamilyOptions_newColumnFamilyOptions( + JNIEnv* env, jobject jobj) { + rocksdb::ColumnFamilyOptions* op = new rocksdb::ColumnFamilyOptions(); + rocksdb::ColumnFamilyOptionsJni::setHandle(env, jobj, op); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: getColumnFamilyOptionsFromProps + * Signature: (Ljava/util/String;)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_getColumnFamilyOptionsFromProps( + JNIEnv* env, jclass jclazz, jstring jopt_string) { + jlong ret_value = 0; + rocksdb::ColumnFamilyOptions* cf_options = + new rocksdb::ColumnFamilyOptions(); + const char* opt_string = env->GetStringUTFChars(jopt_string, 0); + rocksdb::Status status = rocksdb::GetColumnFamilyOptionsFromString( + rocksdb::ColumnFamilyOptions(), opt_string, cf_options); + env->ReleaseStringUTFChars(jopt_string, opt_string); + // Check if ColumnFamilyOptions creation was possible. + if (status.ok()) { + ret_value = reinterpret_cast(cf_options); + } else { + // if operation failed the ColumnFamilyOptions need to be deleted + // again to prevent a memory leak. + delete cf_options; + } + return ret_value; } /* - * Class: org_rocksdb_WriteOptions + * Class: org_rocksdb_ColumnFamilyOptions * Method: disposeInternal - * Signature: ()V + * Signature: (J)V */ -void Java_org_rocksdb_WriteOptions_disposeInternal( - JNIEnv* env, jobject jwrite_options, jlong jhandle) { - auto write_options = reinterpret_cast(jhandle); - delete write_options; - - rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr); +void Java_org_rocksdb_ColumnFamilyOptions_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); } /* - * Class: org_rocksdb_WriteOptions - * Method: setSync - * Signature: (JZ)V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: optimizeForPointLookup + * Signature: (JJ)V */ -void Java_org_rocksdb_WriteOptions_setSync( - JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { - reinterpret_cast(jhandle)->sync = jflag; +void Java_org_rocksdb_ColumnFamilyOptions_optimizeForPointLookup( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong block_cache_size_mb) { + reinterpret_cast(jhandle)-> + OptimizeForPointLookup(block_cache_size_mb); } /* - * Class: org_rocksdb_WriteOptions - * Method: sync - * Signature: (J)Z + * Class: org_rocksdb_ColumnFamilyOptions + * Method: optimizeLevelStyleCompaction + * Signature: (JJ)V */ -jboolean Java_org_rocksdb_WriteOptions_sync( - JNIEnv* env, jobject jwrite_options, jlong jhandle) { - return reinterpret_cast(jhandle)->sync; +void Java_org_rocksdb_ColumnFamilyOptions_optimizeLevelStyleCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong memtable_memory_budget) { + reinterpret_cast(jhandle)-> + OptimizeLevelStyleCompaction(memtable_memory_budget); } /* - * Class: org_rocksdb_WriteOptions - * Method: setDisableWAL - * Signature: (JZ)V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: optimizeUniversalStyleCompaction + * Signature: (JJ)V */ -void Java_org_rocksdb_WriteOptions_setDisableWAL( - JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { - reinterpret_cast(jhandle)->disableWAL = jflag; +void Java_org_rocksdb_ColumnFamilyOptions_optimizeUniversalStyleCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong memtable_memory_budget) { + reinterpret_cast(jhandle)-> + OptimizeUniversalStyleCompaction(memtable_memory_budget); } /* - * Class: org_rocksdb_WriteOptions - * Method: disableWAL - * Signature: (J)Z + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setComparatorHandle + * Signature: (JI)V */ -jboolean Java_org_rocksdb_WriteOptions_disableWAL( - JNIEnv* env, jobject jwrite_options, jlong jhandle) { - return reinterpret_cast(jhandle)->disableWAL; +void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JI( + JNIEnv* env, jobject jobj, jlong jhandle, jint builtinComparator) { + switch (builtinComparator) { + case 1: + reinterpret_cast(jhandle)->comparator = + rocksdb::ReverseBytewiseComparator(); + break; + default: + reinterpret_cast(jhandle)->comparator = + rocksdb::BytewiseComparator(); + break; + } } -///////////////////////////////////////////////////////////////////// -// rocksdb::ReadOptions - /* - * Class: org_rocksdb_ReadOptions - * Method: newReadOptions - * Signature: ()V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setComparatorHandle + * Signature: (JJ)V */ -void Java_org_rocksdb_ReadOptions_newReadOptions( - JNIEnv* env, jobject jobj) { - auto read_opt = new rocksdb::ReadOptions(); - rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt); +void Java_org_rocksdb_ColumnFamilyOptions_setComparatorHandle__JJ( + JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jcomparator_handle) { + reinterpret_cast(jopt_handle)->comparator = + reinterpret_cast(jcomparator_handle); } /* - * Class: org_rocksdb_ReadOptions - * Method: disposeInternal - * Signature: (J)V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMergeOperatorName + * Signature: (JJjava/lang/String)V */ -void Java_org_rocksdb_ReadOptions_disposeInternal( - JNIEnv* env, jobject jobj, jlong jhandle) { - delete reinterpret_cast(jhandle); - rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr); +void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperatorName( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jop_name) { + auto options = reinterpret_cast(jhandle); + const char* op_name = env->GetStringUTFChars(jop_name, 0); + options->merge_operator = rocksdb::MergeOperators::CreateFromStringId( + op_name); + env->ReleaseStringUTFChars(jop_name, op_name); } /* - * Class: org_rocksdb_ReadOptions - * Method: verifyChecksums - * Signature: (J)Z + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMergeOperator + * Signature: (JJjava/lang/String)V */ -jboolean Java_org_rocksdb_ReadOptions_verifyChecksums( - JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast( - jhandle)->verify_checksums; +void Java_org_rocksdb_ColumnFamilyOptions_setMergeOperator( + JNIEnv* env, jobject jobj, jlong jhandle, jlong mergeOperatorHandle) { + reinterpret_cast(jhandle)->merge_operator = + *(reinterpret_cast*> + (mergeOperatorHandle)); } /* - * Class: org_rocksdb_ReadOptions - * Method: setVerifyChecksums - * Signature: (JZ)V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setWriteBufferSize + * Signature: (JJ)I */ -void Java_org_rocksdb_ReadOptions_setVerifyChecksums( - JNIEnv* env, jobject jobj, jlong jhandle, - jboolean jverify_checksums) { - reinterpret_cast(jhandle)->verify_checksums = - static_cast(jverify_checksums); +void Java_org_rocksdb_ColumnFamilyOptions_setWriteBufferSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jwrite_buffer_size); + if (s.ok()) { + reinterpret_cast(jhandle)-> + write_buffer_size = jwrite_buffer_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } } /* - * Class: org_rocksdb_ReadOptions - * Method: fillCache - * Signature: (J)Z + * Class: org_rocksdb_ColumnFamilyOptions + * Method: writeBufferSize + * Signature: (J)J */ -jboolean Java_org_rocksdb_ReadOptions_fillCache( +jlong Java_org_rocksdb_ColumnFamilyOptions_writeBufferSize( JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->fill_cache; + return reinterpret_cast(jhandle)-> + write_buffer_size; } /* - * Class: org_rocksdb_ReadOptions - * Method: setFillCache - * Signature: (JZ)V + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxWriteBufferNumber + * Signature: (JI)V */ -void Java_org_rocksdb_ReadOptions_setFillCache( - JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) { - reinterpret_cast(jhandle)->fill_cache = - static_cast(jfill_cache); +void Java_org_rocksdb_ColumnFamilyOptions_setMaxWriteBufferNumber( + JNIEnv* env, jobject jobj, jlong jhandle, jint jmax_write_buffer_number) { + reinterpret_cast(jhandle)-> + max_write_buffer_number = jmax_write_buffer_number; } /* - * Class: org_rocksdb_ReadOptions - * Method: tailing - * Signature: (J)Z + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxWriteBufferNumber + * Signature: (J)I */ -jboolean Java_org_rocksdb_ReadOptions_tailing( +jint Java_org_rocksdb_ColumnFamilyOptions_maxWriteBufferNumber( JNIEnv* env, jobject jobj, jlong jhandle) { - return reinterpret_cast(jhandle)->tailing; + return reinterpret_cast(jhandle)-> + max_write_buffer_number; +} + +/* + * Method: setMemTableFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMemTableFactory( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) { + reinterpret_cast(jhandle)-> + memtable_factory.reset( + reinterpret_cast(jfactory_handle)); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: memTableFactoryName + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_ColumnFamilyOptions_memTableFactoryName( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto opt = reinterpret_cast(jhandle); + rocksdb::MemTableRepFactory* tf = opt->memtable_factory.get(); + + // Should never be nullptr. + // Default memtable factory is SkipListFactory + assert(tf); + + // temporarly fix for the historical typo + if (strcmp(tf->Name(), "HashLinkListRepFactory") == 0) { + return env->NewStringUTF("HashLinkedListRepFactory"); + } + + return env->NewStringUTF(tf->Name()); +} + +/* + * Method: useFixedLengthPrefixExtractor + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_useFixedLengthPrefixExtractor( + JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { + reinterpret_cast(jhandle)-> + prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( + static_cast(jprefix_length))); +} + +/* + * Method: setTableFactory + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setTableFactory( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jfactory_handle) { + reinterpret_cast(jhandle)-> + table_factory.reset(reinterpret_cast( + jfactory_handle)); +} + +/* + * Method: tableFactoryName + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_ColumnFamilyOptions_tableFactoryName( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto opt = reinterpret_cast(jhandle); + rocksdb::TableFactory* tf = opt->table_factory.get(); + + // Should never be nullptr. + // Default memtable factory is SkipListFactory + assert(tf); + + return env->NewStringUTF(tf->Name()); +} + + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: minWriteBufferNumberToMerge + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_minWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMinWriteBufferNumberToMerge + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMinWriteBufferNumberToMerge( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_write_buffer_number_to_merge) { + reinterpret_cast( + jhandle)->min_write_buffer_number_to_merge = + static_cast(jmin_write_buffer_number_to_merge); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setCompressionType + * Signature: (JB)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setCompressionType( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte compression) { + reinterpret_cast(jhandle)-> + compression = static_cast(compression); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: compressionType + * Signature: (J)B + */ +jbyte Java_org_rocksdb_ColumnFamilyOptions_compressionType( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + compression; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setCompactionStyle + * Signature: (JB)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setCompactionStyle( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte compaction_style) { + reinterpret_cast(jhandle)->compaction_style = + static_cast(compaction_style); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: compactionStyle + * Signature: (J)B + */ +jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast + (jhandle)->compaction_style; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: numLevels + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_numLevels( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->num_levels; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setNumLevels + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setNumLevels( + JNIEnv* env, jobject jobj, jlong jhandle, jint jnum_levels) { + reinterpret_cast(jhandle)->num_levels = + static_cast(jnum_levels); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: levelZeroFileNumCompactionTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setLevelZeroFileNumCompactionTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroFileNumCompactionTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_file_num_compaction_trigger) { + reinterpret_cast( + jhandle)->level0_file_num_compaction_trigger = + static_cast(jlevel0_file_num_compaction_trigger); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: levelZeroSlowdownWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setLevelSlowdownWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroSlowdownWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_slowdown_writes_trigger) { + reinterpret_cast( + jhandle)->level0_slowdown_writes_trigger = + static_cast(jlevel0_slowdown_writes_trigger); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: levelZeroStopWritesTrigger + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_levelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->level0_stop_writes_trigger; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setLevelStopWritesTrigger + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setLevelZeroStopWritesTrigger( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jlevel0_stop_writes_trigger) { + reinterpret_cast(jhandle)-> + level0_stop_writes_trigger = static_cast( + jlevel0_stop_writes_trigger); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxMemCompactionLevel + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_maxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_mem_compaction_level; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxMemCompactionLevel + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxMemCompactionLevel( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_mem_compaction_level) { + reinterpret_cast(jhandle)-> + max_mem_compaction_level = static_cast(jmax_mem_compaction_level); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: targetFileSizeBase + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + target_file_size_base; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setTargetFileSizeBase + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jtarget_file_size_base) { + reinterpret_cast(jhandle)-> + target_file_size_base = static_cast(jtarget_file_size_base); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: targetFileSizeMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_targetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->target_file_size_multiplier; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setTargetFileSizeMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setTargetFileSizeMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jtarget_file_size_multiplier) { + reinterpret_cast( + jhandle)->target_file_size_multiplier = + static_cast(jtarget_file_size_multiplier); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxBytesForLevelBase + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_base; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxBytesForLevelBase + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelBase( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_bytes_for_level_base) { + reinterpret_cast( + jhandle)->max_bytes_for_level_base = + static_cast(jmax_bytes_for_level_base); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxBytesForLevelMultiplier + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_maxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxBytesForLevelMultiplier + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxBytesForLevelMultiplier( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_bytes_for_level_multiplier) { + reinterpret_cast( + jhandle)->max_bytes_for_level_multiplier = + static_cast(jmax_bytes_for_level_multiplier); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: expandedCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_expandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->expanded_compaction_factor; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setExpandedCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setExpandedCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jexpanded_compaction_factor) { + reinterpret_cast( + jhandle)->expanded_compaction_factor = + static_cast(jexpanded_compaction_factor); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: sourceCompactionFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_sourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->source_compaction_factor; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setSourceCompactionFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setSourceCompactionFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jsource_compaction_factor) { + reinterpret_cast( + jhandle)->source_compaction_factor = + static_cast(jsource_compaction_factor); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxGrandparentOverlapFactor + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_maxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_grandparent_overlap_factor; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxGrandparentOverlapFactor + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxGrandparentOverlapFactor( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmax_grandparent_overlap_factor) { + reinterpret_cast( + jhandle)->max_grandparent_overlap_factor = + static_cast(jmax_grandparent_overlap_factor); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: softRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_ColumnFamilyOptions_softRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + soft_rate_limit; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setSoftRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setSoftRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jsoft_rate_limit) { + reinterpret_cast(jhandle)->soft_rate_limit = + static_cast(jsoft_rate_limit); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: hardRateLimit + * Signature: (J)D + */ +jdouble Java_org_rocksdb_ColumnFamilyOptions_hardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + hard_rate_limit; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setHardRateLimit + * Signature: (JD)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setHardRateLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jdouble jhard_rate_limit) { + reinterpret_cast(jhandle)->hard_rate_limit = + static_cast(jhard_rate_limit); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: rateLimitDelayMaxMilliseconds + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_rateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setRateLimitDelayMaxMilliseconds + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setRateLimitDelayMaxMilliseconds( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jrate_limit_delay_max_milliseconds) { + reinterpret_cast( + jhandle)->rate_limit_delay_max_milliseconds = + static_cast(jrate_limit_delay_max_milliseconds); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: arenaBlockSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_arenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + arena_block_size; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setArenaBlockSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setArenaBlockSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(jarena_block_size); + if (s.ok()) { + reinterpret_cast(jhandle)-> + arena_block_size = jarena_block_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: disableAutoCompactions + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_disableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->disable_auto_compactions; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setDisableAutoCompactions + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setDisableAutoCompactions( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jdisable_auto_compactions) { + reinterpret_cast( + jhandle)->disable_auto_compactions = + static_cast(jdisable_auto_compactions); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: purgeRedundantKvsWhileFlush + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_purgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setPurgeRedundantKvsWhileFlush + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setPurgeRedundantKvsWhileFlush( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jpurge_redundant_kvs_while_flush) { + reinterpret_cast( + jhandle)->purge_redundant_kvs_while_flush = + static_cast(jpurge_redundant_kvs_while_flush); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: verifyChecksumsInCompaction + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_verifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->verify_checksums_in_compaction; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setVerifyChecksumsInCompaction + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setVerifyChecksumsInCompaction( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jverify_checksums_in_compaction) { + reinterpret_cast( + jhandle)->verify_checksums_in_compaction = + static_cast(jverify_checksums_in_compaction); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: filterDeletes + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_filterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + filter_deletes; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setFilterDeletes + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setFilterDeletes( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfilter_deletes) { + reinterpret_cast(jhandle)->filter_deletes = + static_cast(jfilter_deletes); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxSequentialSkipInIterations + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_maxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxSequentialSkipInIterations + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxSequentialSkipInIterations( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_sequential_skip_in_iterations) { + reinterpret_cast( + jhandle)->max_sequential_skip_in_iterations = + static_cast(jmax_sequential_skip_in_iterations); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: inplaceUpdateSupport + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_support; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setInplaceUpdateSupport + * Signature: (JZ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateSupport( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jinplace_update_support) { + reinterpret_cast( + jhandle)->inplace_update_support = + static_cast(jinplace_update_support); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: inplaceUpdateNumLocks + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_inplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->inplace_update_num_locks; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setInplaceUpdateNumLocks + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setInplaceUpdateNumLocks( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jinplace_update_num_locks) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + jinplace_update_num_locks); + if (s.ok()) { + reinterpret_cast(jhandle)-> + inplace_update_num_locks = jinplace_update_num_locks; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: memtablePrefixBloomBits + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMemtablePrefixBloomBits + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomBits( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_bits) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_bits = + static_cast(jmemtable_prefix_bloom_bits); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: memtablePrefixBloomProbes + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_memtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMemtablePrefixBloomProbes + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMemtablePrefixBloomProbes( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmemtable_prefix_bloom_probes) { + reinterpret_cast( + jhandle)->memtable_prefix_bloom_probes = + static_cast(jmemtable_prefix_bloom_probes); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: bloomLocality + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_bloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + bloom_locality; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setBloomLocality + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setBloomLocality( + JNIEnv* env, jobject jobj, jlong jhandle, jint jbloom_locality) { + reinterpret_cast(jhandle)->bloom_locality = + static_cast(jbloom_locality); +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: maxSuccessiveMerges + * Signature: (J)J + */ +jlong Java_org_rocksdb_ColumnFamilyOptions_maxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + max_successive_merges; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMaxSuccessiveMerges + * Signature: (JJ)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMaxSuccessiveMerges( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_successive_merges) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + jmax_successive_merges); + if (s.ok()) { + reinterpret_cast(jhandle)-> + max_successive_merges = jmax_successive_merges; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: minPartialMergeOperands + * Signature: (J)I + */ +jint Java_org_rocksdb_ColumnFamilyOptions_minPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->min_partial_merge_operands; +} + +/* + * Class: org_rocksdb_ColumnFamilyOptions + * Method: setMinPartialMergeOperands + * Signature: (JI)V + */ +void Java_org_rocksdb_ColumnFamilyOptions_setMinPartialMergeOperands( + JNIEnv* env, jobject jobj, jlong jhandle, + jint jmin_partial_merge_operands) { + reinterpret_cast( + jhandle)->min_partial_merge_operands = + static_cast(jmin_partial_merge_operands); +} + +///////////////////////////////////////////////////////////////////// +// rocksdb::DBOptions + +/* + * Class: org_rocksdb_DBOptions + * Method: newDBOptions + * Signature: ()V + */ +void Java_org_rocksdb_DBOptions_newDBOptions(JNIEnv* env, + jobject jobj) { + rocksdb::DBOptions* dbop = new rocksdb::DBOptions(); + rocksdb::DBOptionsJni::setHandle(env, jobj, dbop); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: getDBOptionsFromProps + * Signature: (Ljava/util/String;)J + */ +jlong Java_org_rocksdb_DBOptions_getDBOptionsFromProps( + JNIEnv* env, jclass jclazz, jstring jopt_string) { + jlong ret_value = 0; + rocksdb::DBOptions* db_options = + new rocksdb::DBOptions(); + const char* opt_string = env->GetStringUTFChars(jopt_string, 0); + rocksdb::Status status = rocksdb::GetDBOptionsFromString( + rocksdb::DBOptions(), opt_string, db_options); + env->ReleaseStringUTFChars(jopt_string, opt_string); + // Check if DBOptions creation was possible. + if (status.ok()) { + ret_value = reinterpret_cast(db_options); + } else { + // if operation failed the DBOptions need to be deleted + // again to prevent a memory leak. + delete db_options; + } + return ret_value; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_DBOptions_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setIncreaseParallelism + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setIncreaseParallelism( + JNIEnv * env, jobject jobj, jlong jhandle, jint totalThreads) { + reinterpret_cast + (jhandle)->IncreaseParallelism(static_cast(totalThreads)); +} + + +/* + * Class: org_rocksdb_DBOptions + * Method: setCreateIfMissing + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setCreateIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + reinterpret_cast(jhandle)-> + create_if_missing = flag; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: createIfMissing + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_createIfMissing( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->create_if_missing; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setCreateMissingColumnFamilies + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setCreateMissingColumnFamilies( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean flag) { + reinterpret_cast + (jhandle)->create_missing_column_families = flag; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: createMissingColumnFamilies + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_createMissingColumnFamilies( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast + (jhandle)->create_missing_column_families; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setErrorIfExists + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setErrorIfExists( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean error_if_exists) { + reinterpret_cast(jhandle)->error_if_exists = + static_cast(error_if_exists); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: errorIfExists + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_errorIfExists( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->error_if_exists; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setParanoidChecks + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setParanoidChecks( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean paranoid_checks) { + reinterpret_cast(jhandle)->paranoid_checks = + static_cast(paranoid_checks); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: paranoidChecks + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_paranoidChecks( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->paranoid_checks; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setRateLimiter + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setRateLimiter( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) { + reinterpret_cast(jhandle)->rate_limiter.reset( + reinterpret_cast(jrate_limiter_handle)); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setInfoLogLevel + * Signature: (JB)V + */ +void Java_org_rocksdb_DBOptions_setInfoLogLevel( + JNIEnv* env, jobject jobj, jlong jhandle, jbyte jlog_level) { + reinterpret_cast(jhandle)->info_log_level = + static_cast(jlog_level); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: infoLogLevel + * Signature: (J)B + */ +jbyte Java_org_rocksdb_DBOptions_infoLogLevel( + JNIEnv* env, jobject jobj, jlong jhandle) { + return static_cast( + reinterpret_cast(jhandle)->info_log_level); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxTotalWalSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setMaxTotalWalSize( + JNIEnv* env, jobject jobj, jlong jhandle, + jlong jmax_total_wal_size) { + reinterpret_cast(jhandle)->max_total_wal_size = + static_cast(jmax_total_wal_size); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxTotalWalSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_maxTotalWalSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + max_total_wal_size; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxOpenFiles + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setMaxOpenFiles( + JNIEnv* env, jobject jobj, jlong jhandle, jint max_open_files) { + reinterpret_cast(jhandle)->max_open_files = + static_cast(max_open_files); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxOpenFiles + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_maxOpenFiles( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_open_files; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: createStatistics + * Signature: (J)V + */ +void Java_org_rocksdb_DBOptions_createStatistics( + JNIEnv* env, jobject jobj, jlong jOptHandle) { + reinterpret_cast(jOptHandle)->statistics = + rocksdb::CreateDBStatistics(); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: statisticsPtr + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_statisticsPtr( + JNIEnv* env, jobject jobj, jlong jOptHandle) { + auto st = reinterpret_cast(jOptHandle)-> + statistics.get(); + return reinterpret_cast(st); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setDisableDataSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setDisableDataSync( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean disableDataSync) { + reinterpret_cast(jhandle)->disableDataSync = + static_cast(disableDataSync); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: disableDataSync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_disableDataSync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->disableDataSync; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setUseFsync + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setUseFsync( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_fsync) { + reinterpret_cast(jhandle)->use_fsync = + static_cast(use_fsync); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: useFsync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_useFsync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->use_fsync; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setDbLogDir + * Signature: (JLjava/lang/String)V + */ +void Java_org_rocksdb_DBOptions_setDbLogDir( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jdb_log_dir) { + const char* log_dir = env->GetStringUTFChars(jdb_log_dir, 0); + reinterpret_cast(jhandle)->db_log_dir.assign(log_dir); + env->ReleaseStringUTFChars(jdb_log_dir, log_dir); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: dbLogDir + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_DBOptions_dbLogDir( + JNIEnv* env, jobject jobj, jlong jhandle) { + return env->NewStringUTF( + reinterpret_cast(jhandle)->db_log_dir.c_str()); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setWalDir + * Signature: (JLjava/lang/String)V + */ +void Java_org_rocksdb_DBOptions_setWalDir( + JNIEnv* env, jobject jobj, jlong jhandle, jstring jwal_dir) { + const char* wal_dir = env->GetStringUTFChars(jwal_dir, 0); + reinterpret_cast(jhandle)->wal_dir.assign(wal_dir); + env->ReleaseStringUTFChars(jwal_dir, wal_dir); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: walDir + * Signature: (J)Ljava/lang/String + */ +jstring Java_org_rocksdb_DBOptions_walDir( + JNIEnv* env, jobject jobj, jlong jhandle) { + return env->NewStringUTF( + reinterpret_cast(jhandle)->wal_dir.c_str()); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setDeleteObsoleteFilesPeriodMicros + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setDeleteObsoleteFilesPeriodMicros( + JNIEnv* env, jobject jobj, jlong jhandle, jlong micros) { + reinterpret_cast(jhandle) + ->delete_obsolete_files_period_micros = + static_cast(micros); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: deleteObsoleteFilesPeriodMicros + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_deleteObsoleteFilesPeriodMicros( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->delete_obsolete_files_period_micros; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxBackgroundCompactions + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setMaxBackgroundCompactions( + JNIEnv* env, jobject jobj, jlong jhandle, jint max) { + reinterpret_cast(jhandle) + ->max_background_compactions = static_cast(max); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxBackgroundCompactions + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_maxBackgroundCompactions( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->max_background_compactions; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxBackgroundFlushes + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setMaxBackgroundFlushes( + JNIEnv* env, jobject jobj, jlong jhandle, jint max_background_flushes) { + reinterpret_cast(jhandle)->max_background_flushes = + static_cast(max_background_flushes); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxBackgroundFlushes + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_maxBackgroundFlushes( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + max_background_flushes; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxLogFileSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setMaxLogFileSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(max_log_file_size); + if (s.ok()) { + reinterpret_cast(jhandle)->max_log_file_size = + max_log_file_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxLogFileSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_maxLogFileSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->max_log_file_size; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setLogFileTimeToRoll + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setLogFileTimeToRoll( + JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t( + log_file_time_to_roll); + if (s.ok()) { + reinterpret_cast(jhandle)->log_file_time_to_roll = + log_file_time_to_roll; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: logFileTimeToRoll + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_logFileTimeToRoll( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->log_file_time_to_roll; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setKeepLogFileNum + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setKeepLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(keep_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->keep_log_file_num = + keep_log_file_num; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: keepLogFileNum + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_keepLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->keep_log_file_num; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setMaxManifestFileSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setMaxManifestFileSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong max_manifest_file_size) { + reinterpret_cast(jhandle)->max_manifest_file_size = + static_cast(max_manifest_file_size); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: maxManifestFileSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_maxManifestFileSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + max_manifest_file_size; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setTableCacheNumshardbits + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setTableCacheNumshardbits( + JNIEnv* env, jobject jobj, jlong jhandle, jint table_cache_numshardbits) { + reinterpret_cast(jhandle)->table_cache_numshardbits = + static_cast(table_cache_numshardbits); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: tableCacheNumshardbits + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_tableCacheNumshardbits( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)-> + table_cache_numshardbits; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setTableCacheRemoveScanCountLimit + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setTableCacheRemoveScanCountLimit( + JNIEnv* env, jobject jobj, jlong jhandle, jint limit) { + reinterpret_cast( + jhandle)->table_cache_remove_scan_count_limit = static_cast(limit); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: tableCacheRemoveScanCountLimit + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_tableCacheRemoveScanCountLimit( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->table_cache_remove_scan_count_limit; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setWalTtlSeconds + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setWalTtlSeconds( + JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_ttl_seconds) { + reinterpret_cast(jhandle)->WAL_ttl_seconds = + static_cast(WAL_ttl_seconds); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: walTtlSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_walTtlSeconds( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->WAL_ttl_seconds; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setWalSizeLimitMB + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setWalSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle, jlong WAL_size_limit_MB) { + reinterpret_cast(jhandle)->WAL_size_limit_MB = + static_cast(WAL_size_limit_MB); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: walTtlSeconds + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_walSizeLimitMB( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->WAL_size_limit_MB; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setManifestPreallocationSize + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setManifestPreallocationSize( + JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(preallocation_size); + if (s.ok()) { + reinterpret_cast(jhandle)-> + manifest_preallocation_size = preallocation_size; + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: manifestPreallocationSize + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_manifestPreallocationSize( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->manifest_preallocation_size; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setAllowOsBuffer + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setAllowOsBuffer( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_os_buffer) { + reinterpret_cast(jhandle)->allow_os_buffer = + static_cast(allow_os_buffer); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: allowOsBuffer + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_allowOsBuffer( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_os_buffer; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setAllowMmapReads + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setAllowMmapReads( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_reads) { + reinterpret_cast(jhandle)->allow_mmap_reads = + static_cast(allow_mmap_reads); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: allowMmapReads + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_allowMmapReads( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_mmap_reads; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setAllowMmapWrites + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setAllowMmapWrites( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean allow_mmap_writes) { + reinterpret_cast(jhandle)->allow_mmap_writes = + static_cast(allow_mmap_writes); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: allowMmapWrites + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_allowMmapWrites( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->allow_mmap_writes; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setIsFdCloseOnExec + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setIsFdCloseOnExec( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean is_fd_close_on_exec) { + reinterpret_cast(jhandle)->is_fd_close_on_exec = + static_cast(is_fd_close_on_exec); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: isFdCloseOnExec + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_isFdCloseOnExec( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->is_fd_close_on_exec; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setSkipLogErrorOnRecovery + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setSkipLogErrorOnRecovery( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean skip) { + reinterpret_cast(jhandle)->skip_log_error_on_recovery = + static_cast(skip); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: skipLogErrorOnRecovery + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_skipLogErrorOnRecovery( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->skip_log_error_on_recovery; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setStatsDumpPeriodSec + * Signature: (JI)V + */ +void Java_org_rocksdb_DBOptions_setStatsDumpPeriodSec( + JNIEnv* env, jobject jobj, jlong jhandle, jint stats_dump_period_sec) { + reinterpret_cast(jhandle)->stats_dump_period_sec = + static_cast(stats_dump_period_sec); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: statsDumpPeriodSec + * Signature: (J)I + */ +jint Java_org_rocksdb_DBOptions_statsDumpPeriodSec( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->stats_dump_period_sec; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setAdviseRandomOnOpen + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setAdviseRandomOnOpen( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean advise_random_on_open) { + reinterpret_cast(jhandle)->advise_random_on_open = + static_cast(advise_random_on_open); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: adviseRandomOnOpen + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_adviseRandomOnOpen( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->advise_random_on_open; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setUseAdaptiveMutex + * Signature: (JZ)V + */ +void Java_org_rocksdb_DBOptions_setUseAdaptiveMutex( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean use_adaptive_mutex) { + reinterpret_cast(jhandle)->use_adaptive_mutex = + static_cast(use_adaptive_mutex); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: useAdaptiveMutex + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_DBOptions_useAdaptiveMutex( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->use_adaptive_mutex; +} + +/* + * Class: org_rocksdb_DBOptions + * Method: setBytesPerSync + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setBytesPerSync( + JNIEnv* env, jobject jobj, jlong jhandle, jlong bytes_per_sync) { + reinterpret_cast(jhandle)->bytes_per_sync = + static_cast(bytes_per_sync); +} + +/* + * Class: org_rocksdb_DBOptions + * Method: bytesPerSync + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_bytesPerSync( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->bytes_per_sync; +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::WriteOptions + +/* + * Class: org_rocksdb_WriteOptions + * Method: newWriteOptions + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_newWriteOptions( + JNIEnv* env, jobject jwrite_options) { + rocksdb::WriteOptions* op = new rocksdb::WriteOptions(); + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, op); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: disposeInternal + * Signature: ()V + */ +void Java_org_rocksdb_WriteOptions_disposeInternal( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + auto write_options = reinterpret_cast(jhandle); + delete write_options; + + rocksdb::WriteOptionsJni::setHandle(env, jwrite_options, nullptr); +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setSync + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setSync( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->sync = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: sync + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_sync( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->sync; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: setDisableWAL + * Signature: (JZ)V + */ +void Java_org_rocksdb_WriteOptions_setDisableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle, jboolean jflag) { + reinterpret_cast(jhandle)->disableWAL = jflag; +} + +/* + * Class: org_rocksdb_WriteOptions + * Method: disableWAL + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WriteOptions_disableWAL( + JNIEnv* env, jobject jwrite_options, jlong jhandle) { + return reinterpret_cast(jhandle)->disableWAL; +} + +///////////////////////////////////////////////////////////////////// +// rocksdb::ReadOptions + +/* + * Class: org_rocksdb_ReadOptions + * Method: newReadOptions + * Signature: ()V + */ +void Java_org_rocksdb_ReadOptions_newReadOptions( + JNIEnv* env, jobject jobj) { + auto read_opt = new rocksdb::ReadOptions(); + rocksdb::ReadOptionsJni::setHandle(env, jobj, read_opt); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ReadOptions_disposeInternal( + JNIEnv* env, jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); + rocksdb::ReadOptionsJni::setHandle(env, jobj, nullptr); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setVerifyChecksums + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setVerifyChecksums( + JNIEnv* env, jobject jobj, jlong jhandle, + jboolean jverify_checksums) { + reinterpret_cast(jhandle)->verify_checksums = + static_cast(jverify_checksums); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: verifyChecksums + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_verifyChecksums( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast( + jhandle)->verify_checksums; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setFillCache + * Signature: (JZ)V + */ +void Java_org_rocksdb_ReadOptions_setFillCache( + JNIEnv* env, jobject jobj, jlong jhandle, jboolean jfill_cache) { + reinterpret_cast(jhandle)->fill_cache = + static_cast(jfill_cache); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: fillCache + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_fillCache( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->fill_cache; } /* @@ -1684,3 +3729,130 @@ void Java_org_rocksdb_ReadOptions_setTailing( reinterpret_cast(jhandle)->tailing = static_cast(jtailing); } + +/* + * Class: org_rocksdb_ReadOptions + * Method: tailing + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ReadOptions_tailing( + JNIEnv* env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle)->tailing; +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: setSnapshot + * Signature: (JJ)V + */ +void Java_org_rocksdb_ReadOptions_setSnapshot( + JNIEnv* env, jobject jobj, jlong jhandle, jlong jsnapshot) { + reinterpret_cast(jhandle)->snapshot = + reinterpret_cast(jsnapshot); +} + +/* + * Class: org_rocksdb_ReadOptions + * Method: snapshot + * Signature: (J)J + */ +jlong Java_org_rocksdb_ReadOptions_snapshot( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto& snapshot = + reinterpret_cast(jhandle)->snapshot; + return reinterpret_cast(snapshot); +} + +///////////////////////////////////////////////////////////////////// +// rocksdb::ComparatorOptions + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: newComparatorOptions + * Signature: ()V + */ +void Java_org_rocksdb_ComparatorOptions_newComparatorOptions( + JNIEnv* env, jobject jobj) { + auto comparator_opt = new rocksdb::ComparatorJniCallbackOptions(); + rocksdb::ComparatorOptionsJni::setHandle(env, jobj, comparator_opt); +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: useAdaptiveMutex + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_ComparatorOptions_useAdaptiveMutex( + JNIEnv * env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->use_adaptive_mutex; +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: setUseAdaptiveMutex + * Signature: (JZ)V + */ +void Java_org_rocksdb_ComparatorOptions_setUseAdaptiveMutex( + JNIEnv * env, jobject jobj, jlong jhandle, jboolean juse_adaptive_mutex) { + reinterpret_cast(jhandle) + ->use_adaptive_mutex = static_cast(juse_adaptive_mutex); +} + +/* + * Class: org_rocksdb_ComparatorOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_ComparatorOptions_disposeInternal( + JNIEnv * env, jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); + rocksdb::ComparatorOptionsJni::setHandle(env, jobj, nullptr); +} + +///////////////////////////////////////////////////////////////////// +// rocksdb::FlushOptions + +/* + * Class: org_rocksdb_FlushOptions + * Method: newFlushOptions + * Signature: ()V + */ +void Java_org_rocksdb_FlushOptions_newFlushOptions( + JNIEnv* env, jobject jobj) { + auto flush_opt = new rocksdb::FlushOptions(); + rocksdb::FlushOptionsJni::setHandle(env, jobj, flush_opt); +} + +/* + * Class: org_rocksdb_FlushOptions + * Method: setWaitForFlush + * Signature: (JZ)V + */ +void Java_org_rocksdb_FlushOptions_setWaitForFlush( + JNIEnv * env, jobject jobj, jlong jhandle, jboolean jwait) { + reinterpret_cast(jhandle) + ->wait = static_cast(jwait); +} + +/* + * Class: org_rocksdb_FlushOptions + * Method: waitForFlush + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_FlushOptions_waitForFlush( + JNIEnv * env, jobject jobj, jlong jhandle) { + return reinterpret_cast(jhandle) + ->wait; +} + +/* + * Class: org_rocksdb_FlushOptions + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_FlushOptions_disposeInternal( + JNIEnv * env, jobject jobj, jlong jhandle) { + delete reinterpret_cast(jhandle); + rocksdb::FlushOptionsJni::setHandle(env, jobj, nullptr); +} diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 28fe754f0..0c35eef4e 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -11,42 +11,68 @@ #define JAVA_ROCKSJNI_PORTAL_H_ #include +#include +#include +#include + #include "rocksdb/db.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/status.h" #include "rocksdb/utilities/backupable_db.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksjni/comparatorjnicallback.h" +#include "rocksjni/writebatchhandlerjnicallback.h" namespace rocksdb { -// The portal class for org.rocksdb.RocksDB -class RocksDBJni { +// detect if jlong overflows size_t +inline Status check_if_jlong_fits_size_t(const jlong& jvalue) { + Status s = Status::OK(); + if (static_cast(jvalue) > std::numeric_limits::max()) { + s = Status::InvalidArgument(Slice("jlong overflows 32 bit value.")); + } + return s; +} + +// Native class template +template class RocksDBNativeClass { public: - // Get the java class id of org.rocksdb.RocksDB. - static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/RocksDB"); + // Get the java class id + static jclass getJClass(JNIEnv* env, const char* jclazz_name) { + jclass jclazz = env->FindClass(jclazz_name); assert(jclazz != nullptr); return jclazz; } - // Get the field id of the member variable of org.rocksdb.RocksDB - // that stores the pointer to rocksdb::DB. + // Get the field id of the member variable to store + // the ptr static jfieldID getHandleFieldID(JNIEnv* env) { static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); + DERIVED::getJClass(env), "nativeHandle_", "J"); assert(fid != nullptr); return fid; } - // Get the pointer to rocksdb::DB of the specified org.rocksdb.RocksDB. - static rocksdb::DB* getHandle(JNIEnv* env, jobject jdb) { - return reinterpret_cast( - env->GetLongField(jdb, getHandleFieldID(env))); + // Get the pointer from Java + static PTR getHandle(JNIEnv* env, jobject jobj) { + return reinterpret_cast( + env->GetLongField(jobj, getHandleFieldID(env))); } - // Pass the rocksdb::DB pointer to the java side. - static void setHandle(JNIEnv* env, jobject jdb, rocksdb::DB* db) { + // Pass the pointer to the java side. + static void setHandle(JNIEnv* env, jobject jdb, PTR ptr) { env->SetLongField( jdb, getHandleFieldID(env), - reinterpret_cast(db)); + reinterpret_cast(ptr)); + } +}; + +// The portal class for org.rocksdb.RocksDB +class RocksDBJni : public RocksDBNativeClass { + public: + // Get the java class id of org.rocksdb.RocksDB. + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/RocksDB"); } }; @@ -79,132 +105,150 @@ class RocksDBExceptionJni { } }; -class OptionsJni { +// The portal class for org.rocksdb.Options +class OptionsJni : public RocksDBNativeClass< + rocksdb::Options*, OptionsJni> { public: - // Get the java class id of org.rocksdb.Options. static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/Options"); - assert(jclazz != nullptr); - return jclazz; - } - - // Get the field id of the member variable of org.rocksdb.Options - // that stores the pointer to rocksdb::Options - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; - } - - // Get the pointer to rocksdb::Options - static rocksdb::Options* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); + return RocksDBNativeClass::getJClass(env, "org/rocksdb/Options"); } +}; - // Pass the rocksdb::Options pointer to the java side. - static void setHandle(JNIEnv* env, jobject jobj, rocksdb::Options* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); +// The portal class for org.rocksdb.DBOptions +class DBOptionsJni : public RocksDBNativeClass< + rocksdb::DBOptions*, DBOptionsJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, "org/rocksdb/DBOptions"); } }; -class WriteOptionsJni { +class ColumnFamilyDescriptorJni { public: - // Get the java class id of org.rocksdb.WriteOptions. - static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/WriteOptions"); + // Get the java class id of org.rocksdb.ColumnFamilyDescriptor + static jclass getColumnFamilyDescriptorClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/ColumnFamilyDescriptor"); assert(jclazz != nullptr); return jclazz; } - // Get the field id of the member variable of org.rocksdb.WriteOptions - // that stores the pointer to rocksdb::WriteOptions - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; + // Get the java method id of columnFamilyName + static jmethodID getColumnFamilyNameMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getColumnFamilyDescriptorClass(env), + "columnFamilyName", "()[B"); + assert(mid != nullptr); + return mid; } - // Get the pointer to rocksdb::WriteOptions - static rocksdb::WriteOptions* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); + // Get the java method id of columnFamilyOptions + static jmethodID getColumnFamilyOptionsMethod(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getColumnFamilyDescriptorClass(env), + "columnFamilyOptions", "()Lorg/rocksdb/ColumnFamilyOptions;"); + assert(mid != nullptr); + return mid; } +}; - // Pass the rocksdb::WriteOptions pointer to the java side. - static void setHandle(JNIEnv* env, jobject jobj, rocksdb::WriteOptions* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); +// The portal class for org.rocksdb.ColumnFamilyOptions +class ColumnFamilyOptionsJni : public RocksDBNativeClass< + rocksdb::ColumnFamilyOptions*, ColumnFamilyOptionsJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/ColumnFamilyOptions"); } }; +// The portal class for org.rocksdb.WriteOptions +class WriteOptionsJni : public RocksDBNativeClass< + rocksdb::WriteOptions*, WriteOptionsJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/WriteOptions"); + } +}; -class ReadOptionsJni { +// The portal class for org.rocksdb.ReadOptions +class ReadOptionsJni : public RocksDBNativeClass< + rocksdb::ReadOptions*, ReadOptionsJni> { public: - // Get the java class id of org.rocksdb.ReadOptions. static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/ReadOptions"); - assert(jclazz != nullptr); - return jclazz; + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/ReadOptions"); } +}; - // Get the field id of the member variable of org.rocksdb.ReadOptions - // that stores the pointer to rocksdb::ReadOptions - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; +// The portal class for org.rocksdb.ReadOptions +class WriteBatchJni : public RocksDBNativeClass< + rocksdb::WriteBatch*, WriteBatchJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/WriteBatch"); } +}; - // Get the pointer to rocksdb::ReadOptions - static rocksdb::ReadOptions* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); +// The portal class for org.rocksdb.WriteBatch.Handler +class WriteBatchHandlerJni : public RocksDBNativeClass< + const rocksdb::WriteBatchHandlerJniCallback*, + WriteBatchHandlerJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/WriteBatch$Handler"); } - // Pass the rocksdb::ReadOptions pointer to the java side. - static void setHandle(JNIEnv* env, jobject jobj, - rocksdb::ReadOptions* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); + // Get the java method `put` of org.rocksdb.WriteBatch.Handler. + static jmethodID getPutMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "put", "([B[B)V"); + assert(mid != nullptr); + return mid; } -}; + // Get the java method `merge` of org.rocksdb.WriteBatch.Handler. + static jmethodID getMergeMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "merge", "([B[B)V"); + assert(mid != nullptr); + return mid; + } -class WriteBatchJni { - public: - static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/WriteBatch"); - assert(jclazz != nullptr); - return jclazz; + // Get the java method `delete` of org.rocksdb.WriteBatch.Handler. + static jmethodID getDeleteMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "delete", "([B)V"); + assert(mid != nullptr); + return mid; } - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; + // Get the java method `logData` of org.rocksdb.WriteBatch.Handler. + static jmethodID getLogDataMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "logData", "([B)V"); + assert(mid != nullptr); + return mid; } - // Get the pointer to rocksdb::WriteBatch of the specified - // org.rocksdb.WriteBatch. - static rocksdb::WriteBatch* getHandle(JNIEnv* env, jobject jwb) { - return reinterpret_cast( - env->GetLongField(jwb, getHandleFieldID(env))); + // Get the java method `shouldContinue` of org.rocksdb.WriteBatch.Handler. + static jmethodID getContinueMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "shouldContinue", "()Z"); + assert(mid != nullptr); + return mid; } +}; - // Pass the rocksdb::WriteBatch pointer to the java side. - static void setHandle(JNIEnv* env, jobject jwb, rocksdb::WriteBatch* wb) { - env->SetLongField( - jwb, getHandleFieldID(env), - reinterpret_cast(wb)); +// The portal class for org.rocksdb.WriteBatchWithIndex +class WriteBatchWithIndexJni : public RocksDBNativeClass< + rocksdb::WriteBatchWithIndex*, WriteBatchWithIndexJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/WriteBatch"); } }; @@ -217,102 +261,151 @@ class HistogramDataJni { } }; -class BackupableDBOptionsJni { +// The portal class for org.rocksdb.WriteBatchWithIndex +class BackupableDBOptionsJni : public RocksDBNativeClass< + rocksdb::BackupableDBOptions*, BackupableDBOptionsJni> { public: - // Get the java class id of org.rocksdb.BackupableDBOptions. static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/BackupableDBOptions"); - assert(jclazz != nullptr); - return jclazz; + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/BackupableDBOptions"); } +}; - // Get the field id of the member variable of org.rocksdb.BackupableDBOptions - // that stores the pointer to rocksdb::BackupableDBOptions - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; +// The portal class for org.rocksdb.RocksIterator +class IteratorJni : public RocksDBNativeClass< + rocksdb::Iterator*, IteratorJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/RocksIterator"); } +}; - // Get the pointer to rocksdb::BackupableDBOptions - static rocksdb::BackupableDBOptions* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); +// The portal class for org.rocksdb.Filter +class FilterJni : public RocksDBNativeClass< + std::shared_ptr*, FilterJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/Filter"); } +}; - // Pass the rocksdb::BackupableDBOptions pointer to the java side. - static void setHandle( - JNIEnv* env, jobject jobj, rocksdb::BackupableDBOptions* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); +// The portal class for org.rocksdb.ColumnFamilyHandle +class ColumnFamilyHandleJni : public RocksDBNativeClass< + rocksdb::ColumnFamilyHandle*, ColumnFamilyHandleJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/ColumnFamilyHandle"); } }; -class IteratorJni { +// The portal class for org.rocksdb.FlushOptions +class FlushOptionsJni : public RocksDBNativeClass< + rocksdb::FlushOptions*, FlushOptionsJni> { public: - // Get the java class id of org.rocksdb.Iteartor. static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/RocksIterator"); - assert(jclazz != nullptr); - return jclazz; + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/FlushOptions"); } +}; - // Get the field id of the member variable of org.rocksdb.Iterator - // that stores the pointer to rocksdb::Iterator. - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; +// The portal class for org.rocksdb.ComparatorOptions +class ComparatorOptionsJni : public RocksDBNativeClass< + rocksdb::ComparatorJniCallbackOptions*, ComparatorOptionsJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/ComparatorOptions"); } +}; - // Get the pointer to rocksdb::Iterator. - static rocksdb::Iterator* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); +// The portal class for org.rocksdb.AbstractComparator +class AbstractComparatorJni : public RocksDBNativeClass< + const rocksdb::BaseComparatorJniCallback*, + AbstractComparatorJni> { + public: + static jclass getJClass(JNIEnv* env) { + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractComparator"); } - // Pass the rocksdb::Iterator pointer to the java side. - static void setHandle( - JNIEnv* env, jobject jobj, rocksdb::Iterator* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); + // Get the java method `name` of org.rocksdb.Comparator. + static jmethodID getNameMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID( + getJClass(env), "name", "()Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `compare` of org.rocksdb.Comparator. + static jmethodID getCompareMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "compare", + "(Lorg/rocksdb/AbstractSlice;Lorg/rocksdb/AbstractSlice;)I"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `findShortestSeparator` of org.rocksdb.Comparator. + static jmethodID getFindShortestSeparatorMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "findShortestSeparator", + "(Ljava/lang/String;Lorg/rocksdb/AbstractSlice;)Ljava/lang/String;"); + assert(mid != nullptr); + return mid; + } + + // Get the java method `findShortSuccessor` of org.rocksdb.Comparator. + static jmethodID getFindShortSuccessorMethodId(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), + "findShortSuccessor", + "(Ljava/lang/String;)Ljava/lang/String;"); + assert(mid != nullptr); + return mid; } }; -class FilterJni { +// The portal class for org.rocksdb.AbstractSlice +class AbstractSliceJni : public RocksDBNativeClass< + const rocksdb::Slice*, AbstractSliceJni> { public: - // Get the java class id of org.rocksdb.FilterPolicy. static jclass getJClass(JNIEnv* env) { - jclass jclazz = env->FindClass("org/rocksdb/Filter"); + return RocksDBNativeClass::getJClass(env, + "org/rocksdb/AbstractSlice"); + } +}; + +class SliceJni { + public: + // Get the java class id of org.rocksdb.Slice. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/Slice"); assert(jclazz != nullptr); return jclazz; } - // Get the field id of the member variable of org.rocksdb.Filter - // that stores the pointer to rocksdb::FilterPolicy. - static jfieldID getHandleFieldID(JNIEnv* env) { - static jfieldID fid = env->GetFieldID( - getJClass(env), "nativeHandle_", "J"); - assert(fid != nullptr); - return fid; + static jobject construct0(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), "", "()V"); + assert(mid != nullptr); + return env->NewObject(getJClass(env), mid); } +}; - // Get the pointer to rocksdb::FilterPolicy. - static rocksdb::FilterPolicy* getHandle(JNIEnv* env, jobject jobj) { - return reinterpret_cast( - env->GetLongField(jobj, getHandleFieldID(env))); +class DirectSliceJni { + public: + // Get the java class id of org.rocksdb.DirectSlice. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/DirectSlice"); + assert(jclazz != nullptr); + return jclazz; } - // Pass the rocksdb::FilterPolicy pointer to the java side. - static void setHandle( - JNIEnv* env, jobject jobj, const rocksdb::FilterPolicy* op) { - env->SetLongField( - jobj, getHandleFieldID(env), - reinterpret_cast(op)); + static jobject construct0(JNIEnv* env) { + static jmethodID mid = env->GetMethodID(getJClass(env), "", "()V"); + assert(mid != nullptr); + return env->NewObject(getJClass(env), mid); } }; @@ -379,5 +472,245 @@ class ListJni { return mid; } }; + +class BackupInfoJni { + public: + // Get the java class id of org.rocksdb.BackupInfo. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/BackupInfo"); + assert(jclazz != nullptr); + return jclazz; + } + + static jobject construct0(JNIEnv* env, uint32_t backup_id, int64_t timestamp, + uint64_t size, uint32_t number_files) { + static jmethodID mid = env->GetMethodID(getJClass(env), "", + "(IJJI)V"); + assert(mid != nullptr); + return env->NewObject(getJClass(env), mid, + backup_id, timestamp, size, number_files); + } +}; + +class BackupInfoListJni { + public: + static jobject getBackupInfo(JNIEnv* env, + std::vector backup_infos) { + jclass jclazz = env->FindClass("java/util/ArrayList"); + jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId( + env, jclazz); + jobject jbackup_info_handle_list = env->NewObject(jclazz, mid, + backup_infos.size()); + // insert in java list + for (std::vector::size_type i = 0; + i != backup_infos.size(); i++) { + rocksdb::BackupInfo backup_info = backup_infos[i]; + jobject obj = rocksdb::BackupInfoJni::construct0(env, + backup_info.backup_id, + backup_info.timestamp, + backup_info.size, + backup_info.number_files); + env->CallBooleanMethod(jbackup_info_handle_list, + rocksdb::ListJni::getListAddMethodId(env), obj); + } + return jbackup_info_handle_list; + } +}; + +class WBWIRocksIteratorJni { + public: + // Get the java class id of org.rocksdb.WBWIRocksIterator. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator"); + assert(jclazz != nullptr); + return jclazz; + } + + static jfieldID getWriteEntryField(JNIEnv* env) { + static jfieldID fid = + env->GetFieldID(getJClass(env), "entry", + "Lorg/rocksdb/WBWIRocksIterator$WriteEntry;"); + assert(fid != nullptr); + return fid; + } + + static jobject getWriteEntry(JNIEnv* env, jobject jwbwi_rocks_iterator) { + jobject jwe = + env->GetObjectField(jwbwi_rocks_iterator, getWriteEntryField(env)); + assert(jwe != nullptr); + return jwe; + } +}; + +class WriteTypeJni { + public: + // Get the PUT enum field of org.rocksdb.WBWIRocksIterator.WriteType + static jobject PUT(JNIEnv* env) { + return getEnum(env, "PUT"); + } + + // Get the MERGE enum field of org.rocksdb.WBWIRocksIterator.WriteType + static jobject MERGE(JNIEnv* env) { + return getEnum(env, "MERGE"); + } + + // Get the DELETE enum field of org.rocksdb.WBWIRocksIterator.WriteType + static jobject DELETE(JNIEnv* env) { + return getEnum(env, "DELETE"); + } + + // Get the LOG enum field of org.rocksdb.WBWIRocksIterator.WriteType + static jobject LOG(JNIEnv* env) { + return getEnum(env, "LOG"); + } + + private: + // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteType. + static jclass getJClass(JNIEnv* env) { + jclass jclazz = env->FindClass("org/rocksdb/WBWIRocksIterator$WriteType"); + assert(jclazz != nullptr); + return jclazz; + } + + // Get an enum field of org.rocksdb.WBWIRocksIterator.WriteType + static jobject getEnum(JNIEnv* env, const char name[]) { + jclass jclazz = getJClass(env); + jfieldID jfid = + env->GetStaticFieldID(jclazz, name, + "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); + assert(jfid != nullptr); + return env->GetStaticObjectField(jclazz, jfid); + } +}; + +class WriteEntryJni { + public: + // Get the java class id of org.rocksdb.WBWIRocksIterator.WriteEntry. + static jclass getJClass(JNIEnv* env) { + static jclass jclazz = + env->FindClass("org/rocksdb/WBWIRocksIterator$WriteEntry"); + assert(jclazz != nullptr); + return jclazz; + } + + static void setWriteType(JNIEnv* env, jobject jwrite_entry, + WriteType write_type) { + jobject jwrite_type; + switch (write_type) { + case kPutRecord: + jwrite_type = WriteTypeJni::PUT(env); + break; + + case kMergeRecord: + jwrite_type = WriteTypeJni::MERGE(env); + break; + + case kDeleteRecord: + jwrite_type = WriteTypeJni::DELETE(env); + break; + + case kLogDataRecord: + jwrite_type = WriteTypeJni::LOG(env); + break; + + default: + jwrite_type = nullptr; + } + assert(jwrite_type != nullptr); + env->SetObjectField(jwrite_entry, getWriteTypeField(env), jwrite_type); + } + + static void setKey(JNIEnv* env, jobject jwrite_entry, + const rocksdb::Slice* slice) { + jobject jkey = env->GetObjectField(jwrite_entry, getKeyField(env)); + AbstractSliceJni::setHandle(env, jkey, slice); + } + + static void setValue(JNIEnv* env, jobject jwrite_entry, + const rocksdb::Slice* slice) { + jobject jvalue = env->GetObjectField(jwrite_entry, getValueField(env)); + AbstractSliceJni::setHandle(env, jvalue, slice); + } + + private: + static jfieldID getWriteTypeField(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "type", "Lorg/rocksdb/WBWIRocksIterator$WriteType;"); + assert(fid != nullptr); + return fid; + } + + static jfieldID getKeyField(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "key", "Lorg/rocksdb/DirectSlice;"); + assert(fid != nullptr); + return fid; + } + + static jfieldID getValueField(JNIEnv* env) { + static jfieldID fid = env->GetFieldID( + getJClass(env), "value", "Lorg/rocksdb/DirectSlice;"); + assert(fid != nullptr); + return fid; + } +}; + +class JniUtil { + public: + /* + * Copies a jstring to a std::string + * and releases the original jstring + */ + static std::string copyString(JNIEnv* env, jstring js) { + const char *utf = env->GetStringUTFChars(js, NULL); + std::string name(utf); + env->ReleaseStringUTFChars(js, utf); + return name; + } + + /* + * Helper for operations on a key and value + * for example WriteBatch->Put + * + * TODO(AR) could be extended to cover returning rocksdb::Status + * from `op` and used for RocksDB->Put etc. + */ + static void kv_op( + std::function op, + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + jbyte* value = env->GetByteArrayElements(jentry_value, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), + jentry_value_len); + + op(key_slice, value_slice); + + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT); + } + + /* + * Helper for operations on a key + * for example WriteBatch->Delete + * + * TODO(AR) could be extended to cover returning rocksdb::Status + * from `op` and used for RocksDB->Delete etc. + */ + static void k_op( + std::function op, + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len) { + jbyte* key = env->GetByteArrayElements(jkey, nullptr); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + + op(key_slice); + + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + } +}; + } // namespace rocksdb #endif // JAVA_ROCKSJNI_PORTAL_H_ diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc new file mode 100644 index 000000000..ab6160e0d --- /dev/null +++ b/java/rocksjni/ratelimiterjni.cc @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for RateLimiter. + +#include "rocksjni/portal.h" +#include "include/org_rocksdb_GenericRateLimiterConfig.h" +#include "rocksdb/rate_limiter.h" + +/* + * Class: org_rocksdb_GenericRateLimiterConfig + * Method: newRateLimiterHandle + * Signature: (JJI)J + */ +jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle( + JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second, + jlong jrefill_period_micros, jint jfairness) { + return reinterpret_cast(rocksdb::NewGenericRateLimiter( + static_cast(jrate_bytes_per_second), + static_cast(jrefill_period_micros), + static_cast(jfairness))); +} diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc index 942e707e6..a2341632b 100644 --- a/java/rocksjni/restorejni.cc +++ b/java/rocksjni/restorejni.cc @@ -10,7 +10,6 @@ #include #include #include -#include #include #include "include/org_rocksdb_RestoreOptions.h" @@ -66,13 +65,13 @@ void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromBackup0(JNIEnv* env, const char* cwal_dir = env->GetStringUTFChars(jwal_dir, 0); auto rdb = reinterpret_cast(jhandle); - rocksdb::Status s = - rdb->RestoreDBFromBackup(jbackup_id, cdb_dir, cwal_dir, *opt); + rocksdb::Status s = rdb->RestoreDBFromBackup( + static_cast(jbackup_id), cdb_dir, cwal_dir, *opt); env->ReleaseStringUTFChars(jdb_dir, cdb_dir); env->ReleaseStringUTFChars(jwal_dir, cwal_dir); - if(!s.ok()) { + if (!s.ok()) { rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } @@ -97,7 +96,7 @@ void Java_org_rocksdb_RestoreBackupableDB_restoreDBFromLatestBackup0( env->ReleaseStringUTFChars(jdb_dir, cdb_dir); env->ReleaseStringUTFChars(jwal_dir, cwal_dir); - if(!s.ok()) { + if (!s.ok()) { rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } @@ -112,7 +111,7 @@ void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env, auto rdb = reinterpret_cast(jhandle); rocksdb::Status s = rdb->PurgeOldBackups(jnum_backups_to_keep); - if(!s.ok()) { + if (!s.ok()) { rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } @@ -120,14 +119,73 @@ void Java_org_rocksdb_RestoreBackupableDB_purgeOldBackups0(JNIEnv* env, /* * Class: org_rocksdb_RestoreBackupableDB * Method: deleteBackup0 - * Signature: (JJ)V + * Signature: (JI)V */ void Java_org_rocksdb_RestoreBackupableDB_deleteBackup0(JNIEnv* env, - jobject jobj, jlong jhandle, jlong jbackup_id) { + jobject jobj, jlong jhandle, jint jbackup_id) { auto rdb = reinterpret_cast(jhandle); rocksdb::Status s = rdb->DeleteBackup(jbackup_id); - if(!s.ok()) { + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_RestoreBackupableDB + * Method: getBackupInfo + * Signature: (J)Ljava/util/List; + */ +jobject Java_org_rocksdb_RestoreBackupableDB_getBackupInfo( + JNIEnv* env, jobject jbdb, jlong jhandle) { + std::vector backup_infos; + reinterpret_cast(jhandle)-> + GetBackupInfo(&backup_infos); + return rocksdb::BackupInfoListJni::getBackupInfo(env, + backup_infos); +} + +/* + * Class: org_rocksdb_RestoreBackupableDB + * Method: getCorruptedBackups + * Signature: (J)[I; + */ +jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups( + JNIEnv* env, jobject jbdb, jlong jhandle) { + std::vector backup_ids; + reinterpret_cast(jhandle)-> + GetCorruptedBackups(&backup_ids); + // store backupids in int array + const std::vector::size_type + kIdSize = backup_ids.size(); + + int int_backup_ids[kIdSize]; + for (std::vector::size_type i = 0; + i != kIdSize; i++) { + int_backup_ids[i] = backup_ids[i]; + } + // Store ints in java array + jintArray ret_backup_ids; + // Its ok to loose precision here (64->32) + jsize ret_backup_ids_size = static_cast(kIdSize); + ret_backup_ids = env->NewIntArray(ret_backup_ids_size); + env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, + int_backup_ids); + return ret_backup_ids; +} + +/* + * Class: org_rocksdb_RestoreBackupableDB + * Method: garbageCollect + * Signature: (J)V + */ +void Java_org_rocksdb_RestoreBackupableDB_garbageCollect( + JNIEnv* env, jobject jobj, jlong jhandle) { + auto db = reinterpret_cast( + jhandle); + rocksdb::Status s = db->GarbageCollect(); + + if (!s.ok()) { rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index f55290f64..54e449f53 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -6,16 +6,18 @@ // This file implements the "bridge" between Java and C++ and enables // calling c++ rocksdb::DB methods from Java side. +#include #include #include -#include +#include #include #include #include "include/org_rocksdb_RocksDB.h" -#include "rocksjni/portal.h" #include "rocksdb/db.h" #include "rocksdb/cache.h" +#include "rocksdb/types.h" +#include "rocksjni/portal.h" ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Open @@ -25,7 +27,7 @@ * Method: open * Signature: (JLjava/lang/String;)V */ -void Java_org_rocksdb_RocksDB_open( +void Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2( JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { auto opt = reinterpret_cast(jopt_handle); rocksdb::DB* db = nullptr; @@ -40,26 +42,280 @@ void Java_org_rocksdb_RocksDB_open( rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } +/* + * Class: org_rocksdb_RocksDB + * Method: openROnly + * Signature: (JLjava/lang/String;)V + */ +void Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2( + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) { + auto opt = reinterpret_cast(jopt_handle); + rocksdb::DB* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt, + db_path, &db); + env->ReleaseStringUTFChars(jdb_path, db_path); + + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jdb, db); + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: openROnly + * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List; + */ +jobject + Java_org_rocksdb_RocksDB_openROnly__JLjava_lang_String_2Ljava_util_List_2I( + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, + jobject jcfdesc_list, jint jcfdesc_count) { + auto opt = reinterpret_cast(jopt_handle); + rocksdb::DB* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + + std::vector cfnames_to_free; + // the zero-terminated version of cfnames_to_free. + std::vector c_cfnames_to_free; + std::vector jcfnames_for_free; + + std::vector column_families; + std::vector handles; + // get iterator for ColumnFamilyDescriptors + jobject iteratorObj = env->CallObjectMethod( + jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over ColumnFamilyDescriptors + while (env->CallBooleanMethod( + iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + // get ColumnFamilyDescriptor + jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, + rocksdb::ListJni::getNextMethod(env)); + // get ColumnFamilyName + jbyteArray byteArray = static_cast(env->CallObjectMethod( + jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( + env))); + // get CF Options + jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( + env)); + rocksdb::ColumnFamilyOptions* cfOptions = + rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); + + jbyte* cfname = env->GetByteArrayElements(byteArray, 0); + const int len = env->GetArrayLength(byteArray) + 1; + char* c_cfname = new char[len]; + memcpy(c_cfname, cfname, len - 1); + c_cfname[len - 1] = 0; + + // free allocated cfnames after call to open + cfnames_to_free.push_back(cfname); + c_cfnames_to_free.push_back(c_cfname); + jcfnames_for_free.push_back(byteArray); + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + c_cfname, *cfOptions)); + } + + rocksdb::Status s = rocksdb::DB::OpenForReadOnly(*opt, + db_path, column_families, &handles, &db); + env->ReleaseStringUTFChars(jdb_path, db_path); + // free jbyte allocations + for (std::vector::size_type i = 0; + i != cfnames_to_free.size(); i++) { + // free cfnames + env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); + // free c_cfnames + delete[] c_cfnames_to_free[i]; + } + + // check if open operation was successful + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jdb, db); + jclass jListClazz = env->FindClass("java/util/ArrayList"); + jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( + env, jListClazz); + jobject jcfhandle_list = env->NewObject(jListClazz, + midList, handles.size()); + // insert in java list + for (std::vector::size_type i = 0; + i != handles.size(); i++) { + // jlong must be converted to Long due to collections restrictions + jclass jLongClazz = env->FindClass("java/lang/Long"); + jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); + jobject obj = env->NewObject(jLongClazz, midLong, + reinterpret_cast(handles[i])); + env->CallBooleanMethod(jcfhandle_list, + rocksdb::ListJni::getListAddMethodId(env), obj); + } + + return jcfhandle_list; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: open + * Signature: (JLjava/lang/String;Ljava/util/List;I)Ljava/util/List; + */ +jobject Java_org_rocksdb_RocksDB_open__JLjava_lang_String_2Ljava_util_List_2I( + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, + jobject jcfdesc_list, jint jcfdesc_count) { + auto opt = reinterpret_cast(jopt_handle); + rocksdb::DB* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + + std::vector cfnames_to_free; + // the zero-terminated version of cfnames_to_free. + std::vector c_cfnames_to_free; + std::vector jcfnames_for_free; + + std::vector column_families; + std::vector handles; + // get iterator for ColumnFamilyDescriptors + jobject iteratorObj = env->CallObjectMethod( + jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over ColumnFamilyDescriptors + while (env->CallBooleanMethod( + iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + // get ColumnFamilyDescriptor + jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, + rocksdb::ListJni::getNextMethod(env)); + // get ColumnFamilyName + jbyteArray byteArray = static_cast(env->CallObjectMethod( + jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( + env))); + // get CF Options + jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( + env)); + rocksdb::ColumnFamilyOptions* cfOptions = + rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); + + jbyte* cfname = env->GetByteArrayElements(byteArray, 0); + const int len = env->GetArrayLength(byteArray) + 1; + char* c_cfname = new char[len]; + memcpy(c_cfname, cfname, len - 1); + c_cfname[len - 1] = 0; + + // free allocated cfnames after call to open + cfnames_to_free.push_back(cfname); + c_cfnames_to_free.push_back(c_cfname); + jcfnames_for_free.push_back(byteArray); + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + c_cfname, *cfOptions)); + } + + rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, column_families, + &handles, &db); + env->ReleaseStringUTFChars(jdb_path, db_path); + // free jbyte allocations + for (std::vector::size_type i = 0; + i != cfnames_to_free.size(); i++) { + // free cfnames + env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); + // free c_cfnames + delete[] c_cfnames_to_free[i]; + } + + // check if open operation was successful + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jdb, db); + jclass jListClazz = env->FindClass("java/util/ArrayList"); + jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( + env, jListClazz); + jobject jcfhandle_list = env->NewObject(jListClazz, + midList, handles.size()); + // insert in java list + for (std::vector::size_type i = 0; + i != handles.size(); i++) { + // jlong must be converted to Long due to collections restrictions + jclass jLongClazz = env->FindClass("java/lang/Long"); + jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); + jobject obj = env->NewObject(jLongClazz, midLong, + reinterpret_cast(handles[i])); + env->CallBooleanMethod(jcfhandle_list, + rocksdb::ListJni::getListAddMethodId(env), obj); + } + + return jcfhandle_list; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::ListColumnFamilies + +/* + * Class: org_rocksdb_RocksDB + * Method: listColumnFamilies + * Signature: (JLjava/lang/String;)Ljava/util/List; + */ +jobject Java_org_rocksdb_RocksDB_listColumnFamilies( + JNIEnv* env, jclass jclazz, jlong jopt_handle, jstring jdb_path) { + std::vector column_family_names; + auto opt = reinterpret_cast(jopt_handle); + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + jobject jvalue_list = nullptr; + + rocksdb::Status s = rocksdb::DB::ListColumnFamilies(*opt, db_path, + &column_family_names); + env->ReleaseStringUTFChars(jdb_path, db_path); + if (s.ok()) { + // Don't reuse class pointer + jclass jListClazz = env->FindClass("java/util/ArrayList"); + jmethodID mid = rocksdb::ListJni::getArrayListConstructorMethodId(env, + jListClazz); + jvalue_list = env->NewObject(jListClazz, mid, column_family_names.size()); + + for (std::vector::size_type i = 0; + i < column_family_names.size(); i++) { + jbyteArray jcf_value = + env->NewByteArray(static_cast(column_family_names[i].size())); + env->SetByteArrayRegion( + jcf_value, 0, static_cast(column_family_names[i].size()), + reinterpret_cast(column_family_names[i].c_str())); + env->CallBooleanMethod(jvalue_list, + rocksdb::ListJni::getListAddMethodId(env), jcf_value); + } + } + return jvalue_list; +} + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Put void rocksdb_put_helper( JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, - jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { jbyte* key = env->GetByteArrayElements(jkey, 0); - jbyte* value = env->GetByteArrayElements(jvalue, 0); + jbyte* value = env->GetByteArrayElements(jentry_value, 0); rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); + rocksdb::Slice value_slice(reinterpret_cast(value), + jentry_value_len); - rocksdb::Status s = db->Put(write_options, key_slice, value_slice); + rocksdb::Status s; + if (cf_handle != nullptr) { + s = db->Put(write_options, cf_handle, key_slice, value_slice); + } else { + // backwards compatibility + s = db->Put(write_options, key_slice, value_slice); + } // trigger java unref on key and value. // by passing JNI_ABORT, it will simply release the reference without // copying the result back to the java byte array. env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); + env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT); if (s.ok()) { return; @@ -75,14 +331,35 @@ void rocksdb_put_helper( void Java_org_rocksdb_RocksDB_put__J_3BI_3BI( JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + jbyteArray jentry_value, jint jentry_value_len) { auto db = reinterpret_cast(jdb_handle); static const rocksdb::WriteOptions default_write_options = rocksdb::WriteOptions(); - rocksdb_put_helper(env, db, default_write_options, + rocksdb_put_helper(env, db, default_write_options, nullptr, jkey, jkey_len, - jvalue, jvalue_len); + jentry_value, jentry_value_len); +} +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_RocksDB_put__J_3BI_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_put_helper(env, db, default_write_options, cf_handle, + jkey, jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } } /* @@ -94,52 +371,205 @@ void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BI( JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options_handle, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + jbyteArray jentry_value, jint jentry_value_len) { auto db = reinterpret_cast(jdb_handle); auto write_options = reinterpret_cast( jwrite_options_handle); - rocksdb_put_helper(env, db, *write_options, + rocksdb_put_helper(env, db, *write_options, nullptr, jkey, jkey_len, - jvalue, jvalue_len); + jentry_value, jentry_value_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: put + * Signature: (JJ[BI[BIJ)V + */ +void Java_org_rocksdb_RocksDB_put__JJ_3BI_3BIJ( + JNIEnv* env, jobject jdb, + jlong jdb_handle, jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast( + jwrite_options_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_put_helper(env, db, *write_options, cf_handle, + jkey, jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } } ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Write /* * Class: org_rocksdb_RocksDB - * Method: write + * Method: write0 * Signature: (JJ)V */ -void Java_org_rocksdb_RocksDB_write( +void Java_org_rocksdb_RocksDB_write0( JNIEnv* env, jobject jdb, - jlong jwrite_options_handle, jlong jbatch_handle) { + jlong jwrite_options_handle, jlong jwb_handle) { rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); - auto write_options = reinterpret_cast( + auto* write_options = reinterpret_cast( + jwrite_options_handle); + auto* wb = reinterpret_cast(jwb_handle); + + rocksdb::Status s = db->Write(*write_options, wb); + + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_RocksDB + * Method: write1 + * Signature: (JJ)V + */ +void Java_org_rocksdb_RocksDB_write1( + JNIEnv* env, jobject jdb, + jlong jwrite_options_handle, jlong jwbwi_handle) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto* write_options = reinterpret_cast( jwrite_options_handle); - auto batch = reinterpret_cast(jbatch_handle); + auto* wbwi = reinterpret_cast(jwbwi_handle); + auto* wb = wbwi->GetWriteBatch(); - rocksdb::Status s = db->Write(*write_options, batch); + rocksdb::Status s = db->Write(*write_options, wb); if (!s.ok()) { rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } } +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::KeyMayExist +jboolean key_may_exist_helper(JNIEnv* env, rocksdb::DB* db, + const rocksdb::ReadOptions& read_opt, + rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len, + jobject jstring_buffer) { + std::string value; + bool value_found = false; + jboolean isCopy; + jbyte* key = env->GetByteArrayElements(jkey, &isCopy); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + bool keyMayExist; + if (cf_handle != nullptr) { + keyMayExist = db->KeyMayExist(read_opt, cf_handle, key_slice, + &value, &value_found); + } else { + keyMayExist = db->KeyMayExist(read_opt, key_slice, + &value, &value_found); + } + + if (value_found && !value.empty()) { + jclass clazz = env->GetObjectClass(jstring_buffer); + jmethodID mid = env->GetMethodID(clazz, "append", + "(Ljava/lang/String;)Ljava/lang/StringBuffer;"); + jstring new_value_str = env->NewStringUTF(value.c_str()); + env->CallObjectMethod(jstring_buffer, mid, new_value_str); + } + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + return static_cast(keyMayExist); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyMayExist + * Signature: ([BILjava/lang/StringBuffer;)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BILjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len, + jobject jstring_buffer) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + return key_may_exist_helper(env, db, rocksdb::ReadOptions(), + nullptr, jkey, jkey_len, jstring_buffer); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyMayExist + * Signature: ([BIJLjava/lang/StringBuffer;)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyMayExist___3BIJLjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jbyteArray jkey, jint jkey_len, + jlong jcf_handle, jobject jstring_buffer) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto cf_handle = reinterpret_cast( + jcf_handle); + if (cf_handle != nullptr) { + return key_may_exist_helper(env, db, rocksdb::ReadOptions(), + cf_handle, jkey, jkey_len, jstring_buffer); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } + return true; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyMayExist + * Signature: (J[BILjava/lang/StringBuffer;)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BILjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jread_options_handle, + jbyteArray jkey, jint jkey_len, jobject jstring_buffer) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto& read_options = *reinterpret_cast( + jread_options_handle); + return key_may_exist_helper(env, db, read_options, + nullptr, jkey, jkey_len, jstring_buffer); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: keyMayExist + * Signature: (J[BIJLjava/lang/StringBuffer;)Z + */ +jboolean Java_org_rocksdb_RocksDB_keyMayExist__J_3BIJLjava_lang_StringBuffer_2( + JNIEnv* env, jobject jdb, jlong jread_options_handle, + jbyteArray jkey, jint jkey_len, jlong jcf_handle, jobject jstring_buffer) { + rocksdb::DB* db = rocksdb::RocksDBJni::getHandle(env, jdb); + auto& read_options = *reinterpret_cast( + jread_options_handle); + auto cf_handle = reinterpret_cast( + jcf_handle); + if (cf_handle != nullptr) { + return key_may_exist_helper(env, db, read_options, cf_handle, + jkey, jkey_len, jstring_buffer); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } + return true; +} + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Get jbyteArray rocksdb_get_helper( JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_opt, - jbyteArray jkey, jint jkey_len) { + rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey, + jint jkey_len) { jboolean isCopy; jbyte* key = env->GetByteArrayElements(jkey, &isCopy); rocksdb::Slice key_slice( reinterpret_cast(key), jkey_len); std::string value; - rocksdb::Status s = db->Get( - read_opt, key_slice, &value); + rocksdb::Status s; + if (column_family_handle != nullptr) { + s = db->Get(read_opt, column_family_handle, key_slice, &value); + } else { + // backwards compatibility + s = db->Get(read_opt, key_slice, &value); + } // trigger java unref on key. // by passing JNI_ABORT, it will simply release the reference without @@ -151,11 +581,10 @@ jbyteArray rocksdb_get_helper( } if (s.ok()) { - jbyteArray jvalue = env->NewByteArray(value.size()); - env->SetByteArrayRegion( - jvalue, 0, value.size(), - reinterpret_cast(value.c_str())); - return jvalue; + jbyteArray jret_value = env->NewByteArray(static_cast(value.size())); + env->SetByteArrayRegion(jret_value, 0, static_cast(value.size()), + reinterpret_cast(value.c_str())); + return jret_value; } rocksdb::RocksDBExceptionJni::ThrowNew(env, s); @@ -172,10 +601,31 @@ jbyteArray Java_org_rocksdb_RocksDB_get__J_3BI( jbyteArray jkey, jint jkey_len) { return rocksdb_get_helper(env, reinterpret_cast(jdb_handle), - rocksdb::ReadOptions(), + rocksdb::ReadOptions(), nullptr, jkey, jkey_len); } +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BIJ)[B + */ +jbyteArray Java_org_rocksdb_RocksDB_get__J_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, jlong jcf_handle) { + auto db_handle = reinterpret_cast(jdb_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), + cf_handle, jkey, jkey_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + // will never be evaluated + return env->NewByteArray(0); + } +} + /* * Class: org_rocksdb_RocksDB * Method: get @@ -186,14 +636,36 @@ jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BI( jbyteArray jkey, jint jkey_len) { return rocksdb_get_helper(env, reinterpret_cast(jdb_handle), - *reinterpret_cast(jropt_handle), + *reinterpret_cast(jropt_handle), nullptr, jkey, jkey_len); } +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (JJ[BIJ)[B + */ +jbyteArray Java_org_rocksdb_RocksDB_get__JJ_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jbyteArray jkey, jint jkey_len, jlong jcf_handle) { + auto db_handle = reinterpret_cast(jdb_handle); + auto& ro_opt = *reinterpret_cast(jropt_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, + jkey, jkey_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + // will never be evaluated + return env->NewByteArray(0); + } +} + jint rocksdb_get_helper( JNIEnv* env, rocksdb::DB* db, const rocksdb::ReadOptions& read_options, - jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + rocksdb::ColumnFamilyHandle* column_family_handle, jbyteArray jkey, + jint jkey_len, jbyteArray jentry_value, jint jentry_value_len) { static const int kNotFound = -1; static const int kStatusError = -2; @@ -204,8 +676,13 @@ jint rocksdb_get_helper( // TODO(yhchiang): we might save one memory allocation here by adding // a DB::Get() function which takes preallocated jbyte* as input. std::string cvalue; - rocksdb::Status s = db->Get( - read_options, key_slice, &cvalue); + rocksdb::Status s; + if (column_family_handle != nullptr) { + s = db->Get(read_options, column_family_handle, key_slice, &cvalue); + } else { + // backwards compatibility + s = db->Get(read_options, key_slice, &cvalue); + } // trigger java unref on key. // by passing JNI_ABORT, it will simply release the reference without @@ -228,25 +705,45 @@ jint rocksdb_get_helper( } int cvalue_len = static_cast(cvalue.size()); - int length = std::min(jvalue_len, cvalue_len); + int length = std::min(jentry_value_len, cvalue_len); env->SetByteArrayRegion( - jvalue, 0, length, + jentry_value, 0, length, reinterpret_cast(cvalue.c_str())); return cvalue_len; } +// cf multi get jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, - const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count) { + const rocksdb::ReadOptions& rOpt, jobject jkey_list, jint jkeys_count, + jobject jcfhandle_list) { std::vector keys; std::vector keys_to_free; + std::vector cf_handles; + if (jcfhandle_list != nullptr) { + // get cf iterator + jobject cfIteratorObj = env->CallObjectMethod( + jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over keys and convert java byte array to slice + while (env->CallBooleanMethod( + cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + jobject jobj = (jbyteArray) env->CallObjectMethod( + cfIteratorObj, rocksdb::ListJni::getNextMethod(env)); + rocksdb::ColumnFamilyHandle* cfHandle = + rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj); + cf_handles.push_back(cfHandle); + } + } + + // Process key list // get iterator jobject iteratorObj = env->CallObjectMethod( jkey_list, rocksdb::ListJni::getIteratorMethod(env)); // iterate over keys and convert java byte array to slice - while(env->CallBooleanMethod( + while (env->CallBooleanMethod( iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { jbyteArray jkey = (jbyteArray) env->CallObjectMethod( iteratorObj, rocksdb::ListJni::getNextMethod(env)); @@ -263,7 +760,12 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, } std::vector values; - std::vector s = db->MultiGet(rOpt, keys, &values); + std::vector s; + if (cf_handles.size() == 0) { + s = db->MultiGet(rOpt, keys, &values); + } else { + s = db->MultiGet(rOpt, cf_handles, keys, &values); + } // Don't reuse class pointer jclass jclazz = env->FindClass("java/util/ArrayList"); @@ -272,27 +774,26 @@ jobject multi_get_helper(JNIEnv* env, jobject jdb, rocksdb::DB* db, jobject jvalue_list = env->NewObject(jclazz, mid, jkeys_count); // insert in java list - for(std::vector::size_type i = 0; i != s.size(); i++) { - if(s[i].ok()) { - jbyteArray jvalue = env->NewByteArray(values[i].size()); + for (std::vector::size_type i = 0; i != s.size(); i++) { + if (s[i].ok()) { + jbyteArray jentry_value = + env->NewByteArray(static_cast(values[i].size())); env->SetByteArrayRegion( - jvalue, 0, values[i].size(), + jentry_value, 0, static_cast(values[i].size()), reinterpret_cast(values[i].c_str())); env->CallBooleanMethod( - jvalue_list, rocksdb::ListJni::getListAddMethodId(env), jvalue); - } - else { + jvalue_list, rocksdb::ListJni::getListAddMethodId(env), + jentry_value); + } else { env->CallBooleanMethod( jvalue_list, rocksdb::ListJni::getListAddMethodId(env), nullptr); } } - // free up allocated byte arrays - for(std::vector::size_type i = 0; i != keys_to_free.size(); i++) { + for (std::vector::size_type i = 0; i != keys_to_free.size(); i++) { delete[] keys_to_free[i]; } keys_to_free.clear(); - return jvalue_list; } @@ -305,7 +806,20 @@ jobject Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2I( JNIEnv* env, jobject jdb, jlong jdb_handle, jobject jkey_list, jint jkeys_count) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), - rocksdb::ReadOptions(), jkey_list, jkeys_count); + rocksdb::ReadOptions(), jkey_list, jkeys_count, nullptr); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: multiGet + * Signature: (JLjava/util/List;ILjava/util/List;)Ljava/util/List; + */ +jobject + Java_org_rocksdb_RocksDB_multiGet__JLjava_util_List_2ILjava_util_List_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jobject jkey_list, jint jkeys_count, jobject jcfhandle_list) { + return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), + rocksdb::ReadOptions(), jkey_list, jkeys_count, jcfhandle_list); } /* @@ -318,7 +832,22 @@ jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I( jlong jropt_handle, jobject jkey_list, jint jkeys_count) { return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), *reinterpret_cast(jropt_handle), jkey_list, - jkeys_count); + jkeys_count, nullptr); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: multiGet + * Signature: (JJLjava/util/List;ILjava/util/List;)Ljava/util/List; + */ +jobject + Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2ILjava_util_List_2( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jropt_handle, jobject jkey_list, jint jkeys_count, + jobject jcfhandle_list) { + return multi_get_helper(env, jdb, reinterpret_cast(jdb_handle), + *reinterpret_cast(jropt_handle), jkey_list, + jkeys_count, jcfhandle_list); } /* @@ -329,11 +858,33 @@ jobject Java_org_rocksdb_RocksDB_multiGet__JJLjava_util_List_2I( jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI( JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + jbyteArray jentry_value, jint jentry_value_len) { return rocksdb_get_helper(env, reinterpret_cast(jdb_handle), - rocksdb::ReadOptions(), - jkey, jkey_len, jvalue, jvalue_len); + rocksdb::ReadOptions(), nullptr, + jkey, jkey_len, jentry_value, jentry_value_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (J[BI[BIJ)I + */ +jint Java_org_rocksdb_RocksDB_get__J_3BI_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db_handle = reinterpret_cast(jdb_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + return rocksdb_get_helper(env, db_handle, rocksdb::ReadOptions(), cf_handle, + jkey, jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + // will never be evaluated + return 0; + } } /* @@ -344,23 +895,50 @@ jint Java_org_rocksdb_RocksDB_get__J_3BI_3BI( jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BI( JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { + jbyteArray jentry_value, jint jentry_value_len) { return rocksdb_get_helper(env, reinterpret_cast(jdb_handle), *reinterpret_cast(jropt_handle), - jkey, jkey_len, jvalue, jvalue_len); + nullptr, jkey, jkey_len, jentry_value, jentry_value_len); } +/* + * Class: org_rocksdb_RocksDB + * Method: get + * Signature: (JJ[BI[BIJ)I + */ +jint Java_org_rocksdb_RocksDB_get__JJ_3BI_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jropt_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db_handle = reinterpret_cast(jdb_handle); + auto& ro_opt = *reinterpret_cast(jropt_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + return rocksdb_get_helper(env, db_handle, ro_opt, cf_handle, jkey, + jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + // will never be evaluated + return 0; + } +} ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Delete() void rocksdb_remove_helper( JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, - jbyteArray jkey, jint jkey_len) { + rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len) { jbyte* key = env->GetByteArrayElements(jkey, 0); rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - rocksdb::Status s = db->Delete(write_options, key_slice); - + rocksdb::Status s; + if (cf_handle != nullptr) { + s = db->Delete(write_options, cf_handle, key_slice); + } else { + // backwards compatibility + s = db->Delete(write_options, key_slice); + } // trigger java unref on key and value. // by passing JNI_ABORT, it will simply release the reference without // copying the result back to the java byte array. @@ -383,45 +961,709 @@ void Java_org_rocksdb_RocksDB_remove__J_3BI( auto db = reinterpret_cast(jdb_handle); static const rocksdb::WriteOptions default_write_options = rocksdb::WriteOptions(); + rocksdb_remove_helper(env, db, default_write_options, nullptr, + jkey, jkey_len); +} - rocksdb_remove_helper(env, db, default_write_options, jkey, jkey_len); +/* + * Class: org_rocksdb_RocksDB + * Method: remove + * Signature: (J[BIJ)V + */ +void Java_org_rocksdb_RocksDB_remove__J_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_remove_helper(env, db, default_write_options, cf_handle, + jkey, jkey_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } } /* * Class: org_rocksdb_RocksDB * Method: remove - * Signature: (JJ[BI)V + * Signature: (JJ[BIJ)V */ void Java_org_rocksdb_RocksDB_remove__JJ_3BI( JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jwrite_options, jbyteArray jkey, jint jkey_len) { auto db = reinterpret_cast(jdb_handle); auto write_options = reinterpret_cast(jwrite_options); - - rocksdb_remove_helper(env, db, *write_options, jkey, jkey_len); + rocksdb_remove_helper(env, db, *write_options, nullptr, jkey, jkey_len); } -////////////////////////////////////////////////////////////////////////////// -// rocksdb::DB::~DB() - /* * Class: org_rocksdb_RocksDB - * Method: disposeInternal - * Signature: (J)V + * Method: remove + * Signature: (JJ[BIJ)V */ -void Java_org_rocksdb_RocksDB_disposeInternal( +void Java_org_rocksdb_RocksDB_remove__JJ_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jwrite_options, jbyteArray jkey, jint jkey_len, + jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast(jwrite_options); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_remove_helper(env, db, *write_options, cf_handle, jkey, jkey_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } +} +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Merge + +void rocksdb_merge_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::WriteOptions& write_options, + rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + + jbyte* key = env->GetByteArrayElements(jkey, 0); + jbyte* value = env->GetByteArrayElements(jentry_value, 0); + rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); + rocksdb::Slice value_slice(reinterpret_cast(value), + jentry_value_len); + + rocksdb::Status s; + if (cf_handle != nullptr) { + s = db->Merge(write_options, cf_handle, key_slice, value_slice); + } else { + s = db->Merge(write_options, key_slice, value_slice); + } + + // trigger java unref on key and value. + // by passing JNI_ABORT, it will simply release the reference without + // copying the result back to the java byte array. + env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); + env->ReleaseByteArrayElements(jentry_value, value, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: merge + * Signature: (J[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_merge__J_3BI_3BI( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + + rocksdb_merge_helper(env, db, default_write_options, + nullptr, jkey, jkey_len, jentry_value, jentry_value_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: merge + * Signature: (J[BI[BIJ)V + */ +void Java_org_rocksdb_RocksDB_merge__J_3BI_3BIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + static const rocksdb::WriteOptions default_write_options = + rocksdb::WriteOptions(); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_merge_helper(env, db, default_write_options, + cf_handle, jkey, jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } +} + +/* + * Class: org_rocksdb_RocksDB + * Method: merge + * Signature: (JJ[BI[BI)V + */ +void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BI( + JNIEnv* env, jobject jdb, + jlong jdb_handle, jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast( + jwrite_options_handle); + + rocksdb_merge_helper(env, db, *write_options, + nullptr, jkey, jkey_len, jentry_value, jentry_value_len); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: merge + * Signature: (JJ[BI[BIJ)V + */ +void Java_org_rocksdb_RocksDB_merge__JJ_3BI_3BIJ( + JNIEnv* env, jobject jdb, + jlong jdb_handle, jlong jwrite_options_handle, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto write_options = reinterpret_cast( + jwrite_options_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + if (cf_handle != nullptr) { + rocksdb_merge_helper(env, db, *write_options, + cf_handle, jkey, jkey_len, jentry_value, jentry_value_len); + } else { + rocksdb::RocksDBExceptionJni::ThrowNew(env, + rocksdb::Status::InvalidArgument("Invalid ColumnFamilyHandle.")); + } +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::~DB() + +/* + * Class: org_rocksdb_RocksDB + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_disposeInternal( JNIEnv* env, jobject java_db, jlong jhandle) { delete reinterpret_cast(jhandle); } +jlong rocksdb_iterator_helper( + rocksdb::DB* db, rocksdb::ReadOptions read_options, + rocksdb::ColumnFamilyHandle* cf_handle) { + rocksdb::Iterator* iterator = nullptr; + if (cf_handle != nullptr) { + iterator = db->NewIterator(read_options, cf_handle); + } else { + iterator = db->NewIterator(read_options); + } + return reinterpret_cast(iterator); +} + /* * Class: org_rocksdb_RocksDB - * Method: iterator0 + * Method: iterator * Signature: (J)J */ -jlong Java_org_rocksdb_RocksDB_iterator0( +jlong Java_org_rocksdb_RocksDB_iterator__J( JNIEnv* env, jobject jdb, jlong db_handle) { auto db = reinterpret_cast(db_handle); - rocksdb::Iterator* iterator = db->NewIterator(rocksdb::ReadOptions()); - return reinterpret_cast(iterator); + return rocksdb_iterator_helper(db, rocksdb::ReadOptions(), + nullptr); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: iterator + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_RocksDB_iterator__JJ( + JNIEnv* env, jobject jdb, jlong db_handle, + jlong jread_options_handle) { + auto db = reinterpret_cast(db_handle); + auto& read_options = *reinterpret_cast( + jread_options_handle); + return rocksdb_iterator_helper(db, read_options, + nullptr); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: iteratorCF + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_RocksDB_iteratorCF__JJ( + JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle) { + auto db = reinterpret_cast(db_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + return rocksdb_iterator_helper(db, rocksdb::ReadOptions(), + cf_handle); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: iteratorCF + * Signature: (JJJ)J + */ +jlong Java_org_rocksdb_RocksDB_iteratorCF__JJJ( + JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle, + jlong jread_options_handle) { + auto db = reinterpret_cast(db_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + auto& read_options = *reinterpret_cast( + jread_options_handle); + return rocksdb_iterator_helper(db, read_options, + cf_handle); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: iterators + * Signature: (JLjava/util/List;J)[J + */ +jlongArray Java_org_rocksdb_RocksDB_iterators( + JNIEnv* env, jobject jdb, jlong db_handle, jobject jcfhandle_list, + jlong jread_options_handle) { + auto db = reinterpret_cast(db_handle); + auto& read_options = *reinterpret_cast( + jread_options_handle); + std::vector cf_handles; + std::vector iterators; + + if (jcfhandle_list != nullptr) { + // get cf iterator + jobject cfIteratorObj = env->CallObjectMethod( + jcfhandle_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over keys and convert java byte array to slice + while (env->CallBooleanMethod( + cfIteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + jobject jobj = (jbyteArray) env->CallObjectMethod( + cfIteratorObj, rocksdb::ListJni::getNextMethod(env)); + rocksdb::ColumnFamilyHandle* cfHandle = + rocksdb::ColumnFamilyHandleJni::getHandle(env, jobj); + cf_handles.push_back(cfHandle); + } + } + + rocksdb::Status s = db->NewIterators(read_options, + cf_handles, &iterators); + if (s.ok()) { + jlongArray jLongArray = + env->NewLongArray(static_cast(iterators.size())); + for (std::vector::size_type i = 0; i < iterators.size(); + i++) { + env->SetLongArrayRegion(jLongArray, static_cast(i), 1, + reinterpret_cast(&iterators[i])); + } + return jLongArray; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return env->NewLongArray(0); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getDefaultColumnFamily + * Signature: (J)J + */ +jlong Java_org_rocksdb_RocksDB_getDefaultColumnFamily( + JNIEnv* env, jobject jobj, jlong jdb_handle) { + auto* db_handle = reinterpret_cast(jdb_handle); + auto* cf_handle = db_handle->DefaultColumnFamily(); + return reinterpret_cast(cf_handle); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: createColumnFamily + * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;)J; + */ +jlong Java_org_rocksdb_RocksDB_createColumnFamily( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jobject jcf_descriptor) { + rocksdb::ColumnFamilyHandle* handle; + auto db_handle = reinterpret_cast(jdb_handle); + + // get ColumnFamilyName + jbyteArray byteArray = static_cast(env->CallObjectMethod( + jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( + env))); + // get CF Options + jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( + env)); + rocksdb::ColumnFamilyOptions* cfOptions = + rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); + + jbyte* cfname = env->GetByteArrayElements(byteArray, 0); + const int len = env->GetArrayLength(byteArray) + 1; + char* c_cfname = new char[len]; + memcpy(c_cfname, cfname, len - 1); + c_cfname[len - 1] = 0; + + rocksdb::Status s = db_handle->CreateColumnFamily( + *cfOptions, c_cfname, &handle); + env->ReleaseByteArrayElements(byteArray, cfname, 0); + delete[] c_cfname; + + if (s.ok()) { + return reinterpret_cast(handle); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: dropColumnFamily + * Signature: (JJ)V; + */ +void Java_org_rocksdb_RocksDB_dropColumnFamily( + JNIEnv* env, jobject jdb, jlong jdb_handle, jlong jcf_handle) { + auto cf_handle = reinterpret_cast(jcf_handle); + auto db_handle = reinterpret_cast(jdb_handle); + rocksdb::Status s = db_handle->DropColumnFamily(cf_handle); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Method: getSnapshot + * Signature: (J)J + */ +jlong Java_org_rocksdb_RocksDB_getSnapshot( + JNIEnv* env, jobject jdb, jlong db_handle) { + auto db = reinterpret_cast(db_handle); + const rocksdb::Snapshot* snapshot = db->GetSnapshot(); + return reinterpret_cast(snapshot); +} + +/* + * Method: releaseSnapshot + * Signature: (JJ)V + */ +void Java_org_rocksdb_RocksDB_releaseSnapshot( + JNIEnv* env, jobject jdb, jlong db_handle, jlong snapshot_handle) { + auto db = reinterpret_cast(db_handle); + auto snapshot = reinterpret_cast(snapshot_handle); + db->ReleaseSnapshot(snapshot); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getProperty0 + * Signature: (JLjava/lang/String;I)Ljava/lang/String; + */ +jstring Java_org_rocksdb_RocksDB_getProperty0__JLjava_lang_String_2I( + JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty, + jint jproperty_len) { + auto db = reinterpret_cast(db_handle); + + const char* property = env->GetStringUTFChars(jproperty, 0); + rocksdb::Slice property_slice(property, jproperty_len); + + std::string property_value; + bool retCode = db->GetProperty(property_slice, &property_value); + env->ReleaseStringUTFChars(jproperty, property); + + if (!retCode) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); + } + + return env->NewStringUTF(property_value.data()); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getProperty0 + * Signature: (JJLjava/lang/String;I)Ljava/lang/String; + */ +jstring Java_org_rocksdb_RocksDB_getProperty0__JJLjava_lang_String_2I( + JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle, + jstring jproperty, jint jproperty_len) { + auto db = reinterpret_cast(db_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + + const char* property = env->GetStringUTFChars(jproperty, 0); + rocksdb::Slice property_slice(property, jproperty_len); + + std::string property_value; + bool retCode = db->GetProperty(cf_handle, property_slice, &property_value); + env->ReleaseStringUTFChars(jproperty, property); + + if (!retCode) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); + } + + return env->NewStringUTF(property_value.data()); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getLongProperty + * Signature: (JLjava/lang/String;I)L; + */ +jlong Java_org_rocksdb_RocksDB_getLongProperty__JLjava_lang_String_2I( + JNIEnv* env, jobject jdb, jlong db_handle, jstring jproperty, + jint jproperty_len) { + auto db = reinterpret_cast(db_handle); + + const char* property = env->GetStringUTFChars(jproperty, 0); + rocksdb::Slice property_slice(property, jproperty_len); + + uint64_t property_value = 0; + bool retCode = db->GetIntProperty(property_slice, &property_value); + env->ReleaseStringUTFChars(jproperty, property); + + if (!retCode) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); + } + return property_value; +} + +/* + * Class: org_rocksdb_RocksDB + * Method: getLongProperty + * Signature: (JJLjava/lang/String;I)L; + */ +jlong Java_org_rocksdb_RocksDB_getLongProperty__JJLjava_lang_String_2I( + JNIEnv* env, jobject jdb, jlong db_handle, jlong jcf_handle, + jstring jproperty, jint jproperty_len) { + auto db = reinterpret_cast(db_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + + const char* property = env->GetStringUTFChars(jproperty, 0); + rocksdb::Slice property_slice(property, jproperty_len); + + uint64_t property_value; + bool retCode = db->GetIntProperty(cf_handle, property_slice, &property_value); + env->ReleaseStringUTFChars(jproperty, property); + + if (!retCode) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, rocksdb::Status::NotFound()); + } + return property_value; +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::Flush + +void rocksdb_flush_helper( + JNIEnv* env, rocksdb::DB* db, const rocksdb::FlushOptions& flush_options, + rocksdb::ColumnFamilyHandle* column_family_handle) { + rocksdb::Status s; + if (column_family_handle != nullptr) { + s = db->Flush(flush_options, column_family_handle); + } else { + s = db->Flush(flush_options); + } + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_RocksDB + * Method: flush + * Signature: (JJ)V + */ +void Java_org_rocksdb_RocksDB_flush__JJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jflush_options) { + auto db = reinterpret_cast(jdb_handle); + auto flush_options = reinterpret_cast(jflush_options); + rocksdb_flush_helper(env, db, *flush_options, nullptr); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: flush + * Signature: (JJJ)V + */ +void Java_org_rocksdb_RocksDB_flush__JJJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jlong jflush_options, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto flush_options = reinterpret_cast(jflush_options); + auto cf_handle = reinterpret_cast(jcf_handle); + rocksdb_flush_helper(env, db, *flush_options, cf_handle); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::CompactRange - Full + +void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db, + rocksdb::ColumnFamilyHandle* cf_handle, jboolean jreduce_level, + jint jtarget_level, jint jtarget_path_id) { + + rocksdb::Status s; + if (cf_handle != nullptr) { + s = db->CompactRange(cf_handle, nullptr, nullptr, jreduce_level, + jtarget_level, static_cast(jtarget_path_id)); + } else { + // backwards compatibility + s = db->CompactRange(nullptr, nullptr, jreduce_level, + jtarget_level, static_cast(jtarget_path_id)); + } + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: compactRange0 + * Signature: (JZII)V + */ +void Java_org_rocksdb_RocksDB_compactRange0__JZII(JNIEnv* env, + jobject jdb, jlong jdb_handle, jboolean jreduce_level, + jint jtarget_level, jint jtarget_path_id) { + auto db = reinterpret_cast(jdb_handle); + rocksdb_compactrange_helper(env, db, nullptr, jreduce_level, + jtarget_level, jtarget_path_id); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: compactRange + * Signature: (JZIIJ)V + */ +void Java_org_rocksdb_RocksDB_compactRange__JZIIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, + jboolean jreduce_level, jint jtarget_level, + jint jtarget_path_id, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + rocksdb_compactrange_helper(env, db, cf_handle, jreduce_level, + jtarget_level, jtarget_path_id); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::CompactRange - Range + +void rocksdb_compactrange_helper(JNIEnv* env, rocksdb::DB* db, + rocksdb::ColumnFamilyHandle* cf_handle, jbyteArray jbegin, jint jbegin_len, + jbyteArray jend, jint jend_len, jboolean jreduce_level, jint jtarget_level, + jint jtarget_path_id) { + + jbyte* begin = env->GetByteArrayElements(jbegin, 0); + jbyte* end = env->GetByteArrayElements(jend, 0); + const rocksdb::Slice begin_slice(reinterpret_cast(begin), jbegin_len); + const rocksdb::Slice end_slice(reinterpret_cast(end), jend_len); + + rocksdb::Status s; + if (cf_handle != nullptr) { + s = db->CompactRange(cf_handle, &begin_slice, &end_slice, jreduce_level, + jtarget_level, static_cast(jtarget_path_id)); + } else { + // backwards compatibility + s = db->CompactRange(&begin_slice, &end_slice, jreduce_level, + jtarget_level, static_cast(jtarget_path_id)); + } + + env->ReleaseByteArrayElements(jbegin, begin, JNI_ABORT); + env->ReleaseByteArrayElements(jend, end, JNI_ABORT); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: compactRange0 + * Signature: (J[BI[BIZII)V + */ +void Java_org_rocksdb_RocksDB_compactRange0__J_3BI_3BIZII(JNIEnv* env, + jobject jdb, jlong jdb_handle, jbyteArray jbegin, jint jbegin_len, + jbyteArray jend, jint jend_len, jboolean jreduce_level, + jint jtarget_level, jint jtarget_path_id) { + auto db = reinterpret_cast(jdb_handle); + rocksdb_compactrange_helper(env, db, nullptr, jbegin, jbegin_len, + jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id); +} + +/* + * Class: org_rocksdb_RocksDB + * Method: compactRange + * Signature: (JJ[BI[BIZII)V + */ +void Java_org_rocksdb_RocksDB_compactRange__J_3BI_3BIZIIJ( + JNIEnv* env, jobject jdb, jlong jdb_handle, jbyteArray jbegin, + jint jbegin_len, jbyteArray jend, jint jend_len, + jboolean jreduce_level, jint jtarget_level, + jint jtarget_path_id, jlong jcf_handle) { + auto db = reinterpret_cast(jdb_handle); + auto cf_handle = reinterpret_cast(jcf_handle); + rocksdb_compactrange_helper(env, db, cf_handle, jbegin, jbegin_len, + jend, jend_len, jreduce_level, jtarget_level, jtarget_path_id); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::GetLatestSequenceNumber + +/* + * Class: org_rocksdb_RocksDB + * Method: getLatestSequenceNumber + * Signature: (J)V + */ +jlong Java_org_rocksdb_RocksDB_getLatestSequenceNumber(JNIEnv* env, + jobject jdb, jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + return db->GetLatestSequenceNumber(); +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB enable/disable file deletions + +/* + * Class: org_rocksdb_RocksDB + * Method: enableFileDeletions + * Signature: (J)V + */ +void Java_org_rocksdb_RocksDB_disableFileDeletions(JNIEnv* env, + jobject jdb, jlong jdb_handle) { + auto* db = reinterpret_cast(jdb_handle); + rocksdb::Status s = db->DisableFileDeletions(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_RocksDB + * Method: enableFileDeletions + * Signature: (JZ)V + */ +void Java_org_rocksdb_RocksDB_enableFileDeletions(JNIEnv* env, + jobject jdb, jlong jdb_handle, jboolean jforce) { + auto* db = reinterpret_cast(jdb_handle); + rocksdb::Status s = db->EnableFileDeletions(jforce); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +////////////////////////////////////////////////////////////////////////////// +// rocksdb::DB::GetUpdatesSince + +/* + * Class: org_rocksdb_RocksDB + * Method: getUpdatesSince + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_RocksDB_getUpdatesSince(JNIEnv* env, + jobject jdb, jlong jdb_handle, jlong jsequence_number) { + auto* db = reinterpret_cast(jdb_handle); + rocksdb::SequenceNumber sequence_number = + static_cast(jsequence_number); + std::unique_ptr iter; + rocksdb::Status s = db->GetUpdatesSince(sequence_number, &iter); + if (s.ok()) { + return reinterpret_cast(iter.release()); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; } diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc new file mode 100644 index 000000000..811117397 --- /dev/null +++ b/java/rocksjni/slice.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ for +// rocksdb::Slice. + +#include +#include +#include +#include + +#include "include/org_rocksdb_AbstractSlice.h" +#include "include/org_rocksdb_Slice.h" +#include "include/org_rocksdb_DirectSlice.h" +#include "rocksdb/slice.h" +#include "rocksjni/portal.h" + +// + +/* + * Class: org_rocksdb_Slice + * Method: createNewSlice0 + * Signature: ([BI)V + */ +void Java_org_rocksdb_Slice_createNewSlice0( + JNIEnv * env, jobject jobj, jbyteArray data, jint offset) { + + const jsize dataSize = env->GetArrayLength(data); + const int len = dataSize - offset; + jbyte* ptrData = new jbyte[len]; + env->GetByteArrayRegion(data, offset, len, ptrData); + + const auto* slice = new rocksdb::Slice((const char*)ptrData, len); + rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); +} + +/* + * Class: org_rocksdb_Slice + * Method: createNewSlice1 + * Signature: ([B)V + */ +void Java_org_rocksdb_Slice_createNewSlice1( + JNIEnv * env, jobject jobj, jbyteArray data) { + + const int len = env->GetArrayLength(data) + 1; + + jboolean isCopy; + jbyte* ptrData = env->GetByteArrayElements(data, &isCopy); + char* buf = new char[len]; + + memcpy(buf, ptrData, len - 1); + buf[len-1]='\0'; + + const auto* slice = + new rocksdb::Slice(buf, len - 1); + + rocksdb::AbstractSliceJni::setHandle(env, jobj, slice); + env->ReleaseByteArrayElements(data, ptrData, JNI_ABORT); + // NOTE: buf will be deleted in the org.rocksdb.Slice#dispose method +} + +/* + * Class: org_rocksdb_Slice + * Method: data0 + * Signature: (J)[B + */ +jbyteArray Java_org_rocksdb_Slice_data0( + JNIEnv* env, jobject jobj, jlong handle) { + const auto* slice = reinterpret_cast(handle); + const int len = static_cast(slice->size()); + const jbyteArray data = env->NewByteArray(len); + env->SetByteArrayRegion(data, 0, len, + reinterpret_cast(slice->data())); + return data; +} + +/* + * Class: org_rocksdb_Slice + * Method: disposeInternalBuf + * Signature: (J)V + */ +void Java_org_rocksdb_Slice_disposeInternalBuf( + JNIEnv * env, jobject jobj, jlong handle) { + const auto* slice = reinterpret_cast(handle); + delete [] slice->data_; +} + +// + +// +#include +#include + +#include "include/org_rocksdb_Snapshot.h" +#include "rocksdb/db.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_Snapshot + * Method: getSequenceNumber + * Signature: (J)J + */ +jlong Java_org_rocksdb_Snapshot_getSequenceNumber(JNIEnv* env, + jobject jobj, jlong jsnapshot_handle) { + auto* snapshot = reinterpret_cast( + jsnapshot_handle); + return snapshot->GetSequenceNumber(); +} diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index ffda1a2ba..e78e7e0d7 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -15,36 +15,46 @@ /* * Class: org_rocksdb_PlainTableConfig * Method: newTableFactoryHandle - * Signature: (IIDI)J + * Signature: (IIDIIBZZ)J */ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( JNIEnv* env, jobject jobj, jint jkey_size, jint jbloom_bits_per_key, - jdouble jhash_table_ratio, jint jindex_sparseness) { + jdouble jhash_table_ratio, jint jindex_sparseness, + jint jhuge_page_tlb_size, jbyte jencoding_type, + jboolean jfull_scan_mode, jboolean jstore_index_in_file) { rocksdb::PlainTableOptions options = rocksdb::PlainTableOptions(); options.user_key_len = jkey_size; options.bloom_bits_per_key = jbloom_bits_per_key; options.hash_table_ratio = jhash_table_ratio; options.index_sparseness = jindex_sparseness; + options.huge_page_tlb_size = jhuge_page_tlb_size; + options.encoding_type = static_cast( + jencoding_type); + options.full_scan_mode = jfull_scan_mode; + options.store_index_in_file = jstore_index_in_file; return reinterpret_cast(rocksdb::NewPlainTableFactory(options)); } /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZJIJIIZI)J + * Signature: (ZJIJIIZIZZJIBBI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, - jint num_shardbits, jlong block_size, jint block_size_deviation, + jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation, jint block_restart_interval, jboolean whole_key_filtering, - jint bits_per_key) { + jlong jfilterPolicy, jboolean cache_index_and_filter_blocks, + jboolean hash_index_allow_collision, jlong block_cache_compressed_size, + jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type, + jbyte jindex_type, jint jformat_version) { rocksdb::BlockBasedTableOptions options; options.no_block_cache = no_block_cache; if (!no_block_cache && block_cache_size > 0) { - if (num_shardbits > 0) { + if (block_cache_num_shardbits > 0) { options.block_cache = - rocksdb::NewLRUCache(block_cache_size, num_shardbits); + rocksdb::NewLRUCache(block_cache_size, block_cache_num_shardbits); } else { options.block_cache = rocksdb::NewLRUCache(block_cache_size); } @@ -53,8 +63,27 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.block_size_deviation = block_size_deviation; options.block_restart_interval = block_restart_interval; options.whole_key_filtering = whole_key_filtering; - if (bits_per_key > 0) { - options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key)); + if (jfilterPolicy > 0) { + std::shared_ptr *pFilterPolicy = + reinterpret_cast *>( + jfilterPolicy); + options.filter_policy = *pFilterPolicy; } + options.cache_index_and_filter_blocks = cache_index_and_filter_blocks; + options.hash_index_allow_collision = hash_index_allow_collision; + if (block_cache_compressed_size > 0) { + if (block_cache_compressd_num_shard_bits > 0) { + options.block_cache = + rocksdb::NewLRUCache(block_cache_compressed_size, + block_cache_compressd_num_shard_bits); + } else { + options.block_cache = rocksdb::NewLRUCache(block_cache_compressed_size); + } + } + options.checksum = static_cast(jchecksum_type); + options.index_type = static_cast< + rocksdb::BlockBasedTableOptions::IndexType>(jindex_type); + options.format_version = jformat_version; + return reinterpret_cast(rocksdb::NewBlockBasedTableFactory(options)); } diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc new file mode 100644 index 000000000..1d3d7c100 --- /dev/null +++ b/java/rocksjni/transaction_log.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::Iterator methods from Java side. + +#include +#include +#include + +#include "include/org_rocksdb_TransactionLogIterator.h" +#include "rocksdb/transaction_log.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_TransactionLogIterator + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionLogIterator_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); +} + +/* + * Class: org_rocksdb_TransactionLogIterator + * Method: isValid + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_TransactionLogIterator_isValid( + JNIEnv* env, jobject jobj, jlong handle) { + return reinterpret_cast(handle)->Valid(); +} + +/* + * Class: org_rocksdb_TransactionLogIterator + * Method: next + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionLogIterator_next( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->Next(); +} + +/* + * Class: org_rocksdb_TransactionLogIterator + * Method: status + * Signature: (J)V + */ +void Java_org_rocksdb_TransactionLogIterator_status( + JNIEnv* env, jobject jobj, jlong handle) { + rocksdb::Status s = reinterpret_cast< + rocksdb::TransactionLogIterator*>(handle)->status(); + if (!s.ok()) { + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_TransactionLogIterator + * Method: getBatch + * Signature: (J)Lorg/rocksdb/TransactionLogIterator$BatchResult + */ +jobject Java_org_rocksdb_TransactionLogIterator_getBatch( + JNIEnv* env, jobject jobj, jlong handle) { + rocksdb::BatchResult batch_result = + reinterpret_cast(handle)->GetBatch(); + jclass jclazz = env->FindClass( + "org/rocksdb/TransactionLogIterator$BatchResult"); + assert(jclazz != nullptr); + jmethodID mid = env->GetMethodID( + jclazz, "", "(Lorg/rocksdb/TransactionLogIterator;JJ)V"); + assert(mid != nullptr); + return env->NewObject(jclazz, mid, jobj, + batch_result.sequence, batch_result.writeBatchPtr.release()); +} diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc new file mode 100644 index 000000000..2992e930d --- /dev/null +++ b/java/rocksjni/ttl.cc @@ -0,0 +1,194 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::TtlDB methods. +// from Java side. + +#include +#include +#include +#include +#include + +#include "include/org_rocksdb_TtlDB.h" +#include "rocksdb/utilities/db_ttl.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_TtlDB + * Method: open + * Signature: (JLjava/lang/String;IZ)V + */ +void Java_org_rocksdb_TtlDB_open(JNIEnv* env, + jobject jttldb, jlong joptions_handle, jstring jdb_path, + jint jttl, jboolean jread_only) { + auto* opt = reinterpret_cast(joptions_handle); + rocksdb::DBWithTTL* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, &db, + jttl, jread_only); + env->ReleaseStringUTFChars(jdb_path, db_path); + + // as TTLDB extends RocksDB on the java side, we can reuse + // the RocksDB portal here. + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jttldb, db); + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_TtlDB + * Method: openCF + * Signature: (JLjava/lang/String;Ljava/util/List; + * ILjava/util/List;Z)Ljava/util/List; + */ +jobject + Java_org_rocksdb_TtlDB_openCF( + JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path, + jobject jcfdesc_list, jint jcfdesc_count, jobject jttl_list, + jboolean jread_only) { + auto* opt = reinterpret_cast(jopt_handle); + rocksdb::DBWithTTL* db = nullptr; + const char* db_path = env->GetStringUTFChars(jdb_path, 0); + + std::vector cfnames_to_free; + // the zero-terminated version of cfnames_to_free. + std::vector c_cfnames_to_free; + std::vector jcfnames_for_free; + + std::vector column_families; + std::vector ttl_values; + std::vector handles; + // get iterator for ColumnFamilyDescriptors + jobject iteratorObj = env->CallObjectMethod( + jcfdesc_list, rocksdb::ListJni::getIteratorMethod(env)); + + // iterate over ColumnFamilyDescriptors + while (env->CallBooleanMethod( + iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + // get ColumnFamilyDescriptor + jobject jcf_descriptor = env->CallObjectMethod(iteratorObj, + rocksdb::ListJni::getNextMethod(env)); + // get ColumnFamilyName + jbyteArray byteArray = static_cast(env->CallObjectMethod( + jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( + env))); + // get CF Options + jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( + env)); + rocksdb::ColumnFamilyOptions* cfOptions = + rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); + + jbyte* cfname = env->GetByteArrayElements(byteArray, 0); + const int len = env->GetArrayLength(byteArray) + 1; + char* c_cfname = new char[len]; + memcpy(c_cfname, cfname, len - 1); + c_cfname[len - 1] = 0; + + // free allocated cfnames after call to open + cfnames_to_free.push_back(cfname); + c_cfnames_to_free.push_back(c_cfname); + jcfnames_for_free.push_back(byteArray); + column_families.push_back(rocksdb::ColumnFamilyDescriptor( + c_cfname, *cfOptions)); + } + // get iterator for TTL values + iteratorObj = env->CallObjectMethod( + jttl_list, rocksdb::ListJni::getIteratorMethod(env)); + // iterate over TTL values + while (env->CallBooleanMethod( + iteratorObj, rocksdb::ListJni::getHasNextMethod(env)) == JNI_TRUE) { + // get TTL object + jobject jttl_object = env->CallObjectMethod(iteratorObj, + rocksdb::ListJni::getNextMethod(env)); + // get Integer value + jclass jIntClazz = env->FindClass("java/lang/Integer"); + jmethodID getVal = env->GetMethodID(jIntClazz, "intValue", "()I"); + ttl_values.push_back(env->CallIntMethod(jttl_object, getVal)); + } + rocksdb::Status s = rocksdb::DBWithTTL::Open(*opt, db_path, column_families, + &handles, &db, ttl_values, jread_only); + + env->ReleaseStringUTFChars(jdb_path, db_path); + // free jbyte allocations + for (std::vector::size_type i = 0; + i != cfnames_to_free.size(); i++) { + // free cfnames + env->ReleaseByteArrayElements(jcfnames_for_free[i], cfnames_to_free[i], 0); + // free c_cfnames + delete[] c_cfnames_to_free[i]; + } + + // check if open operation was successful + if (s.ok()) { + rocksdb::RocksDBJni::setHandle(env, jdb, db); + jclass jListClazz = env->FindClass("java/util/ArrayList"); + jmethodID midList = rocksdb::ListJni::getArrayListConstructorMethodId( + env, jListClazz); + jobject jcfhandle_list = env->NewObject(jListClazz, + midList, handles.size()); + // insert in java list + for (std::vector::size_type i = 0; + i != handles.size(); i++) { + // jlong must be converted to Long due to collections restrictions + jclass jLongClazz = env->FindClass("java/lang/Long"); + jmethodID midLong = env->GetMethodID(jLongClazz, "", "(J)V"); + jobject obj = env->NewObject(jLongClazz, midLong, + reinterpret_cast(handles[i])); + env->CallBooleanMethod(jcfhandle_list, + rocksdb::ListJni::getListAddMethodId(env), obj); + } + + return jcfhandle_list; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return nullptr; +} + +/* + * Class: org_rocksdb_TtlDB + * Method: createColumnFamilyWithTtl + * Signature: (JLorg/rocksdb/ColumnFamilyDescriptor;I)J; + */ +jlong Java_org_rocksdb_TtlDB_createColumnFamilyWithTtl( + JNIEnv* env, jobject jobj, jlong jdb_handle, + jobject jcf_descriptor, jint jttl) { + rocksdb::ColumnFamilyHandle* handle; + auto* db_handle = reinterpret_cast(jdb_handle); + + // get ColumnFamilyName + jbyteArray byteArray = static_cast(env->CallObjectMethod( + jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyNameMethod( + env))); + // get CF Options + jobject jcf_opt_obj = env->CallObjectMethod(jcf_descriptor, + rocksdb::ColumnFamilyDescriptorJni::getColumnFamilyOptionsMethod( + env)); + rocksdb::ColumnFamilyOptions* cfOptions = + rocksdb::ColumnFamilyOptionsJni::getHandle(env, jcf_opt_obj); + + jbyte* cfname = env->GetByteArrayElements(byteArray, 0); + const int len = env->GetArrayLength(byteArray) + 1; + char* c_cfname = new char[len]; + memcpy(c_cfname, cfname, len - 1); + c_cfname[len - 1] = 0; + + rocksdb::Status s = db_handle->CreateColumnFamilyWithTtl( + *cfOptions, c_cfname, &handle, jttl); + env->ReleaseByteArrayElements(byteArray, cfname, 0); + delete[] c_cfname; + + if (s.ok()) { + return reinterpret_cast(handle); + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); + return 0; +} diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index e8b2456ee..aa0c2309a 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -8,16 +8,20 @@ #include #include "include/org_rocksdb_WriteBatch.h" -#include "include/org_rocksdb_WriteBatchInternal.h" -#include "include/org_rocksdb_WriteBatchTest.h" +#include "include/org_rocksdb_WriteBatch_Handler.h" #include "rocksjni/portal.h" +#include "rocksjni/writebatchhandlerjnicallback.h" #include "rocksdb/db.h" +#include "rocksdb/immutable_options.h" #include "db/memtable.h" #include "rocksdb/write_batch.h" +#include "rocksdb/status.h" #include "db/write_batch_internal.h" +#include "db/writebuffer.h" #include "rocksdb/env.h" #include "rocksdb/memtablerep.h" #include "util/logging.h" +#include "util/scoped_arena_iterator.h" #include "util/testharness.h" /* @@ -35,10 +39,10 @@ void Java_org_rocksdb_WriteBatch_newWriteBatch( /* * Class: org_rocksdb_WriteBatch - * Method: count + * Method: count0 * Signature: ()I */ -jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) { +jint Java_org_rocksdb_WriteBatch_count0(JNIEnv* env, jobject jobj) { rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); @@ -47,10 +51,10 @@ jint Java_org_rocksdb_WriteBatch_count(JNIEnv* env, jobject jobj) { /* * Class: org_rocksdb_WriteBatch - * Method: clear + * Method: clear0 * Signature: ()V */ -void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) { +void Java_org_rocksdb_WriteBatch_clear0(JNIEnv* env, jobject jobj) { rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); @@ -62,20 +66,37 @@ void Java_org_rocksdb_WriteBatch_clear(JNIEnv* env, jobject jobj) { * Method: put * Signature: ([BI[BI)V */ -void Java_org_rocksdb_WriteBatch_put( +void Java_org_rocksdb_WriteBatch_put___3BI_3BI( JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + jbyteArray jentry_value, jint jentry_value_len) { + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); + auto put = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { + wb->Put(key, value); + }; + rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - jbyte* value = env->GetByteArrayElements(jvalue, nullptr); - rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); - wb->Put(key_slice, value_slice); - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +/* + * Class: org_rocksdb_WriteBatch + * Method: put + * Signature: ([BI[BIJ)V + */ +void Java_org_rocksdb_WriteBatch_put___3BI_3BIJ( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto put = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { + wb->Put(cf_handle, key, value); + }; + rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); } /* @@ -83,20 +104,37 @@ void Java_org_rocksdb_WriteBatch_put( * Method: merge * Signature: ([BI[BI)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge( +void Java_org_rocksdb_WriteBatch_merge___3BI_3BI( JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, - jbyteArray jvalue, jint jvalue_len) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + jbyteArray jentry_value, jint jentry_value_len) { + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); + auto merge = [&wb] (rocksdb::Slice key, rocksdb::Slice value) { + wb->Merge(key, value); + }; + rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - jbyte* value = env->GetByteArrayElements(jvalue, nullptr); - rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - rocksdb::Slice value_slice(reinterpret_cast(value), jvalue_len); - wb->Merge(key_slice, value_slice); - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); - env->ReleaseByteArrayElements(jvalue, value, JNI_ABORT); +/* + * Class: org_rocksdb_WriteBatch + * Method: merge + * Signature: ([BI[BIJ)V + */ +void Java_org_rocksdb_WriteBatch_merge___3BI_3BIJ( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto merge = [&wb, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { + wb->Merge(cf_handle, key, value); + }; + rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); } /* @@ -104,16 +142,33 @@ JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_merge( * Method: remove * Signature: ([BI)V */ -JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove( +void Java_org_rocksdb_WriteBatch_remove___3BI( JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); + auto remove = [&wb] (rocksdb::Slice key) { + wb->Delete(key); + }; + rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); +} - jbyte* key = env->GetByteArrayElements(jkey, nullptr); - rocksdb::Slice key_slice(reinterpret_cast(key), jkey_len); - wb->Delete(key_slice); - env->ReleaseByteArrayElements(jkey, key, JNI_ABORT); +/* + * Class: org_rocksdb_WriteBatch + * Method: remove + * Signature: ([BIJ)V + */ +void Java_org_rocksdb_WriteBatch_remove___3BIJ( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, jlong jcf_handle) { + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto remove = [&wb, &cf_handle] (rocksdb::Slice key) { + wb->Delete(cf_handle, key); + }; + rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); } /* @@ -123,139 +178,61 @@ JNIEXPORT void JNICALL Java_org_rocksdb_WriteBatch_remove( */ void Java_org_rocksdb_WriteBatch_putLogData( JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + auto* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); - - jbyte* blob = env->GetByteArrayElements(jblob, nullptr); - rocksdb::Slice blob_slice(reinterpret_cast(blob), jblob_len); - wb->PutLogData(blob_slice); - env->ReleaseByteArrayElements(jblob, blob, JNI_ABORT); + auto putLogData = [&wb] (rocksdb::Slice blob) { + wb->PutLogData(blob); + }; + rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); } /* * Class: org_rocksdb_WriteBatch - * Method: disposeInternal + * Method: iterate * Signature: (J)V */ -void Java_org_rocksdb_WriteBatch_disposeInternal( - JNIEnv* env, jobject jobj, jlong handle) { - delete reinterpret_cast(handle); -} - -/* - * Class: org_rocksdb_WriteBatchInternal - * Method: setSequence - * Signature: (Lorg/rocksdb/WriteBatch;J)V - */ -void Java_org_rocksdb_WriteBatchInternal_setSequence( - JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) { +void Java_org_rocksdb_WriteBatch_iterate( + JNIEnv* env , jobject jobj, jlong handlerHandle) { rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); assert(wb != nullptr); - rocksdb::WriteBatchInternal::SetSequence( - wb, static_cast(jsn)); + rocksdb::Status s = wb->Iterate( + reinterpret_cast(handlerHandle)); + + if (s.ok()) { + return; + } + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); } /* - * Class: org_rocksdb_WriteBatchInternal - * Method: sequence - * Signature: (Lorg/rocksdb/WriteBatch;)J + * Class: org_rocksdb_WriteBatch + * Method: disposeInternal + * Signature: (J)V */ -jlong Java_org_rocksdb_WriteBatchInternal_sequence( - JNIEnv* env, jclass jclazz, jobject jobj) { - rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); - assert(wb != nullptr); - - return static_cast(rocksdb::WriteBatchInternal::Sequence(wb)); +void Java_org_rocksdb_WriteBatch_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); } /* - * Class: org_rocksdb_WriteBatchInternal - * Method: append - * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V + * Class: org_rocksdb_WriteBatch_Handler + * Method: createNewHandler0 + * Signature: ()V */ -void Java_org_rocksdb_WriteBatchInternal_append( - JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) { - rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1); - assert(wb1 != nullptr); - rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2); - assert(wb2 != nullptr); - - rocksdb::WriteBatchInternal::Append(wb1, wb2); +void Java_org_rocksdb_WriteBatch_00024Handler_createNewHandler0( + JNIEnv* env, jobject jobj) { + const rocksdb::WriteBatchHandlerJniCallback* h = + new rocksdb::WriteBatchHandlerJniCallback(env, jobj); + rocksdb::WriteBatchHandlerJni::setHandle(env, jobj, h); } /* - * Class: org_rocksdb_WriteBatchTest - * Method: getContents - * Signature: (Lorg/rocksdb/WriteBatch;)[B + * Class: org_rocksdb_WriteBatch_Handler + * Method: disposeInternal + * Signature: (J)V */ -jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( - JNIEnv* env, jclass jclazz, jobject jobj) { - rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj); - assert(b != nullptr); - - // todo: Currently the following code is directly copied from - // db/write_bench_test.cc. It could be implemented in java once - // all the necessary components can be accessed via jni api. - - rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator()); - auto factory = std::make_shared(); - rocksdb::Options options; - options.memtable_factory = factory; - rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); - mem->Ref(); - std::string state; - rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); - rocksdb::Status s = - rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); - int count = 0; - rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - rocksdb::ParsedInternalKey ikey; - memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); - ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey)); - switch (ikey.type) { - case rocksdb::kTypeValue: - state.append("Put("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - break; - case rocksdb::kTypeMerge: - state.append("Merge("); - state.append(ikey.user_key.ToString()); - state.append(", "); - state.append(iter->value().ToString()); - state.append(")"); - count++; - break; - case rocksdb::kTypeDeletion: - state.append("Delete("); - state.append(ikey.user_key.ToString()); - state.append(")"); - count++; - break; - default: - assert(false); - break; - } - state.append("@"); - state.append(rocksdb::NumberToString(ikey.sequence)); - } - delete iter; - if (!s.ok()) { - state.append(s.ToString()); - } else if (count != rocksdb::WriteBatchInternal::Count(b)) { - state.append("CountMismatch()"); - } - delete mem->Unref(); - - jbyteArray jstate = env->NewByteArray(state.size()); - env->SetByteArrayRegion( - jstate, 0, state.size(), - reinterpret_cast(state.c_str())); - - return jstate; +void Java_org_rocksdb_WriteBatch_00024Handler_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + delete reinterpret_cast(handle); } diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc new file mode 100644 index 000000000..d78178211 --- /dev/null +++ b/java/rocksjni/write_batch_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::WriteBatch methods testing from Java side. +#include + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "db/writebuffer.h" +#include "include/org_rocksdb_WriteBatch.h" +#include "include/org_rocksdb_WriteBatch_Handler.h" +#include "include/org_rocksdb_WriteBatchTest.h" +#include "include/org_rocksdb_WriteBatchTestInternalHelper.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/immutable_options.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/status.h" +#include "rocksdb/write_batch.h" +#include "rocksjni/portal.h" +#include "util/logging.h" +#include "util/scoped_arena_iterator.h" +#include "util/testharness.h" + +/* + * Class: org_rocksdb_WriteBatchTest + * Method: getContents + * Signature: (Lorg/rocksdb/WriteBatch;)[B + */ +jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* b = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(b != nullptr); + + // todo: Currently the following code is directly copied from + // db/write_bench_test.cc. It could be implemented in java once + // all the necessary components can be accessed via jni api. + + rocksdb::InternalKeyComparator cmp(rocksdb::BytewiseComparator()); + auto factory = std::make_shared(); + rocksdb::Options options; + rocksdb::WriteBuffer wb(options.db_write_buffer_size); + options.memtable_factory = factory; + rocksdb::MemTable* mem = new rocksdb::MemTable( + cmp, rocksdb::ImmutableCFOptions(options), + rocksdb::MutableCFOptions(options, rocksdb::ImmutableCFOptions(options)), + &wb); + mem->Ref(); + std::string state; + rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem); + rocksdb::Status s = + rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); + int count = 0; + rocksdb::Arena arena; + rocksdb::ScopedArenaIterator iter(mem->NewIterator( + rocksdb::ReadOptions(), &arena)); + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + rocksdb::ParsedInternalKey ikey; + memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); + ASSERT_TRUE(rocksdb::ParseInternalKey(iter->key(), &ikey)); + switch (ikey.type) { + case rocksdb::kTypeValue: + state.append("Put("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeMerge: + state.append("Merge("); + state.append(ikey.user_key.ToString()); + state.append(", "); + state.append(iter->value().ToString()); + state.append(")"); + count++; + break; + case rocksdb::kTypeDeletion: + state.append("Delete("); + state.append(ikey.user_key.ToString()); + state.append(")"); + count++; + break; + default: + assert(false); + break; + } + state.append("@"); + state.append(rocksdb::NumberToString(ikey.sequence)); + } + if (!s.ok()) { + state.append(s.ToString()); + } else if (count != rocksdb::WriteBatchInternal::Count(b)) { + state.append("CountMismatch()"); + } + delete mem->Unref(); + + jbyteArray jstate = env->NewByteArray(static_cast(state.size())); + env->SetByteArrayRegion(jstate, 0, static_cast(state.size()), + reinterpret_cast(state.c_str())); + + return jstate; +} + +/* + * Class: org_rocksdb_WriteBatchTestInternalHelper + * Method: setSequence + * Signature: (Lorg/rocksdb/WriteBatch;J)V + */ +void Java_org_rocksdb_WriteBatchTestInternalHelper_setSequence( + JNIEnv* env, jclass jclazz, jobject jobj, jlong jsn) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + rocksdb::WriteBatchInternal::SetSequence( + wb, static_cast(jsn)); +} + +/* + * Class: org_rocksdb_WriteBatchTestInternalHelper + * Method: sequence + * Signature: (Lorg/rocksdb/WriteBatch;)J + */ +jlong Java_org_rocksdb_WriteBatchTestInternalHelper_sequence( + JNIEnv* env, jclass jclazz, jobject jobj) { + rocksdb::WriteBatch* wb = rocksdb::WriteBatchJni::getHandle(env, jobj); + assert(wb != nullptr); + + return static_cast(rocksdb::WriteBatchInternal::Sequence(wb)); +} + +/* + * Class: org_rocksdb_WriteBatchTestInternalHelper + * Method: append + * Signature: (Lorg/rocksdb/WriteBatch;Lorg/rocksdb/WriteBatch;)V + */ +void Java_org_rocksdb_WriteBatchTestInternalHelper_append( + JNIEnv* env, jclass jclazz, jobject jwb1, jobject jwb2) { + rocksdb::WriteBatch* wb1 = rocksdb::WriteBatchJni::getHandle(env, jwb1); + assert(wb1 != nullptr); + rocksdb::WriteBatch* wb2 = rocksdb::WriteBatchJni::getHandle(env, jwb2); + assert(wb2 != nullptr); + + rocksdb::WriteBatchInternal::Append(wb1, wb2); +} diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc new file mode 100644 index 000000000..92f2ec068 --- /dev/null +++ b/java/rocksjni/write_batch_with_index.cc @@ -0,0 +1,378 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the "bridge" between Java and C++ and enables +// calling c++ rocksdb::WriteBatchWithIndex methods from Java side. + +#include "include/org_rocksdb_WBWIRocksIterator.h" +#include "include/org_rocksdb_WriteBatchWithIndex.h" +#include "rocksdb/comparator.h" +#include "rocksdb/utilities/write_batch_with_index.h" +#include "rocksjni/portal.h" + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: newWriteBatchWithIndex + * Signature: ()V + */ +void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__( + JNIEnv* env, jobject jobj) { + rocksdb::WriteBatchWithIndex* wbwi = new rocksdb::WriteBatchWithIndex(); + rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: newWriteBatchWithIndex + * Signature: (Z)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__Z( + JNIEnv* env, jobject jobj, jboolean joverwrite_key) { + rocksdb::WriteBatchWithIndex* wbwi = + new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0, + static_cast(joverwrite_key)); + rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: newWriteBatchWithIndex + * Signature: (JIZ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_newWriteBatchWithIndex__JIZ( + JNIEnv* env, jobject jobj, jlong jfallback_index_comparator_handle, + jint jreserved_bytes, jboolean joverwrite_key) { + rocksdb::WriteBatchWithIndex* wbwi = + new rocksdb::WriteBatchWithIndex( + reinterpret_cast(jfallback_index_comparator_handle), + static_cast(jreserved_bytes), static_cast(joverwrite_key)); + rocksdb::WriteBatchWithIndexJni::setHandle(env, jobj, wbwi); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: count + * Signature: ()I + */ +jint Java_org_rocksdb_WriteBatchWithIndex_count0( + JNIEnv* env, jobject jobj) { + rocksdb::WriteBatchWithIndex* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + + return static_cast(wbwi->GetWriteBatch()->Count()); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: put + * Signature: ([BI[BI)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BI( + JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto put = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { + wbwi->Put(key, value); + }; + rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: put + * Signature: ([BI[BIJ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_put___3BI_3BIJ( + JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto put = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { + wbwi->Put(cf_handle, key, value); + }; + rocksdb::JniUtil::kv_op(put, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: merge + * Signature: ([BI[BI)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BI( + JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto merge = [&wbwi] (rocksdb::Slice key, rocksdb::Slice value) { + wbwi->Merge(key, value); + }; + rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: merge + * Signature: ([BI[BIJ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_merge___3BI_3BIJ( + JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len, + jbyteArray jentry_value, jint jentry_value_len, jlong jcf_handle) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto merge = [&wbwi, &cf_handle] (rocksdb::Slice key, rocksdb::Slice value) { + wbwi->Merge(cf_handle, key, value); + }; + rocksdb::JniUtil::kv_op(merge, env, jobj, jkey, jkey_len, jentry_value, + jentry_value_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: remove + * Signature: ([BI)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_remove___3BI( + JNIEnv* env, jobject jobj, jbyteArray jkey, jint jkey_len) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto remove = [&wbwi] (rocksdb::Slice key) { + wbwi->Delete(key); + }; + rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: remove + * Signature: ([BIJ)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_remove___3BIJ( + JNIEnv* env, jobject jobj, + jbyteArray jkey, jint jkey_len, jlong jcf_handle) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto* cf_handle = reinterpret_cast(jcf_handle); + assert(cf_handle != nullptr); + auto remove = [&wbwi, &cf_handle] (rocksdb::Slice key) { + wbwi->Delete(cf_handle, key); + }; + rocksdb::JniUtil::k_op(remove, env, jobj, jkey, jkey_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: putLogData + * Signature: ([BI)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_putLogData( + JNIEnv* env, jobject jobj, jbyteArray jblob, jint jblob_len) { + auto* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + auto putLogData = [&wbwi] (rocksdb::Slice blob) { + wbwi->PutLogData(blob); + }; + rocksdb::JniUtil::k_op(putLogData, env, jobj, jblob, jblob_len); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: clear + * Signature: ()V + */ +void Java_org_rocksdb_WriteBatchWithIndex_clear0( + JNIEnv* env, jobject jobj) { + rocksdb::WriteBatchWithIndex* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + assert(wbwi != nullptr); + + wbwi->GetWriteBatch()->Clear(); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: iterator0 + * Signature: ()J + */ +jlong Java_org_rocksdb_WriteBatchWithIndex_iterator0( + JNIEnv* env, jobject jobj) { + rocksdb::WriteBatchWithIndex* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(); + return reinterpret_cast(wbwi_iterator); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: iterator1 + * Signature: (J)J + */ +jlong Java_org_rocksdb_WriteBatchWithIndex_iterator1( + JNIEnv* env, jobject jobj, jlong jcf_handle) { + rocksdb::WriteBatchWithIndex* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + auto* cf_handle = reinterpret_cast(jcf_handle); + rocksdb::WBWIIterator* wbwi_iterator = wbwi->NewIterator(cf_handle); + return reinterpret_cast(wbwi_iterator); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: iteratorWithBase + * Signature: (JJ)J + */ +jlong Java_org_rocksdb_WriteBatchWithIndex_iteratorWithBase( + JNIEnv* env, jobject jobj, jlong jcf_handle, jlong jbi_handle) { + rocksdb::WriteBatchWithIndex* wbwi = + rocksdb::WriteBatchWithIndexJni::getHandle(env, jobj); + auto* cf_handle = reinterpret_cast(jcf_handle); + auto* base_iterator = reinterpret_cast(jbi_handle); + auto* iterator = wbwi->NewIteratorWithBase(cf_handle, base_iterator); + return reinterpret_cast(iterator); +} + +/* + * Class: org_rocksdb_WriteBatchWithIndex + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_WriteBatchWithIndex_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + auto* wbwi = reinterpret_cast(handle); + delete wbwi; +} + +/* WBWIRocksIterator below */ + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: disposeInternal + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_disposeInternal( + JNIEnv* env, jobject jobj, jlong handle) { + auto* it = reinterpret_cast(handle); + delete it; +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: isValid0 + * Signature: (J)Z + */ +jboolean Java_org_rocksdb_WBWIRocksIterator_isValid0( + JNIEnv* env, jobject jobj, jlong handle) { + return reinterpret_cast(handle)->Valid(); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: seekToFirst0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_seekToFirst0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->SeekToFirst(); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: seekToLast0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_seekToLast0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->SeekToLast(); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: next0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_next0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->Next(); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: prev0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_prev0( + JNIEnv* env, jobject jobj, jlong handle) { + reinterpret_cast(handle)->Prev(); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: seek0 + * Signature: (J[BI)V + */ +void Java_org_rocksdb_WBWIRocksIterator_seek0( + JNIEnv* env, jobject jobj, jlong handle, jbyteArray jtarget, + jint jtarget_len) { + auto* it = reinterpret_cast(handle); + jbyte* target = env->GetByteArrayElements(jtarget, 0); + rocksdb::Slice target_slice( + reinterpret_cast(target), jtarget_len); + + it->Seek(target_slice); + + env->ReleaseByteArrayElements(jtarget, target, JNI_ABORT); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: status0 + * Signature: (J)V + */ +void Java_org_rocksdb_WBWIRocksIterator_status0( + JNIEnv* env, jobject jobj, jlong handle) { + auto* it = reinterpret_cast(handle); + rocksdb::Status s = it->status(); + + if (s.ok()) { + return; + } + + rocksdb::RocksDBExceptionJni::ThrowNew(env, s); +} + +/* + * Class: org_rocksdb_WBWIRocksIterator + * Method: entry1 + * Signature: (JLorg/rocksdb/WBWIRocksIterator/WriteEntry;)V + */ +void Java_org_rocksdb_WBWIRocksIterator_entry1( + JNIEnv* env, jobject jobj, jlong handle, jobject jwrite_entry) { + auto* it = reinterpret_cast(handle); + const rocksdb::WriteEntry& we = it->Entry(); + jobject jwe = rocksdb::WBWIRocksIteratorJni::getWriteEntry(env, jobj); + rocksdb::WriteEntryJni::setWriteType(env, jwe, we.type); + rocksdb::WriteEntryJni::setKey(env, jwe, &we.key); + if (we.type == rocksdb::kDeleteRecord || we.type == rocksdb::kLogDataRecord) { + // set native handle of value slice to null if no value available + rocksdb::WriteEntryJni::setValue(env, jwe, NULL); + } else { + rocksdb::WriteEntryJni::setValue(env, jwe, &we.value); + } +} diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc new file mode 100644 index 000000000..b12e35544 --- /dev/null +++ b/java/rocksjni/writebatchhandlerjnicallback.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::Comparator. + +#include "rocksjni/writebatchhandlerjnicallback.h" +#include "rocksjni/portal.h" + +namespace rocksdb { +WriteBatchHandlerJniCallback::WriteBatchHandlerJniCallback( + JNIEnv* env, jobject jWriteBatchHandler) + : m_env(env) { + + // Note: we want to access the Java WriteBatchHandler instance + // across multiple method calls, so we create a global ref + m_jWriteBatchHandler = env->NewGlobalRef(jWriteBatchHandler); + + m_jPutMethodId = WriteBatchHandlerJni::getPutMethodId(env); + m_jMergeMethodId = WriteBatchHandlerJni::getMergeMethodId(env); + m_jDeleteMethodId = WriteBatchHandlerJni::getDeleteMethodId(env); + m_jLogDataMethodId = WriteBatchHandlerJni::getLogDataMethodId(env); + m_jContinueMethodId = WriteBatchHandlerJni::getContinueMethodId(env); +} + +void WriteBatchHandlerJniCallback::Put(const Slice& key, const Slice& value) { + const jbyteArray j_key = sliceToJArray(key); + const jbyteArray j_value = sliceToJArray(value); + + m_env->CallVoidMethod( + m_jWriteBatchHandler, + m_jPutMethodId, + j_key, + j_value); + + m_env->DeleteLocalRef(j_value); + m_env->DeleteLocalRef(j_key); +} + +void WriteBatchHandlerJniCallback::Merge(const Slice& key, const Slice& value) { + const jbyteArray j_key = sliceToJArray(key); + const jbyteArray j_value = sliceToJArray(value); + + m_env->CallVoidMethod( + m_jWriteBatchHandler, + m_jMergeMethodId, + j_key, + j_value); + + m_env->DeleteLocalRef(j_value); + m_env->DeleteLocalRef(j_key); +} + +void WriteBatchHandlerJniCallback::Delete(const Slice& key) { + const jbyteArray j_key = sliceToJArray(key); + + m_env->CallVoidMethod( + m_jWriteBatchHandler, + m_jDeleteMethodId, + j_key); + + m_env->DeleteLocalRef(j_key); +} + +void WriteBatchHandlerJniCallback::LogData(const Slice& blob) { + const jbyteArray j_blob = sliceToJArray(blob); + + m_env->CallVoidMethod( + m_jWriteBatchHandler, + m_jLogDataMethodId, + j_blob); + + m_env->DeleteLocalRef(j_blob); +} + +bool WriteBatchHandlerJniCallback::Continue() { + jboolean jContinue = m_env->CallBooleanMethod( + m_jWriteBatchHandler, + m_jContinueMethodId); + + return static_cast(jContinue == JNI_TRUE); +} + +/* + * Creates a Java Byte Array from the data in a Slice + * + * When calling this function + * you must remember to call env->DeleteLocalRef + * on the result after you have finished with it + */ +jbyteArray WriteBatchHandlerJniCallback::sliceToJArray(const Slice& s) { + jbyteArray ja = m_env->NewByteArray(static_cast(s.size())); + m_env->SetByteArrayRegion( + ja, 0, static_cast(s.size()), + reinterpret_cast(s.data())); + return ja; +} + +WriteBatchHandlerJniCallback::~WriteBatchHandlerJniCallback() { + m_env->DeleteGlobalRef(m_jWriteBatchHandler); +} +} // namespace rocksdb diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h new file mode 100644 index 000000000..9a2a47e80 --- /dev/null +++ b/java/rocksjni/writebatchhandlerjnicallback.h @@ -0,0 +1,46 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file implements the callback "bridge" between Java and C++ for +// rocksdb::WriteBatch::Handler. + +#ifndef JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ +#define JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ + +#include +#include "rocksdb/write_batch.h" + +namespace rocksdb { +/** + * This class acts as a bridge between C++ + * and Java. The methods in this class will be + * called back from the RocksDB storage engine (C++) + * which calls the appropriate Java method. + * This enables Write Batch Handlers to be implemented in Java. + */ +class WriteBatchHandlerJniCallback : public WriteBatch::Handler { + public: + WriteBatchHandlerJniCallback( + JNIEnv* env, jobject jWriteBackHandler); + ~WriteBatchHandlerJniCallback(); + void Put(const Slice& key, const Slice& value); + void Merge(const Slice& key, const Slice& value); + void Delete(const Slice& key); + void LogData(const Slice& blob); + bool Continue(); + + private: + JNIEnv* m_env; + jobject m_jWriteBatchHandler; + jbyteArray sliceToJArray(const Slice& s); + jmethodID m_jPutMethodId; + jmethodID m_jMergeMethodId; + jmethodID m_jDeleteMethodId; + jmethodID m_jLogDataMethodId; + jmethodID m_jContinueMethodId; +}; +} // namespace rocksdb + +#endif // JAVA_ROCKSJNI_WRITEBATCHHANDLERJNICALLBACK_H_ diff --git a/java/samples/src/main/java/RocksDBColumnFamilySample.java b/java/samples/src/main/java/RocksDBColumnFamilySample.java new file mode 100644 index 000000000..da9f4d28b --- /dev/null +++ b/java/samples/src/main/java/RocksDBColumnFamilySample.java @@ -0,0 +1,95 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +import org.rocksdb.*; + +import java.util.ArrayList; +import java.util.List; + +public class RocksDBColumnFamilySample { + static { + RocksDB.loadLibrary(); + } + + public static void main(String[] args) throws RocksDBException { + if (args.length < 1) { + System.out.println( + "usage: RocksDBColumnFamilySample db_path"); + return; + } + String db_path = args[0]; + + System.out.println("RocksDBColumnFamilySample"); + RocksDB db = null; + Options options = null; + ColumnFamilyHandle columnFamilyHandle = null; + WriteBatch wb = null; + try { + options = new Options().setCreateIfMissing(true); + db = RocksDB.open(options, db_path); + assert(db != null); + + // create column family + columnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions())); + assert(columnFamilyHandle != null); + + } finally { + if (columnFamilyHandle != null) { + columnFamilyHandle.dispose(); + } + if (db != null) { + db.close(); + db = null; + } + if (options != null) { + options.dispose(); + } + } + + // open DB with two column families + List columnFamilyDescriptors = new ArrayList<>(); + // have to open default column family + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY, new ColumnFamilyOptions())); + // open the new one, too + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + "new_cf".getBytes(), new ColumnFamilyOptions())); + List columnFamilyHandles = new ArrayList<>(); + try { + db = RocksDB.open(new DBOptions(), db_path, + columnFamilyDescriptors, columnFamilyHandles); + assert(db != null); + + // put and get from non-default column family + db.put(columnFamilyHandles.get(0), new WriteOptions(), + "key".getBytes(), "value".getBytes()); + String value = new String(db.get(columnFamilyHandles.get(0), + "key".getBytes())); + + // atomic write + wb = new WriteBatch(); + wb.put(columnFamilyHandles.get(0), "key2".getBytes(), "value2".getBytes()); + wb.put(columnFamilyHandles.get(1), "key3".getBytes(), "value3".getBytes()); + wb.remove(columnFamilyHandles.get(0), "key".getBytes()); + db.write(new WriteOptions(), wb); + + // drop column family + db.dropColumnFamily(columnFamilyHandles.get(1)); + + } finally { + for (ColumnFamilyHandle handle : columnFamilyHandles){ + handle.dispose(); + } + if (db != null) { + db.close(); + } + if (wb != null) { + wb.dispose(); + } + } + } +} diff --git a/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java similarity index 69% rename from java/RocksDBSample.java rename to java/samples/src/main/java/RocksDBSample.java index 72da4b5e8..84cf6404f 100644 --- a/java/RocksDBSample.java +++ b/java/samples/src/main/java/RocksDBSample.java @@ -35,13 +35,18 @@ public class RocksDBSample { assert(db == null); } - options.setCreateIfMissing(true) - .createStatistics() - .setWriteBufferSize(8 * SizeUnit.KB) - .setMaxWriteBufferNumber(3) - .setMaxBackgroundCompactions(10) - .setCompressionType(CompressionType.SNAPPY_COMPRESSION) - .setCompactionStyle(CompactionStyle.UNIVERSAL); + try { + options.setCreateIfMissing(true) + .createStatistics() + .setWriteBufferSize(8 * SizeUnit.KB) + .setMaxWriteBufferNumber(3) + .setMaxBackgroundCompactions(10) + .setCompressionType(CompressionType.SNAPPY_COMPRESSION) + .setCompactionStyle(CompactionStyle.UNIVERSAL); + } catch (RocksDBException e) { + assert(false); + } + Statistics stats = options.statisticsPtr(); assert(options.createIfMissing() == true); @@ -51,35 +56,60 @@ public class RocksDBSample { assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION); assert(options.compactionStyle() == CompactionStyle.UNIVERSAL); - assert(options.memTableFactoryName().equals("SkipListFactory")); - options.setMemTableConfig( - new HashSkipListMemTableConfig() - .setHeight(4) - .setBranchingFactor(4) - .setBucketCount(2000000)); - assert(options.memTableFactoryName().equals("HashSkipListRepFactory")); - - options.setMemTableConfig( - new HashLinkedListMemTableConfig() - .setBucketCount(100000)); - assert(options.memTableFactoryName().equals("HashLinkedListRepFactory")); - - options.setMemTableConfig( - new VectorMemTableConfig().setReservedSize(10000)); - assert(options.memTableFactoryName().equals("VectorRepFactory")); - - options.setMemTableConfig(new SkipListMemTableConfig()); - assert(options.memTableFactoryName().equals("SkipListFactory")); - - options.setTableFormatConfig(new PlainTableConfig()); - assert(options.tableFactoryName().equals("PlainTable")); + try { + assert(options.memTableFactoryName().equals("SkipListFactory")); + options.setMemTableConfig( + new HashSkipListMemTableConfig() + .setHeight(4) + .setBranchingFactor(4) + .setBucketCount(2000000)); + assert(options.memTableFactoryName().equals("HashSkipListRepFactory")); + + options.setMemTableConfig( + new HashLinkedListMemTableConfig() + .setBucketCount(100000)); + assert(options.memTableFactoryName().equals("HashLinkedListRepFactory")); + + options.setMemTableConfig( + new VectorMemTableConfig().setReservedSize(10000)); + assert(options.memTableFactoryName().equals("VectorRepFactory")); + + options.setMemTableConfig(new SkipListMemTableConfig()); + assert(options.memTableFactoryName().equals("SkipListFactory")); + + options.setTableFormatConfig(new PlainTableConfig()); + // Plain-Table requires mmap read + options.setAllowMmapReads(true); + assert(options.tableFactoryName().equals("PlainTable")); + + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000, + 10000, 10)); + options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000)); + } catch (RocksDBException e) { + assert(false); + } + Filter bloomFilter = new BloomFilter(10); BlockBasedTableConfig table_options = new BlockBasedTableConfig(); table_options.setBlockCacheSize(64 * SizeUnit.KB) - .setFilterBitsPerKey(10) - .setCacheNumShardBits(6); + .setFilter(bloomFilter) + .setCacheNumShardBits(6) + .setBlockSizeDeviation(5) + .setBlockRestartInterval(10) + .setCacheIndexAndFilterBlocks(true) + .setHashIndexAllowCollision(false) + .setBlockCacheCompressedSize(64 * SizeUnit.KB) + .setBlockCacheCompressedNumShardBits(10); + assert(table_options.blockCacheSize() == 64 * SizeUnit.KB); assert(table_options.cacheNumShardBits() == 6); + assert(table_options.blockSizeDeviation() == 5); + assert(table_options.blockRestartInterval() == 10); + assert(table_options.cacheIndexAndFilterBlocks() == true); + assert(table_options.hashIndexAllowCollision() == false); + assert(table_options.blockCacheCompressedSize() == 64 * SizeUnit.KB); + assert(table_options.blockCacheCompressedNumShardBits() == 10); + options.setTableFormatConfig(table_options); assert(options.tableFactoryName().equals("BlockBasedTable")); @@ -88,6 +118,8 @@ public class RocksDBSample { db.put("hello".getBytes(), "world".getBytes()); byte[] value = db.get("hello".getBytes()); assert("world".equals(new String(value))); + String str = db.getProperty("rocksdb.stats"); + assert(str != null && !str.equals("")); } catch (RocksDBException e) { System.out.format("[ERROR] caught the unexpceted exception -- %s\n", e); assert(db == null); @@ -121,6 +153,29 @@ public class RocksDBSample { System.out.println(""); } + // write batch test + WriteOptions writeOpt = new WriteOptions(); + for (int i = 10; i <= 19; ++i) { + WriteBatch batch = new WriteBatch(); + for (int j = 10; j <= 19; ++j) { + batch.put(String.format("%dx%d", i, j).getBytes(), + String.format("%d", i * j).getBytes()); + } + db.write(writeOpt, batch); + batch.dispose(); + } + for (int i = 10; i <= 19; ++i) { + for (int j = 10; j <= 19; ++j) { + assert(new String( + db.get(String.format("%dx%d", i, j).getBytes())).equals( + String.format("%d", i * j))); + System.out.format("%s ", new String(db.get( + String.format("%dx%d", i, j).getBytes()))); + } + System.out.println(""); + } + writeOpt.dispose(); + value = db.get("1x1".getBytes()); assert(value != null); value = db.get("world".getBytes()); diff --git a/java/src/main/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java new file mode 100644 index 000000000..1abdb4774 --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -0,0 +1,100 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Comparators are used by RocksDB to determine + * the ordering of keys. + * + * This class is package private, implementers + * should extend either of the public abstract classes: + * @see org.rocksdb.Comparator + * @see org.rocksdb.DirectComparator + */ +public abstract class AbstractComparator + extends RocksObject { + + /** + * The name of the comparator. Used to check for comparator + * mismatches (i.e., a DB created with one comparator is + * accessed using a different comparator). + * + * A new name should be used whenever + * the comparator implementation changes in a way that will cause + * the relative ordering of any two keys to change. + * + * Names starting with "rocksdb." are reserved and should not be used. + * + * @return The name of this comparator implementation + */ + public abstract String name(); + + /** + * Three-way key comparison + * + * @param a Slice access to first key + * @param b Slice access to second key + * + * @return Should return either: + * 1) < 0 if "a" < "b" + * 2) == 0 if "a" == "b" + * 3) > 0 if "a" > "b" + */ + public abstract int compare(final T a, final T b); + + /** + *

Used to reduce the space requirements + * for internal data structures like index blocks.

+ * + *

If start < limit, you may return a new start which is a + * shorter string in [start, limit).

+ * + *

Simple comparator implementations may return null if they + * wish to use start unchanged. i.e., an implementation of + * this method that does nothing is correct.

+ * + * @param start String + * @param limit of type T + * + * @return a shorter start, or null + */ + public String findShortestSeparator(final String start, final T limit) { + return null; + } + + /** + *

Used to reduce the space requirements + * for internal data structures like index blocks.

+ * + *

You may return a new short key (key1) where + * key1 ≥ key.

+ * + *

Simple comparator implementations may return null if they + * wish to leave the key unchanged. i.e., an implementation of + * this method that does nothing is correct.

+ * + * @param key String + * + * @return a shorter key, or null + */ + public String findShortSuccessor(final String key) { + return null; + } + + /** + * Deletes underlying C++ comparator pointer. + * + * Note that this function should be called only after all + * RocksDB instances referencing the comparator are closed. + * Otherwise an undefined behavior will occur. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void disposeInternal(long handle); +} diff --git a/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java new file mode 100644 index 000000000..08bd9dc23 --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java @@ -0,0 +1,105 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class implementation for Rocks Iterators + * in the Java API + * + *

Multiple threads can invoke const methods on an RocksIterator without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same RocksIterator must use + * external synchronization.

+ * + * @param

The type of the Parent Object from which the Rocks Iterator was + * created. This is used by disposeInternal to avoid double-free + * issues with the underlying C++ object. + * @see org.rocksdb.RocksObject + */ +public abstract class AbstractRocksIterator

+ extends RocksObject implements RocksIteratorInterface { + final P parent_; + + protected AbstractRocksIterator(P parent, long nativeHandle) { + super(); + nativeHandle_ = nativeHandle; + // parent must point to a valid RocksDB instance. + assert (parent != null); + // RocksIterator must hold a reference to the related parent instance + // to guarantee that while a GC cycle starts RocksIterator instances + // are freed prior to parent instances. + parent_ = parent; + } + + @Override + public boolean isValid() { + assert (isInitialized()); + return isValid0(nativeHandle_); + } + + @Override + public void seekToFirst() { + assert (isInitialized()); + seekToFirst0(nativeHandle_); + } + + @Override + public void seekToLast() { + assert (isInitialized()); + seekToLast0(nativeHandle_); + } + + @Override + public void seek(byte[] target) { + assert (isInitialized()); + seek0(nativeHandle_, target, target.length); + } + + @Override + public void next() { + assert (isInitialized()); + next0(nativeHandle_); + } + + @Override + public void prev() { + assert (isInitialized()); + prev0(nativeHandle_); + } + + @Override + public void status() throws RocksDBException { + assert (isInitialized()); + status0(nativeHandle_); + } + + /** + *

Deletes underlying C++ iterator pointer.

+ * + *

Note: the underlying handle can only be safely deleted if the parent + * instance related to a certain RocksIterator is still valid and initialized. + * Therefore {@code disposeInternal()} checks if the parent is initialized + * before freeing the native handle.

+ */ + @Override + protected void disposeInternal() { + synchronized (parent_) { + assert (isInitialized()); + if (parent_.isInitialized()) { + disposeInternal(nativeHandle_); + } + } + } + + abstract void disposeInternal(long handle); + abstract boolean isValid0(long handle); + abstract void seekToFirst0(long handle); + abstract void seekToLast0(long handle); + abstract void next0(long handle); + abstract void prev0(long handle); + abstract void seek0(long handle, byte[] target, int targetLen); + abstract void status0(long handle) throws RocksDBException; +} diff --git a/java/src/main/java/org/rocksdb/AbstractSlice.java b/java/src/main/java/org/rocksdb/AbstractSlice.java new file mode 100644 index 000000000..2b0d80c6f --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractSlice.java @@ -0,0 +1,166 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Slices are used by RocksDB to provide + * efficient access to keys and values. + * + * This class is package private, implementers + * should extend either of the public abstract classes: + * @see org.rocksdb.Slice + * @see org.rocksdb.DirectSlice + * + * Regards the lifecycle of Java Slices in RocksDB: + * At present when you configure a Comparator from Java, it creates an + * instance of a C++ BaseComparatorJniCallback subclass and + * passes that to RocksDB as the comparator. That subclass of + * BaseComparatorJniCallback creates the Java + * @see org.rocksdb.AbstractSlice subclass Objects. When you dispose + * the Java @see org.rocksdb.AbstractComparator subclass, it disposes the + * C++ BaseComparatorJniCallback subclass, which in turn destroys the + * Java @see org.rocksdb.AbstractSlice subclass Objects. + */ +abstract class AbstractSlice extends RocksObject { + + /** + * Returns the data of the slice. + * + * @return The slice data. Note, the type of access is + * determined by the subclass + * @see org.rocksdb.AbstractSlice#data0(long) + */ + public T data() { + assert (isInitialized()); + return data0(nativeHandle_); + } + + /** + * Access to the data is provided by the + * subtype as it needs to handle the + * generic typing. + * + * @param handle The address of the underlying + * native object. + * + * @return Java typed access to the data. + */ + protected abstract T data0(long handle); + + /** + * Return the length (in bytes) of the data. + * + * @return The length in bytes. + */ + public int size() { + assert (isInitialized()); + return size0(nativeHandle_); + } + + /** + * Return true if the length of the + * data is zero. + * + * @return true if there is no data, false otherwise. + */ + public boolean empty() { + assert (isInitialized()); + return empty0(nativeHandle_); + } + + /** + * Creates a string representation of the data + * + * @param hex When true, the representation + * will be encoded in hexadecimal. + * + * @return The string representation of the data. + */ + public String toString(final boolean hex) { + assert (isInitialized()); + return toString0(nativeHandle_, hex); + } + + @Override + public String toString() { + return toString(false); + } + + /** + * Three-way key comparison + * + * @param other A slice to compare against + * + * @return Should return either: + * 1) < 0 if this < other + * 2) == 0 if this == other + * 3) > 0 if this > other + */ + public int compare(final AbstractSlice other) { + assert (other != null); + assert (isInitialized()); + return compare0(nativeHandle_, other.nativeHandle_); + } + + /** + * If other is a slice object, then + * we defer to {@link #compare(AbstractSlice) compare} + * to check equality, otherwise we return false. + * + * @param other Object to test for equality + * + * @return true when {@code this.compare(other) == 0}, + * false otherwise. + */ + @Override + public boolean equals(final Object other) { + if (other != null && other instanceof AbstractSlice) { + return compare((AbstractSlice)other) == 0; + } else { + return false; + } + } + + /** + * Determines whether this slice starts with + * another slice + * + * @param prefix Another slice which may of may not + * be a prefix of this slice. + * + * @return true when this slice starts with the + * {@code prefix} slice + */ + public boolean startsWith(final AbstractSlice prefix) { + if (prefix != null) { + assert (isInitialized()); + return startsWith0(nativeHandle_, prefix.nativeHandle_); + } else { + return false; + } + } + + /** + * Deletes underlying C++ slice pointer. + * Note that this function should be called only after all + * RocksDB instances referencing the slice are closed. + * Otherwise an undefined behavior will occur. + */ + @Override + protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + protected native void createNewSliceFromString(String str); + private native int size0(long handle); + private native boolean empty0(long handle); + private native String toString0(long handle, boolean hex); + private native int compare0(long handle, long otherHandle); + private native boolean startsWith0(long handle, long otherHandle); + private native void disposeInternal(long handle); + +} diff --git a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java new file mode 100644 index 000000000..b380c5d8a --- /dev/null +++ b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java @@ -0,0 +1,92 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public abstract class AbstractWriteBatch extends RocksObject implements WriteBatchInterface { + + @Override + public int count() { + assert (isInitialized()); + return count0(); + } + + @Override + public void put(byte[] key, byte[] value) { + assert (isInitialized()); + put(key, key.length, value, value.length); + } + + @Override + public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) { + assert (isInitialized()); + put(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); + } + + @Override + public void merge(byte[] key, byte[] value) { + assert (isInitialized()); + merge(key, key.length, value, value.length); + } + + @Override + public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) { + assert (isInitialized()); + merge(key, key.length, value, value.length, columnFamilyHandle.nativeHandle_); + } + + @Override + public void remove(byte[] key) { + assert (isInitialized()); + remove(key, key.length); + } + + @Override + public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) { + assert (isInitialized()); + remove(key, key.length, columnFamilyHandle.nativeHandle_); + } + + @Override + public void putLogData(byte[] blob) { + assert (isInitialized()); + putLogData(blob, blob.length); + } + + @Override + public void clear() { + assert (isInitialized()); + clear0(); + } + + /** + * Delete the c++ side pointer. + */ + @Override + protected void disposeInternal() { + assert (isInitialized()); + disposeInternal(nativeHandle_); + } + + abstract void disposeInternal(long handle); + + abstract int count0(); + + abstract void put(byte[] key, int keyLen, byte[] value, int valueLen); + + abstract void put(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle); + + abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen); + + abstract void merge(byte[] key, int keyLen, byte[] value, int valueLen, long cfHandle); + + abstract void remove(byte[] key, int keyLen); + + abstract void remove(byte[] key, int keyLen, long cfHandle); + + abstract void putLogData(byte[] blob, int blobLen); + + abstract void clear0(); +} diff --git a/java/src/main/java/org/rocksdb/BackupInfo.java b/java/src/main/java/org/rocksdb/BackupInfo.java new file mode 100644 index 000000000..407445473 --- /dev/null +++ b/java/src/main/java/org/rocksdb/BackupInfo.java @@ -0,0 +1,67 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * Instances of this class describe a Backup made by + * {@link org.rocksdb.BackupableDB}. + */ +public class BackupInfo { + + /** + * Package private constructor used to create instances + * of BackupInfo by {@link org.rocksdb.BackupableDB} and + * {@link org.rocksdb.RestoreBackupableDB}. + * + * @param backupId id of backup + * @param timestamp timestamp of backup + * @param size size of backup + * @param numberFiles number of files related to this backup. + */ + BackupInfo(int backupId, long timestamp, long size, + int numberFiles) { + backupId_ = backupId; + timestamp_ = timestamp; + size_ = size; + numberFiles_ = numberFiles; + } + + /** + * + * @return the backup id. + */ + public int backupId() { + return backupId_; + } + + /** + * + * @return the timestamp of the backup. + */ + public long timestamp() { + return timestamp_; + } + + /** + * + * @return the size of the backup + */ + public long size() { + return size_; + } + + /** + * + * @return the number of files of this backup. + */ + public int numberFiles() { + return numberFiles_; + } + + private int backupId_; + private long timestamp_; + private long size_; + private int numberFiles_; +} diff --git a/java/src/main/java/org/rocksdb/BackupableDB.java b/java/src/main/java/org/rocksdb/BackupableDB.java new file mode 100644 index 000000000..a743d861e --- /dev/null +++ b/java/src/main/java/org/rocksdb/BackupableDB.java @@ -0,0 +1,166 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.List; + +/** + *

A subclass of RocksDB which supports + * backup-related operations.

+ * + * @see org.rocksdb.BackupableDBOptions + */ +public class BackupableDB extends RocksDB { + /** + *

Open a {@code BackupableDB} under the specified path. + * Note that the backup path should be set properly in the + * input BackupableDBOptions.

+ * + * @param opt {@link org.rocksdb.Options} to set for the database. + * @param bopt {@link org.rocksdb.BackupableDBOptions} to use. + * @param db_path Path to store data to. The path for storing the backup should be + * specified in the {@link org.rocksdb.BackupableDBOptions}. + * + * @return {@link BackupableDB} reference to the opened database. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static BackupableDB open( + Options opt, BackupableDBOptions bopt, String db_path) + throws RocksDBException { + + RocksDB db = RocksDB.open(opt, db_path); + BackupableDB bdb = new BackupableDB(); + bdb.open(db.nativeHandle_, bopt.nativeHandle_); + + // Prevent the RocksDB object from attempting to delete + // the underly C++ DB object. + db.disOwnNativeHandle(); + + return bdb; + } + + /** + *

Captures the state of the database in the latest backup. + * Note that this function is not thread-safe.

+ * + * @param flushBeforeBackup if true, then all data will be flushed + * before creating backup. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void createNewBackup(boolean flushBeforeBackup) + throws RocksDBException { + assert(isInitialized()); + createNewBackup(nativeHandle_, flushBeforeBackup); + } + + /** + *

Deletes old backups, keeping latest numBackupsToKeep alive.

+ * + * @param numBackupsToKeep Number of latest backups to keep. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void purgeOldBackups(int numBackupsToKeep) + throws RocksDBException { + assert(isInitialized()); + purgeOldBackups(nativeHandle_, numBackupsToKeep); + } + + /** + *

Deletes a specific backup.

+ * + * @param backupId of backup to delete. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void deleteBackup(int backupId) throws RocksDBException { + assert(isInitialized()); + deleteBackup0(nativeHandle_, backupId); + } + + /** + *

Returns a list of {@link BackupInfo} instances, which describe + * already made backups.

+ * + * @return List of {@link BackupInfo} instances. + */ + public List getBackupInfos() { + assert(isInitialized()); + return getBackupInfo(nativeHandle_); + } + + /** + *

Returns a list of corrupted backup ids. If there + * is no corrupted backup the method will return an + * empty list.

+ * + * @return array of backup ids as int ids. + */ + public int[] getCorruptedBackups() { + assert(isInitialized()); + return getCorruptedBackups(nativeHandle_); + } + + /** + *

Will delete all the files we don't need anymore. It will + * do the full scan of the files/ directory and delete all the + * files that are not referenced.

+ * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void garbageCollect() throws RocksDBException { + assert(isInitialized()); + garbageCollect(nativeHandle_); + } + + /** + *

Close the BackupableDB instance and release resource.

+ * + *

Internally, {@link BackupableDB} owns the {@code rocksdb::DB} + * pointer to its associated {@link org.rocksdb.RocksDB}. + * The release of that RocksDB pointer is handled in the destructor + * of the c++ {@code rocksdb::BackupableDB} and should be transparent + * to Java developers.

+ */ + @Override public synchronized void close() { + if (isInitialized()) { + super.close(); + } + } + + /** + *

A protected construction that will be used in the static + * factory method {@link #open(Options, BackupableDBOptions, String)}. + *

+ */ + protected BackupableDB() { + super(); + } + + @Override protected void finalize() throws Throwable { + close(); + super.finalize(); + } + + protected native void open(long rocksDBHandle, long backupDBOptionsHandle); + protected native void createNewBackup(long handle, boolean flag) + throws RocksDBException; + protected native void purgeOldBackups(long handle, int numBackupsToKeep) + throws RocksDBException; + private native void deleteBackup0(long nativeHandle, int backupId) + throws RocksDBException; + protected native List getBackupInfo(long handle); + private native int[] getCorruptedBackups(long handle); + private native void garbageCollect(long handle) + throws RocksDBException; +} diff --git a/java/src/main/java/org/rocksdb/BackupableDBOptions.java b/java/src/main/java/org/rocksdb/BackupableDBOptions.java new file mode 100644 index 000000000..ab532f282 --- /dev/null +++ b/java/src/main/java/org/rocksdb/BackupableDBOptions.java @@ -0,0 +1,271 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.io.File; +import java.nio.file.Path; + +/** + *

BackupableDBOptions to control the behavior of a backupable database. + * It will be used during the creation of a {@link org.rocksdb.BackupableDB}. + *

+ *

Note that dispose() must be called before an Options instance + * become out-of-scope to release the allocated memory in c++.

+ * + * @see org.rocksdb.BackupableDB + */ +public class BackupableDBOptions extends RocksObject { + + /** + *

BackupableDBOptions constructor.

+ * + * @param path Where to keep the backup files. Has to be different than db name. + * Best to set this to {@code db name_ + "/backups"} + * @throws java.lang.IllegalArgumentException if illegal path is used. + */ + public BackupableDBOptions(String path) { + super(); + File backupPath = path == null ? null : new File(path); + if (backupPath == null || !backupPath.isDirectory() || !backupPath.canWrite()) { + throw new IllegalArgumentException("Illegal path provided."); + } + newBackupableDBOptions(path); + } + + /** + *

Returns the path to the BackupableDB directory.

+ * + * @return the path to the BackupableDB directory. + */ + public String backupDir() { + assert(isInitialized()); + return backupDir(nativeHandle_); + } + + /** + *

Share table files between backups.

+ * + * @param shareTableFiles If {@code share_table_files == true}, backup will assume + * that table files with same name have the same contents. This enables incremental + * backups and avoids unnecessary data copies. If {@code share_table_files == false}, + * each backup will be on its own and will not share any data with other backups. + * + *

Default: true

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setShareTableFiles(boolean shareTableFiles) { + assert(isInitialized()); + setShareTableFiles(nativeHandle_, shareTableFiles); + return this; + } + + /** + *

Share table files between backups.

+ * + * @return boolean value indicating if SST files will be shared between + * backups. + */ + public boolean shareTableFiles() { + assert(isInitialized()); + return shareTableFiles(nativeHandle_); + } + + /** + *

Set synchronous backups.

+ * + * @param sync If {@code sync == true}, we can guarantee you'll get consistent backup + * even on a machine crash/reboot. Backup process is slower with sync enabled. + * If {@code sync == false}, we don't guarantee anything on machine reboot. + * However,chances are some of the backups are consistent. + * + *

Default: true

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setSync(boolean sync) { + assert(isInitialized()); + setSync(nativeHandle_, sync); + return this; + } + + /** + *

Are synchronous backups activated.

+ * + * @return boolean value if synchronous backups are configured. + */ + public boolean sync() { + assert(isInitialized()); + return sync(nativeHandle_); + } + + /** + *

Set if old data will be destroyed.

+ * + * @param destroyOldData If true, it will delete whatever backups there are already. + * + *

Default: false

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setDestroyOldData(boolean destroyOldData) { + assert(isInitialized()); + setDestroyOldData(nativeHandle_, destroyOldData); + return this; + } + + /** + *

Returns if old data will be destroyed will performing new backups.

+ * + * @return boolean value indicating if old data will be destroyed. + */ + public boolean destroyOldData() { + assert(isInitialized()); + return destroyOldData(nativeHandle_); + } + + /** + *

Set if log files shall be persisted.

+ * + * @param backupLogFiles If false, we won't backup log files. This option can be + * useful for backing up in-memory databases where log file are persisted,but table + * files are in memory. + * + *

Default: true

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setBackupLogFiles(boolean backupLogFiles) { + assert(isInitialized()); + setBackupLogFiles(nativeHandle_, backupLogFiles); + return this; + } + + /** + *

Return information if log files shall be persisted.

+ * + * @return boolean value indicating if log files will be persisted. + */ + public boolean backupLogFiles() { + assert(isInitialized()); + return backupLogFiles(nativeHandle_); + } + + /** + *

Set backup rate limit.

+ * + * @param backupRateLimit Max bytes that can be transferred in a second during backup. + * If 0 or negative, then go as fast as you can. + * + *

Default: 0

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setBackupRateLimit(long backupRateLimit) { + assert(isInitialized()); + backupRateLimit = (backupRateLimit <= 0) ? 0 : backupRateLimit; + setBackupRateLimit(nativeHandle_, backupRateLimit); + return this; + } + + /** + *

Return backup rate limit which described the max bytes that can be transferred in a + * second during backup.

+ * + * @return numerical value describing the backup transfer limit in bytes per second. + */ + public long backupRateLimit() { + assert(isInitialized()); + return backupRateLimit(nativeHandle_); + } + + /** + *

Set restore rate limit.

+ * + * @param restoreRateLimit Max bytes that can be transferred in a second during restore. + * If 0 or negative, then go as fast as you can. + * + *

Default: 0

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setRestoreRateLimit(long restoreRateLimit) { + assert(isInitialized()); + restoreRateLimit = (restoreRateLimit <= 0) ? 0 : restoreRateLimit; + setRestoreRateLimit(nativeHandle_, restoreRateLimit); + return this; + } + + /** + *

Return restore rate limit which described the max bytes that can be transferred in a + * second during restore.

+ * + * @return numerical value describing the restore transfer limit in bytes per second. + */ + public long restoreRateLimit() { + assert(isInitialized()); + return restoreRateLimit(nativeHandle_); + } + + /** + *

Only used if share_table_files is set to true. If true, will consider that + * backups can come from different databases, hence a sst is not uniquely + * identified by its name, but by the triple (file name, crc32, file length)

+ * + * @param shareFilesWithChecksum boolean value indicating if SST files are stored + * using the triple (file name, crc32, file length) and not its name. + * + *

Note: this is an experimental option, and you'll need to set it manually + * turn it on only if you know what you're doing*

+ * + *

Default: false

+ * + * @return instance of current BackupableDBOptions. + */ + public BackupableDBOptions setShareFilesWithChecksum( + boolean shareFilesWithChecksum) { + assert(isInitialized()); + setShareFilesWithChecksum(nativeHandle_, shareFilesWithChecksum); + return this; + } + + /** + *

Return of share files with checksum is active.

+ * + * @return boolean value indicating if share files with checksum + * is active. + */ + public boolean shareFilesWithChecksum() { + assert(isInitialized()); + return shareFilesWithChecksum(nativeHandle_); + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private native void newBackupableDBOptions(String path); + private native String backupDir(long handle); + private native void setShareTableFiles(long handle, boolean flag); + private native boolean shareTableFiles(long handle); + private native void setSync(long handle, boolean flag); + private native boolean sync(long handle); + private native void setDestroyOldData(long handle, boolean flag); + private native boolean destroyOldData(long handle); + private native void setBackupLogFiles(long handle, boolean flag); + private native boolean backupLogFiles(long handle); + private native void setBackupRateLimit(long handle, long rateLimit); + private native long backupRateLimit(long handle); + private native void setRestoreRateLimit(long handle, long rateLimit); + private native long restoreRateLimit(long handle); + private native void setShareFilesWithChecksum(long handle, boolean flag); + private native boolean shareFilesWithChecksum(long handle); + private native void disposeInternal(long handle); +} diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java new file mode 100644 index 000000000..302fc8a0b --- /dev/null +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -0,0 +1,417 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * The config for plain table sst format. + * + * BlockBasedTable is a RocksDB's default SST file format. + */ +public class BlockBasedTableConfig extends TableFormatConfig { + + public BlockBasedTableConfig() { + noBlockCache_ = false; + blockCacheSize_ = 8 * 1024 * 1024; + blockCacheNumShardBits_ = 0; + blockSize_ = 4 * 1024; + blockSizeDeviation_ = 10; + blockRestartInterval_ = 16; + wholeKeyFiltering_ = true; + filter_ = null; + cacheIndexAndFilterBlocks_ = false; + hashIndexAllowCollision_ = true; + blockCacheCompressedSize_ = 0; + blockCacheCompressedNumShardBits_ = 0; + checksumType_ = ChecksumType.kCRC32c; + indexType_ = IndexType.kBinarySearch; + formatVersion_ = 0; + } + + /** + * Disable block cache. If this is set to true, + * then no block cache should be used, and the block_cache should + * point to a {@code nullptr} object. + * Default: false + * + * @param noBlockCache if use block cache + * @return the reference to the current config. + */ + public BlockBasedTableConfig setNoBlockCache(boolean noBlockCache) { + noBlockCache_ = noBlockCache; + return this; + } + + /** + * @return if block cache is disabled + */ + public boolean noBlockCache() { + return noBlockCache_; + } + + /** + * Set the amount of cache in bytes that will be used by RocksDB. + * If cacheSize is non-positive, then cache will not be used. + * DEFAULT: 8M + * + * @param blockCacheSize block cache size in bytes + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockCacheSize(long blockCacheSize) { + blockCacheSize_ = blockCacheSize; + return this; + } + + /** + * @return block cache size in bytes + */ + public long blockCacheSize() { + return blockCacheSize_; + } + + /** + * Controls the number of shards for the block cache. + * This is applied only if cacheSize is set to non-negative. + * + * @param blockCacheNumShardBits the number of shard bits. The resulting + * number of shards would be 2 ^ numShardBits. Any negative + * number means use default settings." + * @return the reference to the current option. + */ + public BlockBasedTableConfig setCacheNumShardBits(int blockCacheNumShardBits) { + blockCacheNumShardBits_ = blockCacheNumShardBits; + return this; + } + + /** + * Returns the number of shard bits used in the block cache. + * The resulting number of shards would be 2 ^ (returned value). + * Any negative number means use default settings. + * + * @return the number of shard bits used in the block cache. + */ + public int cacheNumShardBits() { + return blockCacheNumShardBits_; + } + + /** + * Approximate size of user data packed per block. Note that the + * block size specified here corresponds to uncompressed data. The + * actual size of the unit read from disk may be smaller if + * compression is enabled. This parameter can be changed dynamically. + * Default: 4K + * + * @param blockSize block size in bytes + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockSize(long blockSize) { + blockSize_ = blockSize; + return this; + } + + /** + * @return block size in bytes + */ + public long blockSize() { + return blockSize_; + } + + /** + * This is used to close a block before it reaches the configured + * 'block_size'. If the percentage of free space in the current block is less + * than this specified number and adding a new record to the block will + * exceed the configured block size, then this block will be closed and the + * new record will be written to the next block. + * Default is 10. + * + * @param blockSizeDeviation the deviation to block size allowed + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockSizeDeviation(int blockSizeDeviation) { + blockSizeDeviation_ = blockSizeDeviation; + return this; + } + + /** + * @return the hash table ratio. + */ + public int blockSizeDeviation() { + return blockSizeDeviation_; + } + + /** + * Set block restart interval + * + * @param restartInterval block restart interval. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockRestartInterval(int restartInterval) { + blockRestartInterval_ = restartInterval; + return this; + } + + /** + * @return block restart interval + */ + public int blockRestartInterval() { + return blockRestartInterval_; + } + + /** + * If true, place whole keys in the filter (not just prefixes). + * This must generally be true for gets to be efficient. + * Default: true + * + * @param wholeKeyFiltering if enable whole key filtering + * @return the reference to the current config. + */ + public BlockBasedTableConfig setWholeKeyFiltering(boolean wholeKeyFiltering) { + wholeKeyFiltering_ = wholeKeyFiltering; + return this; + } + + /** + * @return if whole key filtering is enabled + */ + public boolean wholeKeyFiltering() { + return wholeKeyFiltering_; + } + + /** + * Use the specified filter policy to reduce disk reads. + * + * {@link org.rocksdb.Filter} should not be disposed before options instances + * using this filter is disposed. If {@link Filter#dispose()} function is not + * called, then filter object will be GC'd automatically. + * + * {@link org.rocksdb.Filter} instance can be re-used in multiple options + * instances. + * + * @param filter {@link org.rocksdb.Filter} Filter Policy java instance. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setFilter(Filter filter) { + filter_ = filter; + return this; + } + + /** + * Indicating if we'd put index/filter blocks to the block cache. + If not specified, each "table reader" object will pre-load index/filter + block during table initialization. + * + * @return if index and filter blocks should be put in block cache. + */ + public boolean cacheIndexAndFilterBlocks() { + return cacheIndexAndFilterBlocks_; + } + + /** + * Indicating if we'd put index/filter blocks to the block cache. + If not specified, each "table reader" object will pre-load index/filter + block during table initialization. + * + * @param cacheIndexAndFilterBlocks and filter blocks should be put in block cache. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setCacheIndexAndFilterBlocks( + boolean cacheIndexAndFilterBlocks) { + cacheIndexAndFilterBlocks_ = cacheIndexAndFilterBlocks; + return this; + } + + /** + * Influence the behavior when kHashSearch is used. + if false, stores a precise prefix to block range mapping + if true, does not store prefix and allows prefix hash collision + (less memory consumption) + * + * @return if hash collisions should be allowed. + */ + public boolean hashIndexAllowCollision() { + return hashIndexAllowCollision_; + } + + /** + * Influence the behavior when kHashSearch is used. + if false, stores a precise prefix to block range mapping + if true, does not store prefix and allows prefix hash collision + (less memory consumption) + * + * @param hashIndexAllowCollision points out if hash collisions should be allowed. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setHashIndexAllowCollision( + boolean hashIndexAllowCollision) { + hashIndexAllowCollision_ = hashIndexAllowCollision; + return this; + } + + /** + * Size of compressed block cache. If 0, then block_cache_compressed is set + * to null. + * + * @return size of compressed block cache. + */ + public long blockCacheCompressedSize() { + return blockCacheCompressedSize_; + } + + /** + * Size of compressed block cache. If 0, then block_cache_compressed is set + * to null. + * + * @param blockCacheCompressedSize of compressed block cache. + * @return the reference to the current config. + */ + public BlockBasedTableConfig setBlockCacheCompressedSize( + long blockCacheCompressedSize) { + blockCacheCompressedSize_ = blockCacheCompressedSize; + return this; + } + + /** + * Controls the number of shards for the block compressed cache. + * This is applied only if blockCompressedCacheSize is set to non-negative. + * + * @return numShardBits the number of shard bits. The resulting + * number of shards would be 2 ^ numShardBits. Any negative + * number means use default settings. + */ + public int blockCacheCompressedNumShardBits() { + return blockCacheCompressedNumShardBits_; + } + + /** + * Controls the number of shards for the block compressed cache. + * This is applied only if blockCompressedCacheSize is set to non-negative. + * + * @param blockCacheCompressedNumShardBits the number of shard bits. The resulting + * number of shards would be 2 ^ numShardBits. Any negative + * number means use default settings." + * @return the reference to the current option. + */ + public BlockBasedTableConfig setBlockCacheCompressedNumShardBits( + int blockCacheCompressedNumShardBits) { + blockCacheCompressedNumShardBits_ = blockCacheCompressedNumShardBits; + return this; + } + + /** + * Sets the checksum type to be used with this table. + * + * @param checksumType {@link org.rocksdb.ChecksumType} value. + * @return the reference to the current option. + */ + public BlockBasedTableConfig setChecksumType(ChecksumType checksumType) { + checksumType_ = checksumType; + return this; + } + + /** + * + * @return the currently set checksum type + */ + public ChecksumType checksumType() { + return checksumType_; + } + + /** + * Sets the index type to used with this table. + * + * @param indexType {@link org.rocksdb.IndexType} value + * @return the reference to the current option. + */ + public BlockBasedTableConfig setIndexType(IndexType indexType) { + indexType_ = indexType; + return this; + } + + /** + * + * @return the currently set index type + */ + public IndexType indexType() { + return indexType_; + } + + /** + *

We currently have three versions:

+ * + *
    + *
  • 0 - This version is currently written + * out by all RocksDB's versions by default. Can be read by really old + * RocksDB's. Doesn't support changing checksum (default is CRC32).
  • + *
  • 1 - Can be read by RocksDB's versions since 3.0. + * Supports non-default checksum, like xxHash. It is written by RocksDB when + * BlockBasedTableOptions::checksum is something other than kCRC32c. (version + * 0 is silently upconverted)
  • + *
  • 2 - Can be read by RocksDB's versions since 3.10. + * Changes the way we encode compressed blocks with LZ4, BZip2 and Zlib + * compression. If you don't plan to run RocksDB before version 3.10, + * you should probably use this.
  • + *
+ *

This option only affects newly written tables. When reading existing + * tables, the information about version is read from the footer.

+ * + * @param formatVersion integer representing the version to be used. + * @return the reference to the current option. + */ + public BlockBasedTableConfig setFormatVersion(int formatVersion) { + assert(formatVersion >= 0 && formatVersion <= 2); + formatVersion_ = formatVersion; + return this; + } + + /** + * + * @return the currently configured format version. + * See also: {@link #setFormatVersion(int)}. + */ + public int formatVersion() { + return formatVersion_; + } + + + + @Override protected long newTableFactoryHandle() { + long filterHandle = 0; + if (filter_ != null) { + filterHandle = filter_.nativeHandle_; + } + + return newTableFactoryHandle(noBlockCache_, blockCacheSize_, + blockCacheNumShardBits_, blockSize_, blockSizeDeviation_, + blockRestartInterval_, wholeKeyFiltering_, + filterHandle, cacheIndexAndFilterBlocks_, + hashIndexAllowCollision_, blockCacheCompressedSize_, + blockCacheCompressedNumShardBits_, + checksumType_.getValue(), indexType_.getValue(), + formatVersion_); + } + + private native long newTableFactoryHandle( + boolean noBlockCache, long blockCacheSize, int blockCacheNumShardBits, + long blockSize, int blockSizeDeviation, int blockRestartInterval, + boolean wholeKeyFiltering, long filterPolicyHandle, + boolean cacheIndexAndFilterBlocks, boolean hashIndexAllowCollision, + long blockCacheCompressedSize, int blockCacheCompressedNumShardBits, + byte checkSumType, byte indexType, int formatVersion); + + private boolean cacheIndexAndFilterBlocks_; + private IndexType indexType_; + private boolean hashIndexAllowCollision_; + private ChecksumType checksumType_; + private boolean noBlockCache_; + private long blockSize_; + private long blockCacheSize_; + private int blockCacheNumShardBits_; + private long blockCacheCompressedSize_; + private int blockCacheCompressedNumShardBits_; + private int blockSizeDeviation_; + private int blockRestartInterval_; + private Filter filter_; + private boolean wholeKeyFiltering_; + private int formatVersion_; +} diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java new file mode 100644 index 000000000..dd2a511dd --- /dev/null +++ b/java/src/main/java/org/rocksdb/BloomFilter.java @@ -0,0 +1,89 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Bloom filter policy that uses a bloom filter with approximately + * the specified number of bits per key. + * + *

+ * Note: if you are using a custom comparator that ignores some parts + * of the keys being compared, you must not use this {@code BloomFilter} + * and must provide your own FilterPolicy that also ignores the + * corresponding parts of the keys. For example, if the comparator + * ignores trailing spaces, it would be incorrect to use a + * FilterPolicy (like {@code BloomFilter}) that does not ignore + * trailing spaces in keys.

+ */ +public class BloomFilter extends Filter { + + private static final int DEFAULT_BITS_PER_KEY = 10; + private static final boolean DEFAULT_MODE = true; + private final int bitsPerKey_; + private final boolean useBlockBasedMode_; + + /** + * BloomFilter constructor + * + *

+ * Callers must delete the result after any database that is using the + * result has been closed.

+ */ + public BloomFilter() { + this(DEFAULT_BITS_PER_KEY, DEFAULT_MODE); + } + + /** + * BloomFilter constructor + * + *

+ * bits_per_key: bits per key in bloom filter. A good value for bits_per_key + * is 10, which yields a filter with ~ 1% false positive rate. + *

+ *

+ * Callers must delete the result after any database that is using the + * result has been closed.

+ * + * @param bitsPerKey number of bits to use + */ + public BloomFilter(int bitsPerKey) { + this(bitsPerKey, DEFAULT_MODE); + } + + /** + * BloomFilter constructor + * + *

+ * bits_per_key: bits per key in bloom filter. A good value for bits_per_key + * is 10, which yields a filter with ~ 1% false positive rate. + *

default bits_per_key: 10

+ * + *

use_block_based_builder: use block based filter rather than full filter. + * If you want to builder full filter, it needs to be set to false. + *

+ *

default mode: block based filter

+ *

+ * Callers must delete the result after any database that is using the + * result has been closed.

+ * + * @param bitsPerKey number of bits to use + * @param useBlockBasedMode use block based mode or full filter mode + */ + public BloomFilter(int bitsPerKey, boolean useBlockBasedMode) { + super(); + bitsPerKey_ = bitsPerKey; + useBlockBasedMode_ = useBlockBasedMode; + createNewFilter(); + } + + @Override + protected void createNewFilter() { + createNewBloomFilter(bitsPerKey_, useBlockBasedMode_); + } + + private native void createNewBloomFilter(int bitsKeyKey, + boolean useBlockBasedMode); +} diff --git a/java/src/main/java/org/rocksdb/BuiltinComparator.java b/java/src/main/java/org/rocksdb/BuiltinComparator.java new file mode 100644 index 000000000..ee92e8dd9 --- /dev/null +++ b/java/src/main/java/org/rocksdb/BuiltinComparator.java @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Builtin RocksDB comparators + * + *
    + *
  1. BYTEWISE_COMPARATOR - Sorts all keys in ascending bytewise + * order.
  2. + *
  3. REVERSE_BYTEWISE_COMPARATOR - Sorts all keys in descending bytewise + * order
  4. + *
+ */ +public enum BuiltinComparator { + BYTEWISE_COMPARATOR, REVERSE_BYTEWISE_COMPARATOR +} diff --git a/java/src/main/java/org/rocksdb/Checkpoint.java b/java/src/main/java/org/rocksdb/Checkpoint.java new file mode 100644 index 000000000..2525bb08b --- /dev/null +++ b/java/src/main/java/org/rocksdb/Checkpoint.java @@ -0,0 +1,72 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Provides Checkpoint functionality. Checkpoints + * provide persistent snapshots of RocksDB databases. + */ +public class Checkpoint extends RocksObject { + + /** + * Creates a Checkpoint object to be used for creating open-able + * snapshots. + * + * @param db {@link RocksDB} instance. + * @return a Checkpoint instance. + * + * @throws java.lang.IllegalArgumentException if {@link RocksDB} + * instance is null. + * @throws java.lang.IllegalStateException if {@link RocksDB} + * instance is not initialized. + */ + public static Checkpoint create(RocksDB db) { + if (db == null) { + throw new IllegalArgumentException( + "RocksDB instance shall not be null."); + } else if (!db.isInitialized()) { + throw new IllegalStateException( + "RocksDB instance must be initialized."); + } + Checkpoint checkpoint = new Checkpoint(db); + return checkpoint; + } + + /** + *

Builds an open-able snapshot of RocksDB on the same disk, which + * accepts an output directory on the same disk, and under the directory + * (1) hard-linked SST files pointing to existing live SST files + * (2) a copied manifest files and other files

+ * + * @param checkpointPath path to the folder where the snapshot is going + * to be stored. + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void createCheckpoint(String checkpointPath) + throws RocksDBException { + createCheckpoint(nativeHandle_, checkpointPath); + } + + @Override + protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private Checkpoint(RocksDB db) { + super(); + nativeHandle_ = newCheckpoint(db.nativeHandle_); + db_ = db; + } + + RocksDB db_; + + private static native long newCheckpoint(long dbHandle); + private native void disposeInternal(long handle); + + private native void createCheckpoint(long handle, String checkpointPath) + throws RocksDBException; +} diff --git a/java/org/rocksdb/CompactionStyle.java b/java/src/main/java/org/rocksdb/ChecksumType.java similarity index 51% rename from java/org/rocksdb/CompactionStyle.java rename to java/src/main/java/org/rocksdb/ChecksumType.java index 5c41dfdd2..e685376bf 100644 --- a/java/org/rocksdb/CompactionStyle.java +++ b/java/src/main/java/org/rocksdb/ChecksumType.java @@ -5,18 +5,35 @@ package org.rocksdb; -public enum CompactionStyle { - LEVEL((byte) 0), - UNIVERSAL((byte) 1), - FIFO((byte) 2); - - private final byte value_; - - private CompactionStyle(byte value) { - value_ = value; - } +/** + * Checksum types used in conjunction with BlockBasedTable. + */ +public enum ChecksumType { + /** + * Not implemented yet. + */ + kNoChecksum((byte) 0), + /** + * CRC32 Checksum + */ + kCRC32c((byte) 1), + /** + * XX Hash + */ + kxxHash((byte) 2); + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ public byte getValue() { return value_; } + + private ChecksumType(byte value) { + value_ = value; + } + + private final byte value_; } diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java new file mode 100644 index 000000000..4c0954740 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java @@ -0,0 +1,90 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + *

Describes a column family with a + * name and respective Options.

+ */ +public class ColumnFamilyDescriptor { + + /** + *

Creates a new Column Family using a name and default + * options,

+ * + * @param columnFamilyName name of column family. + * @deprecated will be removed in RocksDB 3.10.0. Use + * {@link #ColumnFamilyDescriptor(byte[])} instead. + */ + @Deprecated + public ColumnFamilyDescriptor(final String columnFamilyName){ + this(columnFamilyName.getBytes(), new ColumnFamilyOptions()); + } + + /** + *

Creates a new Column Family using a name and default + * options,

+ * + * @param columnFamilyName name of column family. + * @since 3.10.0 + */ + public ColumnFamilyDescriptor(final byte[] columnFamilyName) { + this(columnFamilyName, new ColumnFamilyOptions()); + } + + /** + *

Creates a new Column Family using a name and custom + * options.

+ * + * @param columnFamilyName name of column family. + * @param columnFamilyOptions options to be used with + * column family. + * @deprecated will be removed in RocksDB 3.10.0. Use + * {@link #ColumnFamilyDescriptor(byte[], ColumnFamilyOptions)} instead. + */ + @Deprecated + public ColumnFamilyDescriptor(final String columnFamilyName, + final ColumnFamilyOptions columnFamilyOptions) { + this(columnFamilyName.getBytes(), columnFamilyOptions); + } + + /** + *

Creates a new Column Family using a name and custom + * options.

+ * + * @param columnFamilyName name of column family. + * @param columnFamilyOptions options to be used with + * column family. + * @since 3.10.0 + */ + public ColumnFamilyDescriptor(final byte[] columnFamilyName, + final ColumnFamilyOptions columnFamilyOptions) { + columnFamilyName_ = columnFamilyName; + columnFamilyOptions_ = columnFamilyOptions; + } + + /** + * Retrieve name of column family. + * + * @return column family name. + * @since 3.10.0 + */ + public byte[] columnFamilyName() { + return columnFamilyName_; + } + + /** + * Retrieve assigned options instance. + * + * @return Options instance assigned to this instance. + */ + public ColumnFamilyOptions columnFamilyOptions() { + return columnFamilyOptions_; + } + + private final byte[] columnFamilyName_; + private final ColumnFamilyOptions columnFamilyOptions_; +} diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java new file mode 100644 index 000000000..835628702 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -0,0 +1,44 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * ColumnFamilyHandle class to hold handles to underlying rocksdb + * ColumnFamily Pointers. + */ +public class ColumnFamilyHandle extends RocksObject { + ColumnFamilyHandle(RocksDB rocksDB, long nativeHandle) { + super(); + nativeHandle_ = nativeHandle; + // rocksDB must point to a valid RocksDB instance; + assert(rocksDB != null); + // ColumnFamilyHandle must hold a reference to the related RocksDB instance + // to guarantee that while a GC cycle starts ColumnFamilyHandle instances + // are freed prior to RocksDB instances. + rocksDB_ = rocksDB; + } + + /** + *

Deletes underlying C++ iterator pointer.

+ * + *

Note: the underlying handle can only be safely deleted if the RocksDB + * instance related to a certain ColumnFamilyHandle is still valid and initialized. + * Therefore {@code disposeInternal()} checks if the RocksDB is initialized + * before freeing the native handle.

+ */ + @Override protected void disposeInternal() { + synchronized (rocksDB_) { + assert (isInitialized()); + if (rocksDB_.isInitialized()) { + disposeInternal(nativeHandle_); + } + } + } + + private native void disposeInternal(long handle); + + private final RocksDB rocksDB_; +} diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java new file mode 100644 index 000000000..898a6cb45 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -0,0 +1,709 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Properties; + +/** + * ColumnFamilyOptions to control the behavior of a database. It will be used + * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). + * + * If {@link #dispose()} function is not called, then it will be GC'd automatically + * and native resources will be released as part of the process. + */ +public class ColumnFamilyOptions extends RocksObject + implements ColumnFamilyOptionsInterface { + static { + RocksDB.loadLibrary(); + } + + /** + * Construct ColumnFamilyOptions. + * + * This constructor will create (by allocating a block of memory) + * an {@code rocksdb::DBOptions} in the c++ side. + */ + public ColumnFamilyOptions() { + super(); + newColumnFamilyOptions(); + } + + /** + *

Method to get a options instance by using pre-configured + * property values. If one or many values are undefined in + * the context of RocksDB the method will return a null + * value.

+ * + *

Note: Property keys can be derived from + * getter methods within the options class. Example: the method + * {@code writeBufferSize()} has a property key: + * {@code write_buffer_size}.

+ * + * @param properties {@link java.util.Properties} instance. + * + * @return {@link org.rocksdb.ColumnFamilyOptions instance} + * or null. + * + * @throws java.lang.IllegalArgumentException if null or empty + * {@link Properties} instance is passed to the method call. + */ + public static ColumnFamilyOptions getColumnFamilyOptionsFromProps( + Properties properties) { + if (properties == null || properties.size() == 0) { + throw new IllegalArgumentException( + "Properties value must contain at least one value."); + } + ColumnFamilyOptions columnFamilyOptions = null; + StringBuilder stringBuilder = new StringBuilder(); + for (final String name : properties.stringPropertyNames()){ + stringBuilder.append(name); + stringBuilder.append("="); + stringBuilder.append(properties.getProperty(name)); + stringBuilder.append(";"); + } + long handle = getColumnFamilyOptionsFromProps( + stringBuilder.toString()); + if (handle != 0){ + columnFamilyOptions = new ColumnFamilyOptions(handle); + } + return columnFamilyOptions; + } + + @Override + public ColumnFamilyOptions optimizeForPointLookup( + long blockCacheSizeMb) { + optimizeForPointLookup(nativeHandle_, + blockCacheSizeMb); + return this; + } + + @Override + public ColumnFamilyOptions optimizeLevelStyleCompaction() { + optimizeLevelStyleCompaction(nativeHandle_, + DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET); + return this; + } + + @Override + public ColumnFamilyOptions optimizeLevelStyleCompaction( + long memtableMemoryBudget) { + optimizeLevelStyleCompaction(nativeHandle_, + memtableMemoryBudget); + return this; + } + + @Override + public ColumnFamilyOptions optimizeUniversalStyleCompaction() { + optimizeUniversalStyleCompaction(nativeHandle_, + DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET); + return this; + } + + @Override + public ColumnFamilyOptions optimizeUniversalStyleCompaction( + long memtableMemoryBudget) { + optimizeUniversalStyleCompaction(nativeHandle_, + memtableMemoryBudget); + return this; + } + + @Override + public ColumnFamilyOptions setComparator(BuiltinComparator builtinComparator) { + assert(isInitialized()); + setComparatorHandle(nativeHandle_, builtinComparator.ordinal()); + return this; + } + + @Override + public ColumnFamilyOptions setComparator(AbstractComparator comparator) { + assert (isInitialized()); + setComparatorHandle(nativeHandle_, comparator.nativeHandle_); + comparator_ = comparator; + return this; + } + + @Override + public ColumnFamilyOptions setMergeOperatorName(String name) { + assert (isInitialized()); + if (name == null) { + throw new IllegalArgumentException( + "Merge operator name must not be null."); + } + setMergeOperatorName(nativeHandle_, name); + return this; + } + + @Override + public ColumnFamilyOptions setMergeOperator(MergeOperator mergeOperator) { + setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle()); + return this; + } + + @Override + public ColumnFamilyOptions setWriteBufferSize(long writeBufferSize) + throws RocksDBException { + assert(isInitialized()); + setWriteBufferSize(nativeHandle_, writeBufferSize); + return this; + } + + @Override + public long writeBufferSize() { + assert(isInitialized()); + return writeBufferSize(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxWriteBufferNumber( + int maxWriteBufferNumber) { + assert(isInitialized()); + setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); + return this; + } + + @Override + public int maxWriteBufferNumber() { + assert(isInitialized()); + return maxWriteBufferNumber(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMinWriteBufferNumberToMerge( + int minWriteBufferNumberToMerge) { + setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge); + return this; + } + + @Override + public int minWriteBufferNumberToMerge() { + return minWriteBufferNumberToMerge(nativeHandle_); + } + + @Override + public ColumnFamilyOptions useFixedLengthPrefixExtractor(int n) { + assert(isInitialized()); + useFixedLengthPrefixExtractor(nativeHandle_, n); + return this; + } + + @Override + public ColumnFamilyOptions setCompressionType(CompressionType compressionType) { + setCompressionType(nativeHandle_, compressionType.getValue()); + return this; + } + + @Override + public CompressionType compressionType() { + return CompressionType.values()[compressionType(nativeHandle_)]; + } + + @Override + public ColumnFamilyOptions setNumLevels(int numLevels) { + setNumLevels(nativeHandle_, numLevels); + return this; + } + + @Override + public int numLevels() { + return numLevels(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setLevelZeroFileNumCompactionTrigger( + int numFiles) { + setLevelZeroFileNumCompactionTrigger( + nativeHandle_, numFiles); + return this; + } + + @Override + public int levelZeroFileNumCompactionTrigger() { + return levelZeroFileNumCompactionTrigger(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setLevelZeroSlowdownWritesTrigger( + int numFiles) { + setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles); + return this; + } + + @Override + public int levelZeroSlowdownWritesTrigger() { + return levelZeroSlowdownWritesTrigger(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setLevelZeroStopWritesTrigger(int numFiles) { + setLevelZeroStopWritesTrigger(nativeHandle_, numFiles); + return this; + } + + @Override + public int levelZeroStopWritesTrigger() { + return levelZeroStopWritesTrigger(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxMemCompactionLevel( + int maxMemCompactionLevel) { + setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel); + return this; + } + + @Override + public int maxMemCompactionLevel() { + return maxMemCompactionLevel(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setTargetFileSizeBase(long targetFileSizeBase) { + setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); + return this; + } + + @Override + public long targetFileSizeBase() { + return targetFileSizeBase(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setTargetFileSizeMultiplier(int multiplier) { + setTargetFileSizeMultiplier(nativeHandle_, multiplier); + return this; + } + + @Override + public int targetFileSizeMultiplier() { + return targetFileSizeMultiplier(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxBytesForLevelBase( + long maxBytesForLevelBase) { + setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase); + return this; + } + + @Override + public long maxBytesForLevelBase() { + return maxBytesForLevelBase(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxBytesForLevelMultiplier(int multiplier) { + setMaxBytesForLevelMultiplier(nativeHandle_, multiplier); + return this; + } + + @Override + public int maxBytesForLevelMultiplier() { + return maxBytesForLevelMultiplier(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setExpandedCompactionFactor(int expandedCompactionFactor) { + setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor); + return this; + } + + @Override + public int expandedCompactionFactor() { + return expandedCompactionFactor(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setSourceCompactionFactor(int sourceCompactionFactor) { + setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor); + return this; + } + + @Override + public int sourceCompactionFactor() { + return sourceCompactionFactor(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxGrandparentOverlapFactor( + int maxGrandparentOverlapFactor) { + setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor); + return this; + } + + @Override + public int maxGrandparentOverlapFactor() { + return maxGrandparentOverlapFactor(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setSoftRateLimit(double softRateLimit) { + setSoftRateLimit(nativeHandle_, softRateLimit); + return this; + } + + @Override + public double softRateLimit() { + return softRateLimit(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setHardRateLimit(double hardRateLimit) { + setHardRateLimit(nativeHandle_, hardRateLimit); + return this; + } + + @Override + public double hardRateLimit() { + return hardRateLimit(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setRateLimitDelayMaxMilliseconds( + int rateLimitDelayMaxMilliseconds) { + setRateLimitDelayMaxMilliseconds( + nativeHandle_, rateLimitDelayMaxMilliseconds); + return this; + } + + @Override + public int rateLimitDelayMaxMilliseconds() { + return rateLimitDelayMaxMilliseconds(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setArenaBlockSize(long arenaBlockSize) + throws RocksDBException { + setArenaBlockSize(nativeHandle_, arenaBlockSize); + return this; + } + + @Override + public long arenaBlockSize() { + return arenaBlockSize(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setDisableAutoCompactions(boolean disableAutoCompactions) { + setDisableAutoCompactions(nativeHandle_, disableAutoCompactions); + return this; + } + + @Override + public boolean disableAutoCompactions() { + return disableAutoCompactions(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setPurgeRedundantKvsWhileFlush( + boolean purgeRedundantKvsWhileFlush) { + setPurgeRedundantKvsWhileFlush( + nativeHandle_, purgeRedundantKvsWhileFlush); + return this; + } + + @Override + public boolean purgeRedundantKvsWhileFlush() { + return purgeRedundantKvsWhileFlush(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setCompactionStyle(CompactionStyle compactionStyle) { + setCompactionStyle(nativeHandle_, compactionStyle.getValue()); + return this; + } + + @Override + public CompactionStyle compactionStyle() { + return CompactionStyle.values()[compactionStyle(nativeHandle_)]; + } + + @Override + public ColumnFamilyOptions setVerifyChecksumsInCompaction( + boolean verifyChecksumsInCompaction) { + setVerifyChecksumsInCompaction( + nativeHandle_, verifyChecksumsInCompaction); + return this; + } + + @Override + public boolean verifyChecksumsInCompaction() { + return verifyChecksumsInCompaction(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setFilterDeletes(boolean filterDeletes) { + setFilterDeletes(nativeHandle_, filterDeletes); + return this; + } + + @Override + public boolean filterDeletes() { + return filterDeletes(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) { + setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + return this; + } + + @Override + public long maxSequentialSkipInIterations() { + return maxSequentialSkipInIterations(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMemTableConfig(MemTableConfig config) + throws RocksDBException { + memTableConfig_ = config; + setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); + return this; + } + + @Override + public String memTableFactoryName() { + assert(isInitialized()); + return memTableFactoryName(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setTableFormatConfig(TableFormatConfig config) { + tableFormatConfig_ = config; + setTableFactory(nativeHandle_, config.newTableFactoryHandle()); + return this; + } + + @Override + public String tableFactoryName() { + assert(isInitialized()); + return tableFactoryName(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setInplaceUpdateSupport(boolean inplaceUpdateSupport) { + setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); + return this; + } + + @Override + public boolean inplaceUpdateSupport() { + return inplaceUpdateSupport(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) + throws RocksDBException { + setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks); + return this; + } + + @Override + public long inplaceUpdateNumLocks() { + return inplaceUpdateNumLocks(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMemtablePrefixBloomBits(int memtablePrefixBloomBits) { + setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); + return this; + } + + @Override + public int memtablePrefixBloomBits() { + return memtablePrefixBloomBits(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) { + setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); + return this; + } + + @Override + public int memtablePrefixBloomProbes() { + return memtablePrefixBloomProbes(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setBloomLocality(int bloomLocality) { + setBloomLocality(nativeHandle_, bloomLocality); + return this; + } + + @Override + public int bloomLocality() { + return bloomLocality(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMaxSuccessiveMerges(long maxSuccessiveMerges) + throws RocksDBException { + setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); + return this; + } + + @Override + public long maxSuccessiveMerges() { + return maxSuccessiveMerges(nativeHandle_); + } + + @Override + public ColumnFamilyOptions setMinPartialMergeOperands(int minPartialMergeOperands) { + setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands); + return this; + } + + @Override + public int minPartialMergeOperands() { + return minPartialMergeOperands(nativeHandle_); + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + /** + *

Private constructor to be used by + * {@link #getColumnFamilyOptionsFromProps(java.util.Properties)}

+ * + * @param handle native handle to ColumnFamilyOptions instance. + */ + private ColumnFamilyOptions(long handle) { + super(); + nativeHandle_ = handle; + } + + private static native long getColumnFamilyOptionsFromProps( + String optString); + + private native void newColumnFamilyOptions(); + private native void disposeInternal(long handle); + + private native void optimizeForPointLookup(long handle, + long blockCacheSizeMb); + private native void optimizeLevelStyleCompaction(long handle, + long memtableMemoryBudget); + private native void optimizeUniversalStyleCompaction(long handle, + long memtableMemoryBudget); + private native void setComparatorHandle(long handle, int builtinComparator); + private native void setComparatorHandle(long optHandle, long comparatorHandle); + private native void setMergeOperatorName( + long handle, String name); + private native void setMergeOperator( + long handle, long mergeOperatorHandle); + private native void setWriteBufferSize(long handle, long writeBufferSize) + throws RocksDBException; + private native long writeBufferSize(long handle); + private native void setMaxWriteBufferNumber( + long handle, int maxWriteBufferNumber); + private native int maxWriteBufferNumber(long handle); + private native void setMinWriteBufferNumberToMerge( + long handle, int minWriteBufferNumberToMerge); + private native int minWriteBufferNumberToMerge(long handle); + private native void setCompressionType(long handle, byte compressionType); + private native byte compressionType(long handle); + private native void useFixedLengthPrefixExtractor( + long handle, int prefixLength); + private native void setNumLevels( + long handle, int numLevels); + private native int numLevels(long handle); + private native void setLevelZeroFileNumCompactionTrigger( + long handle, int numFiles); + private native int levelZeroFileNumCompactionTrigger(long handle); + private native void setLevelZeroSlowdownWritesTrigger( + long handle, int numFiles); + private native int levelZeroSlowdownWritesTrigger(long handle); + private native void setLevelZeroStopWritesTrigger( + long handle, int numFiles); + private native int levelZeroStopWritesTrigger(long handle); + private native void setMaxMemCompactionLevel( + long handle, int maxMemCompactionLevel); + private native int maxMemCompactionLevel(long handle); + private native void setTargetFileSizeBase( + long handle, long targetFileSizeBase); + private native long targetFileSizeBase(long handle); + private native void setTargetFileSizeMultiplier( + long handle, int multiplier); + private native int targetFileSizeMultiplier(long handle); + private native void setMaxBytesForLevelBase( + long handle, long maxBytesForLevelBase); + private native long maxBytesForLevelBase(long handle); + private native void setMaxBytesForLevelMultiplier( + long handle, int multiplier); + private native int maxBytesForLevelMultiplier(long handle); + private native void setExpandedCompactionFactor( + long handle, int expandedCompactionFactor); + private native int expandedCompactionFactor(long handle); + private native void setSourceCompactionFactor( + long handle, int sourceCompactionFactor); + private native int sourceCompactionFactor(long handle); + private native void setMaxGrandparentOverlapFactor( + long handle, int maxGrandparentOverlapFactor); + private native int maxGrandparentOverlapFactor(long handle); + private native void setSoftRateLimit( + long handle, double softRateLimit); + private native double softRateLimit(long handle); + private native void setHardRateLimit( + long handle, double hardRateLimit); + private native double hardRateLimit(long handle); + private native void setRateLimitDelayMaxMilliseconds( + long handle, int rateLimitDelayMaxMilliseconds); + private native int rateLimitDelayMaxMilliseconds(long handle); + private native void setArenaBlockSize( + long handle, long arenaBlockSize) throws RocksDBException; + private native long arenaBlockSize(long handle); + private native void setDisableAutoCompactions( + long handle, boolean disableAutoCompactions); + private native boolean disableAutoCompactions(long handle); + private native void setCompactionStyle(long handle, byte compactionStyle); + private native byte compactionStyle(long handle); + private native void setPurgeRedundantKvsWhileFlush( + long handle, boolean purgeRedundantKvsWhileFlush); + private native boolean purgeRedundantKvsWhileFlush(long handle); + private native void setVerifyChecksumsInCompaction( + long handle, boolean verifyChecksumsInCompaction); + private native boolean verifyChecksumsInCompaction(long handle); + private native void setFilterDeletes( + long handle, boolean filterDeletes); + private native boolean filterDeletes(long handle); + private native void setMaxSequentialSkipInIterations( + long handle, long maxSequentialSkipInIterations); + private native long maxSequentialSkipInIterations(long handle); + private native void setMemTableFactory(long handle, long factoryHandle); + private native String memTableFactoryName(long handle); + private native void setTableFactory(long handle, long factoryHandle); + private native String tableFactoryName(long handle); + private native void setInplaceUpdateSupport( + long handle, boolean inplaceUpdateSupport); + private native boolean inplaceUpdateSupport(long handle); + private native void setInplaceUpdateNumLocks( + long handle, long inplaceUpdateNumLocks) throws RocksDBException; + private native long inplaceUpdateNumLocks(long handle); + private native void setMemtablePrefixBloomBits( + long handle, int memtablePrefixBloomBits); + private native int memtablePrefixBloomBits(long handle); + private native void setMemtablePrefixBloomProbes( + long handle, int memtablePrefixBloomProbes); + private native int memtablePrefixBloomProbes(long handle); + private native void setBloomLocality( + long handle, int bloomLocality); + private native int bloomLocality(long handle); + private native void setMaxSuccessiveMerges( + long handle, long maxSuccessiveMerges) throws RocksDBException; + private native long maxSuccessiveMerges(long handle); + private native void setMinPartialMergeOperands( + long handle, int minPartialMergeOperands); + private native int minPartialMergeOperands(long handle); + + MemTableConfig memTableConfig_; + TableFormatConfig tableFormatConfig_; + AbstractComparator comparator_; +} diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java new file mode 100644 index 000000000..c1be7f294 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -0,0 +1,987 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public interface ColumnFamilyOptionsInterface { + + /** + * Use this if you don't need to keep the data sorted, i.e. you'll never use + * an iterator, only Put() and Get() API calls + * + * @param blockCacheSizeMb Block cache size in MB + * @return the instance of the current Object. + */ + Object optimizeForPointLookup(long blockCacheSizeMb); + + /** + *

Default values for some parameters in ColumnFamilyOptions are not + * optimized for heavy workloads and big datasets, which means you might + * observe write stalls under some conditions. As a starting point for tuning + * RocksDB options, use the following for level style compaction.

+ * + *

Make sure to also call IncreaseParallelism(), which will provide the + * biggest performance gains.

+ *

Note: we might use more memory than memtable_memory_budget during high + * write rate period

+ * + * @return the instance of the current Object. + */ + Object optimizeLevelStyleCompaction(); + + /** + *

Default values for some parameters in ColumnFamilyOptions are not + * optimized for heavy workloads and big datasets, which means you might + * observe write stalls under some conditions. As a starting point for tuning + * RocksDB options, use the following for level style compaction.

+ * + *

Make sure to also call IncreaseParallelism(), which will provide the + * biggest performance gains.

+ *

Note: we might use more memory than memtable_memory_budget during high + * write rate period

+ * + * @param memtableMemoryBudget memory budget in bytes + * @return the instance of the current Object. + */ + Object optimizeLevelStyleCompaction(long memtableMemoryBudget); + + /** + *

Default values for some parameters in ColumnFamilyOptions are not + * optimized for heavy workloads and big datasets, which means you might + * observe write stalls under some conditions. As a starting point for tuning + * RocksDB options, use the following for universal style compaction.

+ * + *

Universal style compaction is focused on reducing Write Amplification + * Factor for big data sets, but increases Space Amplification.

+ * + *

Make sure to also call IncreaseParallelism(), which will provide the + * biggest performance gains.

+ * + *

Note: we might use more memory than memtable_memory_budget during high + * write rate period

+ * + * @return the instance of the current Object. + */ + Object optimizeUniversalStyleCompaction(); + + /** + *

Default values for some parameters in ColumnFamilyOptions are not + * optimized for heavy workloads and big datasets, which means you might + * observe write stalls under some conditions. As a starting point for tuning + * RocksDB options, use the following for universal style compaction.

+ * + *

Universal style compaction is focused on reducing Write Amplification + * Factor for big data sets, but increases Space Amplification.

+ * + *

Make sure to also call IncreaseParallelism(), which will provide the + * biggest performance gains.

+ * + *

Note: we might use more memory than memtable_memory_budget during high + * write rate period

+ * + * @param memtableMemoryBudget memory budget in bytes + * @return the instance of the current Object. + */ + Object optimizeUniversalStyleCompaction(long memtableMemoryBudget); + + /** + * Set {@link BuiltinComparator} to be used with RocksDB. + * + * Note: Comparator can be set once upon database creation. + * + * Default: BytewiseComparator. + * @param builtinComparator a {@link BuiltinComparator} type. + * @return the instance of the current Object. + */ + Object setComparator(BuiltinComparator builtinComparator); + + /** + * Use the specified comparator for key ordering. + * + * Comparator should not be disposed before options instances using this comparator is + * disposed. If dispose() function is not called, then comparator object will be + * GC'd automatically. + * + * Comparator instance can be re-used in multiple options instances. + * + * @param comparator java instance. + * @return the instance of the current Object. + */ + Object setComparator(AbstractComparator comparator); + + /** + *

Set the merge operator to be used for merging two merge operands + * of the same key. The merge function is invoked during + * compaction and at lookup time, if multiple key/value pairs belonging + * to the same key are found in the database.

+ * + * @param name the name of the merge function, as defined by + * the MergeOperators factory (see utilities/MergeOperators.h) + * The merge function is specified by name and must be one of the + * standard merge operators provided by RocksDB. The available + * operators are "put", "uint64add", "stringappend" and "stringappendtest". + * @return the instance of the current Object. + */ + public Object setMergeOperatorName(String name); + + /** + *

Set the merge operator to be used for merging two different key/value + * pairs that share the same key. The merge function is invoked during + * compaction and at lookup time, if multiple key/value pairs belonging + * to the same key are found in the database.

+ * + * @param mergeOperator {@link MergeOperator} instance. + * @return the instance of the current Object. + */ + public Object setMergeOperator(MergeOperator mergeOperator); + + /** + * Amount of data to build up in memory (backed by an unsorted log + * on disk) before converting to a sorted on-disk file. + * + * Larger values increase performance, especially during bulk loads. + * Up to {@code max_write_buffer_number} write buffers may be held in memory + * at the same time, so you may wish to adjust this parameter + * to control memory usage. + * + * Also, a larger write buffer will result in a longer recovery time + * the next time the database is opened. + * + * Default: 4MB + * @param writeBufferSize the size of write buffer. + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setWriteBufferSize(long writeBufferSize) + throws RocksDBException; + + /** + * Return size of write buffer size. + * + * @return size of write buffer. + * @see #setWriteBufferSize(long) + */ + long writeBufferSize(); + + /** + * The maximum number of write buffers that are built up in memory. + * The default is 2, so that when 1 write buffer is being flushed to + * storage, new writes can continue to the other write buffer. + * Default: 2 + * + * @param maxWriteBufferNumber maximum number of write buffers. + * @return the instance of the current Object. + */ + Object setMaxWriteBufferNumber( + int maxWriteBufferNumber); + + /** + * Returns maximum number of write buffers. + * + * @return maximum number of write buffers. + * @see #setMaxWriteBufferNumber(int) + */ + int maxWriteBufferNumber(); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are flushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @param minWriteBufferNumberToMerge the minimum number of write buffers + * that will be merged together. + * @return the reference to the current option. + */ + Object setMinWriteBufferNumberToMerge( + int minWriteBufferNumberToMerge); + + /** + * The minimum number of write buffers that will be merged together + * before writing to storage. If set to 1, then + * all write buffers are flushed to L0 as individual files and this increases + * read amplification because a get request has to check in all of these + * files. Also, an in-memory merge may result in writing lesser + * data to storage if there are duplicate records in each of these + * individual write buffers. Default: 1 + * + * @return the minimum number of write buffers that will be merged together. + */ + int minWriteBufferNumberToMerge(); + + /** + * This prefix-extractor uses the first n bytes of a key as its prefix. + * + * In some hash-based memtable representation such as HashLinkedList + * and HashSkipList, prefixes are used to partition the keys into + * several buckets. Prefix extractor is used to specify how to + * extract the prefix given a key. + * + * @param n use the first n bytes of a key as its prefix. + * @return the reference to the current option. + */ + Object useFixedLengthPrefixExtractor(int n); + + /** + * Compress blocks using the specified compression algorithm. This + * parameter can be changed dynamically. + * + * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. + * + * @param compressionType Compression Type. + * @return the reference to the current option. + */ + Object setCompressionType(CompressionType compressionType); + + /** + * Compress blocks using the specified compression algorithm. This + * parameter can be changed dynamically. + * + * Default: SNAPPY_COMPRESSION, which gives lightweight but fast compression. + * + * @return Compression type. + */ + CompressionType compressionType(); + + /** + * Set the number of levels for this database + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @param numLevels the number of levels. + * @return the reference to the current option. + */ + Object setNumLevels(int numLevels); + + /** + * If level-styled compaction is used, then this number determines + * the total number of levels. + * + * @return the number of levels. + */ + int numLevels(); + + /** + * Number of files to trigger level-0 compaction. A value < 0 means that + * level-0 compaction will not be triggered by number of files at all. + * Default: 4 + * + * @param numFiles the number of files in level-0 to trigger compaction. + * @return the reference to the current option. + */ + Object setLevelZeroFileNumCompactionTrigger( + int numFiles); + + /** + * The number of files in level 0 to trigger compaction from level-0 to + * level-1. A value < 0 means that level-0 compaction will not be + * triggered by number of files at all. + * Default: 4 + * + * @return the number of files in level 0 to trigger compaction. + */ + int levelZeroFileNumCompactionTrigger(); + + /** + * Soft limit on number of level-0 files. We start slowing down writes at this + * point. A value < 0 means that no writing slow down will be triggered by + * number of files in level-0. + * + * @param numFiles soft limit on number of level-0 files. + * @return the reference to the current option. + */ + Object setLevelZeroSlowdownWritesTrigger( + int numFiles); + + /** + * Soft limit on the number of level-0 files. We start slowing down writes + * at this point. A value < 0 means that no writing slow down will be + * triggered by number of files in level-0. + * + * @return the soft limit on the number of level-0 files. + */ + int levelZeroSlowdownWritesTrigger(); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @param numFiles the hard limit of the number of level-0 files. + * @return the reference to the current option. + */ + Object setLevelZeroStopWritesTrigger(int numFiles); + + /** + * Maximum number of level-0 files. We stop writes at this point. + * + * @return the hard limit of the number of level-0 file. + */ + int levelZeroStopWritesTrigger(); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0≥1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @param maxMemCompactionLevel the highest level to which a new compacted + * mem-table will be pushed. + * @return the reference to the current option. + */ + Object setMaxMemCompactionLevel( + int maxMemCompactionLevel); + + /** + * The highest level to which a new compacted memtable is pushed if it + * does not create overlap. We try to push to level 2 to avoid the + * relatively expensive level 0≥1 compactions and to avoid some + * expensive manifest file operations. We do not push all the way to + * the largest level since that can generate a lot of wasted disk + * space if the same key space is being repeatedly overwritten. + * + * @return the highest level where a new compacted memtable will be pushed. + */ + int maxMemCompactionLevel(); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @param targetFileSizeBase the target size of a level-0 file. + * @return the reference to the current option. + * + * @see #setTargetFileSizeMultiplier(int) + */ + Object setTargetFileSizeBase(long targetFileSizeBase); + + /** + * The target file size for compaction. + * This targetFileSizeBase determines a level-1 file size. + * Target file size for level L can be calculated by + * targetFileSizeBase * (targetFileSizeMultiplier ^ (L-1)) + * For example, if targetFileSizeBase is 2MB and + * target_file_size_multiplier is 10, then each file on level-1 will + * be 2MB, and each file on level 2 will be 20MB, + * and each file on level-3 will be 200MB. + * by default targetFileSizeBase is 2MB. + * + * @return the target size of a level-0 file. + * + * @see #targetFileSizeMultiplier() + */ + long targetFileSizeBase(); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-L file and level-(L+1) file. + * By default target_file_size_multiplier is 1, meaning + * files in different levels have the same target. + * + * @param multiplier the size ratio between a level-(L+1) file + * and level-L file. + * @return the reference to the current option. + */ + Object setTargetFileSizeMultiplier(int multiplier); + + /** + * targetFileSizeMultiplier defines the size ratio between a + * level-(L+1) file and level-L file. + * By default targetFileSizeMultiplier is 1, meaning + * files in different levels have the same target. + * + * @return the size ratio between a level-(L+1) file and level-L file. + */ + int targetFileSizeMultiplier(); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @param maxBytesForLevelBase maximum bytes for level base. + * + * @return the reference to the current option. + * @see #setMaxBytesForLevelMultiplier(int) + */ + Object setMaxBytesForLevelBase( + long maxBytesForLevelBase); + + /** + * The upper-bound of the total size of level-1 files in bytes. + * Maximum number of bytes for level L can be calculated as + * (maxBytesForLevelBase) * (maxBytesForLevelMultiplier ^ (L-1)) + * For example, if maxBytesForLevelBase is 20MB, and if + * max_bytes_for_level_multiplier is 10, total data size for level-1 + * will be 20MB, total file size for level-2 will be 200MB, + * and total file size for level-3 will be 2GB. + * by default 'maxBytesForLevelBase' is 10MB. + * + * @return the upper-bound of the total size of leve-1 files in bytes. + * @see #maxBytesForLevelMultiplier() + */ + long maxBytesForLevelBase(); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @param multiplier the ratio between the total size of level-(L+1) + * files and the total size of level-L files for all L. + * @return the reference to the current option. + * @see #setMaxBytesForLevelBase(long) + */ + Object setMaxBytesForLevelMultiplier(int multiplier); + + /** + * The ratio between the total size of level-(L+1) files and the total + * size of level-L files for all L. + * DEFAULT: 10 + * + * @return the ratio between the total size of level-(L+1) files and + * the total size of level-L files for all L. + * @see #maxBytesForLevelBase() + */ + int maxBytesForLevelMultiplier(); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @param expandedCompactionFactor the maximum number of bytes in all + * compacted files. + * @return the reference to the current option. + * @see #setSourceCompactionFactor(int) + */ + Object setExpandedCompactionFactor(int expandedCompactionFactor); + + /** + * Maximum number of bytes in all compacted files. We avoid expanding + * the lower level file set of a compaction if it would make the + * total compaction cover more than + * (expanded_compaction_factor * targetFileSizeLevel()) many bytes. + * + * @return the maximum number of bytes in all compacted files. + * @see #sourceCompactionFactor() + */ + int expandedCompactionFactor(); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @param sourceCompactionFactor the maximum number of bytes in all + * source files to be compacted in a single compaction run. + * @return the reference to the current option. + * @see #setExpandedCompactionFactor(int) + */ + Object setSourceCompactionFactor(int sourceCompactionFactor); + + /** + * Maximum number of bytes in all source files to be compacted in a + * single compaction run. We avoid picking too many files in the + * source level so that we do not exceed the total source bytes + * for compaction to exceed + * (source_compaction_factor * targetFileSizeLevel()) many bytes. + * Default:1, i.e. pick maxfilesize amount of data as the source of + * a compaction. + * + * @return the maximum number of bytes in all source files to be compactedo. + * @see #expandedCompactionFactor() + */ + int sourceCompactionFactor(); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @param maxGrandparentOverlapFactor maximum bytes of overlaps in + * "grandparent" level. + * @return the reference to the current option. + */ + Object setMaxGrandparentOverlapFactor( + int maxGrandparentOverlapFactor); + + /** + * Control maximum bytes of overlaps in grandparent (i.e., level+2) before we + * stop building a single file in a level->level+1 compaction. + * + * @return maximum bytes of overlaps in "grandparent" level. + */ + int maxGrandparentOverlapFactor(); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit ≤ hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @param softRateLimit the soft-rate-limit of a compaction score + * for put delay. + * @return the reference to the current option. + */ + Object setSoftRateLimit(double softRateLimit); + + /** + * Puts are delayed 0-1 ms when any level has a compaction score that exceeds + * soft_rate_limit. This is ignored when == 0.0. + * CONSTRAINT: soft_rate_limit ≤ hard_rate_limit. If this constraint does not + * hold, RocksDB will set soft_rate_limit = hard_rate_limit + * Default: 0 (disabled) + * + * @return soft-rate-limit for put delay. + */ + double softRateLimit(); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when ≤ 1.0. + * Default: 0 (disabled) + * + * @param hardRateLimit the hard-rate-limit of a compaction score for put + * delay. + * @return the reference to the current option. + */ + Object setHardRateLimit(double hardRateLimit); + + /** + * Puts are delayed 1ms at a time when any level has a compaction score that + * exceeds hard_rate_limit. This is ignored when ≤ 1.0. + * Default: 0 (disabled) + * + * @return the hard-rate-limit of a compaction score for put delay. + */ + double hardRateLimit(); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @param rateLimitDelayMaxMilliseconds the maximum time interval a put + * will be stalled. + * @return the reference to the current option. + */ + Object setRateLimitDelayMaxMilliseconds( + int rateLimitDelayMaxMilliseconds); + + /** + * The maximum time interval a put will be stalled when hard_rate_limit + * is enforced. If 0, then there is no limit. + * Default: 1000 + * + * @return the maximum time interval a put will be stalled when + * hard_rate_limit is enforced. + */ + int rateLimitDelayMaxMilliseconds(); + + /** + * The size of one block in arena memory allocation. + * If ≤ 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @param arenaBlockSize the size of an arena block + * @return the reference to the current option. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setArenaBlockSize(long arenaBlockSize) + throws RocksDBException; + + /** + * The size of one block in arena memory allocation. + * If ≤ 0, a proper value is automatically calculated (usually 1/10 of + * writer_buffer_size). + * + * There are two additonal restriction of the The specified size: + * (1) size should be in the range of [4096, 2 << 30] and + * (2) be the multiple of the CPU word (which helps with the memory + * alignment). + * + * We'll automatically check and adjust the size number to make sure it + * conforms to the restrictions. + * Default: 0 + * + * @return the size of an arena block + */ + long arenaBlockSize(); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @param disableAutoCompactions true if auto-compactions are disabled. + * @return the reference to the current option. + */ + Object setDisableAutoCompactions(boolean disableAutoCompactions); + + /** + * Disable automatic compactions. Manual compactions can still + * be issued on this column family + * + * @return true if auto-compactions are disabled. + */ + boolean disableAutoCompactions(); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @param purgeRedundantKvsWhileFlush true if purging keys is disabled. + * @return the reference to the current option. + */ + Object setPurgeRedundantKvsWhileFlush( + boolean purgeRedundantKvsWhileFlush); + + /** + * Purge duplicate/deleted keys when a memtable is flushed to storage. + * Default: true + * + * @return true if purging keys is disabled. + */ + boolean purgeRedundantKvsWhileFlush(); + + /** + * Set compaction style for DB. + * + * Default: LEVEL. + * + * @param compactionStyle Compaction style. + * @return the reference to the current option. + */ + Object setCompactionStyle(CompactionStyle compactionStyle); + + /** + * Compaction style for DB. + * + * @return Compaction style. + */ + CompactionStyle compactionStyle(); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @param verifyChecksumsInCompaction true if compaction verifies + * checksum on every read. + * @return the reference to the current option. + */ + Object setVerifyChecksumsInCompaction( + boolean verifyChecksumsInCompaction); + + /** + * If true, compaction will verify checksum on every read that happens + * as part of compaction + * Default: true + * + * @return true if compaction verifies checksum on every read. + */ + boolean verifyChecksumsInCompaction(); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @param filterDeletes true if filter-deletes behavior is on. + * @return the reference to the current option. + */ + Object setFilterDeletes(boolean filterDeletes); + + /** + * Use KeyMayExist API to filter deletes when this is true. + * If KeyMayExist returns false, i.e. the key definitely does not exist, then + * the delete is a noop. KeyMayExist only incurs in-memory look up. + * This optimization avoids writing the delete to storage when appropriate. + * Default: false + * + * @return true if filter-deletes behavior is on. + */ + boolean filterDeletes(); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @param maxSequentialSkipInIterations the number of keys could + * be skipped in a iteration. + * @return the reference to the current option. + */ + Object setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations); + + /** + * An iteration->Next() sequentially skips over keys with the same + * user-key unless this option is set. This number specifies the number + * of keys (with the same userkey) that will be sequentially + * skipped before a reseek is issued. + * Default: 8 + * + * @return the number of keys could be skipped in a iteration. + */ + long maxSequentialSkipInIterations(); + + /** + * Set the config for mem-table. + * + * @param config the mem-table config. + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setMemTableConfig(MemTableConfig config) + throws RocksDBException; + + /** + * Returns the name of the current mem table representation. + * Memtable format can be set using setTableFormatConfig. + * + * @return the name of the currently-used memtable factory. + * @see #setTableFormatConfig(org.rocksdb.TableFormatConfig) + */ + String memTableFactoryName(); + + /** + * Set the config for table format. + * + * @param config the table format config. + * @return the reference of the current Options. + */ + Object setTableFormatConfig(TableFormatConfig config); + + /** + * @return the name of the currently used table factory. + */ + String tableFactoryName(); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) ≤ sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @param inplaceUpdateSupport true if thread-safe inplace updates + * are allowed. + * @return the reference to the current option. + */ + Object setInplaceUpdateSupport(boolean inplaceUpdateSupport); + + /** + * Allows thread-safe inplace updates. + * If inplace_callback function is not set, + * Put(key, new_value) will update inplace the existing_value iff + * * key exists in current memtable + * * new sizeof(new_value) ≤ sizeof(existing_value) + * * existing_value for that key is a put i.e. kTypeValue + * If inplace_callback function is set, check doc for inplace_callback. + * Default: false. + * + * @return true if thread-safe inplace updates are allowed. + */ + boolean inplaceUpdateSupport(); + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @param inplaceUpdateNumLocks the number of locks used for + * inplace updates. + * @return the reference to the current option. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) + throws RocksDBException; + + /** + * Number of locks used for inplace update + * Default: 10000, if inplace_update_support = true, else 0. + * + * @return the number of locks used for inplace update. + */ + long inplaceUpdateNumLocks(); + + /** + * Sets the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @param memtablePrefixBloomBits the number of bits used in the + * prefix bloom filter. + * @return the reference to the current option. + */ + Object setMemtablePrefixBloomBits(int memtablePrefixBloomBits); + + /** + * Returns the number of bits used in the prefix bloom filter. + * + * This value will be used only when a prefix-extractor is specified. + * + * @return the number of bloom-bits. + * @see #useFixedLengthPrefixExtractor(int) + */ + int memtablePrefixBloomBits(); + + /** + * The number of hash probes per key used in the mem-table. + * + * @param memtablePrefixBloomProbes the number of hash probes per key. + * @return the reference to the current option. + */ + Object setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes); + + /** + * The number of hash probes per key used in the mem-table. + * + * @return the number of hash probes per key. + */ + int memtablePrefixBloomProbes(); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @param bloomLocality the level of locality of bloom-filter probes. + * @return the reference to the current option. + */ + Object setBloomLocality(int bloomLocality); + + /** + * Control locality of bloom filter probes to improve cache miss rate. + * This option only applies to memtable prefix bloom and plaintable + * prefix bloom. It essentially limits the max number of cache lines each + * bloom filter check can touch. + * This optimization is turned off when set to 0. The number should never + * be greater than number of probes. This option can boost performance + * for in-memory workload but should use with care since it can cause + * higher false positive rate. + * Default: 0 + * + * @return the level of locality of bloom-filter probes. + * @see #setMemtablePrefixBloomProbes(int) + */ + int bloomLocality(); + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @param maxSuccessiveMerges the maximum number of successive merges. + * @return the reference to the current option. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setMaxSuccessiveMerges(long maxSuccessiveMerges) + throws RocksDBException; + + /** + * Maximum number of successive merge operations on a key in the memtable. + * + * When a merge operation is added to the memtable and the maximum number of + * successive merges is reached, the value of the key will be calculated and + * inserted into the memtable instead of the merge operation. This will + * ensure that there are never more than max_successive_merges merge + * operations in the memtable. + * + * Default: 0 (disabled) + * + * @return the maximum number of successive merges. + */ + long maxSuccessiveMerges(); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @param minPartialMergeOperands min partial merge operands + * @return the reference to the current option. + */ + Object setMinPartialMergeOperands(int minPartialMergeOperands); + + /** + * The number of partial merge operands to accumulate before partial + * merge will be performed. Partial merge will not be called + * if the list of values to merge is less than min_partial_merge_operands. + * + * If min_partial_merge_operands < 2, then it will be treated as 2. + * + * Default: 2 + * + * @return min partial merge operands + */ + int minPartialMergeOperands(); + + /** + * Default memtable memory budget used with the following methods: + * + *
    + *
  1. {@link #optimizeLevelStyleCompaction()}
  2. + *
  3. {@link #optimizeUniversalStyleCompaction()}
  4. + *
+ */ + long DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET = 512 * 1024 * 1024; +} diff --git a/java/src/main/java/org/rocksdb/CompactionStyle.java b/java/src/main/java/org/rocksdb/CompactionStyle.java new file mode 100644 index 000000000..76064395c --- /dev/null +++ b/java/src/main/java/org/rocksdb/CompactionStyle.java @@ -0,0 +1,52 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Enum CompactionStyle + * + * RocksDB supports different styles of compaction. Available + * compaction styles can be chosen using this enumeration. + * + *
    + *
  1. LEVEL - Level based Compaction style
  2. + *
  3. UNIVERSAL - Universal Compaction Style is a + * compaction style, targeting the use cases requiring lower write + * amplification, trading off read amplification and space + * amplification.
  4. + *
  5. FIFO - FIFO compaction style is the simplest + * compaction strategy. It is suited for keeping event log data with + * very low overhead (query log for example). It periodically deletes + * the old data, so it's basically a TTL compaction style.
  6. + *
+ * + * @see + * Universal Compaction + * @see + * FIFO Compaction + */ +public enum CompactionStyle { + LEVEL((byte) 0), + UNIVERSAL((byte) 1), + FIFO((byte) 2); + + private final byte value_; + + private CompactionStyle(byte value) { + value_ = value; + } + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value_; + } +} diff --git a/java/src/main/java/org/rocksdb/Comparator.java b/java/src/main/java/org/rocksdb/Comparator.java new file mode 100644 index 000000000..c8e050bca --- /dev/null +++ b/java/src/main/java/org/rocksdb/Comparator.java @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class for comparators which will receive + * byte[] based access via org.rocksdb.Slice in their + * compare method implementation. + * + * byte[] based slices perform better when small keys + * are involved. When using larger keys consider + * using @see org.rocksdb.DirectComparator + */ +public abstract class Comparator extends AbstractComparator { + public Comparator(final ComparatorOptions copt) { + super(); + createNewComparator0(copt.nativeHandle_); + } + + private native void createNewComparator0(final long comparatorOptionsHandle); +} diff --git a/java/src/main/java/org/rocksdb/ComparatorOptions.java b/java/src/main/java/org/rocksdb/ComparatorOptions.java new file mode 100644 index 000000000..f0ba520a3 --- /dev/null +++ b/java/src/main/java/org/rocksdb/ComparatorOptions.java @@ -0,0 +1,57 @@ +package org.rocksdb; + +/** + * This class controls the behaviour + * of Java implementations of + * AbstractComparator + * + * Note that dispose() must be called before a ComparatorOptions + * instance becomes out-of-scope to release the allocated memory in C++. + */ +public class ComparatorOptions extends RocksObject { + public ComparatorOptions() { + super(); + newComparatorOptions(); + } + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @return true if adaptive mutex is used. + */ + public boolean useAdaptiveMutex() { + assert(isInitialized()); + return useAdaptiveMutex(nativeHandle_); + } + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @param useAdaptiveMutex true if adaptive mutex is used. + * @return the reference to the current comparator options. + */ + public ComparatorOptions setUseAdaptiveMutex(final boolean useAdaptiveMutex) { + assert (isInitialized()); + setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); + return this; + } + + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void newComparatorOptions(); + private native boolean useAdaptiveMutex(final long handle); + private native void setUseAdaptiveMutex(final long handle, + final boolean useAdaptiveMutex); + private native void disposeInternal(long handle); +} diff --git a/java/src/main/java/org/rocksdb/CompressionType.java b/java/src/main/java/org/rocksdb/CompressionType.java new file mode 100644 index 000000000..9f75b55e6 --- /dev/null +++ b/java/src/main/java/org/rocksdb/CompressionType.java @@ -0,0 +1,74 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Enum CompressionType + * + *

DB contents are stored in a set of blocks, each of which holds a + * sequence of key,value pairs. Each block may be compressed before + * being stored in a file. The following enum describes which + * compression method (if any) is used to compress a block.

+ */ +public enum CompressionType { + + NO_COMPRESSION((byte) 0, null), + SNAPPY_COMPRESSION((byte) 1, "snappy"), + ZLIB_COMPRESSION((byte) 2, "z"), + BZLIB2_COMPRESSION((byte) 3, "bzip2"), + LZ4_COMPRESSION((byte) 4, "lz4"), + LZ4HC_COMPRESSION((byte) 5, "lz4hc"); + + /** + *

Get the CompressionType enumeration value by + * passing the library name to this method.

+ * + *

If library cannot be found the enumeration + * value {@code NO_COMPRESSION} will be returned.

+ * + * @param libraryName compression library name. + * + * @return CompressionType instance. + */ + public static CompressionType getCompressionType(String libraryName) { + if (libraryName != null) { + for (CompressionType compressionType : CompressionType.values()) { + if (compressionType.getLibraryName() != null && + compressionType.getLibraryName().equals(libraryName)) { + return compressionType; + } + } + } + return CompressionType.NO_COMPRESSION; + } + + /** + *

Returns the byte value of the enumerations value.

+ * + * @return byte representation + */ + public byte getValue() { + return value_; + } + + /** + *

Returns the library name of the compression type + * identified by the enumeration value.

+ * + * @return library name + */ + public String getLibraryName() { + return libraryName_; + } + + private CompressionType(byte value, final String libraryName) { + value_ = value; + libraryName_ = libraryName; + } + + private final byte value_; + private final String libraryName_; +} diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java new file mode 100644 index 000000000..fb8f27bc4 --- /dev/null +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -0,0 +1,653 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Properties; + +/** + * DBOptions to control the behavior of a database. It will be used + * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). + * + * If {@link #dispose()} function is not called, then it will be GC'd automatically + * and native resources will be released as part of the process. + */ +public class DBOptions extends RocksObject implements DBOptionsInterface { + static { + RocksDB.loadLibrary(); + } + + /** + * Construct DBOptions. + * + * This constructor will create (by allocating a block of memory) + * an {@code rocksdb::DBOptions} in the c++ side. + */ + public DBOptions() { + super(); + numShardBits_ = DEFAULT_NUM_SHARD_BITS; + newDBOptions(); + } + + /** + *

Method to get a options instance by using pre-configured + * property values. If one or many values are undefined in + * the context of RocksDB the method will return a null + * value.

+ * + *

Note: Property keys can be derived from + * getter methods within the options class. Example: the method + * {@code allowMmapReads()} has a property key: + * {@code allow_mmap_reads}.

+ * + * @param properties {@link java.util.Properties} instance. + * + * @return {@link org.rocksdb.DBOptions instance} + * or null. + * + * @throws java.lang.IllegalArgumentException if null or empty + * {@link java.util.Properties} instance is passed to the method call. + */ + public static DBOptions getDBOptionsFromProps( + Properties properties) { + if (properties == null || properties.size() == 0) { + throw new IllegalArgumentException( + "Properties value must contain at least one value."); + } + DBOptions dbOptions = null; + StringBuilder stringBuilder = new StringBuilder(); + for (final String name : properties.stringPropertyNames()){ + stringBuilder.append(name); + stringBuilder.append("="); + stringBuilder.append(properties.getProperty(name)); + stringBuilder.append(";"); + } + long handle = getDBOptionsFromProps( + stringBuilder.toString()); + if (handle != 0){ + dbOptions = new DBOptions(handle); + } + return dbOptions; + } + + @Override + public DBOptions setIncreaseParallelism(int totalThreads) { + assert (isInitialized()); + setIncreaseParallelism(nativeHandle_, totalThreads); + return this; + } + + @Override + public DBOptions setCreateIfMissing(boolean flag) { + assert(isInitialized()); + setCreateIfMissing(nativeHandle_, flag); + return this; + } + + @Override + public boolean createIfMissing() { + assert(isInitialized()); + return createIfMissing(nativeHandle_); + } + + @Override + public DBOptions setCreateMissingColumnFamilies(boolean flag) { + assert(isInitialized()); + setCreateMissingColumnFamilies(nativeHandle_, flag); + return this; + } + + @Override + public boolean createMissingColumnFamilies() { + assert(isInitialized()); + return createMissingColumnFamilies(nativeHandle_); + } + + @Override + public DBOptions setErrorIfExists(boolean errorIfExists) { + assert(isInitialized()); + setErrorIfExists(nativeHandle_, errorIfExists); + return this; + } + + @Override + public boolean errorIfExists() { + assert(isInitialized()); + return errorIfExists(nativeHandle_); + } + + @Override + public DBOptions setParanoidChecks(boolean paranoidChecks) { + assert(isInitialized()); + setParanoidChecks(nativeHandle_, paranoidChecks); + return this; + } + + @Override + public boolean paranoidChecks() { + assert(isInitialized()); + return paranoidChecks(nativeHandle_); + } + + @Override + public DBOptions setRateLimiterConfig(RateLimiterConfig config) { + assert(isInitialized()); + rateLimiterConfig_ = config; + setRateLimiter(nativeHandle_, config.newRateLimiterHandle()); + return this; + } + + @Override + public DBOptions setInfoLogLevel(InfoLogLevel infoLogLevel) { + assert(isInitialized()); + setInfoLogLevel(nativeHandle_, infoLogLevel.getValue()); + return this; + } + + @Override + public InfoLogLevel infoLogLevel() { + assert(isInitialized()); + return InfoLogLevel.getInfoLogLevel( + infoLogLevel(nativeHandle_)); + } + + @Override + public DBOptions setMaxOpenFiles(int maxOpenFiles) { + assert(isInitialized()); + setMaxOpenFiles(nativeHandle_, maxOpenFiles); + return this; + } + + @Override + public int maxOpenFiles() { + assert(isInitialized()); + return maxOpenFiles(nativeHandle_); + } + + @Override + public DBOptions setMaxTotalWalSize(long maxTotalWalSize) { + assert(isInitialized()); + setMaxTotalWalSize(nativeHandle_, maxTotalWalSize); + return this; + } + + @Override + public long maxTotalWalSize() { + assert(isInitialized()); + return maxTotalWalSize(nativeHandle_); + } + + @Override + public DBOptions createStatistics() { + assert(isInitialized()); + createStatistics(nativeHandle_); + return this; + } + + @Override + public Statistics statisticsPtr() { + assert(isInitialized()); + + long statsPtr = statisticsPtr(nativeHandle_); + if(statsPtr == 0) { + createStatistics(); + statsPtr = statisticsPtr(nativeHandle_); + } + + return new Statistics(statsPtr); + } + + @Override + public DBOptions setDisableDataSync(boolean disableDataSync) { + assert(isInitialized()); + setDisableDataSync(nativeHandle_, disableDataSync); + return this; + } + + @Override + public boolean disableDataSync() { + assert(isInitialized()); + return disableDataSync(nativeHandle_); + } + + @Override + public DBOptions setUseFsync(boolean useFsync) { + assert(isInitialized()); + setUseFsync(nativeHandle_, useFsync); + return this; + } + + @Override + public boolean useFsync() { + assert(isInitialized()); + return useFsync(nativeHandle_); + } + + @Override + public DBOptions setDbLogDir(String dbLogDir) { + assert(isInitialized()); + setDbLogDir(nativeHandle_, dbLogDir); + return this; + } + + @Override + public String dbLogDir() { + assert(isInitialized()); + return dbLogDir(nativeHandle_); + } + + @Override + public DBOptions setWalDir(String walDir) { + assert(isInitialized()); + setWalDir(nativeHandle_, walDir); + return this; + } + + @Override + public String walDir() { + assert(isInitialized()); + return walDir(nativeHandle_); + } + + @Override + public DBOptions setDeleteObsoleteFilesPeriodMicros(long micros) { + assert(isInitialized()); + setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); + return this; + } + + @Override + public long deleteObsoleteFilesPeriodMicros() { + assert(isInitialized()); + return deleteObsoleteFilesPeriodMicros(nativeHandle_); + } + + @Override + public DBOptions setMaxBackgroundCompactions(int maxBackgroundCompactions) { + assert(isInitialized()); + setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); + return this; + } + + @Override + public int maxBackgroundCompactions() { + assert(isInitialized()); + return maxBackgroundCompactions(nativeHandle_); + } + + @Override + public DBOptions setMaxBackgroundFlushes(int maxBackgroundFlushes) { + assert(isInitialized()); + setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); + return this; + } + + @Override + public int maxBackgroundFlushes() { + assert(isInitialized()); + return maxBackgroundFlushes(nativeHandle_); + } + + @Override + public DBOptions setMaxLogFileSize(long maxLogFileSize) + throws RocksDBException { + assert(isInitialized()); + setMaxLogFileSize(nativeHandle_, maxLogFileSize); + return this; + } + + @Override + public long maxLogFileSize() { + assert(isInitialized()); + return maxLogFileSize(nativeHandle_); + } + + @Override + public DBOptions setLogFileTimeToRoll(long logFileTimeToRoll) + throws RocksDBException{ + assert(isInitialized()); + setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); + return this; + } + + @Override + public long logFileTimeToRoll() { + assert(isInitialized()); + return logFileTimeToRoll(nativeHandle_); + } + + @Override + public DBOptions setKeepLogFileNum(long keepLogFileNum) + throws RocksDBException{ + assert(isInitialized()); + setKeepLogFileNum(nativeHandle_, keepLogFileNum); + return this; + } + + @Override + public long keepLogFileNum() { + assert(isInitialized()); + return keepLogFileNum(nativeHandle_); + } + + @Override + public DBOptions setMaxManifestFileSize(long maxManifestFileSize) { + assert(isInitialized()); + setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); + return this; + } + + @Override + public long maxManifestFileSize() { + assert(isInitialized()); + return maxManifestFileSize(nativeHandle_); + } + + @Override + public DBOptions setTableCacheNumshardbits(int tableCacheNumshardbits) { + assert(isInitialized()); + setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); + return this; + } + + @Override + public int tableCacheNumshardbits() { + assert(isInitialized()); + return tableCacheNumshardbits(nativeHandle_); + } + + @Override + public DBOptions setTableCacheRemoveScanCountLimit(int limit) { + assert(isInitialized()); + setTableCacheRemoveScanCountLimit(nativeHandle_, limit); + return this; + } + + @Override + public int tableCacheRemoveScanCountLimit() { + assert(isInitialized()); + return tableCacheRemoveScanCountLimit(nativeHandle_); + } + + @Override + public DBOptions setWalTtlSeconds(long walTtlSeconds) { + assert(isInitialized()); + setWalTtlSeconds(nativeHandle_, walTtlSeconds); + return this; + } + + @Override + public long walTtlSeconds() { + assert(isInitialized()); + return walTtlSeconds(nativeHandle_); + } + + @Override + public DBOptions setWalSizeLimitMB(long sizeLimitMB) { + assert(isInitialized()); + setWalSizeLimitMB(nativeHandle_, sizeLimitMB); + return this; + } + + @Override + public long walSizeLimitMB() { + assert(isInitialized()); + return walSizeLimitMB(nativeHandle_); + } + + @Override + public DBOptions setManifestPreallocationSize(long size) + throws RocksDBException { + assert(isInitialized()); + setManifestPreallocationSize(nativeHandle_, size); + return this; + } + + @Override + public long manifestPreallocationSize() { + assert(isInitialized()); + return manifestPreallocationSize(nativeHandle_); + } + + @Override + public DBOptions setAllowOsBuffer(boolean allowOsBuffer) { + assert(isInitialized()); + setAllowOsBuffer(nativeHandle_, allowOsBuffer); + return this; + } + + @Override + public boolean allowOsBuffer() { + assert(isInitialized()); + return allowOsBuffer(nativeHandle_); + } + + @Override + public DBOptions setAllowMmapReads(boolean allowMmapReads) { + assert(isInitialized()); + setAllowMmapReads(nativeHandle_, allowMmapReads); + return this; + } + + @Override + public boolean allowMmapReads() { + assert(isInitialized()); + return allowMmapReads(nativeHandle_); + } + + @Override + public DBOptions setAllowMmapWrites(boolean allowMmapWrites) { + assert(isInitialized()); + setAllowMmapWrites(nativeHandle_, allowMmapWrites); + return this; + } + + @Override + public boolean allowMmapWrites() { + assert(isInitialized()); + return allowMmapWrites(nativeHandle_); + } + + @Override + public DBOptions setIsFdCloseOnExec(boolean isFdCloseOnExec) { + assert(isInitialized()); + setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); + return this; + } + + @Override + public boolean isFdCloseOnExec() { + assert(isInitialized()); + return isFdCloseOnExec(nativeHandle_); + } + + @Override + @Deprecated + public DBOptions setSkipLogErrorOnRecovery(boolean skip) { + assert(isInitialized()); + setSkipLogErrorOnRecovery(nativeHandle_, skip); + return this; + } + + @Override + @Deprecated + public boolean skipLogErrorOnRecovery() { + assert(isInitialized()); + return skipLogErrorOnRecovery(nativeHandle_); + } + + @Override + public DBOptions setStatsDumpPeriodSec(int statsDumpPeriodSec) { + assert(isInitialized()); + setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); + return this; + } + + @Override + public int statsDumpPeriodSec() { + assert(isInitialized()); + return statsDumpPeriodSec(nativeHandle_); + } + + @Override + public DBOptions setAdviseRandomOnOpen(boolean adviseRandomOnOpen) { + assert(isInitialized()); + setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); + return this; + } + + @Override + public boolean adviseRandomOnOpen() { + return adviseRandomOnOpen(nativeHandle_); + } + + @Override + public DBOptions setUseAdaptiveMutex(boolean useAdaptiveMutex) { + assert(isInitialized()); + setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); + return this; + } + + @Override + public boolean useAdaptiveMutex() { + assert(isInitialized()); + return useAdaptiveMutex(nativeHandle_); + } + + @Override + public DBOptions setBytesPerSync(long bytesPerSync) { + assert(isInitialized()); + setBytesPerSync(nativeHandle_, bytesPerSync); + return this; + } + + @Override + public long bytesPerSync() { + return bytesPerSync(nativeHandle_); + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + static final int DEFAULT_NUM_SHARD_BITS = -1; + + /** + *

Private constructor to be used by + * {@link #getDBOptionsFromProps(java.util.Properties)}

+ * + * @param handle native handle to DBOptions instance. + */ + private DBOptions(long handle) { + super(); + nativeHandle_ = handle; + } + + private static native long getDBOptionsFromProps( + String optString); + + private native void newDBOptions(); + private native void disposeInternal(long handle); + + private native void setIncreaseParallelism(long handle, int totalThreads); + private native void setCreateIfMissing(long handle, boolean flag); + private native boolean createIfMissing(long handle); + private native void setCreateMissingColumnFamilies( + long handle, boolean flag); + private native boolean createMissingColumnFamilies(long handle); + private native void setErrorIfExists(long handle, boolean errorIfExists); + private native boolean errorIfExists(long handle); + private native void setParanoidChecks( + long handle, boolean paranoidChecks); + private native boolean paranoidChecks(long handle); + private native void setRateLimiter(long handle, + long rateLimiterHandle); + private native void setInfoLogLevel(long handle, byte logLevel); + private native byte infoLogLevel(long handle); + private native void setMaxOpenFiles(long handle, int maxOpenFiles); + private native int maxOpenFiles(long handle); + private native void setMaxTotalWalSize(long handle, + long maxTotalWalSize); + private native long maxTotalWalSize(long handle); + private native void createStatistics(long optHandle); + private native long statisticsPtr(long optHandle); + private native void setDisableDataSync(long handle, boolean disableDataSync); + private native boolean disableDataSync(long handle); + private native boolean useFsync(long handle); + private native void setUseFsync(long handle, boolean useFsync); + private native void setDbLogDir(long handle, String dbLogDir); + private native String dbLogDir(long handle); + private native void setWalDir(long handle, String walDir); + private native String walDir(long handle); + private native void setDeleteObsoleteFilesPeriodMicros( + long handle, long micros); + private native long deleteObsoleteFilesPeriodMicros(long handle); + private native void setMaxBackgroundCompactions( + long handle, int maxBackgroundCompactions); + private native int maxBackgroundCompactions(long handle); + private native void setMaxBackgroundFlushes( + long handle, int maxBackgroundFlushes); + private native int maxBackgroundFlushes(long handle); + private native void setMaxLogFileSize(long handle, long maxLogFileSize) + throws RocksDBException; + private native long maxLogFileSize(long handle); + private native void setLogFileTimeToRoll( + long handle, long logFileTimeToRoll) throws RocksDBException; + private native long logFileTimeToRoll(long handle); + private native void setKeepLogFileNum(long handle, long keepLogFileNum) + throws RocksDBException; + private native long keepLogFileNum(long handle); + private native void setMaxManifestFileSize( + long handle, long maxManifestFileSize); + private native long maxManifestFileSize(long handle); + private native void setTableCacheNumshardbits( + long handle, int tableCacheNumshardbits); + private native int tableCacheNumshardbits(long handle); + private native void setTableCacheRemoveScanCountLimit( + long handle, int limit); + private native int tableCacheRemoveScanCountLimit(long handle); + private native void setWalTtlSeconds(long handle, long walTtlSeconds); + private native long walTtlSeconds(long handle); + private native void setWalSizeLimitMB(long handle, long sizeLimitMB); + private native long walSizeLimitMB(long handle); + private native void setManifestPreallocationSize( + long handle, long size) throws RocksDBException; + private native long manifestPreallocationSize(long handle); + private native void setAllowOsBuffer( + long handle, boolean allowOsBuffer); + private native boolean allowOsBuffer(long handle); + private native void setAllowMmapReads( + long handle, boolean allowMmapReads); + private native boolean allowMmapReads(long handle); + private native void setAllowMmapWrites( + long handle, boolean allowMmapWrites); + private native boolean allowMmapWrites(long handle); + private native void setIsFdCloseOnExec( + long handle, boolean isFdCloseOnExec); + private native boolean isFdCloseOnExec(long handle); + private native void setSkipLogErrorOnRecovery( + long handle, boolean skip); + private native boolean skipLogErrorOnRecovery(long handle); + private native void setStatsDumpPeriodSec( + long handle, int statsDumpPeriodSec); + private native int statsDumpPeriodSec(long handle); + private native void setAdviseRandomOnOpen( + long handle, boolean adviseRandomOnOpen); + private native boolean adviseRandomOnOpen(long handle); + private native void setUseAdaptiveMutex( + long handle, boolean useAdaptiveMutex); + private native boolean useAdaptiveMutex(long handle); + private native void setBytesPerSync( + long handle, long bytesPerSync); + private native long bytesPerSync(long handle); + + int numShardBits_; + RateLimiterConfig rateLimiterConfig_; +} diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java new file mode 100644 index 000000000..38c0338e4 --- /dev/null +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -0,0 +1,807 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public interface DBOptionsInterface { + + /** + *

By default, RocksDB uses only one background thread for flush and + * compaction. Calling this function will set it up such that total of + * `total_threads` is used.

+ * + *

You almost definitely want to call this function if your system is + * bottlenecked by RocksDB.

+ * + * @param totalThreads The total number of threads to be used by RocksDB. + * A good value is the number of cores. + * + * @return the instance of the current Options + */ + Object setIncreaseParallelism(int totalThreads); + + /** + * If this value is set to true, then the database will be created + * if it is missing during {@code RocksDB.open()}. + * Default: false + * + * @param flag a flag indicating whether to create a database the + * specified database in {@link RocksDB#open(org.rocksdb.Options, String)} operation + * is missing. + * @return the instance of the current Options + * @see RocksDB#open(org.rocksdb.Options, String) + */ + Object setCreateIfMissing(boolean flag); + + /** + * Return true if the create_if_missing flag is set to true. + * If true, the database will be created if it is missing. + * + * @return true if the createIfMissing option is set to true. + * @see #setCreateIfMissing(boolean) + */ + boolean createIfMissing(); + + /** + *

If true, missing column families will be automatically created

+ * + *

Default: false

+ * + * @param flag a flag indicating if missing column families shall be + * created automatically. + * @return true if missing column families shall be created automatically + * on open. + */ + Object setCreateMissingColumnFamilies(boolean flag); + + /** + * Return true if the create_missing_column_families flag is set + * to true. If true column families be created if missing. + * + * @return true if the createMissingColumnFamilies is set to + * true. + * @see #setCreateMissingColumnFamilies(boolean) + */ + boolean createMissingColumnFamilies(); + + /** + * If true, an error will be thrown during RocksDB.open() if the + * database already exists. + * Default: false + * + * @param errorIfExists if true, an exception will be thrown + * during {@code RocksDB.open()} if the database already exists. + * @return the reference to the current option. + * @see RocksDB#open(org.rocksdb.Options, String) + */ + Object setErrorIfExists(boolean errorIfExists); + + /** + * If true, an error will be thrown during RocksDB.open() if the + * database already exists. + * + * @return if true, an error is raised when the specified database + * already exists before open. + */ + boolean errorIfExists(); + + /** + * If true, the implementation will do aggressive checking of the + * data it is processing and will stop early if it detects any + * errors. This may have unforeseen ramifications: for example, a + * corruption of one DB entry may cause a large number of entries to + * become unreadable or for the entire DB to become unopenable. + * If any of the writes to the database fails (Put, Delete, Merge, Write), + * the database will switch to read-only mode and fail all other + * Write operations. + * Default: true + * + * @param paranoidChecks a flag to indicate whether paranoid-check + * is on. + * @return the reference to the current option. + */ + Object setParanoidChecks(boolean paranoidChecks); + + /** + * If true, the implementation will do aggressive checking of the + * data it is processing and will stop early if it detects any + * errors. This may have unforeseen ramifications: for example, a + * corruption of one DB entry may cause a large number of entries to + * become unreadable or for the entire DB to become unopenable. + * If any of the writes to the database fails (Put, Delete, Merge, Write), + * the database will switch to read-only mode and fail all other + * Write operations. + * + * @return a boolean indicating whether paranoid-check is on. + */ + boolean paranoidChecks(); + + /** + * Use to control write rate of flush and compaction. Flush has higher + * priority than compaction. Rate limiting is disabled if nullptr. + * Default: nullptr + * + * @param config rate limiter config. + * @return the instance of the current Object. + */ + Object setRateLimiterConfig(RateLimiterConfig config); + + /** + *

Sets the RocksDB log level. Default level is INFO

+ * + * @param infoLogLevel log level to set. + * @return the instance of the current Object. + */ + Object setInfoLogLevel(InfoLogLevel infoLogLevel); + + /** + *

Returns currently set log level.

+ * @return {@link org.rocksdb.InfoLogLevel} instance. + */ + InfoLogLevel infoLogLevel(); + + /** + * Number of open files that can be used by the DB. You may need to + * increase this if your database has a large working set. Value -1 means + * files opened are always kept open. You can estimate number of files based + * on {@code target_file_size_base} and {@code target_file_size_multiplier} + * for level-based compaction. For universal-style compaction, you can usually + * set it to -1. + * Default: 5000 + * + * @param maxOpenFiles the maximum number of open files. + * @return the instance of the current Object. + */ + Object setMaxOpenFiles(int maxOpenFiles); + + /** + * Number of open files that can be used by the DB. You may need to + * increase this if your database has a large working set. Value -1 means + * files opened are always kept open. You can estimate number of files based + * on {@code target_file_size_base} and {@code target_file_size_multiplier} + * for level-based compaction. For universal-style compaction, you can usually + * set it to -1. + * + * @return the maximum number of open files. + */ + int maxOpenFiles(); + + /** + *

Once write-ahead logs exceed this size, we will start forcing the + * flush of column families whose memtables are backed by the oldest live + * WAL file (i.e. the ones that are causing all the space amplification). + *

+ *

If set to 0 (default), we will dynamically choose the WAL size limit to + * be [sum of all write_buffer_size * max_write_buffer_number] * 2

+ *

Default: 0

+ * + * @param maxTotalWalSize max total wal size. + * @return the instance of the current Object. + */ + Object setMaxTotalWalSize(long maxTotalWalSize); + + /** + *

Returns the max total wal size. Once write-ahead logs exceed this size, + * we will start forcing the flush of column families whose memtables are + * backed by the oldest live WAL file (i.e. the ones that are causing all + * the space amplification).

+ * + *

If set to 0 (default), we will dynamically choose the WAL size limit + * to be [sum of all write_buffer_size * max_write_buffer_number] * 2 + *

+ * + * @return max total wal size + */ + long maxTotalWalSize(); + + /** + *

Creates statistics object which collects metrics about database operations. + * Statistics objects should not be shared between DB instances as + * it does not use any locks to prevent concurrent updates.

+ * + * @return the instance of the current Object. + * @see RocksDB#open(org.rocksdb.Options, String) + */ + Object createStatistics(); + + /** + *

Returns statistics object. Calls {@link #createStatistics()} if + * C++ returns {@code nullptr} for statistics.

+ * + * @return the instance of the statistics object. + * @see #createStatistics() + */ + Statistics statisticsPtr(); + + /** + *

If true, then the contents of manifest and data files are + * not synced to stable storage. Their contents remain in the + * OS buffers till theOS decides to flush them.

+ * + *

This option is good for bulk-loading of data.

+ * + *

Once the bulk-loading is complete, please issue a sync to + * the OS to flush all dirty buffers to stable storage.

+ * + *

Default: false

+ * + * @param disableDataSync a boolean flag to specify whether to + * disable data sync. + * @return the instance of the current Object. + */ + Object setDisableDataSync(boolean disableDataSync); + + /** + * If true, then the contents of data files are not synced + * to stable storage. Their contents remain in the OS buffers till the + * OS decides to flush them. This option is good for bulk-loading + * of data. Once the bulk-loading is complete, please issue a + * sync to the OS to flush all dirty buffers to stable storage. + * + * @return if true, then data-sync is disabled. + */ + boolean disableDataSync(); + + /** + *

If true, then every store to stable storage will issue a fsync.

+ *

If false, then every store to stable storage will issue a fdatasync. + * This parameter should be set to true while storing data to + * filesystem like ext3 that can lose files after a reboot.

+ *

Default: false

+ * + * @param useFsync a boolean flag to specify whether to use fsync + * @return the instance of the current Object. + */ + Object setUseFsync(boolean useFsync); + + /** + *

If true, then every store to stable storage will issue a fsync.

+ *

If false, then every store to stable storage will issue a fdatasync. + * This parameter should be set to true while storing data to + * filesystem like ext3 that can lose files after a reboot.

+ * + * @return boolean value indicating if fsync is used. + */ + boolean useFsync(); + + /** + * This specifies the info LOG dir. + * If it is empty, the log files will be in the same dir as data. + * If it is non empty, the log files will be in the specified dir, + * and the db data dir's absolute path will be used as the log file + * name's prefix. + * + * @param dbLogDir the path to the info log directory + * @return the instance of the current Object. + */ + Object setDbLogDir(String dbLogDir); + + /** + * Returns the directory of info log. + * + * If it is empty, the log files will be in the same dir as data. + * If it is non empty, the log files will be in the specified dir, + * and the db data dir's absolute path will be used as the log file + * name's prefix. + * + * @return the path to the info log directory + */ + String dbLogDir(); + + /** + * This specifies the absolute dir path for write-ahead logs (WAL). + * If it is empty, the log files will be in the same dir as data, + * dbname is used as the data dir by default + * If it is non empty, the log files will be in kept the specified dir. + * When destroying the db, + * all log files in wal_dir and the dir itself is deleted + * + * @param walDir the path to the write-ahead-log directory. + * @return the instance of the current Object. + */ + Object setWalDir(String walDir); + + /** + * Returns the path to the write-ahead-logs (WAL) directory. + * + * If it is empty, the log files will be in the same dir as data, + * dbname is used as the data dir by default + * If it is non empty, the log files will be in kept the specified dir. + * When destroying the db, + * all log files in wal_dir and the dir itself is deleted + * + * @return the path to the write-ahead-logs (WAL) directory. + */ + String walDir(); + + /** + * The periodicity when obsolete files get deleted. The default + * value is 6 hours. The files that get out of scope by compaction + * process will still get automatically delete on every compaction, + * regardless of this setting + * + * @param micros the time interval in micros + * @return the instance of the current Object. + */ + Object setDeleteObsoleteFilesPeriodMicros(long micros); + + /** + * The periodicity when obsolete files get deleted. The default + * value is 6 hours. The files that get out of scope by compaction + * process will still get automatically delete on every compaction, + * regardless of this setting + * + * @return the time interval in micros when obsolete files will be deleted. + */ + long deleteObsoleteFilesPeriodMicros(); + + /** + * Specifies the maximum number of concurrent background compaction jobs, + * submitted to the default LOW priority thread pool. + * If you're increasing this, also consider increasing number of threads in + * LOW priority thread pool. For more information, see + * Default: 1 + * + * @param maxBackgroundCompactions the maximum number of background + * compaction jobs. + * @return the instance of the current Object. + * + * @see RocksEnv#setBackgroundThreads(int) + * @see RocksEnv#setBackgroundThreads(int, int) + * @see #maxBackgroundFlushes() + */ + Object setMaxBackgroundCompactions(int maxBackgroundCompactions); + + /** + * Returns the maximum number of concurrent background compaction jobs, + * submitted to the default LOW priority thread pool. + * When increasing this number, we may also want to consider increasing + * number of threads in LOW priority thread pool. + * Default: 1 + * + * @return the maximum number of concurrent background compaction jobs. + * @see RocksEnv#setBackgroundThreads(int) + * @see RocksEnv#setBackgroundThreads(int, int) + */ + int maxBackgroundCompactions(); + + /** + * Specifies the maximum number of concurrent background flush jobs. + * If you're increasing this, also consider increasing number of threads in + * HIGH priority thread pool. For more information, see + * Default: 1 + * + * @param maxBackgroundFlushes number of max concurrent flush jobs + * @return the instance of the current Object. + * + * @see RocksEnv#setBackgroundThreads(int) + * @see RocksEnv#setBackgroundThreads(int, int) + * @see #maxBackgroundCompactions() + */ + Object setMaxBackgroundFlushes(int maxBackgroundFlushes); + + /** + * Returns the maximum number of concurrent background flush jobs. + * If you're increasing this, also consider increasing number of threads in + * HIGH priority thread pool. For more information, see + * Default: 1 + * + * @return the maximum number of concurrent background flush jobs. + * @see RocksEnv#setBackgroundThreads(int) + * @see RocksEnv#setBackgroundThreads(int, int) + */ + int maxBackgroundFlushes(); + + /** + * Specifies the maximum size of a info log file. If the current log file + * is larger than `max_log_file_size`, a new info log file will + * be created. + * If 0, all logs will be written to one log file. + * + * @param maxLogFileSize the maximum size of a info log file. + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setMaxLogFileSize(long maxLogFileSize) + throws RocksDBException; + + /** + * Returns the maximum size of a info log file. If the current log file + * is larger than this size, a new info log file will be created. + * If 0, all logs will be written to one log file. + * + * @return the maximum size of the info log file. + */ + long maxLogFileSize(); + + /** + * Specifies the time interval for the info log file to roll (in seconds). + * If specified with non-zero value, log file will be rolled + * if it has been active longer than `log_file_time_to_roll`. + * Default: 0 (disabled) + * + * @param logFileTimeToRoll the time interval in seconds. + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setLogFileTimeToRoll(long logFileTimeToRoll) + throws RocksDBException; + + /** + * Returns the time interval for the info log file to roll (in seconds). + * If specified with non-zero value, log file will be rolled + * if it has been active longer than `log_file_time_to_roll`. + * Default: 0 (disabled) + * + * @return the time interval in seconds. + */ + long logFileTimeToRoll(); + + /** + * Specifies the maximum number of info log files to be kept. + * Default: 1000 + * + * @param keepLogFileNum the maximum number of info log files to be kept. + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setKeepLogFileNum(long keepLogFileNum) + throws RocksDBException; + + /** + * Returns the maximum number of info log files to be kept. + * Default: 1000 + * + * @return the maximum number of info log files to be kept. + */ + long keepLogFileNum(); + + /** + * Manifest file is rolled over on reaching this limit. + * The older manifest file be deleted. + * The default value is MAX_INT so that roll-over does not take place. + * + * @param maxManifestFileSize the size limit of a manifest file. + * @return the instance of the current Object. + */ + Object setMaxManifestFileSize(long maxManifestFileSize); + + /** + * Manifest file is rolled over on reaching this limit. + * The older manifest file be deleted. + * The default value is MAX_INT so that roll-over does not take place. + * + * @return the size limit of a manifest file. + */ + long maxManifestFileSize(); + + /** + * Number of shards used for table cache. + * + * @param tableCacheNumshardbits the number of chards + * @return the instance of the current Object. + */ + Object setTableCacheNumshardbits(int tableCacheNumshardbits); + + /** + * Number of shards used for table cache. + * + * @return the number of shards used for table cache. + */ + int tableCacheNumshardbits(); + + /** + * During data eviction of table's LRU cache, it would be inefficient + * to strictly follow LRU because this piece of memory will not really + * be released unless its refcount falls to zero. Instead, make two + * passes: the first pass will release items with refcount = 1, + * and if not enough space releases after scanning the number of + * elements specified by this parameter, we will remove items in LRU + * order. + * + * @param limit scan count limit + * @return the instance of the current Object. + */ + Object setTableCacheRemoveScanCountLimit(int limit); + + /** + * During data eviction of table's LRU cache, it would be inefficient + * to strictly follow LRU because this piece of memory will not really + * be released unless its refcount falls to zero. Instead, make two + * passes: the first pass will release items with refcount = 1, + * and if not enough space releases after scanning the number of + * elements specified by this parameter, we will remove items in LRU + * order. + * + * @return scan count limit + */ + int tableCacheRemoveScanCountLimit(); + + /** + * {@link #walTtlSeconds()} and {@link #walSizeLimitMB()} affect how archived logs + * will be deleted. + *
    + *
  1. If both set to 0, logs will be deleted asap and will not get into + * the archive.
  2. + *
  3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted.
  4. + *
  5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted.
  6. + *
  7. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first.
  8. + *
+ * + * @param walTtlSeconds the ttl seconds + * @return the instance of the current Object. + * @see #setWalSizeLimitMB(long) + */ + Object setWalTtlSeconds(long walTtlSeconds); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + *
    + *
  1. If both set to 0, logs will be deleted asap and will not get into + * the archive.
  2. + *
  3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted.
  4. + *
  5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted.
  6. + *
  7. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first.
  8. + *
+ * + * @return the wal-ttl seconds + * @see #walSizeLimitMB() + */ + long walTtlSeconds(); + + /** + * WalTtlSeconds() and walSizeLimitMB() affect how archived logs + * will be deleted. + *
    + *
  1. If both set to 0, logs will be deleted asap and will not get into + * the archive.
  2. + *
  3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted.
  4. + *
  5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_secondsi / 2 and those that + * are older than WAL_ttl_seconds will be deleted.
  6. + *
  7. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first.
  8. + *
+ * + * @param sizeLimitMB size limit in mega-bytes. + * @return the instance of the current Object. + * @see #setWalSizeLimitMB(long) + */ + Object setWalSizeLimitMB(long sizeLimitMB); + + /** + * {@link #walTtlSeconds()} and {@code #walSizeLimitMB()} affect how archived logs + * will be deleted. + *
    + *
  1. If both set to 0, logs will be deleted asap and will not get into + * the archive.
  2. + *
  3. If WAL_ttl_seconds is 0 and WAL_size_limit_MB is not 0, + * WAL files will be checked every 10 min and if total size is greater + * then WAL_size_limit_MB, they will be deleted starting with the + * earliest until size_limit is met. All empty files will be deleted.
  4. + *
  5. If WAL_ttl_seconds is not 0 and WAL_size_limit_MB is 0, then + * WAL files will be checked every WAL_ttl_seconds i / 2 and those that + * are older than WAL_ttl_seconds will be deleted.
  6. + *
  7. If both are not 0, WAL files will be checked every 10 min and both + * checks will be performed with ttl being first.
  8. + *
+ * @return size limit in mega-bytes. + * @see #walSizeLimitMB() + */ + long walSizeLimitMB(); + + /** + * Number of bytes to preallocate (via fallocate) the manifest + * files. Default is 4mb, which is reasonable to reduce random IO + * as well as prevent overallocation for mounts that preallocate + * large amounts of data (such as xfs's allocsize option). + * + * @param size the size in byte + * @return the instance of the current Object. + * @throws org.rocksdb.RocksDBException thrown on 32-Bit platforms while + * overflowing the underlying platform specific value. + */ + Object setManifestPreallocationSize(long size) + throws RocksDBException; + + /** + * Number of bytes to preallocate (via fallocate) the manifest + * files. Default is 4mb, which is reasonable to reduce random IO + * as well as prevent overallocation for mounts that preallocate + * large amounts of data (such as xfs's allocsize option). + * + * @return size in bytes. + */ + long manifestPreallocationSize(); + + /** + * Data being read from file storage may be buffered in the OS + * Default: true + * + * @param allowOsBuffer if true, then OS buffering is allowed. + * @return the instance of the current Object. + */ + Object setAllowOsBuffer(boolean allowOsBuffer); + + /** + * Data being read from file storage may be buffered in the OS + * Default: true + * + * @return if true, then OS buffering is allowed. + */ + boolean allowOsBuffer(); + + /** + * Allow the OS to mmap file for reading sst tables. + * Default: false + * + * @param allowMmapReads true if mmap reads are allowed. + * @return the instance of the current Object. + */ + Object setAllowMmapReads(boolean allowMmapReads); + + /** + * Allow the OS to mmap file for reading sst tables. + * Default: false + * + * @return true if mmap reads are allowed. + */ + boolean allowMmapReads(); + + /** + * Allow the OS to mmap file for writing. Default: false + * + * @param allowMmapWrites true if mmap writes are allowd. + * @return the instance of the current Object. + */ + Object setAllowMmapWrites(boolean allowMmapWrites); + + /** + * Allow the OS to mmap file for writing. Default: false + * + * @return true if mmap writes are allowed. + */ + boolean allowMmapWrites(); + + /** + * Disable child process inherit open files. Default: true + * + * @param isFdCloseOnExec true if child process inheriting open + * files is disabled. + * @return the instance of the current Object. + */ + Object setIsFdCloseOnExec(boolean isFdCloseOnExec); + + /** + * Disable child process inherit open files. Default: true + * + * @return true if child process inheriting open files is disabled. + */ + boolean isFdCloseOnExec(); + + /** + * Skip log corruption error on recovery (If client is ok with + * losing most recent changes) + * Default: false + * + * @param skip true if log corruption errors are skipped during recovery. + * @return the instance of the current Object. + * + * @deprecated will be removed in RocksDB 3.11.0. Not used anymore. + */ + @Deprecated + Object setSkipLogErrorOnRecovery(boolean skip); + + /** + * Skip log corruption error on recovery (If client is ok with + * losing most recent changes) + * Default: false + * + * @return true if log corruption errors are skipped during recovery. + * + * @deprecated will be removed in RocksDB 3.11.0. Not used anymore. + */ + @Deprecated + boolean skipLogErrorOnRecovery(); + + /** + * if not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + * Default: 3600 (1 hour) + * + * @param statsDumpPeriodSec time interval in seconds. + * @return the instance of the current Object. + */ + Object setStatsDumpPeriodSec(int statsDumpPeriodSec); + + /** + * If not zero, dump rocksdb.stats to LOG every stats_dump_period_sec + * Default: 3600 (1 hour) + * + * @return time interval in seconds. + */ + int statsDumpPeriodSec(); + + /** + * If set true, will hint the underlying file system that the file + * access pattern is random, when a sst file is opened. + * Default: true + * + * @param adviseRandomOnOpen true if hinting random access is on. + * @return the instance of the current Object. + */ + Object setAdviseRandomOnOpen(boolean adviseRandomOnOpen); + + /** + * If set true, will hint the underlying file system that the file + * access pattern is random, when a sst file is opened. + * Default: true + * + * @return true if hinting random access is on. + */ + boolean adviseRandomOnOpen(); + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @param useAdaptiveMutex true if adaptive mutex is used. + * @return the instance of the current Object. + */ + Object setUseAdaptiveMutex(boolean useAdaptiveMutex); + + /** + * Use adaptive mutex, which spins in the user space before resorting + * to kernel. This could reduce context switch when the mutex is not + * heavily contended. However, if the mutex is hot, we could end up + * wasting spin time. + * Default: false + * + * @return true if adaptive mutex is used. + */ + boolean useAdaptiveMutex(); + + /** + * Allows OS to incrementally sync files to disk while they are being + * written, asynchronously, in the background. + * Issue one request for every bytes_per_sync written. 0 turns it off. + * Default: 0 + * + * @param bytesPerSync size in bytes + * @return the instance of the current Object. + */ + Object setBytesPerSync(long bytesPerSync); + + /** + * Allows OS to incrementally sync files to disk while they are being + * written, asynchronously, in the background. + * Issue one request for every bytes_per_sync written. 0 turns it off. + * Default: 0 + * + * @return size in bytes + */ + long bytesPerSync(); +} diff --git a/java/src/main/java/org/rocksdb/DirectComparator.java b/java/src/main/java/org/rocksdb/DirectComparator.java new file mode 100644 index 000000000..47f4d7256 --- /dev/null +++ b/java/src/main/java/org/rocksdb/DirectComparator.java @@ -0,0 +1,24 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Base class for comparators which will receive + * ByteBuffer based access via org.rocksdb.DirectSlice + * in their compare method implementation. + * + * ByteBuffer based slices perform better when large keys + * are involved. When using smaller keys consider + * using @see org.rocksdb.Comparator + */ +public abstract class DirectComparator extends AbstractComparator { + public DirectComparator(final ComparatorOptions copt) { + super(); + createNewDirectComparator0(copt.nativeHandle_); + } + + private native void createNewDirectComparator0(final long comparatorOptionsHandle); +} diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java new file mode 100644 index 000000000..765b01586 --- /dev/null +++ b/java/src/main/java/org/rocksdb/DirectSlice.java @@ -0,0 +1,118 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.nio.ByteBuffer; + +/** + * Base class for slices which will receive direct + * ByteBuffer based access to the underlying data. + * + * ByteBuffer backed slices typically perform better with + * larger keys and values. When using smaller keys and + * values consider using @see org.rocksdb.Slice + */ +public class DirectSlice extends AbstractSlice { + //TODO(AR) only needed by WriteBatchWithIndexTest until JDK8 + public final static DirectSlice NONE = new DirectSlice(); + + /** + * Called from JNI to construct a new Java DirectSlice + * without an underlying C++ object set + * at creation time. + * + * Note: You should be aware that + * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally + * called from the default DirectSlice constructor, and that it is marked as + * package-private. This is so that developers cannot construct their own default + * DirectSlice objects (at present). As developers cannot construct their own + * DirectSlice objects through this, they are not creating underlying C++ + * DirectSlice objects, and so there is nothing to free (dispose) from Java. + */ + DirectSlice() { + super(); + disOwnNativeHandle(); + } + + /** + * Constructs a slice + * where the data is taken from + * a String. + * + * @param str The string + */ + public DirectSlice(final String str) { + super(); + createNewSliceFromString(str); + } + + /** + * Constructs a slice where the data is + * read from the provided + * ByteBuffer up to a certain length + * + * @param data The buffer containing the data + * @param length The length of the data to use for the slice + */ + public DirectSlice(final ByteBuffer data, final int length) { + super(); + assert(data.isDirect()); + createNewDirectSlice0(data, length); + } + + /** + * Constructs a slice where the data is + * read from the provided + * ByteBuffer + * + * @param data The bugger containing the data + */ + public DirectSlice(final ByteBuffer data) { + super(); + assert(data.isDirect()); + createNewDirectSlice1(data); + } + + /** + * Retrieves the byte at a specific offset + * from the underlying data + * + * @param offset The (zero-based) offset of the byte to retrieve + * + * @return the requested byte + */ + public byte get(int offset) { + assert (isInitialized()); + return get0(nativeHandle_, offset); + } + + /** + * Clears the backing slice + */ + public void clear() { + assert (isInitialized()); + clear0(nativeHandle_); + } + + /** + * Drops the specified {@code n} + * number of bytes from the start + * of the backing slice + * + * @param n The number of bytes to drop + */ + public void removePrefix(final int n) { + assert (isInitialized()); + removePrefix0(nativeHandle_, n); + } + + private native void createNewDirectSlice0(ByteBuffer data, int length); + private native void createNewDirectSlice1(ByteBuffer data); + @Override protected final native ByteBuffer data0(long handle); + private native byte get0(long handle, int offset); + private native void clear0(long handle); + private native void removePrefix0(long handle, int length); +} diff --git a/java/src/main/java/org/rocksdb/EncodingType.java b/java/src/main/java/org/rocksdb/EncodingType.java new file mode 100644 index 000000000..d639542aa --- /dev/null +++ b/java/src/main/java/org/rocksdb/EncodingType.java @@ -0,0 +1,55 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * EncodingType + * + *

The value will determine how to encode keys + * when writing to a new SST file.

+ * + *

This value will be stored + * inside the SST file which will be used when reading from + * the file, which makes it possible for users to choose + * different encoding type when reopening a DB. Files with + * different encoding types can co-exist in the same DB and + * can be read.

+ */ +public enum EncodingType { + /** + * Always write full keys without any special encoding. + */ + kPlain((byte) 0), + /** + *

Find opportunity to write the same prefix once for multiple rows. + * In some cases, when a key follows a previous key with the same prefix, + * instead of writing out the full key, it just writes out the size of the + * shared prefix, as well as other bytes, to save some bytes.

+ * + *

When using this option, the user is required to use the same prefix + * extractor to make sure the same prefix will be extracted from the same key. + * The Name() value of the prefix extractor will be stored in the file. When + * reopening the file, the name of the options.prefix_extractor given will be + * bitwise compared to the prefix extractors stored in the file. An error + * will be returned if the two don't match.

+ */ + kPrefix((byte) 1); + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value_; + } + + private EncodingType(byte value) { + value_ = value; + } + + private final byte value_; +} diff --git a/java/org/rocksdb/Filter.java b/java/src/main/java/org/rocksdb/Filter.java similarity index 100% rename from java/org/rocksdb/Filter.java rename to java/src/main/java/org/rocksdb/Filter.java diff --git a/java/src/main/java/org/rocksdb/FlushOptions.java b/java/src/main/java/org/rocksdb/FlushOptions.java new file mode 100644 index 000000000..e481c7664 --- /dev/null +++ b/java/src/main/java/org/rocksdb/FlushOptions.java @@ -0,0 +1,51 @@ +package org.rocksdb; + +/** + * FlushOptions to be passed to flush operations of + * {@link org.rocksdb.RocksDB}. + */ +public class FlushOptions extends RocksObject { + + /** + * Construct a new instance of FlushOptions. + */ + public FlushOptions(){ + super(); + newFlushOptions(); + } + + /** + * Set if the flush operation shall block until it terminates. + * + * @param waitForFlush boolean value indicating if the flush + * operations waits for termination of the flush process. + * + * @return instance of current FlushOptions. + */ + public FlushOptions setWaitForFlush(boolean waitForFlush) { + assert(isInitialized()); + waitForFlush(nativeHandle_); + return this; + } + + /** + * Wait for flush to finished. + * + * @return boolean value indicating if the flush operation + * waits for termination of the flush process. + */ + public boolean waitForFlush() { + assert(isInitialized()); + return waitForFlush(nativeHandle_); + } + + @Override protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + private native void newFlushOptions(); + private native void disposeInternal(long handle); + private native void setWaitForFlush(long handle, + boolean wait); + private native boolean waitForFlush(long handle); +} diff --git a/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java b/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java new file mode 100644 index 000000000..5023822a6 --- /dev/null +++ b/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java @@ -0,0 +1,66 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * Config for rate limiter, which is used to control write rate of flush and + * compaction. + * + * @see RateLimiterConfig + */ +public class GenericRateLimiterConfig extends RateLimiterConfig { + private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000); + private static final int DEFAULT_FAIRNESS = 10; + + /** + * GenericRateLimiterConfig constructor + * + * @param rateBytesPerSecond this is the only parameter you want to set + * most of the time. It controls the total write rate of compaction + * and flush in bytes per second. Currently, RocksDB does not enforce + * rate limit for anything other than flush and compaction, e.g. write to WAL. + * @param refillPeriodMicros this controls how often tokens are refilled. For example, + * when rate_bytes_per_sec is set to 10MB/s and refill_period_us is set to + * 100ms, then 1MB is refilled every 100ms internally. Larger value can lead to + * burstier writes while smaller value introduces more CPU overhead. + * The default should work for most cases. + * @param fairness RateLimiter accepts high-pri requests and low-pri requests. + * A low-pri request is usually blocked in favor of hi-pri request. Currently, + * RocksDB assigns low-pri to request from compaction and high-pri to request + * from flush. Low-pri requests can get blocked if flush requests come in + * continuously. This fairness parameter grants low-pri requests permission by + * fairness chance even though high-pri requests exist to avoid starvation. + * You should be good by leaving it at default 10. + */ + public GenericRateLimiterConfig(long rateBytesPerSecond, + long refillPeriodMicros, int fairness) { + rateBytesPerSecond_ = rateBytesPerSecond; + refillPeriodMicros_ = refillPeriodMicros; + fairness_ = fairness; + } + + /** + * GenericRateLimiterConfig constructor + * + * @param rateBytesPerSecond this is the only parameter you want to set + * most of the time. It controls the total write rate of compaction + * and flush in bytes per second. Currently, RocksDB does not enforce + * rate limit for anything other than flush and compaction, e.g. write to WAL. + */ + public GenericRateLimiterConfig(long rateBytesPerSecond) { + this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS); + } + + @Override protected long newRateLimiterHandle() { + return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_, + fairness_); + } + + private native long newRateLimiterHandle(long rateBytesPerSecond, + long refillPeriodMicros, int fairness); + private final long rateBytesPerSecond_; + private final long refillPeriodMicros_; + private final int fairness_; +} diff --git a/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java new file mode 100644 index 000000000..78a4e8661 --- /dev/null +++ b/java/src/main/java/org/rocksdb/HashLinkedListMemTableConfig.java @@ -0,0 +1,172 @@ +package org.rocksdb; + +/** + * The config for hash linked list memtable representation + * Such memtable contains a fix-sized array of buckets, where + * each bucket points to a sorted singly-linked + * list (or null if the bucket is empty). + * + * Note that since this mem-table representation relies on the + * key prefix, it is required to invoke one of the usePrefixExtractor + * functions to specify how to extract key prefix given a key. + * If proper prefix-extractor is not set, then RocksDB will + * use the default memtable representation (SkipList) instead + * and post a warning in the LOG. + */ +public class HashLinkedListMemTableConfig extends MemTableConfig { + public static final long DEFAULT_BUCKET_COUNT = 50000; + public static final long DEFAULT_HUGE_PAGE_TLB_SIZE = 0; + public static final int DEFAULT_BUCKET_ENTRIES_LOG_THRES = 4096; + public static final boolean + DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH = true; + public static final int DEFAUL_THRESHOLD_USE_SKIPLIST = 256; + + /** + * HashLinkedListMemTableConfig constructor + */ + public HashLinkedListMemTableConfig() { + bucketCount_ = DEFAULT_BUCKET_COUNT; + hugePageTlbSize_ = DEFAULT_HUGE_PAGE_TLB_SIZE; + bucketEntriesLoggingThreshold_ = DEFAULT_BUCKET_ENTRIES_LOG_THRES; + ifLogBucketDistWhenFlush_ = DEFAULT_IF_LOG_BUCKET_DIST_WHEN_FLUSH; + thresholdUseSkiplist_ = DEFAUL_THRESHOLD_USE_SKIPLIST; + } + + /** + * Set the number of buckets in the fixed-size array used + * in the hash linked-list mem-table. + * + * @param count the number of hash buckets. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig setBucketCount(long count) { + bucketCount_ = count; + return this; + } + + /** + * Returns the number of buckets that will be used in the memtable + * created based on this config. + * + * @return the number of buckets + */ + public long bucketCount() { + return bucketCount_; + } + + /** + *

Set the size of huge tlb or allocate the hashtable bytes from + * malloc if {@code size <= 0}.

+ * + *

The user needs to reserve huge pages for it to be allocated, + * like: {@code sysctl -w vm.nr_hugepages=20}

+ * + *

See linux documentation/vm/hugetlbpage.txt

+ * + * @param size if set to {@code <= 0} hashtable bytes from malloc + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig setHugePageTlbSize(long size) { + hugePageTlbSize_ = size; + return this; + } + + /** + * Returns the size value of hugePageTlbSize. + * + * @return the hugePageTlbSize. + */ + public long hugePageTlbSize() { + return hugePageTlbSize_; + } + + /** + * If number of entries in one bucket exceeds that setting, log + * about it. + * + * @param threshold - number of entries in a single bucket before + * logging starts. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setBucketEntriesLoggingThreshold(int threshold) { + bucketEntriesLoggingThreshold_ = threshold; + return this; + } + + /** + * Returns the maximum number of entries in one bucket before + * logging starts. + * + * @return maximum number of entries in one bucket before logging + * starts. + */ + public int bucketEntriesLoggingThreshold() { + return bucketEntriesLoggingThreshold_; + } + + /** + * If true the distrubition of number of entries will be logged. + * + * @param logDistribution - boolean parameter indicating if number + * of entry distribution shall be logged. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setIfLogBucketDistWhenFlush(boolean logDistribution) { + ifLogBucketDistWhenFlush_ = logDistribution; + return this; + } + + /** + * Returns information about logging the distribution of + * number of entries on flush. + * + * @return if distrubtion of number of entries shall be logged. + */ + public boolean ifLogBucketDistWhenFlush() { + return ifLogBucketDistWhenFlush_; + } + + /** + * Set maximum number of entries in one bucket. Exceeding this val + * leads to a switch from LinkedList to SkipList. + * + * @param threshold maximum number of entries before SkipList is + * used. + * @return the reference to the current HashLinkedListMemTableConfig. + */ + public HashLinkedListMemTableConfig + setThresholdUseSkiplist(int threshold) { + thresholdUseSkiplist_ = threshold; + return this; + } + + /** + * Returns entries per bucket threshold before LinkedList is + * replaced by SkipList usage for that bucket. + * + * @return entries per bucket threshold before SkipList is used. + */ + public int thresholdUseSkiplist() { + return thresholdUseSkiplist_; + } + + @Override protected long newMemTableFactoryHandle() + throws RocksDBException { + return newMemTableFactoryHandle(bucketCount_, hugePageTlbSize_, + bucketEntriesLoggingThreshold_, ifLogBucketDistWhenFlush_, + thresholdUseSkiplist_); + } + + private native long newMemTableFactoryHandle(long bucketCount, + long hugePageTlbSize, int bucketEntriesLoggingThreshold, + boolean ifLogBucketDistWhenFlush, int thresholdUseSkiplist) + throws RocksDBException; + + private long bucketCount_; + private long hugePageTlbSize_; + private int bucketEntriesLoggingThreshold_; + private boolean ifLogBucketDistWhenFlush_; + private int thresholdUseSkiplist_; +} diff --git a/java/org/rocksdb/HashSkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java similarity index 90% rename from java/org/rocksdb/HashSkipListMemTableConfig.java rename to java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java index 74fb0dba2..7dc598fc4 100644 --- a/java/org/rocksdb/HashSkipListMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/HashSkipListMemTableConfig.java @@ -18,6 +18,9 @@ public class HashSkipListMemTableConfig extends MemTableConfig { public static final int DEFAULT_BRANCHING_FACTOR = 4; public static final int DEFAULT_HEIGHT = 4; + /** + * HashSkipListMemTableConfig constructor + */ public HashSkipListMemTableConfig() { bucketCount_ = DEFAULT_BUCKET_COUNT; branchingFactor_ = DEFAULT_BRANCHING_FACTOR; @@ -47,6 +50,8 @@ public class HashSkipListMemTableConfig extends MemTableConfig { /** * Set the height of the skip list. Default = 4. * + * @param height height to set. + * * @return the reference to the current HashSkipListMemTableConfig. */ public HashSkipListMemTableConfig setHeight(int height) { @@ -83,13 +88,15 @@ public class HashSkipListMemTableConfig extends MemTableConfig { return branchingFactor_; } - @Override protected long newMemTableFactoryHandle() { + @Override protected long newMemTableFactoryHandle() + throws RocksDBException { return newMemTableFactoryHandle( bucketCount_, height_, branchingFactor_); } private native long newMemTableFactoryHandle( - long bucketCount, int height, int branchingFactor); + long bucketCount, int height, int branchingFactor) + throws RocksDBException; private long bucketCount_; private int branchingFactor_; diff --git a/java/org/rocksdb/HistogramData.java b/java/src/main/java/org/rocksdb/HistogramData.java similarity index 100% rename from java/org/rocksdb/HistogramData.java rename to java/src/main/java/org/rocksdb/HistogramData.java diff --git a/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java similarity index 100% rename from java/org/rocksdb/HistogramType.java rename to java/src/main/java/org/rocksdb/HistogramType.java diff --git a/java/src/main/java/org/rocksdb/IndexType.java b/java/src/main/java/org/rocksdb/IndexType.java new file mode 100644 index 000000000..f3c104566 --- /dev/null +++ b/java/src/main/java/org/rocksdb/IndexType.java @@ -0,0 +1,37 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * IndexType used in conjunction with BlockBasedTable. + */ +public enum IndexType { + /** + * A space efficient index block that is optimized for + * binary-search-based index. + */ + kBinarySearch((byte) 0), + /** + * The hash index, if enabled, will do the hash lookup when + * {@code Options.prefix_extractor} is provided. + */ + kHashSearch((byte) 1); + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value_; + } + + private IndexType(byte value) { + value_ = value; + } + + private final byte value_; +} diff --git a/java/src/main/java/org/rocksdb/InfoLogLevel.java b/java/src/main/java/org/rocksdb/InfoLogLevel.java new file mode 100644 index 000000000..e67063c68 --- /dev/null +++ b/java/src/main/java/org/rocksdb/InfoLogLevel.java @@ -0,0 +1,47 @@ +package org.rocksdb; + +/** + * RocksDB log levels. + */ +public enum InfoLogLevel { + DEBUG_LEVEL((byte)0), + INFO_LEVEL((byte)1), + WARN_LEVEL((byte)2), + ERROR_LEVEL((byte)3), + FATAL_LEVEL((byte)4), + NUM_INFO_LOG_LEVELS((byte)5); + + private final byte value_; + + private InfoLogLevel(byte value) { + value_ = value; + } + + /** + * Returns the byte value of the enumerations value + * + * @return byte representation + */ + public byte getValue() { + return value_; + } + + /** + * Get InfoLogLevel by byte value. + * + * @param value byte representation of InfoLogLevel. + * + * @return {@link org.rocksdb.InfoLogLevel} instance or null. + * @throws java.lang.IllegalArgumentException if an invalid + * value is provided. + */ + public static InfoLogLevel getInfoLogLevel(byte value) { + for (InfoLogLevel infoLogLevel : InfoLogLevel.values()) { + if (infoLogLevel.getValue() == value){ + return infoLogLevel; + } + } + throw new IllegalArgumentException( + "Illegal value provided for InfoLogLevel."); + } +} diff --git a/java/org/rocksdb/MemTableConfig.java b/java/src/main/java/org/rocksdb/MemTableConfig.java similarity index 76% rename from java/org/rocksdb/MemTableConfig.java rename to java/src/main/java/org/rocksdb/MemTableConfig.java index a473c2585..853d29776 100644 --- a/java/org/rocksdb/MemTableConfig.java +++ b/java/src/main/java/org/rocksdb/MemTableConfig.java @@ -21,7 +21,13 @@ public abstract class MemTableConfig { * which will create a c++ shared-pointer to the c++ MemTableRepFactory * that associated with the Java MemTableConfig. * - * @see Options.setMemTableFactory() + * @see Options#setMemTableConfig(MemTableConfig) + * + * @return native handle address to native memory table instance. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. */ - abstract protected long newMemTableFactoryHandle(); + abstract protected long newMemTableFactoryHandle() + throws RocksDBException; } diff --git a/java/src/main/java/org/rocksdb/MergeOperator.java b/java/src/main/java/org/rocksdb/MergeOperator.java new file mode 100644 index 000000000..2655e466f --- /dev/null +++ b/java/src/main/java/org/rocksdb/MergeOperator.java @@ -0,0 +1,15 @@ +// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com). All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * MergeOperator holds an operator to be applied when compacting + * two merge operands held under the same key in order to obtain a single + * value. + */ +public interface MergeOperator { + public long newMergeOperatorHandle(); +} diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java new file mode 100644 index 000000000..06ae773cb --- /dev/null +++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -0,0 +1,110 @@ +package org.rocksdb; + +import java.io.*; +import java.nio.file.Files; +import java.nio.file.StandardCopyOption; + +import org.rocksdb.util.Environment; + +/** + * This class is used to load the RocksDB shared library from within the jar. + * The shared library is extracted to a temp folder and loaded from there. + */ +public class NativeLibraryLoader { + //singleton + private static final NativeLibraryLoader instance = new NativeLibraryLoader(); + private static boolean initialized = false; + + private static final String sharedLibraryName = Environment.getSharedLibraryName("rocksdb"); + private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); + private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); + private static final String tempFilePrefix = "librocksdbjni"; + private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension(); + + /** + * Get a reference to the NativeLibraryLoader + * + * @return The NativeLibraryLoader + */ + public static NativeLibraryLoader getInstance() { + return instance; + } + + /** + * Firstly attempts to load the library from java.library.path, + * if that fails then it falls back to extracting + * the library from the classpath + * {@link org.rocksdb.NativeLibraryLoader#loadLibraryFromJar(java.lang.String)} + * + * @param tmpDir A temporary directory to use + * to copy the native library to when loading from the classpath. + * If null, or the empty string, we rely on Java's + * {@link java.io.File#createTempFile(String, String)} + * function to provide a temporary location. + * The temporary file will be registered for deletion + * on exit. + * + * @throws java.io.IOException if a filesystem operation fails. + */ + public synchronized void loadLibrary(final String tmpDir) throws IOException { + try { + System.loadLibrary(sharedLibraryName); + } catch(final UnsatisfiedLinkError ule1) { + try { + System.loadLibrary(jniLibraryName); + } catch(final UnsatisfiedLinkError ule2) { + loadLibraryFromJar(tmpDir); + } + } + } + + /** + * Attempts to extract the native RocksDB library + * from the classpath and load it + * + * @param tmpDir A temporary directory to use + * to copy the native library to. If null, + * or the empty string, we rely on Java's + * {@link java.io.File#createTempFile(String, String)} + * function to provide a temporary location. + * The temporary file will be registered for deletion + * on exit. + * + * @throws java.io.IOException if a filesystem operation fails. + */ + private void loadLibraryFromJar(final String tmpDir) + throws IOException { + if (!initialized) { + final File temp; + if (tmpDir == null || tmpDir.equals("")) { + temp = File.createTempFile(tempFilePrefix, tempFileSuffix); + } else { + temp = new File(tmpDir, jniLibraryFileName); + } + + if (!temp.exists()) { + throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); + } else { + temp.deleteOnExit(); + } + + // attempt to copy the library from the Jar file to the temp destination + try (final InputStream is = getClass().getClassLoader(). + getResourceAsStream(jniLibraryFileName)) { + if (is == null) { + throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); + } else { + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + } + } + + System.load(temp.getAbsolutePath()); + initialized = true; + } + } + /** + * Private constructor to disallow instantiation + */ + private NativeLibraryLoader() { + } +} diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java new file mode 100644 index 000000000..56385154d --- /dev/null +++ b/java/src/main/java/org/rocksdb/Options.java @@ -0,0 +1,1257 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Options to control the behavior of a database. It will be used + * during the creation of a {@link org.rocksdb.RocksDB} (i.e., RocksDB.open()). + * + * If {@link #dispose()} function is not called, then it will be GC'd automatically + * and native resources will be released as part of the process. + */ +public class Options extends RocksObject + implements DBOptionsInterface, ColumnFamilyOptionsInterface { + static { + RocksDB.loadLibrary(); + } + /** + * Construct options for opening a RocksDB. + * + * This constructor will create (by allocating a block of memory) + * an {@code rocksdb::Options} in the c++ side. + */ + public Options() { + super(); + newOptions(); + env_ = RocksEnv.getDefault(); + } + + /** + * Construct options for opening a RocksDB. Reusing database options + * and column family options. + * + * @param dbOptions {@link org.rocksdb.DBOptions} instance + * @param columnFamilyOptions {@link org.rocksdb.ColumnFamilyOptions} + * instance + */ + public Options(DBOptions dbOptions, ColumnFamilyOptions columnFamilyOptions) { + super(); + newOptions(dbOptions.nativeHandle_, columnFamilyOptions.nativeHandle_); + env_ = RocksEnv.getDefault(); + } + + @Override + public Options setIncreaseParallelism(int totalThreads) { + assert(isInitialized()); + setIncreaseParallelism(nativeHandle_, totalThreads); + return this; + } + + @Override + public Options setCreateIfMissing(boolean flag) { + assert(isInitialized()); + setCreateIfMissing(nativeHandle_, flag); + return this; + } + + @Override + public Options setCreateMissingColumnFamilies(boolean flag) { + assert(isInitialized()); + setCreateMissingColumnFamilies(nativeHandle_, flag); + return this; + } + + /** + * Use the specified object to interact with the environment, + * e.g. to read/write files, schedule background work, etc. + * Default: {@link RocksEnv#getDefault()} + * + * @param env {@link RocksEnv} instance. + * @return the instance of the current Options. + */ + public Options setEnv(RocksEnv env) { + assert(isInitialized()); + setEnv(nativeHandle_, env.nativeHandle_); + env_ = env; + return this; + } + + /** + * Returns the set RocksEnv instance. + * + * @return {@link RocksEnv} instance set in the Options. + */ + public RocksEnv getEnv() { + return env_; + } + + /** + *

Set appropriate parameters for bulk loading. + * The reason that this is a function that returns "this" instead of a + * constructor is to enable chaining of multiple similar calls in the future. + *

+ * + *

All data will be in level 0 without any automatic compaction. + * It's recommended to manually call CompactRange(NULL, NULL) before reading + * from the database, because otherwise the read can be very slow.

+ * + * @return the instance of the current Options. + */ + public Options prepareForBulkLoad() { + prepareForBulkLoad(nativeHandle_); + return this; + } + + @Override + public boolean createIfMissing() { + assert(isInitialized()); + return createIfMissing(nativeHandle_); + } + + @Override + public boolean createMissingColumnFamilies() { + assert(isInitialized()); + return createMissingColumnFamilies(nativeHandle_); + } + + @Override + public Options optimizeForPointLookup( + long blockCacheSizeMb) { + optimizeForPointLookup(nativeHandle_, + blockCacheSizeMb); + return this; + } + + @Override + public Options optimizeLevelStyleCompaction() { + optimizeLevelStyleCompaction(nativeHandle_, + DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET); + return this; + } + + @Override + public Options optimizeLevelStyleCompaction( + long memtableMemoryBudget) { + optimizeLevelStyleCompaction(nativeHandle_, + memtableMemoryBudget); + return this; + } + + @Override + public Options optimizeUniversalStyleCompaction() { + optimizeUniversalStyleCompaction(nativeHandle_, + DEFAULT_COMPACTION_MEMTABLE_MEMORY_BUDGET); + return this; + } + + @Override + public Options optimizeUniversalStyleCompaction( + long memtableMemoryBudget) { + optimizeUniversalStyleCompaction(nativeHandle_, + memtableMemoryBudget); + return this; + } + + @Override + public Options setComparator(BuiltinComparator builtinComparator) { + assert(isInitialized()); + setComparatorHandle(nativeHandle_, builtinComparator.ordinal()); + return this; + } + + @Override + public Options setComparator(AbstractComparator comparator) { + assert (isInitialized()); + setComparatorHandle(nativeHandle_, comparator.nativeHandle_); + comparator_ = comparator; + return this; + } + + @Override + public Options setMergeOperatorName(String name) { + assert (isInitialized()); + if (name == null) { + throw new IllegalArgumentException( + "Merge operator name must not be null."); + } + setMergeOperatorName(nativeHandle_, name); + return this; + } + + @Override + public Options setMergeOperator(MergeOperator mergeOperator) { + setMergeOperator(nativeHandle_, mergeOperator.newMergeOperatorHandle()); + return this; + } + + @Override + public Options setWriteBufferSize(long writeBufferSize) + throws RocksDBException { + assert(isInitialized()); + setWriteBufferSize(nativeHandle_, writeBufferSize); + return this; + } + + @Override + public long writeBufferSize() { + assert(isInitialized()); + return writeBufferSize(nativeHandle_); + } + + @Override + public Options setMaxWriteBufferNumber(int maxWriteBufferNumber) { + assert(isInitialized()); + setMaxWriteBufferNumber(nativeHandle_, maxWriteBufferNumber); + return this; + } + + @Override + public int maxWriteBufferNumber() { + assert(isInitialized()); + return maxWriteBufferNumber(nativeHandle_); + } + + @Override + public boolean errorIfExists() { + assert(isInitialized()); + return errorIfExists(nativeHandle_); + } + + @Override + public Options setErrorIfExists(boolean errorIfExists) { + assert(isInitialized()); + setErrorIfExists(nativeHandle_, errorIfExists); + return this; + } + + @Override + public boolean paranoidChecks() { + assert(isInitialized()); + return paranoidChecks(nativeHandle_); + } + + @Override + public Options setParanoidChecks(boolean paranoidChecks) { + assert(isInitialized()); + setParanoidChecks(nativeHandle_, paranoidChecks); + return this; + } + + @Override + public int maxOpenFiles() { + assert(isInitialized()); + return maxOpenFiles(nativeHandle_); + } + + @Override + public Options setMaxTotalWalSize(long maxTotalWalSize) { + assert(isInitialized()); + setMaxTotalWalSize(nativeHandle_, maxTotalWalSize); + return this; + } + + @Override + public long maxTotalWalSize() { + assert(isInitialized()); + return maxTotalWalSize(nativeHandle_); + } + + @Override + public Options setMaxOpenFiles(int maxOpenFiles) { + assert(isInitialized()); + setMaxOpenFiles(nativeHandle_, maxOpenFiles); + return this; + } + + @Override + public boolean disableDataSync() { + assert(isInitialized()); + return disableDataSync(nativeHandle_); + } + + @Override + public Options setDisableDataSync(boolean disableDataSync) { + assert(isInitialized()); + setDisableDataSync(nativeHandle_, disableDataSync); + return this; + } + + @Override + public boolean useFsync() { + assert(isInitialized()); + return useFsync(nativeHandle_); + } + + @Override + public Options setUseFsync(boolean useFsync) { + assert(isInitialized()); + setUseFsync(nativeHandle_, useFsync); + return this; + } + + @Override + public String dbLogDir() { + assert(isInitialized()); + return dbLogDir(nativeHandle_); + } + + @Override + public Options setDbLogDir(String dbLogDir) { + assert(isInitialized()); + setDbLogDir(nativeHandle_, dbLogDir); + return this; + } + + @Override + public String walDir() { + assert(isInitialized()); + return walDir(nativeHandle_); + } + + @Override + public Options setWalDir(String walDir) { + assert(isInitialized()); + setWalDir(nativeHandle_, walDir); + return this; + } + + @Override + public long deleteObsoleteFilesPeriodMicros() { + assert(isInitialized()); + return deleteObsoleteFilesPeriodMicros(nativeHandle_); + } + + @Override + public Options setDeleteObsoleteFilesPeriodMicros(long micros) { + assert(isInitialized()); + setDeleteObsoleteFilesPeriodMicros(nativeHandle_, micros); + return this; + } + + @Override + public int maxBackgroundCompactions() { + assert(isInitialized()); + return maxBackgroundCompactions(nativeHandle_); + } + + @Override + public Options createStatistics() { + assert(isInitialized()); + createStatistics(nativeHandle_); + return this; + } + + @Override + public Statistics statisticsPtr() { + assert(isInitialized()); + + long statsPtr = statisticsPtr(nativeHandle_); + if(statsPtr == 0) { + createStatistics(); + statsPtr = statisticsPtr(nativeHandle_); + } + + return new Statistics(statsPtr); + } + + @Override + public Options setMaxBackgroundCompactions(int maxBackgroundCompactions) { + assert(isInitialized()); + setMaxBackgroundCompactions(nativeHandle_, maxBackgroundCompactions); + return this; + } + + @Override + public int maxBackgroundFlushes() { + assert(isInitialized()); + return maxBackgroundFlushes(nativeHandle_); + } + + @Override + public Options setMaxBackgroundFlushes(int maxBackgroundFlushes) { + assert(isInitialized()); + setMaxBackgroundFlushes(nativeHandle_, maxBackgroundFlushes); + return this; + } + + @Override + public long maxLogFileSize() { + assert(isInitialized()); + return maxLogFileSize(nativeHandle_); + } + + @Override + public Options setMaxLogFileSize(long maxLogFileSize) + throws RocksDBException { + assert(isInitialized()); + setMaxLogFileSize(nativeHandle_, maxLogFileSize); + return this; + } + + @Override + public long logFileTimeToRoll() { + assert(isInitialized()); + return logFileTimeToRoll(nativeHandle_); + } + + @Override + public Options setLogFileTimeToRoll(long logFileTimeToRoll) + throws RocksDBException{ + assert(isInitialized()); + setLogFileTimeToRoll(nativeHandle_, logFileTimeToRoll); + return this; + } + + @Override + public long keepLogFileNum() { + assert(isInitialized()); + return keepLogFileNum(nativeHandle_); + } + + @Override + public Options setKeepLogFileNum(long keepLogFileNum) + throws RocksDBException{ + assert(isInitialized()); + setKeepLogFileNum(nativeHandle_, keepLogFileNum); + return this; + } + + @Override + public long maxManifestFileSize() { + assert(isInitialized()); + return maxManifestFileSize(nativeHandle_); + } + + @Override + public Options setMaxManifestFileSize(long maxManifestFileSize) { + assert(isInitialized()); + setMaxManifestFileSize(nativeHandle_, maxManifestFileSize); + return this; + } + + @Override + public int tableCacheNumshardbits() { + assert(isInitialized()); + return tableCacheNumshardbits(nativeHandle_); + } + + @Override + public Options setTableCacheNumshardbits(int tableCacheNumshardbits) { + assert(isInitialized()); + setTableCacheNumshardbits(nativeHandle_, tableCacheNumshardbits); + return this; + } + + @Override + public int tableCacheRemoveScanCountLimit() { + assert(isInitialized()); + return tableCacheRemoveScanCountLimit(nativeHandle_); + } + + @Override + public Options setTableCacheRemoveScanCountLimit(int limit) { + assert(isInitialized()); + setTableCacheRemoveScanCountLimit(nativeHandle_, limit); + return this; + } + + @Override + public long walTtlSeconds() { + assert(isInitialized()); + return walTtlSeconds(nativeHandle_); + } + + @Override + public Options setWalTtlSeconds(long walTtlSeconds) { + assert(isInitialized()); + setWalTtlSeconds(nativeHandle_, walTtlSeconds); + return this; + } + + @Override + public long walSizeLimitMB() { + assert(isInitialized()); + return walSizeLimitMB(nativeHandle_); + } + + @Override + public Options setWalSizeLimitMB(long sizeLimitMB) { + assert(isInitialized()); + setWalSizeLimitMB(nativeHandle_, sizeLimitMB); + return this; + } + + @Override + public long manifestPreallocationSize() { + assert(isInitialized()); + return manifestPreallocationSize(nativeHandle_); + } + + @Override + public Options setManifestPreallocationSize(long size) + throws RocksDBException { + assert(isInitialized()); + setManifestPreallocationSize(nativeHandle_, size); + return this; + } + + @Override + public boolean allowOsBuffer() { + assert(isInitialized()); + return allowOsBuffer(nativeHandle_); + } + + @Override + public Options setAllowOsBuffer(boolean allowOsBuffer) { + assert(isInitialized()); + setAllowOsBuffer(nativeHandle_, allowOsBuffer); + return this; + } + + @Override + public boolean allowMmapReads() { + assert(isInitialized()); + return allowMmapReads(nativeHandle_); + } + + @Override + public Options setAllowMmapReads(boolean allowMmapReads) { + assert(isInitialized()); + setAllowMmapReads(nativeHandle_, allowMmapReads); + return this; + } + + @Override + public boolean allowMmapWrites() { + assert(isInitialized()); + return allowMmapWrites(nativeHandle_); + } + + @Override + public Options setAllowMmapWrites(boolean allowMmapWrites) { + assert(isInitialized()); + setAllowMmapWrites(nativeHandle_, allowMmapWrites); + return this; + } + + @Override + public boolean isFdCloseOnExec() { + assert(isInitialized()); + return isFdCloseOnExec(nativeHandle_); + } + + @Override + public Options setIsFdCloseOnExec(boolean isFdCloseOnExec) { + assert(isInitialized()); + setIsFdCloseOnExec(nativeHandle_, isFdCloseOnExec); + return this; + } + + @Override + @Deprecated + public boolean skipLogErrorOnRecovery() { + assert(isInitialized()); + return skipLogErrorOnRecovery(nativeHandle_); + } + + @Override + @Deprecated + public Options setSkipLogErrorOnRecovery(boolean skip) { + assert(isInitialized()); + setSkipLogErrorOnRecovery(nativeHandle_, skip); + return this; + } + + @Override + public int statsDumpPeriodSec() { + assert(isInitialized()); + return statsDumpPeriodSec(nativeHandle_); + } + + @Override + public Options setStatsDumpPeriodSec(int statsDumpPeriodSec) { + assert(isInitialized()); + setStatsDumpPeriodSec(nativeHandle_, statsDumpPeriodSec); + return this; + } + + @Override + public boolean adviseRandomOnOpen() { + return adviseRandomOnOpen(nativeHandle_); + } + + @Override + public Options setAdviseRandomOnOpen(boolean adviseRandomOnOpen) { + assert(isInitialized()); + setAdviseRandomOnOpen(nativeHandle_, adviseRandomOnOpen); + return this; + } + + @Override + public boolean useAdaptiveMutex() { + assert(isInitialized()); + return useAdaptiveMutex(nativeHandle_); + } + + @Override + public Options setUseAdaptiveMutex(boolean useAdaptiveMutex) { + assert(isInitialized()); + setUseAdaptiveMutex(nativeHandle_, useAdaptiveMutex); + return this; + } + + @Override + public long bytesPerSync() { + return bytesPerSync(nativeHandle_); + } + + @Override + public Options setBytesPerSync(long bytesPerSync) { + assert(isInitialized()); + setBytesPerSync(nativeHandle_, bytesPerSync); + return this; + } + + @Override + public Options setMemTableConfig(MemTableConfig config) + throws RocksDBException { + memTableConfig_ = config; + setMemTableFactory(nativeHandle_, config.newMemTableFactoryHandle()); + return this; + } + + @Override + public Options setRateLimiterConfig(RateLimiterConfig config) { + rateLimiterConfig_ = config; + setRateLimiter(nativeHandle_, config.newRateLimiterHandle()); + return this; + } + + @Override + public Options setInfoLogLevel(InfoLogLevel infoLogLevel) { + assert(isInitialized()); + setInfoLogLevel(nativeHandle_, infoLogLevel.getValue()); + return this; + } + + @Override + public InfoLogLevel infoLogLevel() { + assert(isInitialized()); + return InfoLogLevel.getInfoLogLevel( + infoLogLevel(nativeHandle_)); + } + + @Override + public String memTableFactoryName() { + assert(isInitialized()); + return memTableFactoryName(nativeHandle_); + } + + @Override + public Options setTableFormatConfig(TableFormatConfig config) { + tableFormatConfig_ = config; + setTableFactory(nativeHandle_, config.newTableFactoryHandle()); + return this; + } + + @Override + public String tableFactoryName() { + assert(isInitialized()); + return tableFactoryName(nativeHandle_); + } + + @Override + public Options useFixedLengthPrefixExtractor(int n) { + assert(isInitialized()); + useFixedLengthPrefixExtractor(nativeHandle_, n); + return this; + } + + @Override + public CompressionType compressionType() { + return CompressionType.values()[compressionType(nativeHandle_)]; + } + + @Override + public Options setCompressionType(CompressionType compressionType) { + setCompressionType(nativeHandle_, compressionType.getValue()); + return this; + } + + @Override + public CompactionStyle compactionStyle() { + return CompactionStyle.values()[compactionStyle(nativeHandle_)]; + } + + @Override + public Options setCompactionStyle(CompactionStyle compactionStyle) { + setCompactionStyle(nativeHandle_, compactionStyle.getValue()); + return this; + } + + @Override + public int numLevels() { + return numLevels(nativeHandle_); + } + + @Override + public Options setNumLevels(int numLevels) { + setNumLevels(nativeHandle_, numLevels); + return this; + } + + @Override + public int levelZeroFileNumCompactionTrigger() { + return levelZeroFileNumCompactionTrigger(nativeHandle_); + } + + @Override + public Options setLevelZeroFileNumCompactionTrigger( + int numFiles) { + setLevelZeroFileNumCompactionTrigger( + nativeHandle_, numFiles); + return this; + } + + @Override + public int levelZeroSlowdownWritesTrigger() { + return levelZeroSlowdownWritesTrigger(nativeHandle_); + } + + @Override + public Options setLevelZeroSlowdownWritesTrigger( + int numFiles) { + setLevelZeroSlowdownWritesTrigger(nativeHandle_, numFiles); + return this; + } + + @Override + public int levelZeroStopWritesTrigger() { + return levelZeroStopWritesTrigger(nativeHandle_); + } + + @Override + public Options setLevelZeroStopWritesTrigger(int numFiles) { + setLevelZeroStopWritesTrigger(nativeHandle_, numFiles); + return this; + } + + @Override + public int maxMemCompactionLevel() { + return maxMemCompactionLevel(nativeHandle_); + } + + @Override + public Options setMaxMemCompactionLevel(int maxMemCompactionLevel) { + setMaxMemCompactionLevel(nativeHandle_, maxMemCompactionLevel); + return this; + } + + @Override + public long targetFileSizeBase() { + return targetFileSizeBase(nativeHandle_); + } + + @Override + public Options setTargetFileSizeBase(long targetFileSizeBase) { + setTargetFileSizeBase(nativeHandle_, targetFileSizeBase); + return this; + } + + @Override + public int targetFileSizeMultiplier() { + return targetFileSizeMultiplier(nativeHandle_); + } + + @Override + public Options setTargetFileSizeMultiplier(int multiplier) { + setTargetFileSizeMultiplier(nativeHandle_, multiplier); + return this; + } + + @Override + public long maxBytesForLevelBase() { + return maxBytesForLevelBase(nativeHandle_); + } + + @Override + public Options setMaxBytesForLevelBase(long maxBytesForLevelBase) { + setMaxBytesForLevelBase(nativeHandle_, maxBytesForLevelBase); + return this; + } + + @Override + public int maxBytesForLevelMultiplier() { + return maxBytesForLevelMultiplier(nativeHandle_); + } + + @Override + public Options setMaxBytesForLevelMultiplier(int multiplier) { + setMaxBytesForLevelMultiplier(nativeHandle_, multiplier); + return this; + } + + @Override + public int expandedCompactionFactor() { + return expandedCompactionFactor(nativeHandle_); + } + + @Override + public Options setExpandedCompactionFactor(int expandedCompactionFactor) { + setExpandedCompactionFactor(nativeHandle_, expandedCompactionFactor); + return this; + } + + @Override + public int sourceCompactionFactor() { + return sourceCompactionFactor(nativeHandle_); + } + + @Override + public Options setSourceCompactionFactor(int sourceCompactionFactor) { + setSourceCompactionFactor(nativeHandle_, sourceCompactionFactor); + return this; + } + + @Override + public int maxGrandparentOverlapFactor() { + return maxGrandparentOverlapFactor(nativeHandle_); + } + + @Override + public Options setMaxGrandparentOverlapFactor( + int maxGrandparentOverlapFactor) { + setMaxGrandparentOverlapFactor(nativeHandle_, maxGrandparentOverlapFactor); + return this; + } + + @Override + public double softRateLimit() { + return softRateLimit(nativeHandle_); + } + + @Override + public Options setSoftRateLimit(double softRateLimit) { + setSoftRateLimit(nativeHandle_, softRateLimit); + return this; + } + + @Override + public double hardRateLimit() { + return hardRateLimit(nativeHandle_); + } + + @Override + public Options setHardRateLimit(double hardRateLimit) { + setHardRateLimit(nativeHandle_, hardRateLimit); + return this; + } + + @Override + public int rateLimitDelayMaxMilliseconds() { + return rateLimitDelayMaxMilliseconds(nativeHandle_); + } + + @Override + public Options setRateLimitDelayMaxMilliseconds( + int rateLimitDelayMaxMilliseconds) { + setRateLimitDelayMaxMilliseconds( + nativeHandle_, rateLimitDelayMaxMilliseconds); + return this; + } + + @Override + public long arenaBlockSize() { + return arenaBlockSize(nativeHandle_); + } + + @Override + public Options setArenaBlockSize(long arenaBlockSize) + throws RocksDBException { + setArenaBlockSize(nativeHandle_, arenaBlockSize); + return this; + } + + @Override + public boolean disableAutoCompactions() { + return disableAutoCompactions(nativeHandle_); + } + + @Override + public Options setDisableAutoCompactions(boolean disableAutoCompactions) { + setDisableAutoCompactions(nativeHandle_, disableAutoCompactions); + return this; + } + + @Override + public boolean purgeRedundantKvsWhileFlush() { + return purgeRedundantKvsWhileFlush(nativeHandle_); + } + + @Override + public Options setPurgeRedundantKvsWhileFlush( + boolean purgeRedundantKvsWhileFlush) { + setPurgeRedundantKvsWhileFlush( + nativeHandle_, purgeRedundantKvsWhileFlush); + return this; + } + + @Override + public boolean verifyChecksumsInCompaction() { + return verifyChecksumsInCompaction(nativeHandle_); + } + + @Override + public Options setVerifyChecksumsInCompaction( + boolean verifyChecksumsInCompaction) { + setVerifyChecksumsInCompaction( + nativeHandle_, verifyChecksumsInCompaction); + return this; + } + + @Override + public boolean filterDeletes() { + return filterDeletes(nativeHandle_); + } + + @Override + public Options setFilterDeletes(boolean filterDeletes) { + setFilterDeletes(nativeHandle_, filterDeletes); + return this; + } + + @Override + public long maxSequentialSkipInIterations() { + return maxSequentialSkipInIterations(nativeHandle_); + } + + @Override + public Options setMaxSequentialSkipInIterations(long maxSequentialSkipInIterations) { + setMaxSequentialSkipInIterations(nativeHandle_, maxSequentialSkipInIterations); + return this; + } + + @Override + public boolean inplaceUpdateSupport() { + return inplaceUpdateSupport(nativeHandle_); + } + + @Override + public Options setInplaceUpdateSupport(boolean inplaceUpdateSupport) { + setInplaceUpdateSupport(nativeHandle_, inplaceUpdateSupport); + return this; + } + + @Override + public long inplaceUpdateNumLocks() { + return inplaceUpdateNumLocks(nativeHandle_); + } + + @Override + public Options setInplaceUpdateNumLocks(long inplaceUpdateNumLocks) + throws RocksDBException { + setInplaceUpdateNumLocks(nativeHandle_, inplaceUpdateNumLocks); + return this; + } + + @Override + public int memtablePrefixBloomBits() { + return memtablePrefixBloomBits(nativeHandle_); + } + + @Override + public Options setMemtablePrefixBloomBits(int memtablePrefixBloomBits) { + setMemtablePrefixBloomBits(nativeHandle_, memtablePrefixBloomBits); + return this; + } + + @Override + public int memtablePrefixBloomProbes() { + return memtablePrefixBloomProbes(nativeHandle_); + } + + @Override + public Options setMemtablePrefixBloomProbes(int memtablePrefixBloomProbes) { + setMemtablePrefixBloomProbes(nativeHandle_, memtablePrefixBloomProbes); + return this; + } + + @Override + public int bloomLocality() { + return bloomLocality(nativeHandle_); + } + + @Override + public Options setBloomLocality(int bloomLocality) { + setBloomLocality(nativeHandle_, bloomLocality); + return this; + } + + @Override + public long maxSuccessiveMerges() { + return maxSuccessiveMerges(nativeHandle_); + } + + @Override + public Options setMaxSuccessiveMerges(long maxSuccessiveMerges) + throws RocksDBException { + setMaxSuccessiveMerges(nativeHandle_, maxSuccessiveMerges); + return this; + } + + @Override + public int minWriteBufferNumberToMerge() { + return minWriteBufferNumberToMerge(nativeHandle_); + } + + @Override + public Options setMinWriteBufferNumberToMerge(int minWriteBufferNumberToMerge) { + setMinWriteBufferNumberToMerge(nativeHandle_, minWriteBufferNumberToMerge); + return this; + } + + @Override + public int minPartialMergeOperands() { + return minPartialMergeOperands(nativeHandle_); + } + + @Override + public Options setMinPartialMergeOperands(int minPartialMergeOperands) { + setMinPartialMergeOperands(nativeHandle_, minPartialMergeOperands); + return this; + } + + /** + * Release the memory allocated for the current instance + * in the c++ side. + */ + @Override protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void newOptions(); + private native void newOptions(long dbOptHandle, + long cfOptHandle); + private native void disposeInternal(long handle); + private native void setEnv(long optHandle, long envHandle); + private native void prepareForBulkLoad(long handle); + + // DB native handles + private native void setIncreaseParallelism(long handle, int totalThreads); + private native void setCreateIfMissing(long handle, boolean flag); + private native boolean createIfMissing(long handle); + private native void setCreateMissingColumnFamilies( + long handle, boolean flag); + private native boolean createMissingColumnFamilies(long handle); + private native void setErrorIfExists(long handle, boolean errorIfExists); + private native boolean errorIfExists(long handle); + private native void setParanoidChecks( + long handle, boolean paranoidChecks); + private native boolean paranoidChecks(long handle); + private native void setRateLimiter(long handle, + long rateLimiterHandle); + private native void setInfoLogLevel(long handle, byte logLevel); + private native byte infoLogLevel(long handle); + private native void setMaxOpenFiles(long handle, int maxOpenFiles); + private native int maxOpenFiles(long handle); + private native void setMaxTotalWalSize(long handle, + long maxTotalWalSize); + private native long maxTotalWalSize(long handle); + private native void createStatistics(long optHandle); + private native long statisticsPtr(long optHandle); + private native void setDisableDataSync(long handle, boolean disableDataSync); + private native boolean disableDataSync(long handle); + private native boolean useFsync(long handle); + private native void setUseFsync(long handle, boolean useFsync); + private native void setDbLogDir(long handle, String dbLogDir); + private native String dbLogDir(long handle); + private native void setWalDir(long handle, String walDir); + private native String walDir(long handle); + private native void setDeleteObsoleteFilesPeriodMicros( + long handle, long micros); + private native long deleteObsoleteFilesPeriodMicros(long handle); + private native void setMaxBackgroundCompactions( + long handle, int maxBackgroundCompactions); + private native int maxBackgroundCompactions(long handle); + private native void setMaxBackgroundFlushes( + long handle, int maxBackgroundFlushes); + private native int maxBackgroundFlushes(long handle); + private native void setMaxLogFileSize(long handle, long maxLogFileSize) + throws RocksDBException; + private native long maxLogFileSize(long handle); + private native void setLogFileTimeToRoll( + long handle, long logFileTimeToRoll) throws RocksDBException; + private native long logFileTimeToRoll(long handle); + private native void setKeepLogFileNum(long handle, long keepLogFileNum) + throws RocksDBException; + private native long keepLogFileNum(long handle); + private native void setMaxManifestFileSize( + long handle, long maxManifestFileSize); + private native long maxManifestFileSize(long handle); + private native void setTableCacheNumshardbits( + long handle, int tableCacheNumshardbits); + private native int tableCacheNumshardbits(long handle); + private native void setTableCacheRemoveScanCountLimit( + long handle, int limit); + private native int tableCacheRemoveScanCountLimit(long handle); + private native void setWalTtlSeconds(long handle, long walTtlSeconds); + private native long walTtlSeconds(long handle); + private native void setWalSizeLimitMB(long handle, long sizeLimitMB); + private native long walSizeLimitMB(long handle); + private native void setManifestPreallocationSize( + long handle, long size) throws RocksDBException; + private native long manifestPreallocationSize(long handle); + private native void setAllowOsBuffer( + long handle, boolean allowOsBuffer); + private native boolean allowOsBuffer(long handle); + private native void setAllowMmapReads( + long handle, boolean allowMmapReads); + private native boolean allowMmapReads(long handle); + private native void setAllowMmapWrites( + long handle, boolean allowMmapWrites); + private native boolean allowMmapWrites(long handle); + private native void setIsFdCloseOnExec( + long handle, boolean isFdCloseOnExec); + private native boolean isFdCloseOnExec(long handle); + private native void setSkipLogErrorOnRecovery( + long handle, boolean skip); + private native boolean skipLogErrorOnRecovery(long handle); + private native void setStatsDumpPeriodSec( + long handle, int statsDumpPeriodSec); + private native int statsDumpPeriodSec(long handle); + private native void setAdviseRandomOnOpen( + long handle, boolean adviseRandomOnOpen); + private native boolean adviseRandomOnOpen(long handle); + private native void setUseAdaptiveMutex( + long handle, boolean useAdaptiveMutex); + private native boolean useAdaptiveMutex(long handle); + private native void setBytesPerSync( + long handle, long bytesPerSync); + private native long bytesPerSync(long handle); + // CF native handles + private native void optimizeForPointLookup(long handle, + long blockCacheSizeMb); + private native void optimizeLevelStyleCompaction(long handle, + long memtableMemoryBudget); + private native void optimizeUniversalStyleCompaction(long handle, + long memtableMemoryBudget); + private native void setComparatorHandle(long handle, int builtinComparator); + private native void setComparatorHandle(long optHandle, long comparatorHandle); + private native void setMergeOperatorName( + long handle, String name); + private native void setMergeOperator( + long handle, long mergeOperatorHandle); + private native void setWriteBufferSize(long handle, long writeBufferSize) + throws RocksDBException; + private native long writeBufferSize(long handle); + private native void setMaxWriteBufferNumber( + long handle, int maxWriteBufferNumber); + private native int maxWriteBufferNumber(long handle); + private native void setMinWriteBufferNumberToMerge( + long handle, int minWriteBufferNumberToMerge); + private native int minWriteBufferNumberToMerge(long handle); + private native void setCompressionType(long handle, byte compressionType); + private native byte compressionType(long handle); + private native void useFixedLengthPrefixExtractor( + long handle, int prefixLength); + private native void setNumLevels( + long handle, int numLevels); + private native int numLevels(long handle); + private native void setLevelZeroFileNumCompactionTrigger( + long handle, int numFiles); + private native int levelZeroFileNumCompactionTrigger(long handle); + private native void setLevelZeroSlowdownWritesTrigger( + long handle, int numFiles); + private native int levelZeroSlowdownWritesTrigger(long handle); + private native void setLevelZeroStopWritesTrigger( + long handle, int numFiles); + private native int levelZeroStopWritesTrigger(long handle); + private native void setMaxMemCompactionLevel( + long handle, int maxMemCompactionLevel); + private native int maxMemCompactionLevel(long handle); + private native void setTargetFileSizeBase( + long handle, long targetFileSizeBase); + private native long targetFileSizeBase(long handle); + private native void setTargetFileSizeMultiplier( + long handle, int multiplier); + private native int targetFileSizeMultiplier(long handle); + private native void setMaxBytesForLevelBase( + long handle, long maxBytesForLevelBase); + private native long maxBytesForLevelBase(long handle); + private native void setMaxBytesForLevelMultiplier( + long handle, int multiplier); + private native int maxBytesForLevelMultiplier(long handle); + private native void setExpandedCompactionFactor( + long handle, int expandedCompactionFactor); + private native int expandedCompactionFactor(long handle); + private native void setSourceCompactionFactor( + long handle, int sourceCompactionFactor); + private native int sourceCompactionFactor(long handle); + private native void setMaxGrandparentOverlapFactor( + long handle, int maxGrandparentOverlapFactor); + private native int maxGrandparentOverlapFactor(long handle); + private native void setSoftRateLimit( + long handle, double softRateLimit); + private native double softRateLimit(long handle); + private native void setHardRateLimit( + long handle, double hardRateLimit); + private native double hardRateLimit(long handle); + private native void setRateLimitDelayMaxMilliseconds( + long handle, int rateLimitDelayMaxMilliseconds); + private native int rateLimitDelayMaxMilliseconds(long handle); + private native void setArenaBlockSize( + long handle, long arenaBlockSize) throws RocksDBException; + private native long arenaBlockSize(long handle); + private native void setDisableAutoCompactions( + long handle, boolean disableAutoCompactions); + private native boolean disableAutoCompactions(long handle); + private native void setCompactionStyle(long handle, byte compactionStyle); + private native byte compactionStyle(long handle); + private native void setPurgeRedundantKvsWhileFlush( + long handle, boolean purgeRedundantKvsWhileFlush); + private native boolean purgeRedundantKvsWhileFlush(long handle); + private native void setVerifyChecksumsInCompaction( + long handle, boolean verifyChecksumsInCompaction); + private native boolean verifyChecksumsInCompaction(long handle); + private native void setFilterDeletes( + long handle, boolean filterDeletes); + private native boolean filterDeletes(long handle); + private native void setMaxSequentialSkipInIterations( + long handle, long maxSequentialSkipInIterations); + private native long maxSequentialSkipInIterations(long handle); + private native void setMemTableFactory(long handle, long factoryHandle); + private native String memTableFactoryName(long handle); + private native void setTableFactory(long handle, long factoryHandle); + private native String tableFactoryName(long handle); + private native void setInplaceUpdateSupport( + long handle, boolean inplaceUpdateSupport); + private native boolean inplaceUpdateSupport(long handle); + private native void setInplaceUpdateNumLocks( + long handle, long inplaceUpdateNumLocks) throws RocksDBException; + private native long inplaceUpdateNumLocks(long handle); + private native void setMemtablePrefixBloomBits( + long handle, int memtablePrefixBloomBits); + private native int memtablePrefixBloomBits(long handle); + private native void setMemtablePrefixBloomProbes( + long handle, int memtablePrefixBloomProbes); + private native int memtablePrefixBloomProbes(long handle); + private native void setBloomLocality( + long handle, int bloomLocality); + private native int bloomLocality(long handle); + private native void setMaxSuccessiveMerges( + long handle, long maxSuccessiveMerges) throws RocksDBException; + private native long maxSuccessiveMerges(long handle); + private native void setMinPartialMergeOperands( + long handle, int minPartialMergeOperands); + private native int minPartialMergeOperands(long handle); + // instance variables + RocksEnv env_; + MemTableConfig memTableConfig_; + TableFormatConfig tableFormatConfig_; + RateLimiterConfig rateLimiterConfig_; + AbstractComparator comparator_; +} diff --git a/java/src/main/java/org/rocksdb/PlainTableConfig.java b/java/src/main/java/org/rocksdb/PlainTableConfig.java new file mode 100644 index 000000000..3a41bea84 --- /dev/null +++ b/java/src/main/java/org/rocksdb/PlainTableConfig.java @@ -0,0 +1,251 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * The config for plain table sst format. + * + *

PlainTable is a RocksDB's SST file format optimized for low query + * latency on pure-memory or really low-latency media.

+ * + *

It also support prefix hash feature.

+ */ +public class PlainTableConfig extends TableFormatConfig { + public static final int VARIABLE_LENGTH = 0; + public static final int DEFAULT_BLOOM_BITS_PER_KEY = 10; + public static final double DEFAULT_HASH_TABLE_RATIO = 0.75; + public static final int DEFAULT_INDEX_SPARSENESS = 16; + public static final int DEFAULT_HUGE_TLB_SIZE = 0; + public static final EncodingType DEFAULT_ENCODING_TYPE = + EncodingType.kPlain; + public static final boolean DEFAULT_FULL_SCAN_MODE = false; + public static final boolean DEFAULT_STORE_INDEX_IN_FILE + = false; + + public PlainTableConfig() { + keySize_ = VARIABLE_LENGTH; + bloomBitsPerKey_ = DEFAULT_BLOOM_BITS_PER_KEY; + hashTableRatio_ = DEFAULT_HASH_TABLE_RATIO; + indexSparseness_ = DEFAULT_INDEX_SPARSENESS; + hugePageTlbSize_ = DEFAULT_HUGE_TLB_SIZE; + encodingType_ = DEFAULT_ENCODING_TYPE; + fullScanMode_ = DEFAULT_FULL_SCAN_MODE; + storeIndexInFile_ = DEFAULT_STORE_INDEX_IN_FILE; + } + + /** + *

Set the length of the user key. If it is set to be + * VARIABLE_LENGTH, then it indicates the user keys are + * of variable length.

+ * + *

Otherwise,all the keys need to have the same length + * in byte.

+ * + *

DEFAULT: VARIABLE_LENGTH

+ * + * @param keySize the length of the user key. + * @return the reference to the current config. + */ + public PlainTableConfig setKeySize(int keySize) { + keySize_ = keySize; + return this; + } + + /** + * @return the specified size of the user key. If VARIABLE_LENGTH, + * then it indicates variable-length key. + */ + public int keySize() { + return keySize_; + } + + /** + * Set the number of bits per key used by the internal bloom filter + * in the plain table sst format. + * + * @param bitsPerKey the number of bits per key for bloom filer. + * @return the reference to the current config. + */ + public PlainTableConfig setBloomBitsPerKey(int bitsPerKey) { + bloomBitsPerKey_ = bitsPerKey; + return this; + } + + /** + * @return the number of bits per key used for the bloom filter. + */ + public int bloomBitsPerKey() { + return bloomBitsPerKey_; + } + + /** + * hashTableRatio is the desired utilization of the hash table used + * for prefix hashing. The ideal ratio would be the number of + * prefixes / the number of hash buckets. If this value is set to + * zero, then hash table will not be used. + * + * @param ratio the hash table ratio. + * @return the reference to the current config. + */ + public PlainTableConfig setHashTableRatio(double ratio) { + hashTableRatio_ = ratio; + return this; + } + + /** + * @return the hash table ratio. + */ + public double hashTableRatio() { + return hashTableRatio_; + } + + /** + * Index sparseness determines the index interval for keys inside the + * same prefix. This number is equal to the maximum number of linear + * search required after hash and binary search. If it's set to 0, + * then each key will be indexed. + * + * @param sparseness the index sparseness. + * @return the reference to the current config. + */ + public PlainTableConfig setIndexSparseness(int sparseness) { + indexSparseness_ = sparseness; + return this; + } + + /** + * @return the index sparseness. + */ + public long indexSparseness() { + return indexSparseness_; + } + + /** + *

huge_page_tlb_size: if ≤0, allocate hash indexes and blooms + * from malloc otherwise from huge page TLB.

+ * + *

The user needs to reserve huge pages for it to be allocated, + * like: {@code sysctl -w vm.nr_hugepages=20}

+ * + *

See linux doc Documentation/vm/hugetlbpage.txt

+ * + * @param hugePageTlbSize huge page tlb size + * @return the reference to the current config. + */ + public PlainTableConfig setHugePageTlbSize(int hugePageTlbSize) { + this.hugePageTlbSize_ = hugePageTlbSize; + return this; + } + + /** + * Returns the value for huge page tlb size + * + * @return hugePageTlbSize + */ + public int hugePageTlbSize() { + return hugePageTlbSize_; + } + + /** + * Sets the encoding type. + * + *

This setting determines how to encode + * the keys. See enum {@link EncodingType} for + * the choices.

+ * + *

The value will determine how to encode keys + * when writing to a new SST file. This value will be stored + * inside the SST file which will be used when reading from + * the file, which makes it possible for users to choose + * different encoding type when reopening a DB. Files with + * different encoding types can co-exist in the same DB and + * can be read.

+ * + * @param encodingType {@link org.rocksdb.EncodingType} value. + * @return the reference to the current config. + */ + public PlainTableConfig setEncodingType(EncodingType encodingType) { + this.encodingType_ = encodingType; + return this; + } + + /** + * Returns the active EncodingType + * + * @return currently set encoding type + */ + public EncodingType encodingType() { + return encodingType_; + } + + /** + * Set full scan mode, if true the whole file will be read + * one record by one without using the index. + * + * @param fullScanMode boolean value indicating if full + * scan mode shall be enabled. + * @return the reference to the current config. + */ + public PlainTableConfig setFullScanMode(boolean fullScanMode) { + this.fullScanMode_ = fullScanMode; + return this; + } + + /** + * Return if full scan mode is active + * @return boolean value indicating if the full scan mode is + * enabled. + */ + public boolean fullScanMode() { + return fullScanMode_; + } + + /** + *

If set to true: compute plain table index and bloom + * filter during file building and store it in file. + * When reading file, index will be mmaped instead + * of doing recomputation.

+ * + * @param storeIndexInFile value indicating if index shall + * be stored in a file + * @return the reference to the current config. + */ + public PlainTableConfig setStoreIndexInFile(boolean storeIndexInFile) { + this.storeIndexInFile_ = storeIndexInFile; + return this; + } + + /** + * Return a boolean value indicating if index shall be stored + * in a file. + * + * @return currently set value for store index in file. + */ + public boolean storeIndexInFile() { + return storeIndexInFile_; + } + + @Override protected long newTableFactoryHandle() { + return newTableFactoryHandle(keySize_, bloomBitsPerKey_, + hashTableRatio_, indexSparseness_, hugePageTlbSize_, + encodingType_.getValue(), fullScanMode_, + storeIndexInFile_); + } + + private native long newTableFactoryHandle( + int keySize, int bloomBitsPerKey, + double hashTableRatio, int indexSparseness, + int hugePageTlbSize, byte encodingType, + boolean fullScanMode, boolean storeIndexInFile); + + private int keySize_; + private int bloomBitsPerKey_; + private double hashTableRatio_; + private int indexSparseness_; + private int hugePageTlbSize_; + private EncodingType encodingType_; + private boolean fullScanMode_; + private boolean storeIndexInFile_; +} diff --git a/java/src/main/java/org/rocksdb/RateLimiterConfig.java b/java/src/main/java/org/rocksdb/RateLimiterConfig.java new file mode 100644 index 000000000..09d1c7a04 --- /dev/null +++ b/java/src/main/java/org/rocksdb/RateLimiterConfig.java @@ -0,0 +1,23 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +/** + * Config for rate limiter, which is used to control write rate of flush and + * compaction. + */ +public abstract class RateLimiterConfig { + /** + * This function should only be called by + * {@link org.rocksdb.DBOptions#setRateLimiter(long, long)}, which will + * create a c++ shared-pointer to the c++ {@code RateLimiter} that is associated + * with a Java RateLimiterConfig. + * + * @see org.rocksdb.DBOptions#setRateLimiter(long, long) + * + * @return native handle address to rate limiter instance. + */ + abstract protected long newRateLimiterHandle(); +} diff --git a/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java similarity index 75% rename from java/org/rocksdb/ReadOptions.java rename to java/src/main/java/org/rocksdb/ReadOptions.java index 97c47c7d6..4a64f288b 100644 --- a/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -64,7 +64,7 @@ public class ReadOptions extends RocksObject { private native boolean fillCache(long handle); /** - * Fill the cache when loading the block-based sst formated db. + * Fill the cache when loading the block-based sst formatted db. * Callers may wish to set this field to false for bulk scans. * Default: true * @@ -80,13 +80,51 @@ public class ReadOptions extends RocksObject { private native void setFillCache( long handle, boolean fillCache); + /** + *

If "snapshot" is non-nullptr, read as of the supplied snapshot + * (which must belong to the DB that is being read and which must + * not have been released). If "snapshot" is nullptr, use an implicit + * snapshot of the state at the beginning of this read operation.

+ *

Default: null

+ * + * @param snapshot {@link Snapshot} instance + * @return the reference to the current ReadOptions. + */ + public ReadOptions setSnapshot(Snapshot snapshot) { + assert(isInitialized()); + if (snapshot != null) { + setSnapshot(nativeHandle_, snapshot.nativeHandle_); + } else { + setSnapshot(nativeHandle_, 0l); + } + return this; + } + private native void setSnapshot(long handle, long snapshotHandle); + + /** + * Returns the currently assigned Snapshot instance. + * + * @return the Snapshot assigned to this instance. If no Snapshot + * is assigned null. + */ + public Snapshot snapshot() { + assert(isInitialized()); + long snapshotHandle = snapshot(nativeHandle_); + if (snapshotHandle != 0) { + return new Snapshot(snapshotHandle); + } + return null; + } + private native long snapshot(long handle); + /** * Specify to create a tailing iterator -- a special iterator that has a * view of the complete database (i.e. it can also be used to read newly * added data) and is optimized for sequential reads. It will return records * that were inserted into the database after the creation of the iterator. * Default: false - * Not supported in ROCKSDB_LITE mode! + * + * Not supported in {@code ROCKSDB_LITE} mode! * * @return true if tailing iterator is enabled. */ @@ -117,7 +155,6 @@ public class ReadOptions extends RocksObject { @Override protected void disposeInternal() { - assert(isInitialized()); disposeInternal(nativeHandle_); } private native void disposeInternal(long handle); diff --git a/java/src/main/java/org/rocksdb/RestoreBackupableDB.java b/java/src/main/java/org/rocksdb/RestoreBackupableDB.java new file mode 100644 index 000000000..e29628815 --- /dev/null +++ b/java/src/main/java/org/rocksdb/RestoreBackupableDB.java @@ -0,0 +1,162 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.List; + +/** + *

This class is used to access information about backups and + * restore from them.

+ * + *

Note: {@code dispose()} must be called before this instance + * become out-of-scope to release the allocated + * memory in c++.

+ * + */ +public class RestoreBackupableDB extends RocksObject { + /** + *

Construct new estoreBackupableDB instance.

+ * + * @param options {@link org.rocksdb.BackupableDBOptions} instance + */ + public RestoreBackupableDB(BackupableDBOptions options) { + super(); + nativeHandle_ = newRestoreBackupableDB(options.nativeHandle_); + } + + /** + *

Restore from backup with backup_id.

+ * + *

Important: If options_.share_table_files == true + * and you restore DB from some backup that is not the latest, and you + * start creating new backups from the new DB, they will probably + * fail.

+ * + *

Example: Let's say you have backups 1, 2, 3, 4, 5 + * and you restore 3. If you add new data to the DB and try creating a new + * backup now, the database will diverge from backups 4 and 5 and the new + * backup will fail. If you want to create new backup, you will first have + * to delete backups 4 and 5.

+ * + * @param backupId id pointing to backup + * @param dbDir database directory to restore to + * @param walDir directory where wal files are located + * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void restoreDBFromBackup(long backupId, String dbDir, String walDir, + RestoreOptions restoreOptions) throws RocksDBException { + assert(isInitialized()); + restoreDBFromBackup0(nativeHandle_, backupId, dbDir, walDir, + restoreOptions.nativeHandle_); + } + + /** + *

Restore from the latest backup.

+ * + * @param dbDir database directory to restore to + * @param walDir directory where wal files are located + * @param restoreOptions {@link org.rocksdb.RestoreOptions} instance + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void restoreDBFromLatestBackup(String dbDir, String walDir, + RestoreOptions restoreOptions) throws RocksDBException { + assert(isInitialized()); + restoreDBFromLatestBackup0(nativeHandle_, dbDir, walDir, + restoreOptions.nativeHandle_); + } + + /** + *

Deletes old backups, keeping latest numBackupsToKeep alive.

+ * + * @param numBackupsToKeep of latest backups to keep + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void purgeOldBackups(int numBackupsToKeep) throws RocksDBException { + assert(isInitialized()); + purgeOldBackups0(nativeHandle_, numBackupsToKeep); + } + + /** + *

Deletes a specific backup.

+ * + * @param backupId of backup to delete. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void deleteBackup(int backupId) throws RocksDBException { + assert(isInitialized()); + deleteBackup0(nativeHandle_, backupId); + } + + /** + *

Returns a list of {@link BackupInfo} instances, which describe + * already made backups.

+ * + * @return List of {@link BackupInfo} instances. + */ + public List getBackupInfos() { + assert(isInitialized()); + return getBackupInfo(nativeHandle_); + } + + /** + *

Returns a list of corrupted backup ids. If there + * is no corrupted backup the method will return an + * empty list.

+ * + * @return array of backup ids as int ids. + */ + public int[] getCorruptedBackups() { + assert(isInitialized()); + return getCorruptedBackups(nativeHandle_); + } + + /** + *

Will delete all the files we don't need anymore. It will + * do the full scan of the files/ directory and delete all the + * files that are not referenced.

+ * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void garbageCollect() throws RocksDBException { + assert(isInitialized()); + garbageCollect(nativeHandle_); + } + + /** + *

Release the memory allocated for the current instance + * in the c++ side.

+ */ + @Override public synchronized void disposeInternal() { + dispose(nativeHandle_); + } + + private native long newRestoreBackupableDB(long options); + private native void restoreDBFromBackup0(long nativeHandle, long backupId, + String dbDir, String walDir, long restoreOptions) + throws RocksDBException; + private native void restoreDBFromLatestBackup0(long nativeHandle, + String dbDir, String walDir, long restoreOptions) + throws RocksDBException; + private native void purgeOldBackups0(long nativeHandle, int numBackupsToKeep) + throws RocksDBException; + private native void deleteBackup0(long nativeHandle, int backupId) + throws RocksDBException; + private native List getBackupInfo(long handle); + private native int[] getCorruptedBackups(long handle); + private native void garbageCollect(long handle) + throws RocksDBException; + private native void dispose(long nativeHandle); +} diff --git a/java/org/rocksdb/RestoreOptions.java b/java/src/main/java/org/rocksdb/RestoreOptions.java similarity index 73% rename from java/org/rocksdb/RestoreOptions.java rename to java/src/main/java/org/rocksdb/RestoreOptions.java index 77a2b99bc..2325c8f6c 100644 --- a/java/org/rocksdb/RestoreOptions.java +++ b/java/src/main/java/org/rocksdb/RestoreOptions.java @@ -11,13 +11,17 @@ package org.rocksdb; * Note that dispose() must be called before this instance become out-of-scope * to release the allocated memory in c++. * - * @param If true, restore won't overwrite the existing log files in wal_dir. It - * will also move all log files from archive directory to wal_dir. Use this - * option in combination with BackupableDBOptions::backup_log_files = false - * for persisting in-memory databases. - * Default: false */ public class RestoreOptions extends RocksObject { + /** + * Constructor + * + * @param keepLogFiles If true, restore won't overwrite the existing log files in wal_dir. It + * will also move all log files from archive directory to wal_dir. Use this + * option in combination with BackupableDBOptions::backup_log_files = false + * for persisting in-memory databases. + * Default: false + */ public RestoreOptions(boolean keepLogFiles) { super(); nativeHandle_ = newRestoreOptions(keepLogFiles); diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java new file mode 100644 index 000000000..ed8b05b93 --- /dev/null +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -0,0 +1,1812 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.*; +import java.io.IOException; +import org.rocksdb.util.Environment; + +/** + * A RocksDB is a persistent ordered map from keys to values. It is safe for + * concurrent access from multiple threads without any external synchronization. + * All methods of this class could potentially throw RocksDBException, which + * indicates sth wrong at the RocksDB library side and the call failed. + */ +public class RocksDB extends RocksObject { + public static final byte[] DEFAULT_COLUMN_FAMILY = "default".getBytes(); + public static final int NOT_FOUND = -1; + + static { + RocksDB.loadLibrary(); + } + + /** + * Loads the necessary library files. + * Calling this method twice will have no effect. + * By default the method extracts the shared library for loading at + * java.io.tmpdir, however, you can override this temporary location by + * setting the environment variable ROCKSDB_SHAREDLIB_DIR. + */ + public static synchronized void loadLibrary() { + String tmpDir = System.getenv("ROCKSDB_SHAREDLIB_DIR"); + // loading possibly necessary libraries. + for (CompressionType compressionType : CompressionType.values()) { + try { + if (compressionType.getLibraryName() != null) { + System.loadLibrary(compressionType.getLibraryName()); + } + } catch (UnsatisfiedLinkError e) { + // since it may be optional, we ignore its loading failure here. + } + } + try + { + NativeLibraryLoader.getInstance().loadLibrary(tmpDir); + } + catch (IOException e) + { + throw new RuntimeException("Unable to load the RocksDB shared library" + e); + } + } + + /** + * Tries to load the necessary library files from the given list of + * directories. + * + * @param paths a list of strings where each describes a directory + * of a library. + */ + public static synchronized void loadLibrary(List paths) { + for (CompressionType compressionType : CompressionType.values()) { + if (compressionType.equals(CompressionType.NO_COMPRESSION)) { + continue; + } + for (String path : paths) { + try { + System.load(path + "/" + Environment.getSharedLibraryFileName( + compressionType.getLibraryName())); + break; + } catch (UnsatisfiedLinkError e) { + // since they are optional, we ignore loading fails. + } + } + } + boolean success = false; + UnsatisfiedLinkError err = null; + for (String path : paths) { + try { + System.load(path + "/" + Environment.getJniLibraryFileName("rocksdbjni")); + success = true; + break; + } catch (UnsatisfiedLinkError e) { + err = e; + } + } + if (!success) { + throw err; + } + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the default options w/ createIfMissing + * set to true. + * + * @param path the path to the rocksdb. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @see Options#setCreateIfMissing(boolean) + */ + public static RocksDB open(String path) throws RocksDBException { + // This allows to use the rocksjni default Options instead of + // the c++ one. + Options options = new Options(); + options.setCreateIfMissing(true); + return open(options, path); + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the specified options and db path and a list + * of column family names. + *

+ * If opened in read write mode every existing column family name must be passed + * within the list to this method.

+ *

+ * If opened in read-only mode only a subset of existing column families must + * be passed to this method.

+ *

+ * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically

+ *

+ * ColumnFamily handles are disposed when the RocksDB instance is disposed. + *

+ * + * @param path the path to the rocksdb. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @see DBOptions#setCreateIfMissing(boolean) + */ + public static RocksDB open(String path, + List columnFamilyDescriptors, + List columnFamilyHandles) throws RocksDBException { + // This allows to use the rocksjni default Options instead of + // the c++ one. + DBOptions options = new DBOptions(); + return open(options, path, columnFamilyDescriptors, columnFamilyHandles); + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the specified options and db path. + * + *

+ * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically.

+ *

+ * Options instance can be re-used to open multiple DBs if DB statistics is + * not used. If DB statistics are required, then its recommended to open DB + * with new Options instance as underlying native statistics instance does not + * use any locks to prevent concurrent updates.

+ * + * @param options {@link org.rocksdb.Options} instance. + * @param path the path to the rocksdb. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * + * @see Options#setCreateIfMissing(boolean) + */ + public static RocksDB open(Options options, String path) + throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + RocksDB db = new RocksDB(); + db.open(options.nativeHandle_, path); + + db.storeOptionsInstance(options); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance given + * the path to the database using the specified options and db path and a list + * of column family names. + *

+ * If opened in read write mode every existing column family name must be passed + * within the list to this method.

+ *

+ * If opened in read-only mode only a subset of existing column families must + * be passed to this method.

+ *

+ * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically.

+ *

+ * Options instance can be re-used to open multiple DBs if DB statistics is + * not used. If DB statistics are required, then its recommended to open DB + * with new Options instance as underlying native statistics instance does not + * use any locks to prevent concurrent updates.

+ *

+ * ColumnFamily handles are disposed when the RocksDB instance is disposed.

+ * + * @param options {@link org.rocksdb.DBOptions} instance. + * @param path the path to the rocksdb. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * + * @see DBOptions#setCreateIfMissing(boolean) + */ + public static RocksDB open(DBOptions options, String path, + List columnFamilyDescriptors, + List columnFamilyHandles) + throws RocksDBException { + RocksDB db = new RocksDB(); + List cfReferences = db.open(options.nativeHandle_, path, + columnFamilyDescriptors, columnFamilyDescriptors.size()); + for (int i = 0; i < columnFamilyDescriptors.size(); i++) { + columnFamilyHandles.add(new ColumnFamilyHandle(db, cfReferences.get(i))); + } + db.storeOptionsInstance(options); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the default + * options. + * + * @param path the path to the RocksDB. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(String path) + throws RocksDBException { + // This allows to use the rocksjni default Options instead of + // the c++ one. + Options options = new Options(); + return openReadOnly(options, path); + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the default + * options. + * + * @param path the path to the RocksDB. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(String path, + List columnFamilyDescriptors, + List columnFamilyHandles) throws RocksDBException { + // This allows to use the rocksjni default Options instead of + // the c++ one. + DBOptions options = new DBOptions(); + return openReadOnly(options, path, columnFamilyDescriptors, + columnFamilyHandles); + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the specified + * options and db path. + * + * Options instance *should* not be disposed before all DBs using this options + * instance have been closed. If user doesn't call options dispose explicitly, + * then this options instance will be GC'd automatically. + * + * @param options {@link Options} instance. + * @param path the path to the RocksDB. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(Options options, String path) + throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + RocksDB db = new RocksDB(); + db.openROnly(options.nativeHandle_, path); + + db.storeOptionsInstance(options); + return db; + } + + /** + * The factory constructor of RocksDB that opens a RocksDB instance in + * Read-Only mode given the path to the database using the specified + * options and db path. + * + *

This open method allows to open RocksDB using a subset of available + * column families

+ *

Options instance *should* not be disposed before all DBs using this + * options instance have been closed. If user doesn't call options dispose + * explicitly,then this options instance will be GC'd automatically.

+ * + * @param options {@link DBOptions} instance. + * @param path the path to the RocksDB. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * @return a {@link RocksDB} instance on success, null if the specified + * {@link RocksDB} can not be opened. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public static RocksDB openReadOnly(DBOptions options, String path, + List columnFamilyDescriptors, + List columnFamilyHandles) + throws RocksDBException { + // when non-default Options is used, keeping an Options reference + // in RocksDB can prevent Java to GC during the life-time of + // the currently-created RocksDB. + RocksDB db = new RocksDB(); + List cfReferences = db.openROnly(options.nativeHandle_, path, + columnFamilyDescriptors, columnFamilyDescriptors.size()); + for (int i=0; i listColumnFamilies(Options options, String path) + throws RocksDBException { + return RocksDB.listColumnFamilies(options.nativeHandle_, path); + } + + private void storeOptionsInstance(DBOptionsInterface options) { + options_ = options; + } + + @Override protected void disposeInternal() { + synchronized (this) { + assert (isInitialized()); + disposeInternal(nativeHandle_); + } + } + + /** + * Close the RocksDB instance. + * This function is equivalent to dispose(). + */ + public void close() { + dispose(); + } + + /** + * Set the database entry for "key" to "value". + * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void put(byte[] key, byte[] value) throws RocksDBException { + put(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Set the database entry for "key" to "value" in the specified + * column family. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * throws IllegalArgumentException if column family is not present + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void put(ColumnFamilyHandle columnFamilyHandle, byte[] key, + byte[] value) throws RocksDBException { + put(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Set the database entry for "key" to "value". + * + * @param writeOpts {@link org.rocksdb.WriteOptions} instance. + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void put(WriteOptions writeOpts, byte[] key, byte[] value) + throws RocksDBException { + put(nativeHandle_, writeOpts.nativeHandle_, + key, key.length, value, value.length); + } + + /** + * Set the database entry for "key" to "value" for the specified + * column family. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param writeOpts {@link org.rocksdb.WriteOptions} instance. + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + * + * throws IllegalArgumentException if column family is not present + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @see IllegalArgumentException + */ + public void put(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpts, + byte[] key, byte[] value) throws RocksDBException { + put(nativeHandle_, writeOpts.nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, else true. + * + * This check is potentially lighter-weight than invoking DB::Get(). One way + * to make this lighter weight is to avoid doing any IOs. + * + * @param key byte array of a key to search for + * @param value StringBuffer instance which is a out parameter if a value is + * found in block-cache. + * @return boolean value indicating if key does not exist or might exist. + */ + public boolean keyMayExist(byte[] key, StringBuffer value){ + return keyMayExist(key, key.length, value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, else true. + * + * This check is potentially lighter-weight than invoking DB::Get(). One way + * to make this lighter weight is to avoid doing any IOs. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key byte array of a key to search for + * @param value StringBuffer instance which is a out parameter if a value is + * found in block-cache. + * @return boolean value indicating if key does not exist or might exist. + */ + public boolean keyMayExist(ColumnFamilyHandle columnFamilyHandle, + byte[] key, StringBuffer value){ + return keyMayExist(key, key.length, columnFamilyHandle.nativeHandle_, + value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, else true. + * + * This check is potentially lighter-weight than invoking DB::Get(). One way + * to make this lighter weight is to avoid doing any IOs. + * + * @param readOptions {@link ReadOptions} instance + * @param key byte array of a key to search for + * @param value StringBuffer instance which is a out parameter if a value is + * found in block-cache. + * @return boolean value indicating if key does not exist or might exist. + */ + public boolean keyMayExist(ReadOptions readOptions, + byte[] key, StringBuffer value){ + return keyMayExist(readOptions.nativeHandle_, + key, key.length, value); + } + + /** + * If the key definitely does not exist in the database, then this method + * returns false, else true. + * + * This check is potentially lighter-weight than invoking DB::Get(). One way + * to make this lighter weight is to avoid doing any IOs. + * + * @param readOptions {@link ReadOptions} instance + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key byte array of a key to search for + * @param value StringBuffer instance which is a out parameter if a value is + * found in block-cache. + * @return boolean value indicating if key does not exist or might exist. + */ + public boolean keyMayExist(ReadOptions readOptions, + ColumnFamilyHandle columnFamilyHandle, byte[] key, StringBuffer value){ + return keyMayExist(readOptions.nativeHandle_, + key, key.length, columnFamilyHandle.nativeHandle_, + value); + } + + /** + * Apply the specified updates to the database. + * + * @param writeOpts WriteOptions instance + * @param updates WriteBatch instance + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void write(WriteOptions writeOpts, WriteBatch updates) + throws RocksDBException { + write0(writeOpts.nativeHandle_, updates.nativeHandle_); + } + + /** + * Apply the specified updates to the database. + * + * @param writeOpts WriteOptions instance + * @param updates WriteBatchWithIndex instance + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void write(WriteOptions writeOpts, WriteBatchWithIndex updates) + throws RocksDBException { + write1(writeOpts.nativeHandle_, updates.nativeHandle_); + } + + /** + * Add merge operand for key/value pair. + * + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void merge(byte[] key, byte[] value) throws RocksDBException { + merge(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Add merge operand for key/value pair in a ColumnFamily. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void merge(ColumnFamilyHandle columnFamilyHandle, byte[] key, + byte[] value) throws RocksDBException { + merge(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Add merge operand for key/value pair. + * + * @param writeOpts {@link WriteOptions} for this write. + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void merge(WriteOptions writeOpts, byte[] key, byte[] value) + throws RocksDBException { + merge(nativeHandle_, writeOpts.nativeHandle_, + key, key.length, value, value.length); + } + + /** + * Add merge operand for key/value pair. + * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param writeOpts {@link WriteOptions} for this write. + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void merge(ColumnFamilyHandle columnFamilyHandle, + WriteOptions writeOpts, byte[] key, byte[] value) + throws RocksDBException { + merge(nativeHandle_, writeOpts.nativeHandle_, + key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Get the value associated with the specified key within column family* + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public int get(byte[] key, byte[] value) throws RocksDBException { + return get(nativeHandle_, key, key.length, value, value.length); + } + + /** + * Get the value associated with the specified key within column family. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public int get(ColumnFamilyHandle columnFamilyHandle, byte[] key, byte[] value) + throws RocksDBException, IllegalArgumentException { + return get(nativeHandle_, key, key.length, value, value.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Get the value associated with the specified key. + * + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public int get(ReadOptions opt, byte[] key, byte[] value) + throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, + key, key.length, value, value.length); + } + /** + * Get the value associated with the specified key within column family. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param opt {@link org.rocksdb.ReadOptions} instance. + * @param key the key to retrieve the value. + * @param value the out-value to receive the retrieved value. + * @return The size of the actual value that matches the specified + * {@code key} in byte. If the return value is greater than the + * length of {@code value}, then it indicates that the size of the + * input buffer {@code value} is insufficient and partial result will + * be returned. RocksDB.NOT_FOUND will be returned if the value not + * found. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public int get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt, byte[] key, + byte[] value) throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, key, key.length, value, + value.length, columnFamilyHandle.nativeHandle_); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param key the key retrieve the value. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] get(byte[] key) throws RocksDBException { + return get(nativeHandle_, key, key.length); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the key retrieve the value. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] get(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException { + return get(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param key the key retrieve the value. + * @param opt Read options. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] get(ReadOptions opt, byte[] key) throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, key, key.length); + } + + /** + * The simplified version of get which returns a new byte array storing + * the value associated with the specified input key if any. null will be + * returned if the specified key is not found. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the key retrieve the value. + * @param opt Read options. + * @return a byte array storing the value associated with the input key if + * any. null if it does not find the specified key. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public byte[] get(ColumnFamilyHandle columnFamilyHandle, ReadOptions opt, + byte[] key) throws RocksDBException { + return get(nativeHandle_, opt.nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * Returns a map of keys for which values were found in DB. + * + * @param keys List of keys for which values need to be retrieved. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public Map multiGet(List keys) + throws RocksDBException { + assert(keys.size() != 0); + + List values = multiGet( + nativeHandle_, keys, keys.size()); + + Map keyValueMap = new HashMap<>(); + for(int i = 0; i < values.size(); i++) { + if(values.get(i) == null) { + continue; + } + + keyValueMap.put(keys.get(i), values.get(i)); + } + + return keyValueMap; + } + + /** + * Returns a map of keys for which values were found in DB. + *

+ * Note: Every key needs to have a related column family name in + * {@code columnFamilyHandleList}. + *

+ * + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys List of keys for which values need to be retrieved. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @throws IllegalArgumentException thrown if the size of passed keys is not + * equal to the amount of passed column family handles. + */ + public Map multiGet(List columnFamilyHandleList, + List keys) throws RocksDBException, IllegalArgumentException { + assert(keys.size() != 0); + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (keys.size()!=columnFamilyHandleList.size()) { + throw new IllegalArgumentException( + "For each key there must be a ColumnFamilyHandle."); + } + List values = multiGet(nativeHandle_, keys, keys.size(), + columnFamilyHandleList); + + Map keyValueMap = new HashMap<>(); + for(int i = 0; i < values.size(); i++) { + if (values.get(i) == null) { + continue; + } + keyValueMap.put(keys.get(i), values.get(i)); + } + return keyValueMap; + } + + /** + * Returns a map of keys for which values were found in DB. + * + * @param opt Read options. + * @param keys of keys for which values need to be retrieved. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public Map multiGet(ReadOptions opt, List keys) + throws RocksDBException { + assert(keys.size() != 0); + + List values = multiGet( + nativeHandle_, opt.nativeHandle_, keys, keys.size()); + + Map keyValueMap = new HashMap<>(); + for(int i = 0; i < values.size(); i++) { + if(values.get(i) == null) { + continue; + } + + keyValueMap.put(keys.get(i), values.get(i)); + } + + return keyValueMap; + } + + /** + * Returns a map of keys for which values were found in DB. + *

+ * Note: Every key needs to have a related column family name in + * {@code columnFamilyHandleList}. + *

+ * + * @param opt Read options. + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param keys of keys for which values need to be retrieved. + * @return Map where key of map is the key passed by user and value for map + * entry is the corresponding value in DB. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + * @throws IllegalArgumentException thrown if the size of passed keys is not + * equal to the amount of passed column family handles. + */ + public Map multiGet(ReadOptions opt, + List columnFamilyHandleList, List keys) + throws RocksDBException { + assert(keys.size() != 0); + // Check if key size equals cfList size. If not a exception must be + // thrown. If not a Segmentation fault happens. + if (keys.size()!=columnFamilyHandleList.size()){ + throw new IllegalArgumentException( + "For each key there must be a ColumnFamilyHandle."); + } + + List values = multiGet(nativeHandle_, opt.nativeHandle_, + keys, keys.size(), columnFamilyHandleList); + + Map keyValueMap = new HashMap<>(); + for(int i = 0; i < values.size(); i++) { + if(values.get(i) == null) { + continue; + } + keyValueMap.put(keys.get(i), values.get(i)); + } + + return keyValueMap; + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + * + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void remove(byte[] key) throws RocksDBException { + remove(nativeHandle_, key, key.length); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key) + throws RocksDBException { + remove(nativeHandle_, key, key.length, columnFamilyHandle.nativeHandle_); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + * + * @param writeOpt WriteOptions to be used with delete operation + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void remove(WriteOptions writeOpt, byte[] key) + throws RocksDBException { + remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length); + } + + /** + * Remove the database entry (if any) for "key". Returns OK on + * success, and a non-OK status on error. It is not an error if "key" + * did not exist in the database. + * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param writeOpt WriteOptions to be used with delete operation + * @param key Key to delete within database + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void remove(ColumnFamilyHandle columnFamilyHandle, WriteOptions writeOpt, + byte[] key) throws RocksDBException { + remove(nativeHandle_, writeOpt.nativeHandle_, key, key.length, + columnFamilyHandle.nativeHandle_); + } + + /** + * DB implements can export properties about their state + * via this method on a per column family level. + * + *

If {@code property} is a valid property understood by this DB + * implementation, fills {@code value} with its current value and + * returns true. Otherwise returns false.

+ * + *

Valid property names include: + *

    + *
  • "rocksdb.num-files-at-level<N>" - return the number of files at level <N>, + * where <N> is an ASCII representation of a level number (e.g. "0").
  • + *
  • "rocksdb.stats" - returns a multi-line string that describes statistics + * about the internal operation of the DB.
  • + *
  • "rocksdb.sstables" - returns a multi-line string that describes all + * of the sstables that make up the db contents.
  • + *
+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param property to be fetched. See above for examples + * @return property value + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public String getProperty(ColumnFamilyHandle columnFamilyHandle, String property) + throws RocksDBException { + return getProperty0(nativeHandle_, columnFamilyHandle.nativeHandle_, property, + property.length()); + } + + /** + * DB implementations can export properties about their state + * via this method. If "property" is a valid property understood by this + * DB implementation, fills "*value" with its current value and returns + * true. Otherwise returns false. + * + *

Valid property names include: + *

    + *
  • "rocksdb.num-files-at-level<N>" - return the number of files at level <N>, + * where <N> is an ASCII representation of a level number (e.g. "0").
  • + *
  • "rocksdb.stats" - returns a multi-line string that describes statistics + * about the internal operation of the DB.
  • + *
  • "rocksdb.sstables" - returns a multi-line string that describes all + * of the sstables that make up the db contents.
  • + *
+ * + * @param property to be fetched. See above for examples + * @return property value + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public String getProperty(String property) throws RocksDBException { + return getProperty0(nativeHandle_, property, property.length()); + } + + /** + *

Similar to GetProperty(), but only works for a subset of properties whose + * return value is a numerical value. Return the value as long.

+ * + *

Note: As the returned property is of type + * {@code uint64_t} on C++ side the returning value can be negative + * because Java supports in Java 7 only signed long values.

+ * + *

Java 7: To mitigate the problem of the non + * existent unsigned long tpye, values should be encapsulated using + * {@link java.math.BigInteger} to reflect the correct value. The correct + * behavior is guaranteed if {@code 2^64} is added to negative values.

+ * + *

Java 8: In Java 8 the value should be treated as + * unsigned long using provided methods of type {@link Long}.

+ * + * @param property to be fetched. + * + * @return numerical property value. + * + * @throws RocksDBException if an error happens in the underlying native code. + */ + public long getLongProperty(String property) throws RocksDBException { + return getLongProperty(nativeHandle_, property, property.length()); + } + + /** + *

Similar to GetProperty(), but only works for a subset of properties whose + * return value is a numerical value. Return the value as long.

+ * + *

Note: As the returned property is of type + * {@code uint64_t} on C++ side the returning value can be negative + * because Java supports in Java 7 only signed long values.

+ * + *

Java 7: To mitigate the problem of the non + * existent unsigned long tpye, values should be encapsulated using + * {@link java.math.BigInteger} to reflect the correct value. The correct + * behavior is guaranteed if {@code 2^64} is added to negative values.

+ * + *

Java 8: In Java 8 the value should be treated as + * unsigned long using provided methods of type {@link Long}.

+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param property to be fetched. + * + * @return numerical property value + * + * @throws RocksDBException if an error happens in the underlying native code. + */ + public long getLongProperty(ColumnFamilyHandle columnFamilyHandle, String property) + throws RocksDBException { + return getLongProperty(nativeHandle_, columnFamilyHandle.nativeHandle_, property, + property.length()); + } + + /** + *

Return a heap-allocated iterator over the contents of the + * database. The result of newIterator() is initially invalid + * (caller must call one of the Seek methods on the iterator + * before using it).

+ * + *

Caller should close the iterator when it is no longer needed. + * The returned iterator should be closed before this db is closed. + *

+ * + * @return instance of iterator object. + */ + public RocksIterator newIterator() { + return new RocksIterator(this, iterator(nativeHandle_)); + } + + /** + *

Return a heap-allocated iterator over the contents of the + * database. The result of newIterator() is initially invalid + * (caller must call one of the Seek methods on the iterator + * before using it).

+ * + *

Caller should close the iterator when it is no longer needed. + * The returned iterator should be closed before this db is closed. + *

+ * + * @param readOptions {@link ReadOptions} instance. + * @return instance of iterator object. + */ + public RocksIterator newIterator(ReadOptions readOptions) { + return new RocksIterator(this, iterator(nativeHandle_, + readOptions.nativeHandle_)); + } + + /** + *

Return a handle to the current DB state. Iterators created with + * this handle will all observe a stable snapshot of the current DB + * state. The caller must call ReleaseSnapshot(result) when the + * snapshot is no longer needed.

+ * + *

nullptr will be returned if the DB fails to take a snapshot or does + * not support snapshot.

+ * + * @return Snapshot {@link Snapshot} instance + */ + public Snapshot getSnapshot() { + long snapshotHandle = getSnapshot(nativeHandle_); + if (snapshotHandle != 0) { + return new Snapshot(snapshotHandle); + } + return null; + } + + /** + * Release a previously acquired snapshot. The caller must not + * use "snapshot" after this call. + * + * @param snapshot {@link Snapshot} instance + */ + public void releaseSnapshot(final Snapshot snapshot) { + if (snapshot != null) { + releaseSnapshot(nativeHandle_, snapshot.nativeHandle_); + } + } + + /** + *

Return a heap-allocated iterator over the contents of the + * database. The result of newIterator() is initially invalid + * (caller must call one of the Seek methods on the iterator + * before using it).

+ * + *

Caller should close the iterator when it is no longer needed. + * The returned iterator should be closed before this db is closed. + *

+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @return instance of iterator object. + */ + public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) { + return new RocksIterator(this, iteratorCF(nativeHandle_, + columnFamilyHandle.nativeHandle_)); + } + + /** + *

Return a heap-allocated iterator over the contents of the + * database. The result of newIterator() is initially invalid + * (caller must call one of the Seek methods on the iterator + * before using it).

+ * + *

Caller should close the iterator when it is no longer needed. + * The returned iterator should be closed before this db is closed. + *

+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param readOptions {@link ReadOptions} instance. + * @return instance of iterator object. + */ + public RocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle, + ReadOptions readOptions) { + return new RocksIterator(this, iteratorCF(nativeHandle_, + columnFamilyHandle.nativeHandle_, readOptions.nativeHandle_)); + } + + /** + * Returns iterators from a consistent database state across multiple + * column families. Iterators are heap allocated and need to be deleted + * before the db is deleted + * + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} + * instances + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public List newIterators( + List columnFamilyHandleList) throws RocksDBException { + return newIterators(columnFamilyHandleList, new ReadOptions()); + } + + /** + * Returns iterators from a consistent database state across multiple + * column families. Iterators are heap allocated and need to be deleted + * before the db is deleted + * + * @param columnFamilyHandleList {@link java.util.List} containing + * {@link org.rocksdb.ColumnFamilyHandle} instances. + * @param readOptions {@link ReadOptions} instance. + * @return {@link java.util.List} containing {@link org.rocksdb.RocksIterator} + * instances + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public List newIterators( + List columnFamilyHandleList, + ReadOptions readOptions) throws RocksDBException { + List iterators = + new ArrayList<>(columnFamilyHandleList.size()); + + long[] iteratorRefs = iterators(nativeHandle_, columnFamilyHandleList, + readOptions.nativeHandle_); + for (int i=0; iFlush all memory table data.

+ * + *

Note: it must be ensured that the FlushOptions instance + * is not GC'ed before this method finishes. If the wait parameter is + * set to false, flush processing is asynchronous.

+ * + * @param flushOptions {@link org.rocksdb.FlushOptions} instance. + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void flush(FlushOptions flushOptions) + throws RocksDBException { + flush(nativeHandle_, flushOptions.nativeHandle_); + } + + /** + *

Flush all memory table data.

+ * + *

Note: it must be ensured that the FlushOptions instance + * is not GC'ed before this method finishes. If the wait parameter is + * set to false, flush processing is asynchronous.

+ * + * @param flushOptions {@link org.rocksdb.FlushOptions} instance. + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} instance. + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void flush(FlushOptions flushOptions, + ColumnFamilyHandle columnFamilyHandle) throws RocksDBException { + flush(nativeHandle_, flushOptions.nativeHandle_, + columnFamilyHandle.nativeHandle_); + } + + /** + *

Range compaction of database.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

See also

+ *
    + *
  • {@link #compactRange(boolean, int, int)}
  • + *
  • {@link #compactRange(byte[], byte[])}
  • + *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • + *
+ * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange() throws RocksDBException { + compactRange0(nativeHandle_, false, -1, 0); + } + + /** + *

Range compaction of database.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

See also

+ *
    + *
  • {@link #compactRange()}
  • + *
  • {@link #compactRange(boolean, int, int)}
  • + *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • + *
+ * + * @param begin start of key range (included in range) + * @param end end of key range (excluded from range) + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(byte[] begin, byte[] end) + throws RocksDBException { + compactRange0(nativeHandle_, begin, begin.length, end, + end.length, false, -1, 0); + } + + /** + *

Range compaction of database.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

Compaction outputs should be placed in options.db_paths + * [target_path_id]. Behavior is undefined if target_path_id is + * out of range.

+ * + *

See also

+ *
    + *
  • {@link #compactRange()}
  • + *
  • {@link #compactRange(byte[], byte[])}
  • + *
  • {@link #compactRange(byte[], byte[], boolean, int, int)}
  • + *
+ * + * @param reduce_level reduce level after compaction + * @param target_level target level to compact to + * @param target_path_id the target path id of output path + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(boolean reduce_level, int target_level, + int target_path_id) throws RocksDBException { + compactRange0(nativeHandle_, reduce_level, + target_level, target_path_id); + } + + + /** + *

Range compaction of database.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

Compaction outputs should be placed in options.db_paths + * [target_path_id]. Behavior is undefined if target_path_id is + * out of range.

+ * + *

See also

+ *
    + *
  • {@link #compactRange()}
  • + *
  • {@link #compactRange(boolean, int, int)}
  • + *
  • {@link #compactRange(byte[], byte[])}
  • + *
+ * + * @param begin start of key range (included in range) + * @param end end of key range (excluded from range) + * @param reduce_level reduce level after compaction + * @param target_level target level to compact to + * @param target_path_id the target path id of output path + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(byte[] begin, byte[] end, + boolean reduce_level, int target_level, int target_path_id) + throws RocksDBException { + compactRange0(nativeHandle_, begin, begin.length, end, end.length, + reduce_level, target_level, target_path_id); + } + + /** + *

Range compaction of column family.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

See also

+ *
    + *
  • + * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} + *
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} + *
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], + * boolean, int, int)} + *
  • + *
+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance. + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(ColumnFamilyHandle columnFamilyHandle) + throws RocksDBException { + compactRange(nativeHandle_, false, -1, 0, + columnFamilyHandle.nativeHandle_); + } + + /** + *

Range compaction of column family.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

See also

+ *
    + *
  • {@link #compactRange(ColumnFamilyHandle)}
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} + *
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], + * boolean, int, int)} + *
  • + *
+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance. + * @param begin start of key range (included in range) + * @param end end of key range (excluded from range) + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(ColumnFamilyHandle columnFamilyHandle, + byte[] begin, byte[] end) throws RocksDBException { + compactRange(nativeHandle_, begin, begin.length, end, end.length, + false, -1, 0, columnFamilyHandle.nativeHandle_); + } + + /** + *

Range compaction of column family.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

Compaction outputs should be placed in options.db_paths + * [target_path_id]. Behavior is undefined if target_path_id is + * out of range.

+ * + *

See also

+ *
    + *
  • {@link #compactRange(ColumnFamilyHandle)}
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} + *
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[], + * boolean, int, int)} + *
  • + *
+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance. + * @param reduce_level reduce level after compaction + * @param target_level target level to compact to + * @param target_path_id the target path id of output path + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(ColumnFamilyHandle columnFamilyHandle, + boolean reduce_level, int target_level, int target_path_id) + throws RocksDBException { + compactRange(nativeHandle_, reduce_level, target_level, + target_path_id, columnFamilyHandle.nativeHandle_); + } + + /** + *

Range compaction of column family.

+ *

Note: After the database has been compacted, + * all data will have been pushed down to the last level containing + * any data.

+ * + *

Compaction outputs should be placed in options.db_paths + * [target_path_id]. Behavior is undefined if target_path_id is + * out of range.

+ * + *

See also

+ *
    + *
  • {@link #compactRange(ColumnFamilyHandle)}
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, boolean, int, int)} + *
  • + *
  • + * {@link #compactRange(ColumnFamilyHandle, byte[], byte[])} + *
  • + *
+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance. + * @param begin start of key range (included in range) + * @param end end of key range (excluded from range) + * @param reduce_level reduce level after compaction + * @param target_level target level to compact to + * @param target_path_id the target path id of output path + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public void compactRange(ColumnFamilyHandle columnFamilyHandle, + byte[] begin, byte[] end, boolean reduce_level, int target_level, + int target_path_id) throws RocksDBException { + compactRange(nativeHandle_, begin, begin.length, end, end.length, + reduce_level, target_level, target_path_id, + columnFamilyHandle.nativeHandle_); + } + + /** + *

The sequence number of the most recent transaction.

+ * + * @return sequence number of the most + * recent transaction. + */ + public long getLatestSequenceNumber() { + return getLatestSequenceNumber(nativeHandle_); + } + + /** + *

Prevent file deletions. Compactions will continue to occur, + * but no obsolete files will be deleted. Calling this multiple + * times have the same effect as calling it once.

+ * + * @throws RocksDBException thrown if operation was not performed + * successfully. + */ + public void disableFileDeletions() throws RocksDBException { + disableFileDeletions(nativeHandle_); + } + + /** + *

Allow compactions to delete obsolete files. + * If force == true, the call to EnableFileDeletions() + * will guarantee that file deletions are enabled after + * the call, even if DisableFileDeletions() was called + * multiple times before.

+ * + *

If force == false, EnableFileDeletions will only + * enable file deletion after it's been called at least + * as many times as DisableFileDeletions(), enabling + * the two methods to be called by two threads + * concurrently without synchronization + * -- i.e., file deletions will be enabled only after both + * threads call EnableFileDeletions()

+ * + * @param force boolean value described above. + * + * @throws RocksDBException thrown if operation was not performed + * successfully. + */ + public void enableFileDeletions(boolean force) + throws RocksDBException { + enableFileDeletions(nativeHandle_, force); + } + + /** + *

Returns an iterator that is positioned at a write-batch containing + * seq_number. If the sequence number is non existent, it returns an iterator + * at the first available seq_no after the requested seq_no.

+ * + *

Must set WAL_ttl_seconds or WAL_size_limit_MB to large values to + * use this api, else the WAL files will get + * cleared aggressively and the iterator might keep getting invalid before + * an update is read.

+ * + * @param sequenceNumber sequence number offset + * + * @return {@link org.rocksdb.TransactionLogIterator} instance. + * + * @throws org.rocksdb.RocksDBException if iterator cannot be retrieved + * from native-side. + */ + public TransactionLogIterator getUpdatesSince(long sequenceNumber) + throws RocksDBException { + return new TransactionLogIterator( + getUpdatesSince(nativeHandle_, sequenceNumber)); + } + + /** + * Private constructor. + */ + protected RocksDB() { + super(); + } + + // native methods + protected native void open( + long optionsHandle, String path) throws RocksDBException; + protected native List open(long optionsHandle, String path, + List columnFamilyDescriptors, + int columnFamilyDescriptorsLength) + throws RocksDBException; + protected native static List listColumnFamilies( + long optionsHandle, String path) throws RocksDBException; + protected native void openROnly( + long optionsHandle, String path) throws RocksDBException; + protected native List openROnly( + long optionsHandle, String path, + List columnFamilyDescriptors, + int columnFamilyDescriptorsLength) throws RocksDBException; + protected native void put( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void put( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native void put( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void put( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native void write0( + long writeOptHandle, long wbHandle) throws RocksDBException; + protected native void write1( + long writeOptHandle, long wbwiHandle) throws RocksDBException; + protected native boolean keyMayExist(byte[] key, int keyLen, + StringBuffer stringBuffer); + protected native boolean keyMayExist(byte[] key, int keyLen, + long cfHandle, StringBuffer stringBuffer); + protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen, + StringBuffer stringBuffer); + protected native boolean keyMayExist(long optionsHandle, byte[] key, int keyLen, + long cfHandle, StringBuffer stringBuffer); + protected native void merge( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void merge( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native void merge( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native void merge( + long handle, long writeOptHandle, + byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native int get( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native int get( + long handle, byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native int get( + long handle, long readOptHandle, byte[] key, int keyLen, + byte[] value, int valueLen) throws RocksDBException; + protected native int get( + long handle, long readOptHandle, byte[] key, int keyLen, + byte[] value, int valueLen, long cfHandle) throws RocksDBException; + protected native List multiGet( + long dbHandle, List keys, int keysCount); + protected native List multiGet( + long dbHandle, List keys, int keysCount, List + cfHandles); + protected native List multiGet( + long dbHandle, long rOptHandle, List keys, int keysCount); + protected native List multiGet( + long dbHandle, long rOptHandle, List keys, int keysCount, + List cfHandles); + protected native byte[] get( + long handle, byte[] key, int keyLen) throws RocksDBException; + protected native byte[] get( + long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; + protected native byte[] get( + long handle, long readOptHandle, + byte[] key, int keyLen) throws RocksDBException; + protected native byte[] get( + long handle, long readOptHandle, + byte[] key, int keyLen, long cfHandle) throws RocksDBException; + protected native void remove( + long handle, byte[] key, int keyLen) throws RocksDBException; + protected native void remove( + long handle, byte[] key, int keyLen, long cfHandle) throws RocksDBException; + protected native void remove( + long handle, long writeOptHandle, + byte[] key, int keyLen) throws RocksDBException; + protected native void remove( + long handle, long writeOptHandle, + byte[] key, int keyLen, long cfHandle) throws RocksDBException; + protected native String getProperty0(long nativeHandle, + String property, int propertyLength) throws RocksDBException; + protected native String getProperty0(long nativeHandle, long cfHandle, + String property, int propertyLength) throws RocksDBException; + protected native long getLongProperty(long nativeHandle, + String property, int propertyLength) throws RocksDBException; + protected native long getLongProperty(long nativeHandle, long cfHandle, + String property, int propertyLength) throws RocksDBException; + protected native long iterator(long handle); + protected native long iterator(long handle, long readOptHandle); + protected native long iteratorCF(long handle, long cfHandle); + protected native long iteratorCF(long handle, long cfHandle, + long readOptHandle); + protected native long[] iterators(long handle, + List columnFamilyNames, long readOptHandle) + throws RocksDBException; + protected native long getSnapshot(long nativeHandle); + protected native void releaseSnapshot( + long nativeHandle, long snapshotHandle); + private native void disposeInternal(long handle); + private native long getDefaultColumnFamily(long handle); + private native long createColumnFamily(long handle, + ColumnFamilyDescriptor columnFamilyDescriptor) throws RocksDBException; + private native void dropColumnFamily(long handle, long cfHandle) throws RocksDBException; + private native void flush(long handle, long flushOptHandle) + throws RocksDBException; + private native void flush(long handle, long flushOptHandle, + long cfHandle) throws RocksDBException; + private native void compactRange0(long handle, boolean reduce_level, int target_level, + int target_path_id) throws RocksDBException; + private native void compactRange0(long handle, byte[] begin, int beginLen, byte[] end, + int endLen, boolean reduce_level, int target_level, int target_path_id) + throws RocksDBException; + private native void compactRange(long handle, boolean reduce_level, int target_level, + int target_path_id, long cfHandle) throws RocksDBException; + private native void compactRange(long handle, byte[] begin, int beginLen, byte[] end, + int endLen, boolean reduce_level, int target_level, int target_path_id, + long cfHandle) throws RocksDBException; + private native long getLatestSequenceNumber(long handle); + private native void disableFileDeletions(long handle) + throws RocksDBException; + private native void enableFileDeletions(long handle, + boolean force) throws RocksDBException; + private native long getUpdatesSince(long handle, long sequenceNumber) + throws RocksDBException; + + protected DBOptionsInterface options_; +} diff --git a/java/org/rocksdb/RocksDBException.java b/java/src/main/java/org/rocksdb/RocksDBException.java similarity index 97% rename from java/org/rocksdb/RocksDBException.java rename to java/src/main/java/org/rocksdb/RocksDBException.java index acc93669e..c4fe72bdd 100644 --- a/java/org/rocksdb/RocksDBException.java +++ b/java/src/main/java/org/rocksdb/RocksDBException.java @@ -5,8 +5,6 @@ package org.rocksdb; -import java.util.*; - /** * A RocksDBException encapsulates the error of an operation. This exception * type is used to describe an internal error from the c++ rocksdb library. diff --git a/java/org/rocksdb/RocksEnv.java b/java/src/main/java/org/rocksdb/RocksEnv.java similarity index 52% rename from java/org/rocksdb/RocksEnv.java rename to java/src/main/java/org/rocksdb/RocksEnv.java index ce73ba654..bb19eb732 100644 --- a/java/org/rocksdb/RocksEnv.java +++ b/java/src/main/java/org/rocksdb/RocksEnv.java @@ -6,11 +6,11 @@ package org.rocksdb; /** - * A RocksEnv is an interface used by the rocksdb implementation to access - * operating system functionality like the filesystem etc. + *

A RocksEnv is an interface used by the rocksdb implementation to access + * operating system functionality like the filesystem etc.

* - * All Env implementations are safe for concurrent access from - * multiple threads without any external synchronization. + *

All Env implementations are safe for concurrent access from + * multiple threads without any external synchronization.

*/ public class RocksEnv extends RocksObject { public static final int FLUSH_POOL = 0; @@ -18,39 +18,48 @@ public class RocksEnv extends RocksObject { static { default_env_ = new RocksEnv(getDefaultEnvInternal()); + } private static native long getDefaultEnvInternal(); /** - * Returns the default environment suitable for the current operating - * system. + *

Returns the default environment suitable for the current operating + * system.

* - * The result of getDefault() is a singleton whose ownership belongs - * to rocksdb c++. As a result, the returned RocksEnv will not + *

The result of {@code getDefault()} is a singleton whose ownership + * belongs to rocksdb c++. As a result, the returned RocksEnv will not * have the ownership of its c++ resource, and calling its dispose() - * will be no-op. + * will be no-op.

+ * + * @return the default {@link org.rocksdb.RocksEnv} instance. */ public static RocksEnv getDefault() { return default_env_; } /** - * Sets the number of background worker threads of the flush pool - * for this environment. - * default number: 1 + *

Sets the number of background worker threads of the flush pool + * for this environment.

+ *

Default number: 1

+ * + * @param num the number of threads + * + * @return current {@link org.rocksdb.RocksEnv} instance. */ public RocksEnv setBackgroundThreads(int num) { return setBackgroundThreads(num, FLUSH_POOL); } /** - * Sets the number of background worker threads of the specified thread - * pool for this environment. + *

Sets the number of background worker threads of the specified thread + * pool for this environment.

* * @param num the number of threads * @param poolID the id to specified a thread pool. Should be either * FLUSH_POOL or COMPACTION_POOL. - * Default number: 1 + * + *

Default number: 1

+ * @return current {@link org.rocksdb.RocksEnv} instance. */ public RocksEnv setBackgroundThreads(int num, int poolID) { setBackgroundThreads(nativeHandle_, num, poolID); @@ -60,11 +69,13 @@ public class RocksEnv extends RocksObject { long handle, int num, int priority); /** - * Returns the length of the queue associated with the specified - * thread pool. + *

Returns the length of the queue associated with the specified + * thread pool.

* * @param poolID the id to specified a thread pool. Should be either * FLUSH_POOL or COMPACTION_POOL. + * + * @return the thread pool queue length. */ public int getThreadPoolQueueLen(int poolID) { return getThreadPoolQueueLen(nativeHandle_, poolID); @@ -72,11 +83,13 @@ public class RocksEnv extends RocksObject { private native int getThreadPoolQueueLen(long handle, int poolID); /** - * Package-private constructor that uses the specified native handle - * to construct a RocksEnv. Note that the ownership of the input handle + *

Package-private constructor that uses the specified native handle + * to construct a RocksEnv.

+ * + *

Note that the ownership of the input handle * belongs to the caller, and the newly created RocksEnv will not take - * the ownership of the input handle. As a result, calling dispose() - * of the created RocksEnv will be no-op. + * the ownership of the input handle. As a result, calling + * {@code dispose()} of the created RocksEnv will be no-op.

*/ RocksEnv(long handle) { super(); @@ -85,18 +98,19 @@ public class RocksEnv extends RocksObject { } /** - * The helper function of dispose() which all subclasses of RocksObject - * must implement to release their associated C++ resource. + * The helper function of {@link #dispose()} which all subclasses of + * {@link RocksObject} must implement to release their associated C++ + * resource. */ - protected void disposeInternal() { + @Override protected void disposeInternal() { disposeInternal(nativeHandle_); } private native void disposeInternal(long handle); /** - * The static default RocksEnv. The ownership of its native handle + *

The static default RocksEnv. The ownership of its native handle * belongs to rocksdb c++ and is not able to be released on the Java - * side. + * side.

*/ static RocksEnv default_env_; } diff --git a/java/src/main/java/org/rocksdb/RocksIterator.java b/java/src/main/java/org/rocksdb/RocksIterator.java new file mode 100644 index 000000000..bb9a6e697 --- /dev/null +++ b/java/src/main/java/org/rocksdb/RocksIterator.java @@ -0,0 +1,64 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + *

An iterator that yields a sequence of key/value pairs from a source. + * Multiple implementations are provided by this library. + * In particular, iterators are provided + * to access the contents of a Table or a DB.

+ * + *

Multiple threads can invoke const methods on an RocksIterator without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same RocksIterator must use + * external synchronization.

+ * + * @see org.rocksdb.RocksObject + */ +public class RocksIterator extends AbstractRocksIterator { + protected RocksIterator(RocksDB rocksDB, long nativeHandle) { + super(rocksDB, nativeHandle); + } + + /** + *

Return the key for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: {@link #isValid()}

+ * + * @return key for the current entry. + */ + public byte[] key() { + assert(isInitialized()); + return key0(nativeHandle_); + } + + /** + *

Return the value for the current entry. The underlying storage for + * the returned slice is valid only until the next modification of + * the iterator.

+ * + *

REQUIRES: !AtEnd() && !AtStart()

+ * @return value for the current entry. + */ + public byte[] value() { + assert(isInitialized()); + return value0(nativeHandle_); + } + + @Override final native void disposeInternal(long handle); + @Override final native boolean isValid0(long handle); + @Override final native void seekToFirst0(long handle); + @Override final native void seekToLast0(long handle); + @Override final native void next0(long handle); + @Override final native void prev0(long handle); + @Override final native void seek0(long handle, byte[] target, int targetLen); + @Override final native void status0(long handle) throws RocksDBException; + + private native byte[] key0(long handle); + private native byte[] value0(long handle); +} diff --git a/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java new file mode 100644 index 000000000..b5cc90afb --- /dev/null +++ b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java @@ -0,0 +1,80 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + *

Defines the interface for an Iterator which provides + * access to data one entry at a time. Multiple implementations + * are provided by this library. In particular, iterators are provided + * to access the contents of a DB and Write Batch.

+ * + *

Multiple threads can invoke const methods on an RocksIterator without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same RocksIterator must use + * external synchronization.

+ * + * @see org.rocksdb.RocksObject + */ +public interface RocksIteratorInterface { + + /** + *

An iterator is either positioned at an entry, or + * not valid. This method returns true if the iterator is valid.

+ * + * @return true if iterator is valid. + */ + public boolean isValid(); + + /** + *

Position at the first entry in the source. The iterator is Valid() + * after this call if the source is not empty.

+ */ + public void seekToFirst(); + + /** + *

Position at the last entry in the source. The iterator is + * valid after this call if the source is not empty.

+ */ + public void seekToLast(); + + /** + *

Position at the first entry in the source whose key is that or + * past target.

+ * + *

The iterator is valid after this call if the source contains + * a key that comes at or past target.

+ * + * @param target byte array describing a key or a + * key prefix to seek for. + */ + public void seek(byte[] target); + + /** + *

Moves to the next entry in the source. After this call, Valid() is + * true if the iterator was not positioned at the last entry in the source.

+ * + *

REQUIRES: {@link #isValid()}

+ */ + public void next(); + + /** + *

Moves to the previous entry in the source. After this call, Valid() is + * true if the iterator was not positioned at the first entry in source.

+ * + *

REQUIRES: {@link #isValid()}

+ */ + public void prev(); + + /** + *

If an error has occurred, return it. Else return an ok status. + * If non-blocking IO is requested and this operation cannot be + * satisfied without doing some IO, then this returns Status::Incomplete().

+ * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public void status() throws RocksDBException; +} diff --git a/java/org/rocksdb/RocksObject.java b/java/src/main/java/org/rocksdb/RocksObject.java similarity index 52% rename from java/org/rocksdb/RocksObject.java rename to java/src/main/java/org/rocksdb/RocksObject.java index 353918d2e..6e24a1385 100644 --- a/java/org/rocksdb/RocksObject.java +++ b/java/src/main/java/org/rocksdb/RocksObject.java @@ -7,16 +7,20 @@ package org.rocksdb; /** * RocksObject is the base-class of all RocksDB classes that has a pointer to - * some c++ rocksdb object. + * some c++ {@code rocksdb} object. * - * RocksObject has dispose() function, which releases its associated c++ resource. + *

+ * RocksObject has {@code dispose()} function, which releases its associated c++ + * resource.

+ *

* This function can be either called manually, or being called automatically - * during the regular Java GC process. However, since Java may wrongly assume a + * during the regular Java GC process. However, since Java may wrongly assume a * RocksObject only contains a long member variable and think it is small in size, - * Java may give RocksObject low priority in the GC process. For this, it is - * suggested to call dispose() manually. However, it is safe to let RocksObject go - * out-of-scope without manually calling dispose() as dispose() will be called - * in the finalizer during the regular GC process. + * Java may give {@code RocksObject} low priority in the GC process. For this, it is + * suggested to call {@code dispose()} manually. However, it is safe to let + * {@code RocksObject} go out-of-scope without manually calling {@code dispose()} + * as {@code dispose()} will be called in the finalizer during the + * regular GC process.

*/ public abstract class RocksObject { protected RocksObject() { @@ -26,16 +30,18 @@ public abstract class RocksObject { /** * Release the c++ object manually pointed by the native handle. - * - * Note that dispose() will also be called during the GC process - * if it was not called before its RocksObject went out-of-scope. + *

+ * Note that {@code dispose()} will also be called during the GC process + * if it was not called before its {@code RocksObject} went out-of-scope. * However, since Java may wrongly wrongly assume those objects are * small in that they seems to only hold a long variable. As a result, * they might have low priority in the GC process. To prevent this, - * it is suggested to call dispose() manually. - * - * Note that once an instance of RocksObject has been disposed, + * it is suggested to call {@code dispose()} manually. + *

+ *

+ * Note that once an instance of {@code RocksObject} has been disposed, * calling its function will lead undefined behavior. + *

*/ public final synchronized void dispose() { if (isOwningNativeHandle() && isInitialized()) { @@ -46,40 +52,41 @@ public abstract class RocksObject { } /** - * The helper function of dispose() which all subclasses of RocksObject - * must implement to release their associated C++ resource. + * The helper function of {@code dispose()} which all subclasses of + * {@code RocksObject} must implement to release their associated + * C++ resource. */ protected abstract void disposeInternal(); /** * Revoke ownership of the native object. - * + *

* This will prevent the object from attempting to delete the underlying * native object in its finalizer. This must be used when another object * takes over ownership of the native object or both will attempt to delete * the underlying object when garbage collected. - * - * When disOwnNativeHandle() is called, dispose() will simply set nativeHandle_ - * to 0 without releasing its associated C++ resource. As a result, - * incorrectly use this function may cause memory leak, and this function call - * will not affect the return value of isInitialized(). - * - * @see dispose() - * @see isInitialized() + *

+ * When {@code disOwnNativeHandle()} is called, {@code dispose()} will simply set + * {@code nativeHandle_} to 0 without releasing its associated C++ resource. + * As a result, incorrectly use this function may cause memory leak, and this + * function call will not affect the return value of {@code isInitialized()}. + *

+ * @see #dispose() + * @see #isInitialized() */ protected void disOwnNativeHandle() { owningHandle_ = false; } /** - * Returns true if the current RocksObject is responsable to release its - * native handle. + * Returns true if the current {@code RocksObject} is responsible to release + * its native handle. * - * @return true if the current RocksObject is responsible to release its - * native handle. + * @return true if the current {@code RocksObject} is responsible to release + * its native handle. * - * @see disOwnNativeHandle() - * @see dispose() + * @see #disOwnNativeHandle() + * @see #dispose() */ protected boolean isOwningNativeHandle() { return owningHandle_; @@ -90,18 +97,19 @@ public abstract class RocksObject { * * @return true if the associated native handle has been initialized. * - * @see dispose() + * @see #dispose() */ protected boolean isInitialized() { return (nativeHandle_ != 0); } /** - * Simply calls dispose() and release its c++ resource if it has not + * Simply calls {@code dispose()} and release its c++ resource if it has not * yet released. */ - @Override protected void finalize() { + @Override protected void finalize() throws Throwable { dispose(); + super.finalize(); } /** @@ -110,8 +118,8 @@ public abstract class RocksObject { protected long nativeHandle_; /** - * A flag indicating whether the current RocksObject is responsible to - * release the c++ object stored in its nativeHandle_. + * A flag indicating whether the current {@code RocksObject} is responsible to + * release the c++ object stored in its {@code nativeHandle_}. */ private boolean owningHandle_; } diff --git a/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java b/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java new file mode 100644 index 000000000..d26fd9d32 --- /dev/null +++ b/java/src/main/java/org/rocksdb/SkipListMemTableConfig.java @@ -0,0 +1,49 @@ +package org.rocksdb; + +/** + * The config for skip-list memtable representation. + */ +public class SkipListMemTableConfig extends MemTableConfig { + + public static final long DEFAULT_LOOKAHEAD = 0; + + /** + * SkipListMemTableConfig constructor + */ + public SkipListMemTableConfig() { + lookahead_ = DEFAULT_LOOKAHEAD; + } + + /** + * Sets lookahead for SkipList + * + * @param lookahead If non-zero, each iterator's seek operation + * will start the search from the previously visited record + * (doing at most 'lookahead' steps). This is an + * optimization for the access pattern including many + * seeks with consecutive keys. + * @return the current instance of SkipListMemTableConfig + */ + public SkipListMemTableConfig setLookahead(long lookahead) { + lookahead_ = lookahead; + return this; + } + + /** + * Returns the currently set lookahead value. + * + * @return lookahead value + */ + public long lookahead() { + return lookahead_; + } + + + @Override protected long newMemTableFactoryHandle() { + return newMemTableFactoryHandle0(lookahead_); + } + + private native long newMemTableFactoryHandle0(long lookahead); + + private long lookahead_; +} diff --git a/java/src/main/java/org/rocksdb/Slice.java b/java/src/main/java/org/rocksdb/Slice.java new file mode 100644 index 000000000..d26490e5f --- /dev/null +++ b/java/src/main/java/org/rocksdb/Slice.java @@ -0,0 +1,88 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + *

Base class for slices which will receive + * byte[] based access to the underlying data.

+ * + *

byte[] backed slices typically perform better with + * small keys and values. When using larger keys and + * values consider using {@link org.rocksdb.DirectSlice}

+ */ +public class Slice extends AbstractSlice { + /** + *

Called from JNI to construct a new Java Slice + * without an underlying C++ object set + * at creation time.

+ * + *

Note: You should be aware that + * {@see org.rocksdb.RocksObject#disOwnNativeHandle()} is intentionally + * called from the default Slice constructor, and that it is marked as + * private. This is so that developers cannot construct their own default + * Slice objects (at present). As developers cannot construct their own + * Slice objects through this, they are not creating underlying C++ Slice + * objects, and so there is nothing to free (dispose) from Java.

+ */ + private Slice() { + super(); + disOwnNativeHandle(); + } + + /** + *

Constructs a slice where the data is taken from + * a String.

+ * + * @param str String value. + */ + public Slice(final String str) { + super(); + createNewSliceFromString(str); + } + + /** + *

Constructs a slice where the data is a copy of + * the byte array from a specific offset.

+ * + * @param data byte array. + * @param offset offset within the byte array. + */ + public Slice(final byte[] data, final int offset) { + super(); + createNewSlice0(data, offset); + } + + /** + *

Constructs a slice where the data is a copy of + * the byte array.

+ * + * @param data byte array. + */ + public Slice(final byte[] data) { + super(); + createNewSlice1(data); + } + + /** + *

Deletes underlying C++ slice pointer + * and any buffered data.

+ * + *

+ * Note that this function should be called only after all + * RocksDB instances referencing the slice are closed. + * Otherwise an undefined behavior will occur.

+ */ + @Override + protected void disposeInternal() { + disposeInternalBuf(nativeHandle_); + super.disposeInternal(); + } + + @Override protected final native byte[] data0(long handle); + private native void createNewSlice0(byte[] data, int length); + private native void createNewSlice1(byte[] data); + private native void disposeInternalBuf(long handle); +} diff --git a/java/src/main/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java new file mode 100644 index 000000000..1842dddd3 --- /dev/null +++ b/java/src/main/java/org/rocksdb/Snapshot.java @@ -0,0 +1,37 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Snapshot of database + */ +public class Snapshot extends RocksObject { + Snapshot(long nativeHandle) { + super(); + nativeHandle_ = nativeHandle; + } + + /** + * Return the associated sequence number; + * + * @return the associated sequence number of + * this snapshot. + */ + public long getSequenceNumber() { + assert(isInitialized()); + return getSequenceNumber(nativeHandle_); + } + + /** + * Dont release C++ Snapshot pointer. The pointer + * to the snapshot is released by the database + * instance. + */ + @Override protected void disposeInternal() { + } + + private native long getSequenceNumber(long handle); +} diff --git a/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java similarity index 94% rename from java/org/rocksdb/Statistics.java rename to java/src/main/java/org/rocksdb/Statistics.java index bed2b8810..066f3a5b5 100644 --- a/java/org/rocksdb/Statistics.java +++ b/java/src/main/java/org/rocksdb/Statistics.java @@ -24,9 +24,8 @@ public class Statistics { public HistogramData geHistogramData(HistogramType histogramType) { assert(isInitialized()); - HistogramData hist = geHistogramData0( + return geHistogramData0( histogramType.getValue(), statsHandle_); - return hist; } private boolean isInitialized() { diff --git a/java/org/rocksdb/StatisticsCollector.java b/java/src/main/java/org/rocksdb/StatisticsCollector.java similarity index 89% rename from java/org/rocksdb/StatisticsCollector.java rename to java/src/main/java/org/rocksdb/StatisticsCollector.java index 29815c46d..be8f26a14 100644 --- a/java/org/rocksdb/StatisticsCollector.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollector.java @@ -6,20 +6,18 @@ package org.rocksdb; import java.util.List; -import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Executors; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; /** - * Helper class to collect DB statistics periodically at a period specified in + *

Helper class to collect DB statistics periodically at a period specified in * constructor. Callback function (provided in constructor) is called with - * every statistics collection. + * every statistics collection.

* - * Caller should call start() to start statistics collection. Shutdown() should + *

Caller should call start() to start statistics collection. Shutdown() should * be called to stop stats collection and should be called before statistics ( - * provided in constructor) reference has been disposed. + * provided in constructor) reference has been disposed.

*/ public class StatisticsCollector { private final List _statsCollectorInputList; @@ -29,9 +27,9 @@ public class StatisticsCollector { /** * Constructor for statistics collector. - * + * * @param statsCollectorInputList List of statistics collector input. - * @param statsCollectionIntervalInMilliSeconds Statistics collection time + * @param statsCollectionIntervalInMilliSeconds Statistics collection time * period (specified in milliseconds). */ public StatisticsCollector(List statsCollectorInputList, @@ -48,9 +46,10 @@ public class StatisticsCollector { /** * Shuts down statistics collector. - * + * * @param shutdownTimeout Time in milli-seconds to wait for shutdown before * killing the collection process. + * @throws java.lang.InterruptedException thrown if Threads are interrupted. */ public void shutDown(int shutdownTimeout) throws InterruptedException { _isRunning = false; @@ -70,13 +69,13 @@ public class StatisticsCollector { try { if(Thread.currentThread().isInterrupted()) { break; - } + } for(StatsCollectorInput statsCollectorInput : _statsCollectorInputList) { Statistics statistics = statsCollectorInput.getStatistics(); StatisticsCollectorCallback statsCallback = statsCollectorInput.getCallback(); - + // Collect ticker data for(TickerType ticker : TickerType.values()) { long tickerValue = statistics.getTickerCount(ticker); diff --git a/java/org/rocksdb/StatisticsCollectorCallback.java b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java similarity index 94% rename from java/org/rocksdb/StatisticsCollectorCallback.java rename to java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java index a955ec216..2ce92c5ee 100644 --- a/java/org/rocksdb/StatisticsCollectorCallback.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java @@ -7,16 +7,14 @@ package org.rocksdb; /** * Callback interface provided to StatisticsCollector. - * + * * Thread safety: - * StatisticsCollector doesn't make any guarantees about thread safety. + * StatisticsCollector doesn't make any guarantees about thread safety. * If the same reference of StatisticsCollectorCallback is passed to multiple - * StatisticsCollector references, then its the responsibility of the + * StatisticsCollector references, then its the responsibility of the * user to make StatisticsCollectorCallback's implementation thread-safe. - * - * @param tickerType - * @param tickerCount -*/ + * + */ public interface StatisticsCollectorCallback { /** * Callback function to get ticker values. diff --git a/java/org/rocksdb/StatsCollectorInput.java b/java/src/main/java/org/rocksdb/StatsCollectorInput.java similarity index 98% rename from java/org/rocksdb/StatsCollectorInput.java rename to java/src/main/java/org/rocksdb/StatsCollectorInput.java index a1aa928d3..890977cdf 100644 --- a/java/org/rocksdb/StatsCollectorInput.java +++ b/java/src/main/java/org/rocksdb/StatsCollectorInput.java @@ -12,10 +12,10 @@ package org.rocksdb; public class StatsCollectorInput { private final Statistics _statistics; private final StatisticsCollectorCallback _statsCallback; - + /** * Constructor for StatsCollectorInput. - * + * * @param statistics Reference of DB statistics. * @param statsCallback Reference of statistics callback interface. */ @@ -24,11 +24,11 @@ public class StatsCollectorInput { _statistics = statistics; _statsCallback = statsCallback; } - + public Statistics getStatistics() { return _statistics; } - + public StatisticsCollectorCallback getCallback() { return _statsCallback; } diff --git a/java/src/main/java/org/rocksdb/StringAppendOperator.java b/java/src/main/java/org/rocksdb/StringAppendOperator.java new file mode 100644 index 000000000..52cd43e79 --- /dev/null +++ b/java/src/main/java/org/rocksdb/StringAppendOperator.java @@ -0,0 +1,17 @@ +// Copyright (c) 2014, Vlad Balan (vlad.gm@gmail.com). All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * StringAppendOperator is a merge operator that concatenates + * two strings. + */ +public class StringAppendOperator implements MergeOperator { + @Override public long newMergeOperatorHandle() { + return newMergeOperatorHandleImpl(); + } + private native long newMergeOperatorHandleImpl(); +} diff --git a/java/org/rocksdb/TableFormatConfig.java b/java/src/main/java/org/rocksdb/TableFormatConfig.java similarity index 79% rename from java/org/rocksdb/TableFormatConfig.java rename to java/src/main/java/org/rocksdb/TableFormatConfig.java index e5c63411f..58a533b22 100644 --- a/java/org/rocksdb/TableFormatConfig.java +++ b/java/src/main/java/org/rocksdb/TableFormatConfig.java @@ -12,9 +12,11 @@ package org.rocksdb; */ public abstract class TableFormatConfig { /** - * This function should only be called by Options.setTableFormatConfig(), + *

This function should only be called by Options.setTableFormatConfig(), * which will create a c++ shared-pointer to the c++ TableFactory - * that associated with the Java TableFormatConfig. + * that associated with the Java TableFormatConfig.

+ * + * @return native handle address to native table instance. */ abstract protected long newTableFactoryHandle(); } diff --git a/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java similarity index 100% rename from java/org/rocksdb/TickerType.java rename to java/src/main/java/org/rocksdb/TickerType.java diff --git a/java/src/main/java/org/rocksdb/TransactionLogIterator.java b/java/src/main/java/org/rocksdb/TransactionLogIterator.java new file mode 100644 index 000000000..d82cde3ea --- /dev/null +++ b/java/src/main/java/org/rocksdb/TransactionLogIterator.java @@ -0,0 +1,115 @@ +package org.rocksdb; + +/** + *

A TransactionLogIterator is used to iterate over the transactions in a db. + * One run of the iterator is continuous, i.e. the iterator will stop at the + * beginning of any gap in sequences.

+ */ +public class TransactionLogIterator extends RocksObject { + + /** + *

An iterator is either positioned at a WriteBatch + * or not valid. This method returns true if the iterator + * is valid. Can read data from a valid iterator.

+ * + * @return true if iterator position is valid. + */ + public boolean isValid() { + return isValid(nativeHandle_); + } + + /** + *

Moves the iterator to the next WriteBatch. + * REQUIRES: Valid() to be true.

+ */ + public void next() { + next(nativeHandle_); + } + + /** + *

Throws RocksDBException if something went wrong.

+ * + * @throws org.rocksdb.RocksDBException if something went + * wrong in the underlying C++ code. + */ + public void status() throws RocksDBException { + status(nativeHandle_); + } + + /** + *

If iterator position is valid, return the current + * write_batch and the sequence number of the earliest + * transaction contained in the batch.

+ * + *

ONLY use if Valid() is true and status() is OK.

+ * + * @return {@link org.rocksdb.TransactionLogIterator.BatchResult} + * instance. + */ + public BatchResult getBatch() { + assert(isValid()); + return getBatch(nativeHandle_); + } + + /** + *

TransactionLogIterator constructor.

+ * + * @param nativeHandle address to native address. + */ + TransactionLogIterator(long nativeHandle) { + super(); + nativeHandle_ = nativeHandle; + } + + @Override protected void disposeInternal() { + disposeInternal(nativeHandle_); + } + + /** + *

BatchResult represents a data structure returned + * by a TransactionLogIterator containing a sequence + * number and a {@link WriteBatch} instance.

+ */ + public class BatchResult { + /** + *

Constructor of BatchResult class.

+ * + * @param sequenceNumber related to this BatchResult instance. + * @param nativeHandle to {@link org.rocksdb.WriteBatch} + * native instance. + */ + public BatchResult(long sequenceNumber, long nativeHandle) { + sequenceNumber_ = sequenceNumber; + writeBatch_ = new WriteBatch(nativeHandle); + } + + /** + *

Return sequence number related to this BatchResult.

+ * + * @return Sequence number. + */ + public long sequenceNumber() { + return sequenceNumber_; + } + + /** + *

Return contained {@link org.rocksdb.WriteBatch} + * instance

+ * + * @return {@link org.rocksdb.WriteBatch} instance. + */ + public WriteBatch writeBatch() { + return writeBatch_; + } + + private final long sequenceNumber_; + private final WriteBatch writeBatch_; + } + + private native void disposeInternal(long handle); + private native boolean isValid(long handle); + private native void next(long handle); + private native void status(long handle) + throws RocksDBException; + private native BatchResult getBatch(long handle); +} diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java new file mode 100644 index 000000000..a78bb9435 --- /dev/null +++ b/java/src/main/java/org/rocksdb/TtlDB.java @@ -0,0 +1,196 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.List; + +/** + * Database with TTL support. + * + *

Use case

+ *

This API should be used to open the db when key-values inserted are + * meant to be removed from the db in a non-strict 'ttl' amount of time + * Therefore, this guarantees that key-values inserted will remain in the + * db for >= ttl amount of time and the db will make efforts to remove the + * key-values as soon as possible after ttl seconds of their insertion. + *

+ * + *

Behaviour

+ *

TTL is accepted in seconds + * (int32_t)Timestamp(creation) is suffixed to values in Put internally + * Expired TTL values deleted in compaction only:(Timestamp+ttl<time_now) + * Get/Iterator may return expired entries(compaction not run on them yet) + * Different TTL may be used during different Opens + *

+ * + *

Example

+ *
    + *
  • Open1 at t=0 with ttl=4 and insert k1,k2, close at t=2
  • + *
  • Open2 at t=3 with ttl=5. Now k1,k2 should be deleted at t>=5
  • + *
+ * + *

+ * read_only=true opens in the usual read-only mode. Compactions will not be + * triggered(neither manual nor automatic), so no expired entries removed + *

+ * + *

Constraints

+ *

Not specifying/passing or non-positive TTL behaves + * like TTL = infinity

+ * + *

!!!WARNING!!!

+ *

Calling DB::Open directly to re-open a db created by this API will get + * corrupt values(timestamp suffixed) and no ttl effect will be there + * during the second Open, so use this API consistently to open the db + * Be careful when passing ttl with a small positive value because the + * whole database may be deleted in a small amount of time.

+ */ +public class TtlDB extends RocksDB { + + /** + *

Opens a TtlDB.

+ * + *

Database is opened in read-write mode without default TTL.

+ * + * @param options {@link org.rocksdb.Options} instance. + * @param db_path path to database. + * + * @return TtlDB instance. + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public static TtlDB open(Options options, String db_path) + throws RocksDBException { + return open(options, db_path, 0, false); + } + + /** + *

Opens a TtlDB.

+ * + * @param options {@link org.rocksdb.Options} instance. + * @param db_path path to database. + * @param ttl time to live for new entries. + * @param readOnly boolean value indicating if database if db is + * opened read-only. + * + * @return TtlDB instance. + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + public static TtlDB open(Options options, String db_path, int ttl, + boolean readOnly) throws RocksDBException { + TtlDB ttldb = new TtlDB(); + ttldb.open(options.nativeHandle_, db_path, ttl, readOnly); + return ttldb; + } + + /** + *

Opens a TtlDB.

+ * + * @param options {@link org.rocksdb.Options} instance. + * @param db_path path to database. + * @param columnFamilyDescriptors list of column family descriptors + * @param columnFamilyHandles will be filled with ColumnFamilyHandle instances + * on open. + * @param ttlValues time to live values per column family handle + * @param readOnly boolean value indicating if database if db is + * opened read-only. + * + * @return TtlDB instance. + * + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + * @throws java.lang.IllegalArgumentException when there is not a ttl value + * per given column family handle. + */ + public static TtlDB open(DBOptions options, String db_path, + List columnFamilyDescriptors, + List columnFamilyHandles, + List ttlValues, boolean readOnly) throws RocksDBException { + if (columnFamilyDescriptors.size() != ttlValues.size()) { + throw new IllegalArgumentException("There must be a ttl value per column" + + "family handle."); + } + TtlDB ttlDB = new TtlDB(); + List cfReferences = ttlDB.openCF(options.nativeHandle_, db_path, + columnFamilyDescriptors, columnFamilyDescriptors.size(), + ttlValues, readOnly); + for (int i=0; iCreates a new ttl based column family with a name defined + * in given ColumnFamilyDescriptor and allocates a + * ColumnFamilyHandle within an internal structure.

+ * + *

The ColumnFamilyHandle is automatically disposed with DB + * disposal.

+ * + * @param columnFamilyDescriptor column family to be created. + * @param ttl TTL to set for this column family. + * + * @return {@link org.rocksdb.ColumnFamilyHandle} instance. + * + * @throws RocksDBException thrown if error happens in underlying + * native library. + */ + public ColumnFamilyHandle createColumnFamilyWithTtl( + ColumnFamilyDescriptor columnFamilyDescriptor, int ttl) + throws RocksDBException { + assert(isInitialized()); + return new ColumnFamilyHandle(this, + createColumnFamilyWithTtl(nativeHandle_, + columnFamilyDescriptor, ttl)); + } + + /** + *

Close the TtlDB instance and release resource.

+ * + *

Internally, TtlDB owns the {@code rocksdb::DB} pointer + * to its associated {@link org.rocksdb.RocksDB}. The release + * of that RocksDB pointer is handled in the destructor of the + * c++ {@code rocksdb::TtlDB} and should be transparent to + * Java developers.

+ */ + @Override public synchronized void close() { + if (isInitialized()) { + super.close(); + } + } + + /** + *

A protected constructor that will be used in the static + * factory method + * {@link #open(Options, String, int, boolean)} + * and + * {@link #open(DBOptions, String, java.util.List, java.util.List, + * java.util.List, boolean)}. + *

+ */ + protected TtlDB() { + super(); + } + + @Override protected void finalize() throws Throwable { + close(); + super.finalize(); + } + + private native void open(long optionsHandle, String db_path, int ttl, + boolean readOnly) throws RocksDBException; + private native List openCF(long optionsHandle, String db_path, + List columnFamilyDescriptors, + int columnFamilyDescriptorsLength, List ttlValues, + boolean readOnly) throws RocksDBException; + private native long createColumnFamilyWithTtl(long handle, + ColumnFamilyDescriptor columnFamilyDescriptor, int ttl) + throws RocksDBException; +} diff --git a/java/org/rocksdb/VectorMemTableConfig.java b/java/src/main/java/org/rocksdb/VectorMemTableConfig.java similarity index 95% rename from java/org/rocksdb/VectorMemTableConfig.java rename to java/src/main/java/org/rocksdb/VectorMemTableConfig.java index b7a413f19..ba1be3e77 100644 --- a/java/org/rocksdb/VectorMemTableConfig.java +++ b/java/src/main/java/org/rocksdb/VectorMemTableConfig.java @@ -5,6 +5,10 @@ package org.rocksdb; */ public class VectorMemTableConfig extends MemTableConfig { public static final int DEFAULT_RESERVED_SIZE = 0; + + /** + * VectorMemTableConfig constructor + */ public VectorMemTableConfig() { reservedSize_ = DEFAULT_RESERVED_SIZE; } diff --git a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java new file mode 100644 index 000000000..3171cc4ee --- /dev/null +++ b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -0,0 +1,137 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +public class WBWIRocksIterator extends AbstractRocksIterator { + private final WriteEntry entry = new WriteEntry(); + + protected WBWIRocksIterator(WriteBatchWithIndex wbwi, long nativeHandle) { + super(wbwi, nativeHandle); + } + + /** + * Get the current entry + * + * The WriteEntry is only valid + * until the iterator is repositioned. + * If you want to keep the WriteEntry across iterator + * movements, you must make a copy of its data! + * + * @return The WriteEntry of the current entry + */ + public WriteEntry entry() { + assert(isInitialized()); + assert(entry != null); + entry1(nativeHandle_, entry); + return entry; + } + + @Override final native void disposeInternal(long handle); + @Override final native boolean isValid0(long handle); + @Override final native void seekToFirst0(long handle); + @Override final native void seekToLast0(long handle); + @Override final native void next0(long handle); + @Override final native void prev0(long handle); + @Override final native void seek0(long handle, byte[] target, int targetLen); + @Override final native void status0(long handle) throws RocksDBException; + + private native void entry1(long handle, WriteEntry entry); + + /** + * Enumeration of the Write operation + * that created the record in the Write Batch + */ + public enum WriteType { + PUT, + MERGE, + DELETE, + LOG + } + + /** + * Represents an entry returned by + * {@link org.rocksdb.WBWIRocksIterator#entry()} + * + * It is worth noting that a WriteEntry with + * the type {@link org.rocksdb.WBWIRocksIterator.WriteType#DELETE} + * or {@link org.rocksdb.WBWIRocksIterator.WriteType#LOG} + * will not have a value. + */ + public static class WriteEntry { + WriteType type = null; + final DirectSlice key; + final DirectSlice value; + + /** + * Intentionally private as this + * should only be instantiated in + * this manner by the outer WBWIRocksIterator + * class; The class members are then modified + * by calling {@link org.rocksdb.WBWIRocksIterator#entry()} + */ + private WriteEntry() { + key = new DirectSlice(); + value = new DirectSlice(); + } + + public WriteEntry(WriteType type, DirectSlice key, DirectSlice value) { + this.type = type; + this.key = key; + this.value = value; + } + + /** + * Returns the type of the Write Entry + * + * @return the WriteType of the WriteEntry + */ + public WriteType getType() { + return type; + } + + /** + * Returns the key of the Write Entry + * + * @return The slice containing the key + * of the WriteEntry + */ + public DirectSlice getKey() { + return key; + } + + /** + * Returns the value of the Write Entry + * + * @return The slice containing the value of + * the WriteEntry or null if the WriteEntry has + * no value + */ + public DirectSlice getValue() { + if(!value.isInitialized()) { + return null; //TODO(AR) migrate to JDK8 java.util.Optional#empty() + } else { + return value; + } + } + + @Override + public boolean equals(Object other) { + if(other == null) { + return false; + } else if (this == other) { + return true; + } else if(other instanceof WriteEntry) { + final WriteEntry otherWriteEntry = (WriteEntry)other; + return type.equals(otherWriteEntry.type) + && key.equals(otherWriteEntry.key) + && (value.isInitialized() ? value.equals(otherWriteEntry.value) + : !otherWriteEntry.value.isInitialized()); + } else { + return false; + } + } + } +} diff --git a/java/src/main/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java new file mode 100644 index 000000000..fd6d9386c --- /dev/null +++ b/java/src/main/java/org/rocksdb/WriteBatch.java @@ -0,0 +1,126 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * WriteBatch holds a collection of updates to apply atomically to a DB. + * + * The updates are applied in the order in which they are added + * to the WriteBatch. For example, the value of "key" will be "v3" + * after the following batch is written: + * + * batch.put("key", "v1"); + * batch.remove("key"); + * batch.put("key", "v2"); + * batch.put("key", "v3"); + * + * Multiple threads can invoke const methods on a WriteBatch without + * external synchronization, but if any of the threads may call a + * non-const method, all threads accessing the same WriteBatch must use + * external synchronization. + */ +public class WriteBatch extends AbstractWriteBatch { + /** + * Constructs a WriteBatch instance. + */ + public WriteBatch() { + super(); + newWriteBatch(0); + } + + /** + * Constructs a WriteBatch instance with a given size. + * + * @param reserved_bytes reserved size for WriteBatch + */ + public WriteBatch(int reserved_bytes) { + nativeHandle_ = 0; + newWriteBatch(reserved_bytes); + } + + /** + * Support for iterating over the contents of a batch. + * + * @param handler A handler that is called back for each + * update present in the batch + * + * @throws RocksDBException If we cannot iterate over the batch + */ + public void iterate(Handler handler) throws RocksDBException { + iterate(handler.nativeHandle_); + } + + /** + *

Private WriteBatch constructor which is used to construct + * WriteBatch instances from C++ side. As the reference to this + * object is also managed from C++ side the handle will be disowned.

+ * + * @param nativeHandle address of native instance. + */ + WriteBatch(long nativeHandle) { + super(); + disOwnNativeHandle(); + nativeHandle_ = nativeHandle; + } + + @Override final native void disposeInternal(long handle); + @Override final native int count0(); + @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen); + @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen, + long cfHandle); + @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen); + @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen, + long cfHandle); + @Override final native void remove(byte[] key, int keyLen); + @Override final native void remove(byte[] key, int keyLen, long cfHandle); + @Override final native void putLogData(byte[] blob, int blobLen); + @Override final native void clear0(); + + private native void newWriteBatch(int reserved_bytes); + private native void iterate(long handlerHandle) throws RocksDBException; + + + /** + * Handler callback for iterating over the contents of a batch. + */ + public static abstract class Handler extends RocksObject { + public Handler() { + super(); + createNewHandler0(); + } + + public abstract void put(byte[] key, byte[] value); + public abstract void merge(byte[] key, byte[] value); + public abstract void delete(byte[] key); + public abstract void logData(byte[] blob); + + /** + * shouldContinue is called by the underlying iterator + * WriteBatch::Iterate. If it returns false, + * iteration is halted. Otherwise, it continues + * iterating. The default implementation always + * returns true. + * + * @return boolean value indicating if the + * iteration is halted. + */ + public boolean shouldContinue() { + return true; + } + + /** + * Deletes underlying C++ handler pointer. + */ + @Override + protected void disposeInternal() { + assert(isInitialized()); + disposeInternal(nativeHandle_); + } + + private native void createNewHandler0(); + private native void disposeInternal(long handle); + } +} diff --git a/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/java/src/main/java/org/rocksdb/WriteBatchInterface.java new file mode 100644 index 000000000..4eaf1ad9d --- /dev/null +++ b/java/src/main/java/org/rocksdb/WriteBatchInterface.java @@ -0,0 +1,98 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + *

Defines the interface for a Write Batch which + * holds a collection of updates to apply atomically to a DB.

+ */ +public interface WriteBatchInterface { + + /** + * Returns the number of updates in the batch. + * + * @return number of items in WriteBatch + */ + public int count(); + + /** + *

Store the mapping "key->value" in the database.

+ * + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(byte[] key, byte[] value); + + /** + *

Store the mapping "key->value" within given column + * family.

+ * + * @param columnFamilyHandle {@link org.rocksdb.ColumnFamilyHandle} + * instance + * @param key the specified key to be inserted. + * @param value the value associated with the specified key. + */ + public void put(ColumnFamilyHandle columnFamilyHandle, + byte[] key, byte[] value); + + /** + *

Merge "value" with the existing value of "key" in the database. + * "key->merge(existing, value)"

+ * + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + */ + public void merge(byte[] key, byte[] value); + + /** + *

Merge "value" with the existing value of "key" in given column family. + * "key->merge(existing, value)"

+ * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key the specified key to be merged. + * @param value the value to be merged with the current value for + * the specified key. + */ + public void merge(ColumnFamilyHandle columnFamilyHandle, + byte[] key, byte[] value); + + /** + *

If the database contains a mapping for "key", erase it. Else do nothing.

+ * + * @param key Key to delete within database + */ + public void remove(byte[] key); + + /** + *

If column family contains a mapping for "key", erase it. Else do nothing.

+ * + * @param columnFamilyHandle {@link ColumnFamilyHandle} instance + * @param key Key to delete within database + */ + public void remove(ColumnFamilyHandle columnFamilyHandle, byte[] key); + + /** + * Append a blob of arbitrary size to the records in this batch. The blob will + * be stored in the transaction log but not in any other file. In particular, + * it will not be persisted to the SST files. When iterating over this + * WriteBatch, WriteBatch::Handler::LogData will be called with the contents + * of the blob as it is encountered. Blobs, puts, deletes, and merges will be + * encountered in the same order in thich they were inserted. The blob will + * NOT consume sequence number(s) and will NOT increase the count of the batch + * + * Example application: add timestamps to the transaction log for use in + * replication. + * + * @param blob binary object to be inserted + */ + public void putLogData(byte[] blob); + + /** + * Clear all updates buffered in this batch + */ + public void clear(); +} diff --git a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java new file mode 100644 index 000000000..5204146c4 --- /dev/null +++ b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -0,0 +1,149 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Similar to {@link org.rocksdb.WriteBatch} but with a binary searchable + * index built for all the keys inserted. + * + * Calling put, merge, remove or putLogData calls the same function + * as with {@link org.rocksdb.WriteBatch} whilst also building an index. + * + * A user can call {@link org.rocksdb.WriteBatchWithIndex#newIterator() }to create an iterator + * over the write batch or + * {@link org.rocksdb.WriteBatchWithIndex#newIteratorWithBase(org.rocksdb.RocksIterator)} to + * get an iterator for the database with Read-Your-Own-Writes like capability + */ +public class WriteBatchWithIndex extends AbstractWriteBatch { + /** + * Creates a WriteBatchWithIndex where no bytes + * are reserved up-front, bytewise comparison is + * used for fallback key comparisons, + * and duplicate keys operations are retained + */ + public WriteBatchWithIndex() { + super(); + newWriteBatchWithIndex(); + } + + + /** + * Creates a WriteBatchWithIndex where no bytes + * are reserved up-front, bytewise comparison is + * used for fallback key comparisons, and duplicate key + * assignment is determined by the constructor argument + * + * @param overwriteKey if true, overwrite the key in the index when + * inserting a duplicate key, in this way an iterator will never + * show two entries with the same key. + */ + public WriteBatchWithIndex(boolean overwriteKey) { + super(); + newWriteBatchWithIndex(overwriteKey); + } + + /** + * Creates a WriteBatchWithIndex + * + * @param fallbackIndexComparator We fallback to this comparator + * to compare keys within a column family if we cannot determine + * the column family and so look up it's comparator. + * + * @param reservedBytes reserved bytes in underlying WriteBatch + * + * @param overwriteKey if true, overwrite the key in the index when + * inserting a duplicate key, in this way an iterator will never + * show two entries with the same key. + */ + public WriteBatchWithIndex(AbstractComparator fallbackIndexComparator, int reservedBytes, + boolean overwriteKey) { + super(); + newWriteBatchWithIndex(fallbackIndexComparator.nativeHandle_, reservedBytes, overwriteKey); + } + + /** + * Create an iterator of a column family. User can call + * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to + * search to the next entry of or after a key. Keys will be iterated in the + * order given by index_comparator. For multiple updates on the same key, + * each update will be returned as a separate entry, in the order of update + * time. + * + * @param columnFamilyHandle The column family to iterate over + * @return An iterator for the Write Batch contents, restricted to the column family + */ + public WBWIRocksIterator newIterator(ColumnFamilyHandle columnFamilyHandle) { + return new WBWIRocksIterator(this, iterator1(columnFamilyHandle.nativeHandle_)); + } + + /** + * Create an iterator of the default column family. User can call + * {@link org.rocksdb.RocksIteratorInterface#seek(byte[])} to + * search to the next entry of or after a key. Keys will be iterated in the + * order given by index_comparator. For multiple updates on the same key, + * each update will be returned as a separate entry, in the order of update + * time. + * + * @return An iterator for the Write Batch contents + */ + public WBWIRocksIterator newIterator() { + return new WBWIRocksIterator(this, iterator0()); + } + + /** + * Provides Read-Your-Own-Writes like functionality by + * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} + * as a delta and baseIterator as a base + * + * @param columnFamilyHandle The column family to iterate over + * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @return An iterator which shows a view comprised of both the database point-in-time + * from baseIterator and modifications made in this write batch. + */ + public RocksIterator newIteratorWithBase(ColumnFamilyHandle columnFamilyHandle, + RocksIterator baseIterator) { + RocksIterator iterator = new RocksIterator( + baseIterator.parent_, + iteratorWithBase(columnFamilyHandle.nativeHandle_, baseIterator.nativeHandle_)); + //when the iterator is deleted it will also delete the baseIterator + baseIterator.disOwnNativeHandle(); + return iterator; + } + + /** + * Provides Read-Your-Own-Writes like functionality by + * creating a new Iterator that will use {@link org.rocksdb.WBWIRocksIterator} + * as a delta and baseIterator as a base. Operates on the default column family. + * + * @param baseIterator The base iterator, e.g. {@link org.rocksdb.RocksDB#newIterator()} + * @return An iterator which shows a view comprised of both the database point-in-time + * from baseIterator and modifications made in this write batch. + */ + public RocksIterator newIteratorWithBase(RocksIterator baseIterator) { + return newIteratorWithBase(baseIterator.parent_.getDefaultColumnFamily(), baseIterator); + } + + @Override final native void disposeInternal(long handle); + @Override final native int count0(); + @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen); + @Override final native void put(byte[] key, int keyLen, byte[] value, int valueLen, + long cfHandle); + @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen); + @Override final native void merge(byte[] key, int keyLen, byte[] value, int valueLen, + long cfHandle); + @Override final native void remove(byte[] key, int keyLen); + @Override final native void remove(byte[] key, int keyLen, long cfHandle); + @Override final native void putLogData(byte[] blob, int blobLen); + @Override final native void clear0(); + + private native void newWriteBatchWithIndex(); + private native void newWriteBatchWithIndex(boolean overwriteKey); + private native void newWriteBatchWithIndex(long fallbackIndexComparatorHandle, int reservedBytes, + boolean overwriteKey); + private native long iterator0(); + private native long iterator1(long cfHandle); + private native long iteratorWithBase(long baseIteratorHandle, long cfHandle); +} diff --git a/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java similarity index 95% rename from java/org/rocksdb/WriteOptions.java rename to java/src/main/java/org/rocksdb/WriteOptions.java index d26dbb918..6d7ea4c30 100644 --- a/java/org/rocksdb/WriteOptions.java +++ b/java/src/main/java/org/rocksdb/WriteOptions.java @@ -12,6 +12,9 @@ package org.rocksdb; * c++ side memory before a WriteOptions instance runs out of scope. */ public class WriteOptions extends RocksObject { + /** + * Construct WriteOptions instance. + */ public WriteOptions() { super(); newWriteOptions(); @@ -64,6 +67,8 @@ public class WriteOptions extends RocksObject { * crash semantics as the "write()" system call. A DB write * with sync==true has similar crash semantics to a "write()" * system call followed by "fdatasync()". + * + * @return boolean value indicating if sync is active. */ public boolean sync() { return sync(nativeHandle_); @@ -85,6 +90,8 @@ public class WriteOptions extends RocksObject { /** * If true, writes will not first go to the write ahead log, * and the write may got lost after a crash. + * + * @return boolean value indicating if WAL is disabled. */ public boolean disableWAL() { return disableWAL(nativeHandle_); diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java new file mode 100644 index 000000000..6b5a9f2c8 --- /dev/null +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -0,0 +1,59 @@ +package org.rocksdb.util; + +public class Environment { + private static String OS = System.getProperty("os.name").toLowerCase(); + private static String ARCH = System.getProperty("os.arch").toLowerCase(); + + public static boolean isWindows() { + return (OS.contains("win")); + } + + public static boolean isMac() { + return (OS.contains("mac")); + } + + public static boolean isUnix() { + return (OS.contains("nix") || + OS.contains("nux") || + OS.contains("aix")); + } + + public static boolean is64Bit() { + return (ARCH.indexOf("64") > 0); + } + + public static String getSharedLibraryName(String name) { + return name + "jni"; + } + + public static String getSharedLibraryFileName(String name) { + return appendLibOsSuffix("lib" + getSharedLibraryName(name), true); + } + + public static String getJniLibraryName(final String name) { + if (isUnix()) { + final String arch = (is64Bit()) ? "64" : "32"; + return String.format("%sjni-linux%s", name, arch); + } else if (isMac()) { + return String.format("%sjni-osx", name); + } + throw new UnsupportedOperationException(); + } + + public static String getJniLibraryFileName(final String name) { + return appendLibOsSuffix("lib" + getJniLibraryName(name), false); + } + + private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) { + if (isUnix()) { + return libraryFileName + ".so"; + } else if (isMac()) { + return libraryFileName + (shared ? ".dylib" : ".jnilib"); + } + throw new UnsupportedOperationException(); + } + + public static String getJniLibraryExtension() { + return (isMac()) ? ".jnilib" : ".so"; + } +} diff --git a/java/org/rocksdb/util/SizeUnit.java b/java/src/main/java/org/rocksdb/util/SizeUnit.java similarity index 100% rename from java/org/rocksdb/util/SizeUnit.java rename to java/src/main/java/org/rocksdb/util/SizeUnit.java diff --git a/java/src/test/java/org/rocksdb/AbstractComparatorTest.java b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java new file mode 100644 index 000000000..97afb48d1 --- /dev/null +++ b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java @@ -0,0 +1,214 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.io.IOException; +import java.nio.file.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.rocksdb.Types.byteToInt; +import static org.rocksdb.Types.intToByte; + +/** + * Abstract tests for both Comparator and DirectComparator + */ +public abstract class AbstractComparatorTest { + + /** + * Get a comparator which will expect Integer keys + * and determine an ascending order + * + * @return An integer ascending order key comparator + */ + public abstract AbstractComparator getAscendingIntKeyComparator(); + + /** + * Test which stores random keys into the database + * using an @see getAscendingIntKeyComparator + * it then checks that these keys are read back in + * ascending order + * + * @param db_path A path where we can store database + * files temporarily + * + * @throws java.io.IOException if IO error happens. + */ + public void testRoundtrip(final Path db_path) throws IOException, RocksDBException { + + Options opt = null; + RocksDB db = null; + + try { + opt = new Options(); + opt.setCreateIfMissing(true); + opt.setComparator(getAscendingIntKeyComparator()); + + // store 10,000 random integer keys + final int ITERATIONS = 10000; + + db = RocksDB.open(opt, db_path.toString()); + final Random random = new Random(); + for (int i = 0; i < ITERATIONS; i++) { + final byte key[] = intToByte(random.nextInt()); + if (i > 0 && db.get(key) != null) { // does key already exist (avoid duplicates) + i--; // generate a different key + } else { + db.put(key, "value".getBytes()); + } + } + db.close(); + + // re-open db and read from start to end + // integer keys should be in ascending + // order as defined by SimpleIntComparator + db = RocksDB.open(opt, db_path.toString()); + final RocksIterator it = db.newIterator(); + it.seekToFirst(); + int lastKey = Integer.MIN_VALUE; + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final int thisKey = byteToInt(it.key()); + assertThat(thisKey).isGreaterThan(lastKey); + lastKey = thisKey; + count++; + } + db.close(); + + assertThat(count).isEqualTo(ITERATIONS); + + } finally { + if (db != null) { + db.close(); + } + + if (opt != null) { + opt.dispose(); + } + } + } + + /** + * Test which stores random keys into a column family + * in the database + * using an @see getAscendingIntKeyComparator + * it then checks that these keys are read back in + * ascending order + * + * @param db_path A path where we can store database + * files temporarily + * + * @throws java.io.IOException if IO error happens. + */ + public void testRoundtripCf(final Path db_path) throws IOException, + RocksDBException { + + DBOptions opt = null; + RocksDB db = null; + List cfDescriptors = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions().setComparator( + getAscendingIntKeyComparator()))); + List cfHandles = new ArrayList<>(); + try { + opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + + // store 10,000 random integer keys + final int ITERATIONS = 10000; + + db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles); + assertThat(cfDescriptors.size()).isEqualTo(2); + assertThat(cfHandles.size()).isEqualTo(2); + + final Random random = new Random(); + for (int i = 0; i < ITERATIONS; i++) { + final byte key[] = intToByte(random.nextInt()); + if (i > 0 && db.get(cfHandles.get(1), key) != null) { + // does key already exist (avoid duplicates) + i--; // generate a different key + } else { + db.put(cfHandles.get(1), key, "value".getBytes()); + } + } + for (ColumnFamilyHandle handle : cfHandles) { + handle.dispose(); + } + cfHandles.clear(); + db.close(); + + // re-open db and read from start to end + // integer keys should be in ascending + // order as defined by SimpleIntComparator + db = RocksDB.open(opt, db_path.toString(), cfDescriptors, cfHandles); + assertThat(cfDescriptors.size()).isEqualTo(2); + assertThat(cfHandles.size()).isEqualTo(2); + final RocksIterator it = db.newIterator(cfHandles.get(1)); + it.seekToFirst(); + int lastKey = Integer.MIN_VALUE; + int count = 0; + for (it.seekToFirst(); it.isValid(); it.next()) { + final int thisKey = byteToInt(it.key()); + assertThat(thisKey).isGreaterThan(lastKey); + lastKey = thisKey; + count++; + } + for (ColumnFamilyHandle handle : cfHandles) { + handle.dispose(); + } + cfHandles.clear(); + db.close(); + assertThat(count).isEqualTo(ITERATIONS); + + } finally { + for (ColumnFamilyHandle handle : cfHandles) { + handle.dispose(); + } + + if (db != null) { + db.close(); + } + + if (opt != null) { + opt.dispose(); + } + } + } + + /** + * Compares integer keys + * so that they are in ascending order + * + * @param a 4-bytes representing an integer key + * @param b 4-bytes representing an integer key + * + * @return negative if a < b, 0 if a == b, positive otherwise + */ + protected final int compareIntKeys(final byte[] a, final byte[] b) { + + final int iA = byteToInt(a); + final int iB = byteToInt(b); + + // protect against int key calculation overflow + final double diff = (double)iA - iB; + final int result; + if (diff < Integer.MIN_VALUE) { + result = Integer.MIN_VALUE; + } else if(diff > Integer.MAX_VALUE) { + result = Integer.MAX_VALUE; + } else { + result = (int)diff; + } + + return result; + } +} diff --git a/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java new file mode 100644 index 000000000..6fe3bd2f0 --- /dev/null +++ b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java @@ -0,0 +1,283 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class BackupableDBOptionsTest { + + private final static String ARBITRARY_PATH = "/tmp"; + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public ExpectedException exception = ExpectedException.none(); + + public static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void backupDir() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + assertThat(backupableDBOptions.backupDir()). + isEqualTo(ARBITRARY_PATH); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void shareTableFiles() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + boolean value = rand.nextBoolean(); + backupableDBOptions.setShareTableFiles(value); + assertThat(backupableDBOptions.shareTableFiles()). + isEqualTo(value); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void sync() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + boolean value = rand.nextBoolean(); + backupableDBOptions.setSync(value); + assertThat(backupableDBOptions.sync()).isEqualTo(value); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void destroyOldData() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + boolean value = rand.nextBoolean(); + backupableDBOptions.setDestroyOldData(value); + assertThat(backupableDBOptions.destroyOldData()). + isEqualTo(value); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void backupLogFiles() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + boolean value = rand.nextBoolean(); + backupableDBOptions.setBackupLogFiles(value); + assertThat(backupableDBOptions.backupLogFiles()). + isEqualTo(value); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void backupRateLimit() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + long value = Math.abs(rand.nextLong()); + backupableDBOptions.setBackupRateLimit(value); + assertThat(backupableDBOptions.backupRateLimit()). + isEqualTo(value); + // negative will be mapped to 0 + backupableDBOptions.setBackupRateLimit(-1); + assertThat(backupableDBOptions.backupRateLimit()). + isEqualTo(0); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void restoreRateLimit() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + long value = Math.abs(rand.nextLong()); + backupableDBOptions.setRestoreRateLimit(value); + assertThat(backupableDBOptions.restoreRateLimit()). + isEqualTo(value); + // negative will be mapped to 0 + backupableDBOptions.setRestoreRateLimit(-1); + assertThat(backupableDBOptions.restoreRateLimit()). + isEqualTo(0); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void shareFilesWithChecksum() { + BackupableDBOptions backupableDBOptions = null; + try { + backupableDBOptions = new BackupableDBOptions(ARBITRARY_PATH); + boolean value = rand.nextBoolean(); + backupableDBOptions.setShareFilesWithChecksum(value); + assertThat(backupableDBOptions.shareFilesWithChecksum()). + isEqualTo(value); + } finally { + if (backupableDBOptions != null) { + backupableDBOptions.dispose(); + } + } + } + + @Test + public void failBackupDirIsNull() { + exception.expect(IllegalArgumentException.class); + new BackupableDBOptions(null); + } + + @Test + public void failBackupDirIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.backupDir(); + } + + @Test + public void failSetShareTableFilesIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setShareTableFiles(true); + } + + @Test + public void failShareTableFilesIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.shareTableFiles(); + } + + @Test + public void failSetSyncIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setSync(true); + } + + @Test + public void failSyncIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.sync(); + } + + @Test + public void failSetDestroyOldDataIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setDestroyOldData(true); + } + + @Test + public void failDestroyOldDataIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.destroyOldData(); + } + + @Test + public void failSetBackupLogFilesIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setBackupLogFiles(true); + } + + @Test + public void failBackupLogFilesIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.backupLogFiles(); + } + + @Test + public void failSetBackupRateLimitIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setBackupRateLimit(1); + } + + @Test + public void failBackupRateLimitIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.backupRateLimit(); + } + + @Test + public void failSetRestoreRateLimitIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setRestoreRateLimit(1); + } + + @Test + public void failRestoreRateLimitIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.restoreRateLimit(); + } + + @Test + public void failSetShareFilesWithChecksumIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.setShareFilesWithChecksum(true); + } + + @Test + public void failShareFilesWithChecksumIfDisposed(){ + BackupableDBOptions options = setupUninitializedBackupableDBOptions( + exception); + options.shareFilesWithChecksum(); + } + + private BackupableDBOptions setupUninitializedBackupableDBOptions( + ExpectedException exception) { + BackupableDBOptions backupableDBOptions = + new BackupableDBOptions(ARBITRARY_PATH); + backupableDBOptions.dispose(); + exception.expect(AssertionError.class); + return backupableDBOptions; + } +} diff --git a/java/src/test/java/org/rocksdb/BackupableDBTest.java b/java/src/test/java/org/rocksdb/BackupableDBTest.java new file mode 100644 index 000000000..3f358bdb7 --- /dev/null +++ b/java/src/test/java/org/rocksdb/BackupableDBTest.java @@ -0,0 +1,425 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class BackupableDBTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule + public TemporaryFolder backupFolder = new TemporaryFolder(); + + @Test + public void backupDb() throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + // Create two backups + bdb.createNewBackup(false); + bdb.createNewBackup(true); + verifyNumberOfValidBackups(bdb, 2); + } finally { + if (bdb != null) { + bdb.close(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void deleteBackup() throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + // Create two backups + bdb.createNewBackup(false); + bdb.createNewBackup(true); + List backupInfo = + verifyNumberOfValidBackups(bdb, 2); + // Delete the first backup + bdb.deleteBackup(backupInfo.get(0).backupId()); + List newBackupInfo = + verifyNumberOfValidBackups(bdb, 1); + // The second backup must remain. + assertThat(newBackupInfo.get(0).backupId()). + isEqualTo(backupInfo.get(1).backupId()); + } finally { + if (bdb != null) { + bdb.close(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void deleteBackupWithRestoreBackupableDB() + throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + RestoreBackupableDB rdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + // Create two backups + bdb.createNewBackup(false); + bdb.createNewBackup(true); + List backupInfo = + verifyNumberOfValidBackups(bdb, 2); + // init RestoreBackupableDB + rdb = new RestoreBackupableDB(bopt); + // Delete the first backup + rdb.deleteBackup(backupInfo.get(0).backupId()); + // Fetch backup info using RestoreBackupableDB + List newBackupInfo = verifyNumberOfValidBackups(rdb, 1); + // The second backup must remain. + assertThat(newBackupInfo.get(0).backupId()). + isEqualTo(backupInfo.get(1).backupId()); + } finally { + if (bdb != null) { + bdb.close(); + } + if (rdb != null) { + rdb.dispose(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void purgeOldBackups() throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + // Create two backups + bdb.createNewBackup(false); + bdb.createNewBackup(true); + bdb.createNewBackup(true); + bdb.createNewBackup(true); + List backupInfo = + verifyNumberOfValidBackups(bdb, 4); + // Delete everything except the latest backup + bdb.purgeOldBackups(1); + List newBackupInfo = + verifyNumberOfValidBackups(bdb, 1); + // The latest backup must remain. + assertThat(newBackupInfo.get(0).backupId()). + isEqualTo(backupInfo.get(3).backupId()); + } finally { + if (bdb != null) { + bdb.close(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void purgeOldBackupsWithRestoreBackupableDb() + throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + RestoreBackupableDB rdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + // Create two backups + bdb.createNewBackup(false); + bdb.createNewBackup(true); + bdb.createNewBackup(true); + bdb.createNewBackup(true); + List infos = verifyNumberOfValidBackups(bdb, 4); + assertThat(infos.get(1).size()). + isEqualTo(infos.get(2).size()); + assertThat(infos.get(1).numberFiles()). + isEqualTo(infos.get(2).numberFiles()); + long maxTimeBeforePurge = Long.MIN_VALUE; + for (BackupInfo backupInfo : infos) { + if (maxTimeBeforePurge < backupInfo.timestamp()) { + maxTimeBeforePurge = backupInfo.timestamp(); + } + } + // init RestoreBackupableDB + rdb = new RestoreBackupableDB(bopt); + // the same number of backups must + // exist using RestoreBackupableDB. + verifyNumberOfValidBackups(rdb, 4); + rdb.purgeOldBackups(1); + infos = verifyNumberOfValidBackups(rdb, 1); + assertThat(infos.get(0).timestamp()). + isEqualTo(maxTimeBeforePurge); + } finally { + if (bdb != null) { + bdb.close(); + } + if (rdb != null) { + rdb.dispose(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void restoreLatestBackup() + throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + RestoreBackupableDB rdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + bdb.createNewBackup(true); + verifyNumberOfValidBackups(bdb, 1); + bdb.put("key1".getBytes(), "valueV2".getBytes()); + bdb.put("key2".getBytes(), "valueV2".getBytes()); + bdb.createNewBackup(true); + verifyNumberOfValidBackups(bdb, 2); + bdb.put("key1".getBytes(), "valueV3".getBytes()); + bdb.put("key2".getBytes(), "valueV3".getBytes()); + assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3"); + assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3"); + bdb.close(); + + // init RestoreBackupableDB + rdb = new RestoreBackupableDB(bopt); + verifyNumberOfValidBackups(rdb, 2); + // restore db from latest backup + rdb.restoreDBFromLatestBackup(dbFolder.getRoot().getAbsolutePath(), + dbFolder.getRoot().getAbsolutePath(), + new RestoreOptions(false)); + // Open database again. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Values must have suffix V2 because of restoring latest backup. + assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V2"); + assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V2"); + } finally { + if (bdb != null) { + bdb.close(); + } + if (rdb != null) { + rdb.dispose(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void restoreFromBackup() + throws RocksDBException { + Options opt = null; + BackupableDBOptions bopt = null; + BackupableDB bdb = null; + RestoreBackupableDB rdb = null; + try { + opt = new Options().setCreateIfMissing(true); + bopt = new BackupableDBOptions( + backupFolder.getRoot().getAbsolutePath()); + assertThat(bopt.backupDir()).isEqualTo( + backupFolder.getRoot().getAbsolutePath()); + // Open empty database. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Fill database with some test values + prepareDatabase(bdb); + bdb.createNewBackup(true); + verifyNumberOfValidBackups(bdb, 1); + bdb.put("key1".getBytes(), "valueV2".getBytes()); + bdb.put("key2".getBytes(), "valueV2".getBytes()); + bdb.createNewBackup(true); + verifyNumberOfValidBackups(bdb, 2); + bdb.put("key1".getBytes(), "valueV3".getBytes()); + bdb.put("key2".getBytes(), "valueV3".getBytes()); + assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V3"); + assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V3"); + bdb.close(); + + // init RestoreBackupableDB + rdb = new RestoreBackupableDB(bopt); + List backupInfo = verifyNumberOfValidBackups(rdb, 2); + // restore db from first backup + rdb.restoreDBFromBackup(backupInfo.get(0).backupId(), + dbFolder.getRoot().getAbsolutePath(), + dbFolder.getRoot().getAbsolutePath(), + new RestoreOptions(false)); + // Open database again. + bdb = BackupableDB.open(opt, bopt, + dbFolder.getRoot().getAbsolutePath()); + // Values must have suffix V2 because of restoring latest backup. + assertThat(new String(bdb.get("key1".getBytes()))).endsWith("V1"); + assertThat(new String(bdb.get("key2".getBytes()))).endsWith("V1"); + } finally { + if (bdb != null) { + bdb.close(); + } + if (rdb != null) { + rdb.dispose(); + } + if (bopt != null) { + bopt.dispose(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + /** + * Verify backups. + * + * @param bdb {@link BackupableDB} instance. + * @param expectedNumberOfBackups numerical value + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + private List verifyNumberOfValidBackups(BackupableDB bdb, + int expectedNumberOfBackups) throws RocksDBException { + // Verify that backups exist + assertThat(bdb.getCorruptedBackups().length). + isEqualTo(0); + bdb.garbageCollect(); + List backupInfo = bdb.getBackupInfos(); + assertThat(backupInfo.size()). + isEqualTo(expectedNumberOfBackups); + return backupInfo; + } + + /** + * Verify backups. + * + * @param rdb {@link RestoreBackupableDB} instance. + * @param expectedNumberOfBackups numerical value + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + private List verifyNumberOfValidBackups( + RestoreBackupableDB rdb, int expectedNumberOfBackups) + throws RocksDBException { + // Verify that backups exist + assertThat(rdb.getCorruptedBackups().length). + isEqualTo(0); + rdb.garbageCollect(); + List backupInfo = rdb.getBackupInfos(); + assertThat(backupInfo.size()). + isEqualTo(expectedNumberOfBackups); + return backupInfo; + } + + /** + * Fill database with some test values. + * + * @param db {@link RocksDB} instance. + * @throws RocksDBException thrown if an error occurs within the native + * part of the library. + */ + private void prepareDatabase(RocksDB db) + throws RocksDBException { + db.put("key1".getBytes(), "valueV1".getBytes()); + db.put("key2".getBytes(), "valueV1".getBytes()); + } +} diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java new file mode 100644 index 000000000..aacf44054 --- /dev/null +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -0,0 +1,185 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class BlockBasedTableConfigTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void noBlockCache() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setNoBlockCache(true); + assertThat(blockBasedTableConfig.noBlockCache()).isTrue(); + } + + @Test + public void blockCacheSize() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockCacheSize(8 * 1024); + assertThat(blockBasedTableConfig.blockCacheSize()). + isEqualTo(8 * 1024); + } + + @Test + public void blockSizeDeviation() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockSizeDeviation(12); + assertThat(blockBasedTableConfig.blockSizeDeviation()). + isEqualTo(12); + } + + @Test + public void blockRestartInterval() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockRestartInterval(15); + assertThat(blockBasedTableConfig.blockRestartInterval()). + isEqualTo(15); + } + + @Test + public void wholeKeyFiltering() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setWholeKeyFiltering(false); + assertThat(blockBasedTableConfig.wholeKeyFiltering()). + isFalse(); + } + + @Test + public void cacheIndexAndFilterBlocks() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setCacheIndexAndFilterBlocks(true); + assertThat(blockBasedTableConfig.cacheIndexAndFilterBlocks()). + isTrue(); + + } + + @Test + public void hashIndexAllowCollision() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setHashIndexAllowCollision(false); + assertThat(blockBasedTableConfig.hashIndexAllowCollision()). + isFalse(); + } + + @Test + public void blockCacheCompressedSize() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockCacheCompressedSize(40); + assertThat(blockBasedTableConfig.blockCacheCompressedSize()). + isEqualTo(40); + } + + @Test + public void checksumType() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + assertThat(ChecksumType.values().length).isEqualTo(3); + assertThat(ChecksumType.valueOf("kxxHash")). + isEqualTo(ChecksumType.kxxHash); + blockBasedTableConfig.setChecksumType(ChecksumType.kNoChecksum); + blockBasedTableConfig.setChecksumType(ChecksumType.kxxHash); + assertThat(blockBasedTableConfig.checksumType().equals( + ChecksumType.kxxHash)); + } + + @Test + public void indexType() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + assertThat(IndexType.values().length).isEqualTo(2); + blockBasedTableConfig.setIndexType(IndexType.kHashSearch); + assertThat(blockBasedTableConfig.indexType().equals( + IndexType.kHashSearch)); + assertThat(IndexType.valueOf("kBinarySearch")).isNotNull(); + blockBasedTableConfig.setIndexType(IndexType.valueOf("kBinarySearch")); + assertThat(blockBasedTableConfig.indexType().equals( + IndexType.kBinarySearch)); + } + + @Test + public void blockCacheCompressedNumShardBits() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockCacheCompressedNumShardBits(4); + assertThat(blockBasedTableConfig.blockCacheCompressedNumShardBits()). + isEqualTo(4); + } + + @Test + public void cacheNumShardBits() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setCacheNumShardBits(5); + assertThat(blockBasedTableConfig.cacheNumShardBits()). + isEqualTo(5); + } + + @Test + public void blockSize() { + BlockBasedTableConfig blockBasedTableConfig = new BlockBasedTableConfig(); + blockBasedTableConfig.setBlockSize(10); + assertThat(blockBasedTableConfig.blockSize()).isEqualTo(10); + } + + + @Test + public void blockBasedTableWithFilter() { + Options options = null; + try { + options = new Options(); + options.setTableFormatConfig( + new BlockBasedTableConfig().setFilter( + new BloomFilter(10))); + assertThat(options.tableFactoryName()). + isEqualTo("BlockBasedTable"); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void blockBasedTableWithoutFilter() { + Options options = null; + try { + options = new Options(); + options.setTableFormatConfig( + new BlockBasedTableConfig().setFilter(null)); + assertThat(options.tableFactoryName()). + isEqualTo("BlockBasedTable"); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void blockBasedTableFormatVersion() { + BlockBasedTableConfig config = new BlockBasedTableConfig(); + for (int version=0; version<=2; version++) { + config.setFormatVersion(version); + assertThat(config.formatVersion()).isEqualTo(version); + } + } + + @Test(expected = AssertionError.class) + public void blockBasedTableFormatVersionFailNegative() { + BlockBasedTableConfig config = new BlockBasedTableConfig(); + config.setFormatVersion(-1); + } + + @Test(expected = AssertionError.class) + public void blockBasedTableFormatVersionFailIllegalVersion() { + BlockBasedTableConfig config = new BlockBasedTableConfig(); + config.setFormatVersion(3); + } +} diff --git a/java/src/test/java/org/rocksdb/CheckPointTest.java b/java/src/test/java/org/rocksdb/CheckPointTest.java new file mode 100644 index 000000000..3081e585a --- /dev/null +++ b/java/src/test/java/org/rocksdb/CheckPointTest.java @@ -0,0 +1,97 @@ +package org.rocksdb; + + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class CheckPointTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Rule + public TemporaryFolder checkpointFolder = new TemporaryFolder(); + + @Test + public void checkPoint() throws RocksDBException { + RocksDB db = null; + Options options = null; + Checkpoint checkpoint = null; + try { + options = new Options(). + setCreateIfMissing(true); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + checkpoint = Checkpoint.create(db); + checkpoint.createCheckpoint(checkpointFolder. + getRoot().getAbsolutePath() + "/snapshot1"); + db.put("key2".getBytes(), "value2".getBytes()); + checkpoint.createCheckpoint(checkpointFolder. + getRoot().getAbsolutePath() + "/snapshot2"); + db.close(); + db = RocksDB.open(options, + checkpointFolder.getRoot().getAbsolutePath() + + "/snapshot1"); + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + assertThat(db.get("key2".getBytes())).isNull(); + db.close(); + db = RocksDB.open(options, + checkpointFolder.getRoot().getAbsolutePath() + + "/snapshot2"); + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + assertThat(new String(db.get("key2".getBytes()))). + isEqualTo("value2"); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (checkpoint != null) { + checkpoint.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void failIfDbIsNull() { + Checkpoint.create(null); + } + + @Test(expected = IllegalStateException.class) + public void failIfDbNotInitialized() throws RocksDBException { + RocksDB db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.dispose(); + Checkpoint.create(db); + } + + @Test(expected = RocksDBException.class) + public void failWithIllegalPath() throws RocksDBException { + RocksDB db = null; + Checkpoint checkpoint = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + checkpoint = Checkpoint.create(db); + checkpoint.createCheckpoint("/Z:///:\\C:\\TZ/-"); + } finally { + if (db != null) { + db.close(); + } + if (checkpoint != null) { + checkpoint.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java new file mode 100644 index 000000000..4082c602d --- /dev/null +++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java @@ -0,0 +1,639 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import java.util.Properties; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ColumnFamilyOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + public static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void getColumnFamilyOptionsFromProps() { + ColumnFamilyOptions opt = null; + try { + // setup sample properties + Properties properties = new Properties(); + properties.put("write_buffer_size", "112"); + properties.put("max_write_buffer_number", "13"); + opt = ColumnFamilyOptions. + getColumnFamilyOptionsFromProps(properties); + assertThat(opt).isNotNull(); + assertThat(String.valueOf(opt.writeBufferSize())). + isEqualTo(properties.get("write_buffer_size")); + assertThat(String.valueOf(opt.maxWriteBufferNumber())). + isEqualTo(properties.get("max_write_buffer_number")); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void failColumnFamilyOptionsFromPropsWithIllegalValue() { + ColumnFamilyOptions opt = null; + try { + // setup sample properties + Properties properties = new Properties(); + properties.put("tomato", "1024"); + properties.put("burger", "2"); + opt = ColumnFamilyOptions. + getColumnFamilyOptionsFromProps(properties); + assertThat(opt).isNull(); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void failColumnFamilyOptionsFromPropsWithNullValue() { + ColumnFamilyOptions.getColumnFamilyOptionsFromProps(null); + } + + @Test(expected = IllegalArgumentException.class) + public void failColumnFamilyOptionsFromPropsWithEmptyProps() { + ColumnFamilyOptions.getColumnFamilyOptionsFromProps( + new Properties()); + } + + @Test + public void writeBufferSize() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setWriteBufferSize(longValue); + assertThat(opt.writeBufferSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxWriteBufferNumber() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMaxWriteBufferNumber(intValue); + assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void minWriteBufferNumberToMerge() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMinWriteBufferNumberToMerge(intValue); + assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void numLevels() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setNumLevels(intValue); + assertThat(opt.numLevels()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroFileNumCompactionTrigger() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setLevelZeroFileNumCompactionTrigger(intValue); + assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroSlowdownWritesTrigger() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setLevelZeroSlowdownWritesTrigger(intValue); + assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroStopWritesTrigger() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setLevelZeroStopWritesTrigger(intValue); + assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxMemCompactionLevel() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMaxMemCompactionLevel(intValue); + assertThat(opt.maxMemCompactionLevel()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void targetFileSizeBase() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setTargetFileSizeBase(longValue); + assertThat(opt.targetFileSizeBase()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void targetFileSizeMultiplier() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setTargetFileSizeMultiplier(intValue); + assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBytesForLevelBase() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setMaxBytesForLevelBase(longValue); + assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBytesForLevelMultiplier() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMaxBytesForLevelMultiplier(intValue); + assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void expandedCompactionFactor() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setExpandedCompactionFactor(intValue); + assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void sourceCompactionFactor() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setSourceCompactionFactor(intValue); + assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxGrandparentOverlapFactor() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMaxGrandparentOverlapFactor(intValue); + assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void softRateLimit() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + double doubleValue = rand.nextDouble(); + opt.setSoftRateLimit(doubleValue); + assertThat(opt.softRateLimit()).isEqualTo(doubleValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void hardRateLimit() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + double doubleValue = rand.nextDouble(); + opt.setHardRateLimit(doubleValue); + assertThat(opt.hardRateLimit()).isEqualTo(doubleValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void rateLimitDelayMaxMilliseconds() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setRateLimitDelayMaxMilliseconds(intValue); + assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void arenaBlockSize() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setArenaBlockSize(longValue); + assertThat(opt.arenaBlockSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void disableAutoCompactions() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setDisableAutoCompactions(boolValue); + assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void purgeRedundantKvsWhileFlush() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setPurgeRedundantKvsWhileFlush(boolValue); + assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void verifyChecksumsInCompaction() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksumsInCompaction(boolValue); + assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void filterDeletes() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setFilterDeletes(boolValue); + assertThat(opt.filterDeletes()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxSequentialSkipInIterations() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setMaxSequentialSkipInIterations(longValue); + assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void inplaceUpdateSupport() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setInplaceUpdateSupport(boolValue); + assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void inplaceUpdateNumLocks() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + long longValue = rand.nextLong(); + opt.setInplaceUpdateNumLocks(longValue); + assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void memtablePrefixBloomBits() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomBits(intValue); + assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void memtablePrefixBloomProbes() { + ColumnFamilyOptions opt = null; + try { + int intValue = rand.nextInt(); + opt = new ColumnFamilyOptions(); + opt.setMemtablePrefixBloomProbes(intValue); + assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void bloomLocality() { + ColumnFamilyOptions opt = null; + try { + int intValue = rand.nextInt(); + opt = new ColumnFamilyOptions(); + opt.setBloomLocality(intValue); + assertThat(opt.bloomLocality()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxSuccessiveMerges() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + long longValue = rand.nextLong(); + opt = new ColumnFamilyOptions(); + opt.setMaxSuccessiveMerges(longValue); + assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void minPartialMergeOperands() { + ColumnFamilyOptions opt = null; + try { + int intValue = rand.nextInt(); + opt = new ColumnFamilyOptions(); + opt.setMinPartialMergeOperands(intValue); + assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void memTable() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + opt.setMemTableConfig(new HashLinkedListMemTableConfig()); + assertThat(opt.memTableFactoryName()). + isEqualTo("HashLinkedListRepFactory"); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void comparator() throws RocksDBException { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + opt.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void linkageOfPrepMethods() { + ColumnFamilyOptions options = null; + try { + options = new ColumnFamilyOptions(); + options.optimizeUniversalStyleCompaction(); + options.optimizeUniversalStyleCompaction(4000); + options.optimizeLevelStyleCompaction(); + options.optimizeLevelStyleCompaction(3000); + options.optimizeForPointLookup(10); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void shouldSetTestPrefixExtractor() { + ColumnFamilyOptions options = null; + try { + options = new ColumnFamilyOptions(); + options.useFixedLengthPrefixExtractor(100); + options.useFixedLengthPrefixExtractor(10); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void compressionTypes() { + ColumnFamilyOptions ColumnFamilyOptions = null; + try { + ColumnFamilyOptions = new ColumnFamilyOptions(); + for (CompressionType compressionType : + CompressionType.values()) { + ColumnFamilyOptions.setCompressionType(compressionType); + assertThat(ColumnFamilyOptions.compressionType()). + isEqualTo(compressionType); + assertThat(CompressionType.valueOf("NO_COMPRESSION")). + isEqualTo(CompressionType.NO_COMPRESSION); + } + } finally { + if (ColumnFamilyOptions != null) { + ColumnFamilyOptions.dispose(); + } + } + } + + @Test + public void compactionStyles() { + ColumnFamilyOptions ColumnFamilyOptions = null; + try { + ColumnFamilyOptions = new ColumnFamilyOptions(); + for (CompactionStyle compactionStyle : + CompactionStyle.values()) { + ColumnFamilyOptions.setCompactionStyle(compactionStyle); + assertThat(ColumnFamilyOptions.compactionStyle()). + isEqualTo(compactionStyle); + assertThat(CompactionStyle.valueOf("FIFO")). + isEqualTo(CompactionStyle.FIFO); + } + } finally { + if (ColumnFamilyOptions != null) { + ColumnFamilyOptions.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java new file mode 100644 index 000000000..9a860ebe8 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -0,0 +1,607 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; +import java.util.Map; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ColumnFamilyTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void listColumnFamilies() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + options.setCreateIfMissing(true); + + DBOptions dbOptions = new DBOptions(); + dbOptions.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + // Test listColumnFamilies + List columnFamilyNames; + columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(columnFamilyNames).isNotNull(); + assertThat(columnFamilyNames.size()).isGreaterThan(0); + assertThat(columnFamilyNames.size()).isEqualTo(1); + assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void defaultColumnFamily() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + options.setCreateIfMissing(true); + + DBOptions dbOptions = new DBOptions(); + dbOptions.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + ColumnFamilyHandle cfh = db.getDefaultColumnFamily(); + assertThat(cfh).isNotNull(); + + final byte[] key = "key".getBytes(); + final byte[] value = "value".getBytes(); + + db.put(cfh, key, value); + + final byte[] actualValue = db.get(cfh, key); + + assertThat(cfh).isNotNull(); + assertThat(actualValue).isEqualTo(value); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void createColumnFamily() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + options.setCreateIfMissing(true); + + DBOptions dbOptions = new DBOptions(); + dbOptions.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.createColumnFamily(new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions())); + db.close(); + List columnFamilyNames; + columnFamilyNames = RocksDB.listColumnFamilies(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(columnFamilyNames).isNotNull(); + assertThat(columnFamilyNames.size()).isGreaterThan(0); + assertThat(columnFamilyNames.size()).isEqualTo(2); + assertThat(new String(columnFamilyNames.get(0))).isEqualTo("default"); + assertThat(new String(columnFamilyNames.get(1))).isEqualTo("new_cf"); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void openWithColumnFamilies() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + // Test open database with column family names + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + assertThat(columnFamilyHandleList.size()).isEqualTo(2); + db.put("dfkey1".getBytes(), "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(0), "dfkey2".getBytes(), + "dfvalue".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey1".getBytes(), + "newcfvalue".getBytes()); + + String retVal = new String(db.get(columnFamilyHandleList.get(1), + "newcfkey1".getBytes())); + assertThat(retVal).isEqualTo("newcfvalue"); + assertThat((db.get(columnFamilyHandleList.get(1), + "dfkey1".getBytes()))).isNull(); + db.remove(columnFamilyHandleList.get(1), "newcfkey1".getBytes()); + assertThat((db.get(columnFamilyHandleList.get(1), + "newcfkey1".getBytes()))).isNull(); + db.remove(columnFamilyHandleList.get(0), new WriteOptions(), + "dfkey2".getBytes()); + assertThat(db.get(columnFamilyHandleList.get(0), new ReadOptions(), + "dfkey2".getBytes())).isNull(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void getWithOutValueAndCf() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + // Test open database with column family names + List cfDescriptors = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList); + db.put(columnFamilyHandleList.get(0), new WriteOptions(), + "key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + byte[] outValue = new byte[5]; + // not found value + int getResult = db.get("keyNotFound".getBytes(), outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get(columnFamilyHandleList.get(0), "key1".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = db.get(columnFamilyHandleList.get(0), new ReadOptions(), + "key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void createWriteDropColumnFamily() throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + ColumnFamilyHandle tmpColumnFamilyHandle = null; + try { + opt = new DBOptions(); + opt.setCreateIfMissing(true); + opt.setCreateMissingColumnFamilies(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + tmpColumnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("tmpCF".getBytes(), new ColumnFamilyOptions())); + db.put(tmpColumnFamilyHandle, "key".getBytes(), "value".getBytes()); + db.dropColumnFamily(tmpColumnFamilyHandle); + tmpColumnFamilyHandle.dispose(); + } finally { + if (tmpColumnFamilyHandle != null) { + tmpColumnFamilyHandle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void writeBatch() throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + try { + opt = new DBOptions(); + opt.setCreateIfMissing(true); + opt.setCreateMissingColumnFamilies(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions().setMergeOperator(new StringAppendOperator()))); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + + WriteBatch writeBatch = new WriteBatch(); + WriteOptions writeOpt = new WriteOptions(); + writeBatch.put("key".getBytes(), "value".getBytes()); + writeBatch.put(db.getDefaultColumnFamily(), + "mergeKey".getBytes(), "merge".getBytes()); + writeBatch.merge(db.getDefaultColumnFamily(), "mergeKey".getBytes(), + "merge".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), + "value".getBytes()); + writeBatch.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), + "value2".getBytes()); + writeBatch.remove("xyz".getBytes()); + writeBatch.remove(columnFamilyHandleList.get(1), "xyz".getBytes()); + db.write(writeOpt, writeBatch); + writeBatch.dispose(); + assertThat(db.get(columnFamilyHandleList.get(1), + "xyz".getBytes()) == null); + assertThat(new String(db.get(columnFamilyHandleList.get(1), + "newcfkey".getBytes()))).isEqualTo("value"); + assertThat(new String(db.get(columnFamilyHandleList.get(1), + "newcfkey2".getBytes()))).isEqualTo("value2"); + assertThat(new String(db.get("key".getBytes()))).isEqualTo("value"); + // check if key is merged + assertThat(new String(db.get(db.getDefaultColumnFamily(), + "mergeKey".getBytes()))).isEqualTo("merge,merge"); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void iteratorOnColumnFamily() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + RocksIterator rocksIterator = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), + "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey2".getBytes(), + "value2".getBytes()); + rocksIterator = db.newIterator( + columnFamilyHandleList.get(1)); + rocksIterator.seekToFirst(); + Map refMap = new HashMap<>(); + refMap.put("newcfkey", "value"); + refMap.put("newcfkey2", "value2"); + int i = 0; + while (rocksIterator.isValid()) { + i++; + assertThat(refMap.get(new String(rocksIterator.key()))). + isEqualTo(new String(rocksIterator.value())); + rocksIterator.next(); + } + assertThat(i).isEqualTo(2); + rocksIterator.dispose(); + } finally { + if (rocksIterator != null) { + rocksIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void multiGet() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + List cfDescriptors = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList); + db.put(columnFamilyHandleList.get(0), "key".getBytes(), "value".getBytes()); + db.put(columnFamilyHandleList.get(1), "newcfkey".getBytes(), "value".getBytes()); + + List keys = new ArrayList<>(); + keys.add("key".getBytes()); + keys.add("newcfkey".getBytes()); + Map retValues = db.multiGet(columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(keys.get(0)))) + .isEqualTo("value"); + assertThat(new String(retValues.get(keys.get(1)))) + .isEqualTo("value"); + retValues = db.multiGet(new ReadOptions(), columnFamilyHandleList, keys); + assertThat(retValues.size()).isEqualTo(2); + assertThat(new String(retValues.get(keys.get(0)))) + .isEqualTo("value"); + assertThat(new String(retValues.get(keys.get(1)))) + .isEqualTo("value"); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void properties() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + assertThat(db.getProperty("rocksdb.estimate-num-keys")). + isNotNull(); + assertThat(db.getLongProperty(columnFamilyHandleList.get(0), + "rocksdb.estimate-num-keys")).isGreaterThanOrEqualTo(0); + assertThat(db.getProperty("rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(0), + "rocksdb.sstables")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.estimate-num-keys")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.stats")).isNotNull(); + assertThat(db.getProperty(columnFamilyHandleList.get(1), + "rocksdb.sstables")).isNotNull(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + + @Test + public void iterators() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + options.setCreateMissingColumnFamilies(true); + + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + List iterators = + db.newIterators(columnFamilyHandleList); + assertThat(iterators.size()).isEqualTo(2); + RocksIterator iter = iterators.get(0); + iter.seekToFirst(); + Map defRefMap = new HashMap<>(); + defRefMap.put("dfkey1", "dfvalue"); + defRefMap.put("key", "value"); + while (iter.isValid()) { + assertThat(defRefMap.get(new String(iter.key()))). + isEqualTo(new String(iter.value())); + iter.next(); + } + // iterate over new_cf key/value pairs + Map cfRefMap = new HashMap<>(); + cfRefMap.put("newcfkey", "value"); + cfRefMap.put("newcfkey2", "value2"); + iter = iterators.get(1); + iter.seekToFirst(); + while (iter.isValid()) { + assertThat(cfRefMap.get(new String(iter.key()))). + isEqualTo(new String(iter.value())); + iter.next(); + } + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failPutDisposedCF() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.put(columnFamilyHandleList.get(1), "key".getBytes(), "value".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failRemoveDisposedCF() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.remove(columnFamilyHandleList.get(1), "key".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failGetDisposedCF() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + db.dropColumnFamily(columnFamilyHandleList.get(1)); + db.get(columnFamilyHandleList.get(1), "key".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failMultiGetWithoutCorrectNumberOfCF() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true); + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList); + List keys = new ArrayList<>(); + keys.add("key".getBytes()); + keys.add("newcfkey".getBytes()); + List cfCustomList = new ArrayList<>(); + db.multiGet(cfCustomList, keys); + + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + +} diff --git a/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java new file mode 100644 index 000000000..4f8a7d1a6 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java @@ -0,0 +1,35 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ComparatorOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void comparatorOptions() { + final ComparatorOptions copt = new ComparatorOptions(); + + assertThat(copt).isNotNull(); + + { // UseAdaptiveMutex test + copt.setUseAdaptiveMutex(true); + assertThat(copt.useAdaptiveMutex()).isTrue(); + + copt.setUseAdaptiveMutex(false); + assertThat(copt.useAdaptiveMutex()).isFalse(); + } + + copt.dispose(); + } +} diff --git a/java/src/test/java/org/rocksdb/ComparatorTest.java b/java/src/test/java/org/rocksdb/ComparatorTest.java new file mode 100644 index 000000000..e689a9cf5 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ComparatorTest.java @@ -0,0 +1,227 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.nio.file.FileSystems; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ComparatorTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void javaComparator() throws IOException, RocksDBException { + + final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() { + @Override + public AbstractComparator getAscendingIntKeyComparator() { + return new Comparator(new ComparatorOptions()) { + + @Override + public String name() { + return "test.AscendingIntKeyComparator"; + } + + @Override + public int compare(final Slice a, final Slice b) { + return compareIntKeys(a.data(), b.data()); + } + }; + } + }; + + // test the round-tripability of keys written and read with the Comparator + comparatorTest.testRoundtrip(FileSystems.getDefault().getPath( + dbFolder.getRoot().getAbsolutePath())); + } + + @Test + public void javaComparatorCf() throws IOException, RocksDBException { + + final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() { + @Override + public AbstractComparator getAscendingIntKeyComparator() { + return new Comparator(new ComparatorOptions()) { + + @Override + public String name() { + return "test.AscendingIntKeyComparator"; + } + + @Override + public int compare(final Slice a, final Slice b) { + return compareIntKeys(a.data(), b.data()); + } + }; + } + }; + + // test the round-tripability of keys written and read with the Comparator + comparatorTest.testRoundtripCf(FileSystems.getDefault().getPath( + dbFolder.getRoot().getAbsolutePath())); + } + + @Test + public void builtinForwardComparator() + throws RocksDBException { + Options options = null; + RocksDB rocksDB = null; + RocksIterator rocksIterator = null; + try { + options = new Options(); + options.setCreateIfMissing(true); + options.setComparator(BuiltinComparator.BYTEWISE_COMPARATOR); + rocksDB = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + + rocksDB.put("abc1".getBytes(), "abc1".getBytes()); + rocksDB.put("abc2".getBytes(), "abc2".getBytes()); + rocksDB.put("abc3".getBytes(), "abc3".getBytes()); + + rocksIterator = rocksDB.newIterator(); + // Iterate over keys using a iterator + rocksIterator.seekToFirst(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc2".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc2".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isFalse(); + // Get last one + rocksIterator.seekToLast(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + // Seek for abc + rocksIterator.seek("abc".getBytes()); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + + } finally { + if (rocksIterator != null) { + rocksIterator.dispose(); + } + if (rocksDB != null) { + rocksDB.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void builtinReverseComparator() + throws RocksDBException { + Options options = null; + RocksDB rocksDB = null; + RocksIterator rocksIterator = null; + try { + options = new Options(); + options.setCreateIfMissing(true); + options.setComparator( + BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR); + rocksDB = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + + rocksDB.put("abc1".getBytes(), "abc1".getBytes()); + rocksDB.put("abc2".getBytes(), "abc2".getBytes()); + rocksDB.put("abc3".getBytes(), "abc3".getBytes()); + + rocksIterator = rocksDB.newIterator(); + // Iterate over keys using a iterator + rocksIterator.seekToFirst(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc2".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc2".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + rocksIterator.next(); + assertThat(rocksIterator.isValid()).isFalse(); + // Get last one + rocksIterator.seekToLast(); + assertThat(rocksIterator.isValid()).isTrue(); + assertThat(rocksIterator.key()).isEqualTo( + "abc1".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc1".getBytes()); + // Will be invalid because abc is after abc1 + rocksIterator.seek("abc".getBytes()); + assertThat(rocksIterator.isValid()).isFalse(); + // Will be abc3 because the next one after abc999 + // is abc3 + rocksIterator.seek("abc999".getBytes()); + assertThat(rocksIterator.key()).isEqualTo( + "abc3".getBytes()); + assertThat(rocksIterator.value()).isEqualTo( + "abc3".getBytes()); + } finally { + if (rocksIterator != null) { + rocksIterator.dispose(); + } + if (rocksDB != null) { + rocksDB.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void builtinComparatorEnum(){ + assertThat(BuiltinComparator.BYTEWISE_COMPARATOR.ordinal()) + .isEqualTo(0); + assertThat( + BuiltinComparator.REVERSE_BYTEWISE_COMPARATOR.ordinal()) + .isEqualTo(1); + assertThat(BuiltinComparator.values().length).isEqualTo(2); + assertThat(BuiltinComparator.valueOf("BYTEWISE_COMPARATOR")). + isEqualTo(BuiltinComparator.BYTEWISE_COMPARATOR); + } +} diff --git a/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java new file mode 100644 index 000000000..bff4d5f6c --- /dev/null +++ b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java @@ -0,0 +1,21 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.Test; + + +public class CompressionOptionsTest +{ + @Test + public void getCompressionType() { + for (CompressionType compressionType : CompressionType.values()) { + String libraryName = compressionType.getLibraryName(); + compressionType.equals(CompressionType.getCompressionType( + libraryName)); + } + } +} diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java new file mode 100644 index 000000000..9dab55955 --- /dev/null +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -0,0 +1,601 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import java.util.Properties; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DBOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + public static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void getDBOptionsFromProps() { + DBOptions opt = null; + try { + // setup sample properties + Properties properties = new Properties(); + properties.put("allow_mmap_reads", "true"); + properties.put("bytes_per_sync", "13"); + opt = DBOptions.getDBOptionsFromProps(properties); + assertThat(opt).isNotNull(); + assertThat(String.valueOf(opt.allowMmapReads())). + isEqualTo(properties.get("allow_mmap_reads")); + assertThat(String.valueOf(opt.bytesPerSync())). + isEqualTo(properties.get("bytes_per_sync")); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void failDBOptionsFromPropsWithIllegalValue() { + DBOptions opt = null; + try { + // setup sample properties + Properties properties = new Properties(); + properties.put("tomato", "1024"); + properties.put("burger", "2"); + opt = DBOptions. + getDBOptionsFromProps(properties); + assertThat(opt).isNull(); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void failDBOptionsFromPropsWithNullValue() { + DBOptions.getDBOptionsFromProps(null); + } + + @Test(expected = IllegalArgumentException.class) + public void failDBOptionsFromPropsWithEmptyProps() { + DBOptions.getDBOptionsFromProps( + new Properties()); + } + + @Test + public void setIncreaseParallelism() { + DBOptions opt = null; + try { + opt = new DBOptions(); + final int threads = Runtime.getRuntime().availableProcessors() * 2; + opt.setIncreaseParallelism(threads); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void createIfMissing() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setCreateIfMissing(boolValue); + assertThat(opt.createIfMissing()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void createMissingColumnFamilies() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setCreateMissingColumnFamilies(boolValue); + assertThat(opt.createMissingColumnFamilies()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void errorIfExists() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setErrorIfExists(boolValue); + assertThat(opt.errorIfExists()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void paranoidChecks() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setParanoidChecks(boolValue); + assertThat(opt.paranoidChecks()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxTotalWalSize() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setMaxTotalWalSize(longValue); + assertThat(opt.maxTotalWalSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxOpenFiles() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setMaxOpenFiles(intValue); + assertThat(opt.maxOpenFiles()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void disableDataSync() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setDisableDataSync(boolValue); + assertThat(opt.disableDataSync()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void useFsync() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setUseFsync(boolValue); + assertThat(opt.useFsync()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void dbLogDir() { + DBOptions opt = null; + try { + opt = new DBOptions(); + String str = "path/to/DbLogDir"; + opt.setDbLogDir(str); + assertThat(opt.dbLogDir()).isEqualTo(str); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walDir() { + DBOptions opt = null; + try { + opt = new DBOptions(); + String str = "path/to/WalDir"; + opt.setWalDir(str); + assertThat(opt.walDir()).isEqualTo(str); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void deleteObsoleteFilesPeriodMicros() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setDeleteObsoleteFilesPeriodMicros(longValue); + assertThat(opt.deleteObsoleteFilesPeriodMicros()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBackgroundCompactions() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setMaxBackgroundCompactions(intValue); + assertThat(opt.maxBackgroundCompactions()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBackgroundFlushes() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setMaxBackgroundFlushes(intValue); + assertThat(opt.maxBackgroundFlushes()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxLogFileSize() throws RocksDBException { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setMaxLogFileSize(longValue); + assertThat(opt.maxLogFileSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void logFileTimeToRoll() throws RocksDBException { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setLogFileTimeToRoll(longValue); + assertThat(opt.logFileTimeToRoll()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void keepLogFileNum() throws RocksDBException { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setKeepLogFileNum(longValue); + assertThat(opt.keepLogFileNum()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxManifestFileSize() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setMaxManifestFileSize(longValue); + assertThat(opt.maxManifestFileSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void tableCacheNumshardbits() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setTableCacheNumshardbits(intValue); + assertThat(opt.tableCacheNumshardbits()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void tableCacheRemoveScanCountLimit() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setTableCacheRemoveScanCountLimit(intValue); + assertThat(opt.tableCacheRemoveScanCountLimit()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walSizeLimitMB() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setWalSizeLimitMB(longValue); + assertThat(opt.walSizeLimitMB()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walTtlSeconds() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setWalTtlSeconds(longValue); + assertThat(opt.walTtlSeconds()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void manifestPreallocationSize() throws RocksDBException { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setManifestPreallocationSize(longValue); + assertThat(opt.manifestPreallocationSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowOsBuffer() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowOsBuffer(boolValue); + assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowMmapReads() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapReads(boolValue); + assertThat(opt.allowMmapReads()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowMmapWrites() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapWrites(boolValue); + assertThat(opt.allowMmapWrites()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void isFdCloseOnExec() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setIsFdCloseOnExec(boolValue); + assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void skipLogErrorOnRecovery() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setSkipLogErrorOnRecovery(boolValue); + assertThat(opt.skipLogErrorOnRecovery()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void statsDumpPeriodSec() { + DBOptions opt = null; + try { + opt = new DBOptions(); + int intValue = rand.nextInt(); + opt.setStatsDumpPeriodSec(intValue); + assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void adviseRandomOnOpen() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setAdviseRandomOnOpen(boolValue); + assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void useAdaptiveMutex() { + DBOptions opt = null; + try { + opt = new DBOptions(); + boolean boolValue = rand.nextBoolean(); + opt.setUseAdaptiveMutex(boolValue); + assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void bytesPerSync() { + DBOptions opt = null; + try { + opt = new DBOptions(); + long longValue = rand.nextLong(); + opt.setBytesPerSync(longValue); + assertThat(opt.bytesPerSync()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void rateLimiterConfig() { + DBOptions options = null; + DBOptions anotherOptions = null; + try { + options = new DBOptions(); + RateLimiterConfig rateLimiterConfig = + new GenericRateLimiterConfig(1000, 0, 1); + options.setRateLimiterConfig(rateLimiterConfig); + // Test with parameter initialization + anotherOptions = new DBOptions(); + anotherOptions.setRateLimiterConfig( + new GenericRateLimiterConfig(1000)); + } finally { + if (options != null) { + options.dispose(); + } + if (anotherOptions != null) { + anotherOptions.dispose(); + } + } + } + + @Test + public void statistics() { + DBOptions options = new DBOptions(); + Statistics statistics = options.createStatistics(). + statisticsPtr(); + assertThat(statistics).isNotNull(); + + DBOptions anotherOptions = new DBOptions(); + statistics = anotherOptions.statisticsPtr(); + assertThat(statistics).isNotNull(); + } +} diff --git a/java/src/test/java/org/rocksdb/DirectComparatorTest.java b/java/src/test/java/org/rocksdb/DirectComparatorTest.java new file mode 100644 index 000000000..be84d6647 --- /dev/null +++ b/java/src/test/java/org/rocksdb/DirectComparatorTest.java @@ -0,0 +1,52 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; +import java.nio.file.FileSystems; + +public class DirectComparatorTest { + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void directComparator() throws IOException, RocksDBException { + + final AbstractComparatorTest comparatorTest = new AbstractComparatorTest() { + @Override + public AbstractComparator getAscendingIntKeyComparator() { + return new DirectComparator(new ComparatorOptions()) { + + @Override + public String name() { + return "test.AscendingIntKeyDirectComparator"; + } + + @Override + public int compare(final DirectSlice a, final DirectSlice b) { + final byte ax[] = new byte[4], bx[] = new byte[4]; + a.data().get(ax); + b.data().get(bx); + return compareIntKeys(ax, bx); + } + }; + } + }; + + // test the round-tripability of keys written and read with the DirectComparator + comparatorTest.testRoundtrip(FileSystems.getDefault().getPath( + dbFolder.getRoot().getAbsolutePath())); + } +} diff --git a/java/src/test/java/org/rocksdb/DirectSliceTest.java b/java/src/test/java/org/rocksdb/DirectSliceTest.java new file mode 100644 index 000000000..123eed2e7 --- /dev/null +++ b/java/src/test/java/org/rocksdb/DirectSliceTest.java @@ -0,0 +1,106 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import java.nio.ByteBuffer; + +import static org.assertj.core.api.Assertions.assertThat; + +public class DirectSliceTest { + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void directSlice() { + DirectSlice directSlice = null; + DirectSlice otherSlice = null; + try { + directSlice = new DirectSlice("abc"); + otherSlice = new DirectSlice("abc"); + assertThat(directSlice.toString()).isEqualTo("abc"); + // clear first slice + directSlice.clear(); + assertThat(directSlice.toString()).isEmpty(); + // get first char in otherslice + assertThat(otherSlice.get(0)).isEqualTo("a".getBytes()[0]); + // remove prefix + otherSlice.removePrefix(1); + assertThat(otherSlice.toString()).isEqualTo("bc"); + } finally { + if (directSlice != null) { + directSlice.dispose(); + } + if (otherSlice != null) { + otherSlice.dispose(); + } + } + } + + @Test + public void directSliceWithByteBuffer() { + DirectSlice directSlice = null; + try { + byte[] data = "Some text".getBytes(); + ByteBuffer buffer = ByteBuffer.allocateDirect(data.length + 1); + buffer.put(data); + buffer.put(data.length, (byte)0); + + directSlice = new DirectSlice(buffer); + assertThat(directSlice.toString()).isEqualTo("Some text"); + } finally { + if (directSlice != null) { + directSlice.dispose(); + } + } + } + + @Test + public void directSliceWithByteBufferAndLength() { + DirectSlice directSlice = null; + try { + byte[] data = "Some text".getBytes(); + ByteBuffer buffer = ByteBuffer.allocateDirect(data.length); + buffer.put(data); + directSlice = new DirectSlice(buffer, 4); + assertThat(directSlice.toString()).isEqualTo("Some"); + } finally { + if (directSlice != null) { + directSlice.dispose(); + } + } + } + + @Test(expected = AssertionError.class) + public void directSliceInitWithoutDirectAllocation() { + DirectSlice directSlice = null; + try { + byte[] data = "Some text".getBytes(); + ByteBuffer buffer = ByteBuffer.wrap(data); + directSlice = new DirectSlice(buffer); + } finally { + if (directSlice != null) { + directSlice.dispose(); + } + } + } + + @Test(expected = AssertionError.class) + public void directSlicePrefixInitWithoutDirectAllocation() { + DirectSlice directSlice = null; + try { + byte[] data = "Some text".getBytes(); + ByteBuffer buffer = ByteBuffer.wrap(data); + directSlice = new DirectSlice(buffer, 4); + } finally { + if (directSlice != null) { + directSlice.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/FilterTest.java b/java/src/test/java/org/rocksdb/FilterTest.java new file mode 100644 index 000000000..36ce37970 --- /dev/null +++ b/java/src/test/java/org/rocksdb/FilterTest.java @@ -0,0 +1,47 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +public class FilterTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void filter() { + Options options = null; + try { + options = new Options(); + // test table config + options.setTableFormatConfig(new BlockBasedTableConfig(). + setFilter(new BloomFilter())); + options.dispose(); + System.gc(); + System.runFinalization(); + // new Bloom filter + options = new Options(); + BlockBasedTableConfig blockConfig = new BlockBasedTableConfig(); + blockConfig.setFilter(new BloomFilter()); + options.setTableFormatConfig(blockConfig); + BloomFilter bloomFilter = new BloomFilter(10); + blockConfig.setFilter(bloomFilter); + options.setTableFormatConfig(blockConfig); + System.gc(); + System.runFinalization(); + blockConfig.setFilter(new BloomFilter(10, false)); + options.setTableFormatConfig(blockConfig); + + } finally { + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/FlushTest.java b/java/src/test/java/org/rocksdb/FlushTest.java new file mode 100644 index 000000000..94a32d383 --- /dev/null +++ b/java/src/test/java/org/rocksdb/FlushTest.java @@ -0,0 +1,65 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class FlushTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void flush() throws RocksDBException { + RocksDB db = null; + Options options = null; + WriteOptions wOpt = null; + FlushOptions flushOptions = null; + try { + options = new Options(); + // Setup options + options.setCreateIfMissing(true); + options.setMaxWriteBufferNumber(10); + options.setMinWriteBufferNumberToMerge(10); + wOpt = new WriteOptions(); + flushOptions = new FlushOptions(); + flushOptions.setWaitForFlush(true); + assertThat(flushOptions.waitForFlush()).isTrue(); + wOpt.setDisableWAL(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put(wOpt, "key1".getBytes(), "value1".getBytes()); + db.put(wOpt, "key2".getBytes(), "value2".getBytes()); + db.put(wOpt, "key3".getBytes(), "value3".getBytes()); + db.put(wOpt, "key4".getBytes(), "value4".getBytes()); + assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")).isEqualTo("4"); + db.flush(flushOptions); + assertThat(db.getProperty("rocksdb.num-entries-active-mem-table")). + isEqualTo("0"); + } finally { + if (flushOptions != null) { + flushOptions.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (wOpt != null) { + wOpt.dispose(); + } + + } + } +} diff --git a/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java new file mode 100644 index 000000000..39d1ddd1d --- /dev/null +++ b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java @@ -0,0 +1,119 @@ +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.IOException; + +import static java.nio.file.Files.readAllBytes; +import static java.nio.file.Paths.get; +import static org.assertj.core.api.Assertions.assertThat; + +public class InfoLogLevelTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void testInfoLogLevel() throws RocksDBException, + IOException { + RocksDB db = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + assertThat(getLogContents()).isNotEmpty(); + } finally { + if (db != null) { + db.close(); + } + } + } + + @Test + public void testFatalLogLevel() throws RocksDBException, + IOException { + RocksDB db = null; + Options options = null; + try { + options = new Options(). + setCreateIfMissing(true). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); + assertThat(options.infoLogLevel()). + isEqualTo(InfoLogLevel.FATAL_LEVEL); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + // As InfoLogLevel is set to FATAL_LEVEL, here we expect the log + // content to be empty. + assertThat(getLogContents()).isEmpty(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void testFatalLogLevelWithDBOptions() + throws RocksDBException, IOException { + RocksDB db = null; + Options options = null; + DBOptions dbOptions = null; + try { + dbOptions = new DBOptions(). + setInfoLogLevel(InfoLogLevel.FATAL_LEVEL); + options = new Options(dbOptions, + new ColumnFamilyOptions()). + setCreateIfMissing(true); + assertThat(dbOptions.infoLogLevel()). + isEqualTo(InfoLogLevel.FATAL_LEVEL); + assertThat(options.infoLogLevel()). + isEqualTo(InfoLogLevel.FATAL_LEVEL); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + assertThat(getLogContents()).isEmpty(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (dbOptions != null) { + dbOptions.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void failIfIllegalByteValueProvided() { + InfoLogLevel.getInfoLogLevel((byte)-1); + } + + @Test + public void valueOf() { + assertThat(InfoLogLevel.valueOf("DEBUG_LEVEL")). + isEqualTo(InfoLogLevel.DEBUG_LEVEL); + } + + /** + * Read LOG file contents into String. + * + * @return LOG file contents as String. + * @throws IOException if file is not found. + */ + private String getLogContents() throws IOException { + return new String(readAllBytes(get( + dbFolder.getRoot().getAbsolutePath()+ "/LOG"))); + } +} diff --git a/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/java/src/test/java/org/rocksdb/KeyMayExistTest.java new file mode 100644 index 000000000..f29c2f872 --- /dev/null +++ b/java/src/test/java/org/rocksdb/KeyMayExistTest.java @@ -0,0 +1,91 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class KeyMayExistTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void keyMayExist() throws RocksDBException { + RocksDB db = null; + DBOptions options = null; + try { + options = new DBOptions(); + options.setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + // open database using cf names + List cfDescriptors = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath(), + cfDescriptors, columnFamilyHandleList); + assertThat(columnFamilyHandleList.size()). + isEqualTo(2); + db.put("key".getBytes(), "value".getBytes()); + // Test without column family + StringBuffer retValue = new StringBuffer(); + boolean exists = db.keyMayExist("key".getBytes(), retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()). + isEqualTo("value"); + + // Test without column family but with readOptions + retValue = new StringBuffer(); + exists = db.keyMayExist(new ReadOptions(), "key".getBytes(), + retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()). + isEqualTo("value"); + + // Test with column family + retValue = new StringBuffer(); + exists = db.keyMayExist(columnFamilyHandleList.get(0), "key".getBytes(), + retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()). + isEqualTo("value"); + + // Test with column family and readOptions + retValue = new StringBuffer(); + exists = db.keyMayExist(new ReadOptions(), + columnFamilyHandleList.get(0), "key".getBytes(), + retValue); + assertThat(exists).isTrue(); + assertThat(retValue.toString()). + isEqualTo("value"); + + // KeyMayExist in CF1 must return false + assertThat(db.keyMayExist(columnFamilyHandleList.get(1), + "key".getBytes(), retValue)).isFalse(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/MemTableTest.java b/java/src/test/java/org/rocksdb/MemTableTest.java new file mode 100644 index 000000000..bfc898c42 --- /dev/null +++ b/java/src/test/java/org/rocksdb/MemTableTest.java @@ -0,0 +1,137 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class MemTableTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void hashSkipListMemTable() throws RocksDBException { + Options options = null; + try { + options = new Options(); + // Test HashSkipListMemTableConfig + HashSkipListMemTableConfig memTableConfig = + new HashSkipListMemTableConfig(); + assertThat(memTableConfig.bucketCount()). + isEqualTo(1000000); + memTableConfig.setBucketCount(2000000); + assertThat(memTableConfig.bucketCount()). + isEqualTo(2000000); + assertThat(memTableConfig.height()). + isEqualTo(4); + memTableConfig.setHeight(5); + assertThat(memTableConfig.height()). + isEqualTo(5); + assertThat(memTableConfig.branchingFactor()). + isEqualTo(4); + memTableConfig.setBranchingFactor(6); + assertThat(memTableConfig.branchingFactor()). + isEqualTo(6); + options.setMemTableConfig(memTableConfig); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void skipListMemTable() throws RocksDBException { + Options options = null; + try { + options = new Options(); + SkipListMemTableConfig skipMemTableConfig = + new SkipListMemTableConfig(); + assertThat(skipMemTableConfig.lookahead()). + isEqualTo(0); + skipMemTableConfig.setLookahead(20); + assertThat(skipMemTableConfig.lookahead()). + isEqualTo(20); + options.setMemTableConfig(skipMemTableConfig); + options.dispose(); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void hashLinkedListMemTable() throws RocksDBException { + Options options = null; + try { + options = new Options(); + HashLinkedListMemTableConfig hashLinkedListMemTableConfig = + new HashLinkedListMemTableConfig(); + assertThat(hashLinkedListMemTableConfig.bucketCount()). + isEqualTo(50000); + hashLinkedListMemTableConfig.setBucketCount(100000); + assertThat(hashLinkedListMemTableConfig.bucketCount()). + isEqualTo(100000); + assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()). + isEqualTo(0); + hashLinkedListMemTableConfig.setHugePageTlbSize(1); + assertThat(hashLinkedListMemTableConfig.hugePageTlbSize()). + isEqualTo(1); + assertThat(hashLinkedListMemTableConfig. + bucketEntriesLoggingThreshold()). + isEqualTo(4096); + hashLinkedListMemTableConfig. + setBucketEntriesLoggingThreshold(200); + assertThat(hashLinkedListMemTableConfig. + bucketEntriesLoggingThreshold()). + isEqualTo(200); + assertThat(hashLinkedListMemTableConfig. + ifLogBucketDistWhenFlush()).isTrue(); + hashLinkedListMemTableConfig. + setIfLogBucketDistWhenFlush(false); + assertThat(hashLinkedListMemTableConfig. + ifLogBucketDistWhenFlush()).isFalse(); + assertThat(hashLinkedListMemTableConfig. + thresholdUseSkiplist()). + isEqualTo(256); + hashLinkedListMemTableConfig.setThresholdUseSkiplist(29); + assertThat(hashLinkedListMemTableConfig. + thresholdUseSkiplist()). + isEqualTo(29); + options.setMemTableConfig(hashLinkedListMemTableConfig); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void vectorMemTable() throws RocksDBException { + Options options = null; + try { + options = new Options(); + VectorMemTableConfig vectorMemTableConfig = + new VectorMemTableConfig(); + assertThat(vectorMemTableConfig.reservedSize()). + isEqualTo(0); + vectorMemTableConfig.setReservedSize(123); + assertThat(vectorMemTableConfig.reservedSize()). + isEqualTo(123); + options.setMemTableConfig(vectorMemTableConfig); + options.dispose(); + } finally { + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java new file mode 100644 index 000000000..55e8a20cd --- /dev/null +++ b/java/src/test/java/org/rocksdb/MergeTest.java @@ -0,0 +1,300 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.List; +import java.util.ArrayList; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class MergeTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void stringOption() + throws InterruptedException, RocksDBException { + RocksDB db = null; + Options opt = null; + try { + String db_path_string = + dbFolder.getRoot().getAbsolutePath(); + opt = new Options(); + opt.setCreateIfMissing(true); + opt.setMergeOperatorName("stringappend"); + + db = RocksDB.open(opt, db_path_string); + // writing aa under key + db.put("key".getBytes(), "aa".getBytes()); + // merge bb under key + db.merge("key".getBytes(), "bb".getBytes()); + + byte[] value = db.get("key".getBytes()); + String strValue = new String(value); + assertThat(strValue).isEqualTo("aa,bb"); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void cFStringOption() + throws InterruptedException, RocksDBException { + RocksDB db = null; + DBOptions opt = null; + List columnFamilyHandleList = + new ArrayList<>(); + try { + String db_path_string = + dbFolder.getRoot().getAbsolutePath(); + opt = new DBOptions(); + opt.setCreateIfMissing(true); + opt.setCreateMissingColumnFamilies(true); + + List cfDescriptors = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions().setMergeOperatorName( + "stringappend"))); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions().setMergeOperatorName( + "stringappend"))); + db = RocksDB.open(opt, db_path_string, + cfDescriptors, columnFamilyHandleList); + + // writing aa under key + db.put(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "aa".getBytes()); + // merge bb under key + db.merge(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "bb".getBytes()); + + byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + String strValue = new String(value); + assertThat(strValue).isEqualTo("aa,bb"); + } finally { + for (ColumnFamilyHandle handle : columnFamilyHandleList) { + handle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void operatorOption() + throws InterruptedException, RocksDBException { + RocksDB db = null; + Options opt = null; + try { + String db_path_string = + dbFolder.getRoot().getAbsolutePath(); + opt = new Options(); + opt.setCreateIfMissing(true); + + StringAppendOperator stringAppendOperator = new StringAppendOperator(); + opt.setMergeOperator(stringAppendOperator); + + db = RocksDB.open(opt, db_path_string); + // Writing aa under key + db.put("key".getBytes(), "aa".getBytes()); + + // Writing bb under key + db.merge("key".getBytes(), "bb".getBytes()); + + byte[] value = db.get("key".getBytes()); + String strValue = new String(value); + + assertThat(strValue).isEqualTo("aa,bb"); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void cFOperatorOption() + throws InterruptedException, RocksDBException { + RocksDB db = null; + DBOptions opt = null; + ColumnFamilyHandle columnFamilyHandle = null; + try { + String db_path_string = + dbFolder.getRoot().getAbsolutePath(); + opt = new DBOptions(); + opt.setCreateIfMissing(true); + opt.setCreateMissingColumnFamilies(true); + StringAppendOperator stringAppendOperator = new StringAppendOperator(); + + List cfDescriptors = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfDescriptors.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions().setMergeOperator( + stringAppendOperator))); + cfDescriptors.add(new ColumnFamilyDescriptor("new_cf".getBytes(), + new ColumnFamilyOptions().setMergeOperator( + stringAppendOperator))); + db = RocksDB.open(opt, db_path_string, + cfDescriptors, columnFamilyHandleList); + + // writing aa under key + db.put(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "aa".getBytes()); + // merge bb under key + db.merge(columnFamilyHandleList.get(1), + "cfkey".getBytes(), "bb".getBytes()); + byte[] value = db.get(columnFamilyHandleList.get(1), "cfkey".getBytes()); + String strValue = new String(value); + + // Test also with createColumnFamily + columnFamilyHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), + new ColumnFamilyOptions().setMergeOperator(stringAppendOperator))); + // writing xx under cfkey2 + db.put(columnFamilyHandle, "cfkey2".getBytes(), "xx".getBytes()); + // merge yy under cfkey2 + db.merge(columnFamilyHandle, new WriteOptions(), "cfkey2".getBytes(), "yy".getBytes()); + value = db.get(columnFamilyHandle, "cfkey2".getBytes()); + String strValueTmpCf = new String(value); + + columnFamilyHandle.dispose(); + assertThat(strValue).isEqualTo("aa,bb"); + assertThat(strValueTmpCf).isEqualTo("xx,yy"); + } finally { + if (columnFamilyHandle != null) { + columnFamilyHandle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void operatorGcBehaviour() + throws RocksDBException { + Options opt = null; + RocksDB db = null; + try { + String db_path_string = + dbFolder.getRoot().getAbsolutePath(); + opt = new Options(); + opt.setCreateIfMissing(true); + StringAppendOperator stringAppendOperator = new StringAppendOperator(); + opt.setMergeOperator(stringAppendOperator); + db = RocksDB.open(opt, db_path_string); + db.close(); + opt.dispose(); + System.gc(); + System.runFinalization(); + // test reuse + opt = new Options(); + opt.setMergeOperator(stringAppendOperator); + db = RocksDB.open(opt, db_path_string); + db.close(); + opt.dispose(); + System.gc(); + System.runFinalization(); + // test param init + opt = new Options(); + opt.setMergeOperator(new StringAppendOperator()); + db = RocksDB.open(opt, db_path_string); + db.close(); + opt.dispose(); + System.gc(); + System.runFinalization(); + // test replace one with another merge operator instance + opt = new Options(); + opt.setMergeOperator(stringAppendOperator); + StringAppendOperator newStringAppendOperator = new StringAppendOperator(); + opt.setMergeOperator(newStringAppendOperator); + db = RocksDB.open(opt, db_path_string); + db.close(); + opt.dispose(); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void emptyStringInSetMergeOperatorByName() { + Options opt = null; + ColumnFamilyOptions cOpt = null; + try { + opt = new Options(); + cOpt = new ColumnFamilyOptions(); + opt.setMergeOperatorName(""); + cOpt.setMergeOperatorName(""); + } finally { + if (opt != null) { + opt.dispose(); + } + if (cOpt != null) { + cOpt.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void nullStringInSetMergeOperatorByNameOptions() { + Options opt = null; + try { + opt = new Options(); + opt.setMergeOperatorName(null); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test(expected = IllegalArgumentException.class) + public void + nullStringInSetMergeOperatorByNameColumnFamilyOptions() { + ColumnFamilyOptions opt = null; + try { + opt = new ColumnFamilyOptions(); + opt.setMergeOperatorName(null); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/java/src/test/java/org/rocksdb/MixedOptionsTest.java new file mode 100644 index 000000000..f095e99d8 --- /dev/null +++ b/java/src/test/java/org/rocksdb/MixedOptionsTest.java @@ -0,0 +1,56 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class MixedOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void mixedOptionsTest(){ + // Set a table factory and check the names + ColumnFamilyOptions cfOptions = new ColumnFamilyOptions(); + cfOptions.setTableFormatConfig(new BlockBasedTableConfig(). + setFilter(new BloomFilter())); + assertThat(cfOptions.tableFactoryName()).isEqualTo( + "BlockBasedTable"); + cfOptions.setTableFormatConfig(new PlainTableConfig()); + assertThat(cfOptions.tableFactoryName()).isEqualTo("PlainTable"); + // Initialize a dbOptions object from cf options and + // db options + DBOptions dbOptions = new DBOptions(); + Options options = new Options(dbOptions, cfOptions); + assertThat(options.tableFactoryName()).isEqualTo("PlainTable"); + // Free instances + options.dispose(); + options = null; + cfOptions.dispose(); + cfOptions = null; + dbOptions.dispose(); + dbOptions = null; + System.gc(); + System.runFinalization(); + // Test Optimize for statements + cfOptions = new ColumnFamilyOptions(); + cfOptions.optimizeUniversalStyleCompaction(); + cfOptions.optimizeLevelStyleCompaction(); + cfOptions.optimizeForPointLookup(1024); + options = new Options(); + options.optimizeLevelStyleCompaction(); + options.optimizeLevelStyleCompaction(400); + options.optimizeUniversalStyleCompaction(); + options.optimizeUniversalStyleCompaction(400); + options.optimizeForPointLookup(1024); + options.prepareForBulkLoad(); + } +} diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java new file mode 100644 index 000000000..5b84d2510 --- /dev/null +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -0,0 +1,1132 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Random; +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + + +public class OptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + public static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void setIncreaseParallelism() { + Options opt = null; + try { + opt = new Options(); + final int threads = Runtime.getRuntime().availableProcessors() * 2; + opt.setIncreaseParallelism(threads); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void writeBufferSize() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setWriteBufferSize(longValue); + assertThat(opt.writeBufferSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxWriteBufferNumber() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxWriteBufferNumber(intValue); + assertThat(opt.maxWriteBufferNumber()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void minWriteBufferNumberToMerge() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMinWriteBufferNumberToMerge(intValue); + assertThat(opt.minWriteBufferNumberToMerge()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void numLevels() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setNumLevels(intValue); + assertThat(opt.numLevels()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroFileNumCompactionTrigger() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setLevelZeroFileNumCompactionTrigger(intValue); + assertThat(opt.levelZeroFileNumCompactionTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroSlowdownWritesTrigger() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setLevelZeroSlowdownWritesTrigger(intValue); + assertThat(opt.levelZeroSlowdownWritesTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void levelZeroStopWritesTrigger() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setLevelZeroStopWritesTrigger(intValue); + assertThat(opt.levelZeroStopWritesTrigger()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxMemCompactionLevel() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxMemCompactionLevel(intValue); + assertThat(opt.maxMemCompactionLevel()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void targetFileSizeBase() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setTargetFileSizeBase(longValue); + assertThat(opt.targetFileSizeBase()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void targetFileSizeMultiplier() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setTargetFileSizeMultiplier(intValue); + assertThat(opt.targetFileSizeMultiplier()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBytesForLevelBase() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setMaxBytesForLevelBase(longValue); + assertThat(opt.maxBytesForLevelBase()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBytesForLevelMultiplier() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxBytesForLevelMultiplier(intValue); + assertThat(opt.maxBytesForLevelMultiplier()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void expandedCompactionFactor() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setExpandedCompactionFactor(intValue); + assertThat(opt.expandedCompactionFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void sourceCompactionFactor() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setSourceCompactionFactor(intValue); + assertThat(opt.sourceCompactionFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxGrandparentOverlapFactor() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxGrandparentOverlapFactor(intValue); + assertThat(opt.maxGrandparentOverlapFactor()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void softRateLimit() { + Options opt = null; + try { + opt = new Options(); + double doubleValue = rand.nextDouble(); + opt.setSoftRateLimit(doubleValue); + assertThat(opt.softRateLimit()).isEqualTo(doubleValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void hardRateLimit() { + Options opt = null; + try { + opt = new Options(); + double doubleValue = rand.nextDouble(); + opt.setHardRateLimit(doubleValue); + assertThat(opt.hardRateLimit()).isEqualTo(doubleValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void rateLimitDelayMaxMilliseconds() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setRateLimitDelayMaxMilliseconds(intValue); + assertThat(opt.rateLimitDelayMaxMilliseconds()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void arenaBlockSize() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setArenaBlockSize(longValue); + assertThat(opt.arenaBlockSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void disableAutoCompactions() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setDisableAutoCompactions(boolValue); + assertThat(opt.disableAutoCompactions()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void purgeRedundantKvsWhileFlush() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setPurgeRedundantKvsWhileFlush(boolValue); + assertThat(opt.purgeRedundantKvsWhileFlush()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void verifyChecksumsInCompaction() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksumsInCompaction(boolValue); + assertThat(opt.verifyChecksumsInCompaction()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void filterDeletes() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setFilterDeletes(boolValue); + assertThat(opt.filterDeletes()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxSequentialSkipInIterations() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setMaxSequentialSkipInIterations(longValue); + assertThat(opt.maxSequentialSkipInIterations()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void inplaceUpdateSupport() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setInplaceUpdateSupport(boolValue); + assertThat(opt.inplaceUpdateSupport()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void inplaceUpdateNumLocks() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setInplaceUpdateNumLocks(longValue); + assertThat(opt.inplaceUpdateNumLocks()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void memtablePrefixBloomBits() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMemtablePrefixBloomBits(intValue); + assertThat(opt.memtablePrefixBloomBits()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void memtablePrefixBloomProbes() { + Options opt = null; + try { + int intValue = rand.nextInt(); + opt = new Options(); + opt.setMemtablePrefixBloomProbes(intValue); + assertThat(opt.memtablePrefixBloomProbes()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void bloomLocality() { + Options opt = null; + try { + int intValue = rand.nextInt(); + opt = new Options(); + opt.setBloomLocality(intValue); + assertThat(opt.bloomLocality()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxSuccessiveMerges() throws RocksDBException { + Options opt = null; + try { + long longValue = rand.nextLong(); + opt = new Options(); + opt.setMaxSuccessiveMerges(longValue); + assertThat(opt.maxSuccessiveMerges()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void minPartialMergeOperands() { + Options opt = null; + try { + int intValue = rand.nextInt(); + opt = new Options(); + opt.setMinPartialMergeOperands(intValue); + assertThat(opt.minPartialMergeOperands()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void createIfMissing() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setCreateIfMissing(boolValue); + assertThat(opt.createIfMissing()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void createMissingColumnFamilies() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setCreateMissingColumnFamilies(boolValue); + assertThat(opt.createMissingColumnFamilies()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void errorIfExists() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setErrorIfExists(boolValue); + assertThat(opt.errorIfExists()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void paranoidChecks() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setParanoidChecks(boolValue); + assertThat(opt.paranoidChecks()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxTotalWalSize() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setMaxTotalWalSize(longValue); + assertThat(opt.maxTotalWalSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxOpenFiles() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxOpenFiles(intValue); + assertThat(opt.maxOpenFiles()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void disableDataSync() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setDisableDataSync(boolValue); + assertThat(opt.disableDataSync()). + isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void useFsync() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setUseFsync(boolValue); + assertThat(opt.useFsync()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void dbLogDir() { + Options opt = null; + try { + opt = new Options(); + String str = "path/to/DbLogDir"; + opt.setDbLogDir(str); + assertThat(opt.dbLogDir()).isEqualTo(str); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walDir() { + Options opt = null; + try { + opt = new Options(); + String str = "path/to/WalDir"; + opt.setWalDir(str); + assertThat(opt.walDir()).isEqualTo(str); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void deleteObsoleteFilesPeriodMicros() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setDeleteObsoleteFilesPeriodMicros(longValue); + assertThat(opt.deleteObsoleteFilesPeriodMicros()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBackgroundCompactions() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxBackgroundCompactions(intValue); + assertThat(opt.maxBackgroundCompactions()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxBackgroundFlushes() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setMaxBackgroundFlushes(intValue); + assertThat(opt.maxBackgroundFlushes()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxLogFileSize() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setMaxLogFileSize(longValue); + assertThat(opt.maxLogFileSize()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void logFileTimeToRoll() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setLogFileTimeToRoll(longValue); + assertThat(opt.logFileTimeToRoll()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void keepLogFileNum() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setKeepLogFileNum(longValue); + assertThat(opt.keepLogFileNum()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void maxManifestFileSize() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setMaxManifestFileSize(longValue); + assertThat(opt.maxManifestFileSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void tableCacheNumshardbits() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setTableCacheNumshardbits(intValue); + assertThat(opt.tableCacheNumshardbits()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void tableCacheRemoveScanCountLimit() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setTableCacheRemoveScanCountLimit(intValue); + assertThat(opt.tableCacheRemoveScanCountLimit()). + isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walSizeLimitMB() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setWalSizeLimitMB(longValue); + assertThat(opt.walSizeLimitMB()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void walTtlSeconds() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setWalTtlSeconds(longValue); + assertThat(opt.walTtlSeconds()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void manifestPreallocationSize() throws RocksDBException { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setManifestPreallocationSize(longValue); + assertThat(opt.manifestPreallocationSize()). + isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowOsBuffer() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowOsBuffer(boolValue); + assertThat(opt.allowOsBuffer()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowMmapReads() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapReads(boolValue); + assertThat(opt.allowMmapReads()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void allowMmapWrites() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setAllowMmapWrites(boolValue); + assertThat(opt.allowMmapWrites()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void isFdCloseOnExec() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setIsFdCloseOnExec(boolValue); + assertThat(opt.isFdCloseOnExec()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void skipLogErrorOnRecovery() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setSkipLogErrorOnRecovery(boolValue); + assertThat(opt.skipLogErrorOnRecovery()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void statsDumpPeriodSec() { + Options opt = null; + try { + opt = new Options(); + int intValue = rand.nextInt(); + opt.setStatsDumpPeriodSec(intValue); + assertThat(opt.statsDumpPeriodSec()).isEqualTo(intValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void adviseRandomOnOpen() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setAdviseRandomOnOpen(boolValue); + assertThat(opt.adviseRandomOnOpen()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void useAdaptiveMutex() { + Options opt = null; + try { + opt = new Options(); + boolean boolValue = rand.nextBoolean(); + opt.setUseAdaptiveMutex(boolValue); + assertThat(opt.useAdaptiveMutex()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void bytesPerSync() { + Options opt = null; + try { + opt = new Options(); + long longValue = rand.nextLong(); + opt.setBytesPerSync(longValue); + assertThat(opt.bytesPerSync()).isEqualTo(longValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void rocksEnv() { + Options options = null; + try { + options = new Options(); + RocksEnv rocksEnv = RocksEnv.getDefault(); + options.setEnv(rocksEnv); + assertThat(options.getEnv()).isSameAs(rocksEnv); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void linkageOfPrepMethods() { + Options options = null; + try { + options = new Options(); + options.optimizeUniversalStyleCompaction(); + options.optimizeUniversalStyleCompaction(4000); + options.optimizeLevelStyleCompaction(); + options.optimizeLevelStyleCompaction(3000); + options.optimizeForPointLookup(10); + options.prepareForBulkLoad(); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void compressionTypes() { + Options options = null; + try { + options = new Options(); + for (CompressionType compressionType : + CompressionType.values()) { + options.setCompressionType(compressionType); + assertThat(options.compressionType()). + isEqualTo(compressionType); + assertThat(CompressionType.valueOf("NO_COMPRESSION")). + isEqualTo(CompressionType.NO_COMPRESSION); + } + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void compactionStyles() { + Options options = null; + try { + options = new Options(); + for (CompactionStyle compactionStyle : + CompactionStyle.values()) { + options.setCompactionStyle(compactionStyle); + assertThat(options.compactionStyle()). + isEqualTo(compactionStyle); + assertThat(CompactionStyle.valueOf("FIFO")). + isEqualTo(CompactionStyle.FIFO); + } + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void rateLimiterConfig() { + Options options = null; + Options anotherOptions = null; + RateLimiterConfig rateLimiterConfig; + try { + options = new Options(); + rateLimiterConfig = new GenericRateLimiterConfig(1000, 0, 1); + options.setRateLimiterConfig(rateLimiterConfig); + // Test with parameter initialization + anotherOptions = new Options(); + anotherOptions.setRateLimiterConfig( + new GenericRateLimiterConfig(1000)); + } finally { + if (options != null) { + options.dispose(); + } + if (anotherOptions != null) { + anotherOptions.dispose(); + } + } + } + + @Test + public void shouldSetTestPrefixExtractor() { + Options options = null; + try { + options = new Options(); + options.useFixedLengthPrefixExtractor(100); + options.useFixedLengthPrefixExtractor(10); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void shouldTestMemTableFactoryName() + throws RocksDBException { + Options options = null; + try { + options = new Options(); + options.setMemTableConfig(new VectorMemTableConfig()); + assertThat(options.memTableFactoryName()). + isEqualTo("VectorRepFactory"); + options.setMemTableConfig( + new HashLinkedListMemTableConfig()); + assertThat(options.memTableFactoryName()). + isEqualTo("HashLinkedListRepFactory"); + } finally { + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void statistics() { + Options options = null; + Options anotherOptions = null; + try { + options = new Options(); + Statistics statistics = options.createStatistics(). + statisticsPtr(); + assertThat(statistics).isNotNull(); + anotherOptions = new Options(); + statistics = anotherOptions.statisticsPtr(); + assertThat(statistics).isNotNull(); + } finally { + if (options != null) { + options.dispose(); + } + if (anotherOptions != null) { + anotherOptions.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java new file mode 100644 index 000000000..850b050a0 --- /dev/null +++ b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java @@ -0,0 +1,95 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class PlainTableConfigTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void keySize() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setKeySize(5); + assertThat(plainTableConfig.keySize()). + isEqualTo(5); + } + + @Test + public void bloomBitsPerKey() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setBloomBitsPerKey(11); + assertThat(plainTableConfig.bloomBitsPerKey()). + isEqualTo(11); + } + + @Test + public void hashTableRatio() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setHashTableRatio(0.95); + assertThat(plainTableConfig.hashTableRatio()). + isEqualTo(0.95); + } + + @Test + public void indexSparseness() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setIndexSparseness(18); + assertThat(plainTableConfig.indexSparseness()). + isEqualTo(18); + } + + @Test + public void hugePageTlbSize() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setHugePageTlbSize(1); + assertThat(plainTableConfig.hugePageTlbSize()). + isEqualTo(1); + } + + @Test + public void encodingType() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setEncodingType(EncodingType.kPrefix); + assertThat(plainTableConfig.encodingType()).isEqualTo( + EncodingType.kPrefix); + } + + @Test + public void fullScanMode() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setFullScanMode(true); + assertThat(plainTableConfig.fullScanMode()).isTrue(); } + + @Test + public void storeIndexInFile() { + PlainTableConfig plainTableConfig = new PlainTableConfig(); + plainTableConfig.setStoreIndexInFile(true); + assertThat(plainTableConfig.storeIndexInFile()). + isTrue(); + } + + @Test + public void plainTableConfig() { + Options opt = null; + try { + opt = new Options(); + PlainTableConfig plainTableConfig = new PlainTableConfig(); + opt.setTableFormatConfig(plainTableConfig); + assertThat(opt.tableFactoryName()).isEqualTo("PlainTable"); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/PlatformRandomHelper.java b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java new file mode 100644 index 000000000..0155ce263 --- /dev/null +++ b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java @@ -0,0 +1,58 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Random; + +/** + * Helper class to get the appropriate Random class instance dependent + * on the current platform architecture (32bit vs 64bit) + */ +public class PlatformRandomHelper { + /** + * Determine if OS is 32-Bit/64-Bit + * + * @return boolean value indicating if operating system is 64 Bit. + */ + public static boolean isOs64Bit(){ + boolean is64Bit; + if (System.getProperty("os.name").contains("Windows")) { + is64Bit = (System.getenv("ProgramFiles(x86)") != null); + } else { + is64Bit = (System.getProperty("os.arch").contains("64")); + } + return is64Bit; + } + + /** + * Factory to get a platform specific Random instance + * + * @return {@link java.util.Random} instance. + */ + public static Random getPlatformSpecificRandomFactory(){ + if (isOs64Bit()) { + return new Random(); + } + return new Random32Bit(); + } + + /** + * Random32Bit is a class which overrides {@code nextLong} to + * provide random numbers which fit in size_t. This workaround + * is necessary because there is no unsigned_int < Java 8 + */ + private static class Random32Bit extends Random { + @Override + public long nextLong(){ + return this.nextInt(Integer.MAX_VALUE); + } + } + + /** + * Utility class constructor + */ + private PlatformRandomHelper() { } +} diff --git a/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java new file mode 100644 index 000000000..a254481e5 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java @@ -0,0 +1,332 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.List; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ReadOnlyTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void readOnlyOpen() throws RocksDBException { + RocksDB db = null; + RocksDB db2 = null; + RocksDB db3 = null; + Options options = null; + List columnFamilyHandleList = + new ArrayList<>(); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + List readOnlyColumnFamilyHandleList2 = + new ArrayList<>(); + try { + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + db2 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath()); + assertThat("value"). + isEqualTo(new String(db2.get("key".getBytes()))); + db.close(); + db2.close(); + + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + + db = RocksDB.open( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, columnFamilyHandleList); + columnFamilyHandleList.add(db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes(), new ColumnFamilyOptions()))); + columnFamilyHandleList.add(db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions()))); + db.put(columnFamilyHandleList.get(2), "key2".getBytes(), + "value2".getBytes()); + + db2 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + assertThat(db2.get("key2".getBytes())).isNull(); + assertThat(db2.get(readOnlyColumnFamilyHandleList.get(0), "key2".getBytes())). + isNull(); + cfDescriptors.clear(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + cfDescriptors.add( + new ColumnFamilyDescriptor("new_cf2".getBytes(), new ColumnFamilyOptions())); + db3 = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, readOnlyColumnFamilyHandleList2); + assertThat(new String(db3.get(readOnlyColumnFamilyHandleList2.get(1), + "key2".getBytes()))).isEqualTo("value2"); + } finally { + if (db != null) { + db.close(); + } + if (db2 != null) { + db2.close(); + } + if (db3 != null) { + db3.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToWriteInReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + // test that put fails in readonly mode + rDb.put("key".getBytes(), "value".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToCFWriteInReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + rDb.put(readOnlyColumnFamilyHandleList.get(0), + "key".getBytes(), "value".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToRemoveInReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + rDb.remove("key".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToCFRemoveInReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + rDb.remove(readOnlyColumnFamilyHandleList.get(0), + "key".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToWriteBatchReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + WriteBatch wb = new WriteBatch(); + wb.put("key".getBytes(), "value".getBytes()); + rDb.write(new WriteOptions(), wb); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test(expected = RocksDBException.class) + public void failToCFWriteBatchReadOnly() throws RocksDBException { + RocksDB db = null; + RocksDB rDb = null; + Options options = null; + WriteBatch wb = null; + try { + List cfDescriptors = new ArrayList<>(); + cfDescriptors.add( + new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY, + new ColumnFamilyOptions())); + List readOnlyColumnFamilyHandleList = + new ArrayList<>(); + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.close(); + + rDb = RocksDB.openReadOnly( + dbFolder.getRoot().getAbsolutePath(), cfDescriptors, + readOnlyColumnFamilyHandleList); + + wb = new WriteBatch(); + wb.put(readOnlyColumnFamilyHandleList.get(0), + "key".getBytes(), "value".getBytes()); + rDb.write(new WriteOptions(), wb); + } finally { + if (db != null) { + db.close(); + } + if (rDb != null) { + rDb.close(); + } + if (options != null) { + options.dispose(); + } + if (wb != null) { + wb.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java new file mode 100644 index 000000000..af88ce351 --- /dev/null +++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -0,0 +1,151 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Random; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import static org.assertj.core.api.Assertions.assertThat; + +public class ReadOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public ExpectedException exception = ExpectedException.none(); + + @Test + public void verifyChecksum(){ + ReadOptions opt = null; + try { + opt = new ReadOptions(); + Random rand = new Random(); + boolean boolValue = rand.nextBoolean(); + opt.setVerifyChecksums(boolValue); + assertThat(opt.verifyChecksums()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void fillCache(){ + ReadOptions opt = null; + try { + opt = new ReadOptions(); + Random rand = new Random(); + boolean boolValue = rand.nextBoolean(); + opt.setFillCache(boolValue); + assertThat(opt.fillCache()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void tailing(){ + ReadOptions opt = null; + try { + opt = new ReadOptions(); + Random rand = new Random(); + boolean boolValue = rand.nextBoolean(); + opt.setTailing(boolValue); + assertThat(opt.tailing()).isEqualTo(boolValue); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void snapshot(){ + ReadOptions opt = null; + try { + opt = new ReadOptions(); + opt.setSnapshot(null); + assertThat(opt.snapshot()).isNull(); + } finally { + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void failSetVerifyChecksumUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.setVerifyChecksums(true); + } + + @Test + public void failVerifyChecksumUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.verifyChecksums(); + } + + @Test + public void failSetFillCacheUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.setFillCache(true); + } + + @Test + public void failFillCacheUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.fillCache(); + } + + @Test + public void failSetTailingUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.setTailing(true); + } + + @Test + public void failTailingUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.tailing(); + } + + @Test + public void failSetSnapshotUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.setSnapshot(null); + } + + @Test + public void failSnapshotUninitialized(){ + ReadOptions readOptions = setupUninitializedReadOptions( + exception); + readOptions.snapshot(); + } + + private ReadOptions setupUninitializedReadOptions( + ExpectedException exception) { + ReadOptions readOptions = new ReadOptions(); + readOptions.dispose(); + exception.expect(AssertionError.class); + return readOptions; + } +} diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java new file mode 100644 index 000000000..100db529d --- /dev/null +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -0,0 +1,761 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import static org.assertj.core.api.Assertions.assertThat; + +public class RocksDBTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + public static final Random rand = PlatformRandomHelper. + getPlatformSpecificRandomFactory(); + + @Test + public void open() throws RocksDBException { + RocksDB db = null; + Options opt = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.close(); + opt = new Options(); + opt.setCreateIfMissing(true); + db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath()); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void put() throws RocksDBException { + RocksDB db = null; + WriteOptions opt = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value".getBytes()); + opt = new WriteOptions(); + db.put(opt, "key2".getBytes(), "12345678".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo( + "value".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo( + "12345678".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void write() throws RocksDBException { + RocksDB db = null; + Options options = null; + WriteBatch wb1 = null; + WriteBatch wb2 = null; + WriteOptions opts = null; + try { + options = new Options(). + setMergeOperator(new StringAppendOperator()). + setCreateIfMissing(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + opts = new WriteOptions(); + wb1 = new WriteBatch(); + wb1.put("key1".getBytes(), "aa".getBytes()); + wb1.merge("key1".getBytes(), "bb".getBytes()); + wb2 = new WriteBatch(); + wb2.put("key2".getBytes(), "xx".getBytes()); + wb2.merge("key2".getBytes(), "yy".getBytes()); + db.write(opts, wb1); + db.write(opts, wb2); + assertThat(db.get("key1".getBytes())).isEqualTo( + "aa,bb".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo( + "xx,yy".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (wb1 != null) { + wb1.dispose(); + } + if (wb2 != null) { + wb2.dispose(); + } + if (options != null) { + options.dispose(); + } + if (opts != null) { + opts.dispose(); + } + } + } + + @Test + public void getWithOutValue() throws RocksDBException { + RocksDB db = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + byte[] outValue = new byte[5]; + // not found value + int getResult = db.get("keyNotFound".getBytes(), outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get("key1".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = db.get("key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); + } finally { + if (db != null) { + db.close(); + } + } + } + + @Test + public void getWithOutValueReadOptions() throws RocksDBException { + RocksDB db = null; + ReadOptions rOpt = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + rOpt = new ReadOptions(); + db.put("key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + byte[] outValue = new byte[5]; + // not found value + int getResult = db.get(rOpt, "keyNotFound".getBytes(), + outValue); + assertThat(getResult).isEqualTo(RocksDB.NOT_FOUND); + // found value which fits in outValue + getResult = db.get(rOpt, "key1".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("value".getBytes()); + // found value which fits partially + getResult = db.get(rOpt, "key2".getBytes(), outValue); + assertThat(getResult).isNotEqualTo(RocksDB.NOT_FOUND); + assertThat(outValue).isEqualTo("12345".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rOpt != null) { + rOpt.dispose(); + } + } + } + + @Test + public void multiGet() throws RocksDBException { + RocksDB db = null; + ReadOptions rOpt = null; + try { + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + rOpt = new ReadOptions(); + db.put("key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + List lookupKeys = new ArrayList() {{ + add("key1".getBytes()); + add("key2".getBytes()); + }}; + Map results = db.multiGet(lookupKeys); + assertThat(results).isNotNull(); + assertThat(results.values()).isNotNull(); + assertThat(results.values()). + contains("value".getBytes(), "12345678".getBytes()); + // test same method with ReadOptions + results = db.multiGet(rOpt, lookupKeys); + assertThat(results).isNotNull(); + assertThat(results.values()).isNotNull(); + assertThat(results.values()). + contains("value".getBytes(), "12345678".getBytes()); + + // remove existing key + lookupKeys.remove("key2".getBytes()); + // add non existing key + lookupKeys.add("key3".getBytes()); + results = db.multiGet(lookupKeys); + assertThat(results).isNotNull(); + assertThat(results.values()).isNotNull(); + assertThat(results.values()). + contains("value".getBytes()); + // test same call with readOptions + results = db.multiGet(rOpt, lookupKeys); + assertThat(results).isNotNull(); + assertThat(results.values()).isNotNull(); + assertThat(results.values()). + contains("value".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (rOpt != null) { + rOpt.dispose(); + } + } + } + + @Test + public void merge() throws RocksDBException { + RocksDB db = null; + Options opt = null; + WriteOptions wOpt; + try { + opt = new Options(). + setCreateIfMissing(true). + setMergeOperator(new StringAppendOperator()); + wOpt = new WriteOptions(); + db = RocksDB.open(opt, dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo( + "value".getBytes()); + // merge key1 with another value portion + db.merge("key1".getBytes(), "value2".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo( + "value,value2".getBytes()); + // merge key1 with another value portion + db.merge(wOpt, "key1".getBytes(), "value3".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo( + "value,value2,value3".getBytes()); + // merge on non existent key shall insert the value + db.merge(wOpt, "key2".getBytes(), "xxxx".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo( + "xxxx".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void remove() throws RocksDBException { + RocksDB db = null; + WriteOptions wOpt; + try { + wOpt = new WriteOptions(); + db = RocksDB.open(dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value".getBytes()); + db.put("key2".getBytes(), "12345678".getBytes()); + assertThat(db.get("key1".getBytes())).isEqualTo( + "value".getBytes()); + assertThat(db.get("key2".getBytes())).isEqualTo( + "12345678".getBytes()); + db.remove("key1".getBytes()); + db.remove(wOpt, "key2".getBytes()); + assertThat(db.get("key1".getBytes())).isNull(); + assertThat(db.get("key2".getBytes())).isNull(); + } finally { + if (db != null) { + db.close(); + } + } + } + + @Test + public void getIntProperty() throws RocksDBException { + RocksDB db = null; + Options options = null; + WriteOptions wOpt = null; + try { + options = new Options(); + wOpt = new WriteOptions(); + // Setup options + options.setCreateIfMissing(true); + options.setMaxWriteBufferNumber(10); + options.setMinWriteBufferNumberToMerge(10); + wOpt.setDisableWAL(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put(wOpt, "key1".getBytes(), "value1".getBytes()); + db.put(wOpt, "key2".getBytes(), "value2".getBytes()); + db.put(wOpt, "key3".getBytes(), "value3".getBytes()); + db.put(wOpt, "key4".getBytes(), "value4".getBytes()); + assertThat(db.getLongProperty("rocksdb.num-entries-active-mem-table")).isGreaterThan(0); + assertThat(db.getLongProperty("rocksdb.cur-size-active-mem-table")).isGreaterThan(0); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (wOpt != null) { + wOpt.dispose(); + } + } + } + + @Test + public void fullCompactRange() throws RocksDBException { + RocksDB db = null; + Options opt = null; + try { + opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put((String.valueOf(i)).getBytes(), b); + } + db.compactRange(); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void fullCompactRangeColumnFamily() + throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + List columnFamilyHandles = + new ArrayList<>(); + try { + opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + List columnFamilyDescriptors = + new ArrayList<>(); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY)); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + "new_cf".getBytes(), + new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false))); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1)); + } finally { + for (ColumnFamilyHandle handle : columnFamilyHandles) { + handle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeWithKeys() + throws RocksDBException { + RocksDB db = null; + Options opt = null; + try { + opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put((String.valueOf(i)).getBytes(), b); + } + db.compactRange("0".getBytes(), "201".getBytes()); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeWithKeysReduce() + throws RocksDBException { + RocksDB db = null; + Options opt = null; + try { + opt = new Options(). + setCreateIfMissing(true). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put((String.valueOf(i)).getBytes(), b); + } + db.compactRange("0".getBytes(), "201".getBytes(), + true, 0, 0); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeWithKeysColumnFamily() + throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + List columnFamilyHandles = + new ArrayList<>(); + try { + opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + List columnFamilyDescriptors = + new ArrayList<>(); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY)); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + "new_cf".getBytes(), + new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false))); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1), + "0".getBytes(), "201".getBytes()); + } finally { + for (ColumnFamilyHandle handle : columnFamilyHandles) { + handle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeWithKeysReduceColumnFamily() + throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + List columnFamilyHandles = + new ArrayList<>(); + try { + opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + List columnFamilyDescriptors = + new ArrayList<>(); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY)); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + "new_cf".getBytes(), + new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false))); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.compactRange(columnFamilyHandles.get(1), "0".getBytes(), + "201".getBytes(), true, 0, 0); + } finally { + for (ColumnFamilyHandle handle : columnFamilyHandles) { + handle.dispose(); + } + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeToLevel() + throws RocksDBException, InterruptedException { + RocksDB db = null; + Options opt = null; + try { + opt = new Options(). + setCreateIfMissing(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100<<10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put((String.valueOf(i)).getBytes(), b); + } + db.flush(new FlushOptions().setWaitForFlush(true)); + db.close(); + opt.setTargetFileSizeBase(Long.MAX_VALUE). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(Long.MAX_VALUE). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(true); + + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + + db.compactRange(true, 0, 0); + for (int i = 0; i < 4; i++) { + if (i == 0) { + assertThat( + db.getProperty("rocksdb.num-files-at-level" + i)). + isEqualTo("1"); + } else { + assertThat( + db.getProperty("rocksdb.num-files-at-level" + i)). + isEqualTo("0"); + } + } + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void compactRangeToLevelColumnFamily() + throws RocksDBException { + RocksDB db = null; + DBOptions opt = null; + List columnFamilyHandles = + new ArrayList<>(); + try { + opt = new DBOptions(). + setCreateIfMissing(true). + setCreateMissingColumnFamilies(true); + List columnFamilyDescriptors = + new ArrayList<>(); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + RocksDB.DEFAULT_COLUMN_FAMILY)); + columnFamilyDescriptors.add(new ColumnFamilyDescriptor( + "new_cf".getBytes(), + new ColumnFamilyOptions(). + setDisableAutoCompactions(true). + setCompactionStyle(CompactionStyle.LEVEL). + setNumLevels(4). + setWriteBufferSize(100 << 10). + setLevelZeroFileNumCompactionTrigger(3). + setTargetFileSizeBase(200 << 10). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(500 << 10). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(false))); + // open database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles); + // fill database with key/value pairs + byte[] b = new byte[10000]; + for (int i = 0; i < 200; i++) { + rand.nextBytes(b); + db.put(columnFamilyHandles.get(1), + String.valueOf(i).getBytes(), b); + } + db.flush(new FlushOptions().setWaitForFlush(true), + columnFamilyHandles.get(1)); + // free column families + for (ColumnFamilyHandle handle : columnFamilyHandles) { + handle.dispose(); + } + // clear column family handles for reopen + columnFamilyHandles.clear(); + db.close(); + columnFamilyDescriptors.get(1). + columnFamilyOptions(). + setTargetFileSizeBase(Long.MAX_VALUE). + setTargetFileSizeMultiplier(1). + setMaxBytesForLevelBase(Long.MAX_VALUE). + setMaxBytesForLevelMultiplier(1). + setDisableAutoCompactions(true); + // reopen database + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath(), + columnFamilyDescriptors, + columnFamilyHandles); + // compact new column family + db.compactRange(columnFamilyHandles.get(1), true, 0, 0); + // check if new column family is compacted to level zero + for (int i = 0; i < 4; i++) { + if (i == 0) { + assertThat(db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level" + i)). + isEqualTo("1"); + } else { + assertThat(db.getProperty(columnFamilyHandles.get(1), + "rocksdb.num-files-at-level" + i)). + isEqualTo("0"); + } + } + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } + + @Test + public void enableDisableFileDeletions() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options().setCreateIfMissing(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.disableFileDeletions(); + db.enableFileDeletions(false); + db.disableFileDeletions(); + db.enableFileDeletions(true); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/RocksEnvTest.java b/java/src/test/java/org/rocksdb/RocksEnvTest.java new file mode 100644 index 000000000..6b0b9becc --- /dev/null +++ b/java/src/test/java/org/rocksdb/RocksEnvTest.java @@ -0,0 +1,38 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class RocksEnvTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void rocksEnv(){ + RocksEnv rocksEnv = RocksEnv.getDefault(); + rocksEnv.setBackgroundThreads(5); + // default rocksenv will always return zero for flush pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). + isEqualTo(0); + rocksEnv.setBackgroundThreads(5, RocksEnv.FLUSH_POOL); + // default rocksenv will always return zero for flush pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.FLUSH_POOL)). + isEqualTo(0); + rocksEnv.setBackgroundThreads(5, RocksEnv.COMPACTION_POOL); + // default rocksenv will always return zero for compaction pool + // no matter what was set via setBackgroundThreads + assertThat(rocksEnv.getThreadPoolQueueLen(RocksEnv.COMPACTION_POOL)). + isEqualTo(0); + } +} diff --git a/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/java/src/test/java/org/rocksdb/RocksIteratorTest.java new file mode 100644 index 000000000..c5918d8ac --- /dev/null +++ b/java/src/test/java/org/rocksdb/RocksIteratorTest.java @@ -0,0 +1,104 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class RocksIteratorTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void rocksIterator() throws RocksDBException { + RocksDB db = null; + Options options = null; + RocksIterator iterator = null; + try { + options = new Options(); + options.setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + + iterator = db.newIterator(); + + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + assertThat(iterator.value()).isEqualTo("value2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + iterator.seekToLast(); + iterator.prev(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key1".getBytes()); + assertThat(iterator.value()).isEqualTo("value1".getBytes()); + iterator.seekToFirst(); + iterator.seekToLast(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + assertThat(iterator.value()).isEqualTo("value2".getBytes()); + iterator.status(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (iterator != null) { + iterator.dispose(); + } + } + } + + @Test + public void rocksIteratorGc() + throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + options.setCreateIfMissing(true) + .setCreateMissingColumnFamilies(true); + db = RocksDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + db.newIterator(); + db.newIterator(); + RocksIterator iter3 = db.newIterator(); + db.close(); + db = null; + System.gc(); + System.runFinalization(); + iter3.dispose(); + System.gc(); + System.runFinalization(); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/RocksMemoryResource.java b/java/src/test/java/org/rocksdb/RocksMemoryResource.java new file mode 100644 index 000000000..de9ba0d6b --- /dev/null +++ b/java/src/test/java/org/rocksdb/RocksMemoryResource.java @@ -0,0 +1,20 @@ +package org.rocksdb; + +import org.junit.rules.ExternalResource; + +/** + * Resource to trigger garbage collection after each test + * run. + */ +public class RocksMemoryResource extends ExternalResource { + + static { + RocksDB.loadLibrary(); + } + + @Override + protected void after() { + System.gc(); + System.runFinalization(); + } +} diff --git a/java/src/test/java/org/rocksdb/SliceTest.java b/java/src/test/java/org/rocksdb/SliceTest.java new file mode 100644 index 000000000..16221ef65 --- /dev/null +++ b/java/src/test/java/org/rocksdb/SliceTest.java @@ -0,0 +1,104 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class SliceTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void slice() { + Slice slice = null; + Slice otherSlice = null; + Slice thirdSlice = null; + try { + slice = new Slice("testSlice"); + assertThat(slice.empty()).isFalse(); + assertThat(slice.size()).isEqualTo(9); + assertThat(slice.data()).isEqualTo("testSlice".getBytes()); + + otherSlice = new Slice("otherSlice".getBytes()); + assertThat(otherSlice.data()).isEqualTo("otherSlice".getBytes()); + + thirdSlice = new Slice("otherSlice".getBytes(), 5); + assertThat(thirdSlice.data()).isEqualTo("Slice".getBytes()); + } finally { + if (slice != null) { + slice.dispose(); + } + if (otherSlice != null) { + otherSlice.dispose(); + } + if (thirdSlice != null) { + thirdSlice.dispose(); + } + } + } + + @Test + public void sliceEquals() { + Slice slice = null; + Slice slice2 = null; + try { + slice = new Slice("abc"); + slice2 = new Slice("abc"); + assertThat(slice.equals(slice2)).isTrue(); + } finally { + if (slice != null) { + slice.dispose(); + } + if (slice2 != null) { + slice2.dispose(); + } + } + } + + + @Test + public void sliceStartWith() { + Slice slice = null; + Slice match = null; + Slice noMatch = null; + try { + slice = new Slice("matchpoint"); + match = new Slice("mat"); + noMatch = new Slice("nomatch"); + + //assertThat(slice.startsWith(match)).isTrue(); + assertThat(slice.startsWith(noMatch)).isFalse(); + } finally { + if (slice != null) { + slice.dispose(); + } + if (match != null) { + match.dispose(); + } + if (noMatch != null) { + noMatch.dispose(); + } + } + } + + @Test + public void sliceToString() { + Slice slice = null; + try { + slice = new Slice("stringTest"); + assertThat(slice.toString()).isEqualTo("stringTest"); + assertThat(slice.toString(true)).isNotEqualTo(""); + } finally { + if (slice != null) { + slice.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/SnapshotTest.java b/java/src/test/java/org/rocksdb/SnapshotTest.java new file mode 100644 index 000000000..87ccdbcb5 --- /dev/null +++ b/java/src/test/java/org/rocksdb/SnapshotTest.java @@ -0,0 +1,217 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class SnapshotTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void snapshots() throws RocksDBException { + RocksDB db = null; + Options options = null; + ReadOptions readOptions = null; + try { + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + // Get new Snapshot of database + Snapshot snapshot = db.getSnapshot(); + assertThat(snapshot.getSequenceNumber()).isGreaterThan(0); + assertThat(snapshot.getSequenceNumber()).isEqualTo(1); + readOptions = new ReadOptions(); + // set snapshot in ReadOptions + readOptions.setSnapshot(snapshot); + // retrieve key value pair + assertThat(new String(db.get("key".getBytes()))). + isEqualTo("value"); + // retrieve key value pair created before + // the snapshot was made + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // add new key/value pair + db.put("newkey".getBytes(), "newvalue".getBytes()); + // using no snapshot the latest db entries + // will be taken into account + assertThat(new String(db.get("newkey".getBytes()))). + isEqualTo("newvalue"); + // snapshopot was created before newkey + assertThat(db.get(readOptions, "newkey".getBytes())). + isNull(); + // Retrieve snapshot from read options + Snapshot sameSnapshot = readOptions.snapshot(); + readOptions.setSnapshot(sameSnapshot); + // results must be the same with new Snapshot + // instance using the same native pointer + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // update key value pair to newvalue + db.put("key".getBytes(), "newvalue".getBytes()); + // read with previously created snapshot will + // read previous version of key value pair + assertThat(new String(db.get(readOptions, + "key".getBytes()))).isEqualTo("value"); + // read for newkey using the snapshot must be + // null + assertThat(db.get(readOptions, "newkey".getBytes())). + isNull(); + // setting null to snapshot in ReadOptions leads + // to no Snapshot being used. + readOptions.setSnapshot(null); + assertThat(new String(db.get(readOptions, + "newkey".getBytes()))).isEqualTo("newvalue"); + // release Snapshot + db.releaseSnapshot(snapshot); + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (readOptions != null) { + readOptions.dispose(); + } + } + } + + @Test + public void iteratorWithSnapshot() throws RocksDBException { + RocksDB db = null; + Options options = null; + ReadOptions readOptions = null; + RocksIterator iterator = null; + RocksIterator snapshotIterator = null; + try { + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + // Get new Snapshot of database + Snapshot snapshot = db.getSnapshot(); + readOptions = new ReadOptions(); + // set snapshot in ReadOptions + readOptions.setSnapshot(snapshot); + db.put("key2".getBytes(), "value2".getBytes()); + + // iterate over current state of db + iterator = db.newIterator(); + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + + // iterate using a snapshot + snapshotIterator = db.newIterator(readOptions); + snapshotIterator.seekToFirst(); + assertThat(snapshotIterator.isValid()).isTrue(); + assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); + snapshotIterator.next(); + assertThat(snapshotIterator.isValid()).isFalse(); + + // release Snapshot + db.releaseSnapshot(snapshot); + } finally { + if (iterator != null) { + iterator.dispose(); + } + if (snapshotIterator != null) { + snapshotIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (readOptions != null) { + readOptions.dispose(); + } + } + } + + @Test + public void iteratorWithSnapshotOnColumnFamily() throws RocksDBException { + RocksDB db = null; + Options options = null; + ReadOptions readOptions = null; + RocksIterator iterator = null; + RocksIterator snapshotIterator = null; + try { + + options = new Options(); + options.setCreateIfMissing(true); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put("key".getBytes(), "value".getBytes()); + // Get new Snapshot of database + Snapshot snapshot = db.getSnapshot(); + readOptions = new ReadOptions(); + // set snapshot in ReadOptions + readOptions.setSnapshot(snapshot); + db.put("key2".getBytes(), "value2".getBytes()); + + // iterate over current state of column family + iterator = db.newIterator(db.getDefaultColumnFamily()); + iterator.seekToFirst(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isTrue(); + assertThat(iterator.key()).isEqualTo("key2".getBytes()); + iterator.next(); + assertThat(iterator.isValid()).isFalse(); + + // iterate using a snapshot on default column family + snapshotIterator = db.newIterator(db.getDefaultColumnFamily(), + readOptions); + snapshotIterator.seekToFirst(); + assertThat(snapshotIterator.isValid()).isTrue(); + assertThat(snapshotIterator.key()).isEqualTo("key".getBytes()); + snapshotIterator.next(); + assertThat(snapshotIterator.isValid()).isFalse(); + + // release Snapshot + db.releaseSnapshot(snapshot); + } finally { + if (iterator != null) { + iterator.dispose(); + } + if (snapshotIterator != null) { + snapshotIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + if (readOptions != null) { + readOptions.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java new file mode 100644 index 000000000..927826d71 --- /dev/null +++ b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java @@ -0,0 +1,60 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.util.Collections; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class StatisticsCollectorTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void statisticsCollector() + throws InterruptedException, RocksDBException { + Options opt = null; + RocksDB db = null; + try { + opt = new Options().createStatistics().setCreateIfMissing(true); + Statistics stats = opt.statisticsPtr(); + + db = RocksDB.open(opt, + dbFolder.getRoot().getAbsolutePath()); + + StatsCallbackMock callback = new StatsCallbackMock(); + StatsCollectorInput statsInput = new StatsCollectorInput(stats, callback); + + StatisticsCollector statsCollector = new StatisticsCollector( + Collections.singletonList(statsInput), 100); + statsCollector.start(); + + Thread.sleep(1000); + + assertThat(callback.tickerCallbackCount).isGreaterThan(0); + assertThat(callback.histCallbackCount).isGreaterThan(0); + + statsCollector.shutDown(1000); + } finally { + if (db != null) { + db.close(); + } + if (opt != null) { + opt.dispose(); + } + } + } +} diff --git a/java/org/rocksdb/test/StatsCallbackMock.java b/java/src/test/java/org/rocksdb/StatsCallbackMock.java similarity index 93% rename from java/org/rocksdb/test/StatsCallbackMock.java rename to java/src/test/java/org/rocksdb/StatsCallbackMock.java index 4ad2fb7b7..3c5800e42 100644 --- a/java/org/rocksdb/test/StatsCallbackMock.java +++ b/java/src/test/java/org/rocksdb/StatsCallbackMock.java @@ -3,9 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -package org.rocksdb.test; - -import org.rocksdb.*; +package org.rocksdb; public class StatsCallbackMock implements StatisticsCollectorCallback { public int tickerCallbackCount = 0; diff --git a/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java b/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java new file mode 100644 index 000000000..1de2efdea --- /dev/null +++ b/java/src/test/java/org/rocksdb/TransactionLogIteratorTest.java @@ -0,0 +1,182 @@ +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TransactionLogIteratorTest { + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void transactionLogIterator() throws RocksDBException { + RocksDB db = null; + Options options = null; + TransactionLogIterator transactionLogIterator = null; + try { + options = new Options(). + setCreateIfMissing(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + transactionLogIterator = db.getUpdatesSince(0); + } finally { + if (transactionLogIterator != null) { + transactionLogIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void getBatch() throws RocksDBException { + final int numberOfPuts = 5; + RocksDB db = null; + Options options = null; + ColumnFamilyHandle cfHandle = null; + TransactionLogIterator transactionLogIterator = null; + try { + options = new Options(). + setCreateIfMissing(true). + setWalTtlSeconds(1000). + setWalSizeLimitMB(10); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + + for (int i = 0; i < numberOfPuts; i++){ + db.put(String.valueOf(i).getBytes(), + String.valueOf(i).getBytes()); + } + db.flush(new FlushOptions().setWaitForFlush(true)); + + // the latest sequence number is 5 because 5 puts + // were written beforehand + assertThat(db.getLatestSequenceNumber()). + isEqualTo(numberOfPuts); + + // insert 5 writes into a cf + cfHandle = db.createColumnFamily( + new ColumnFamilyDescriptor("new_cf".getBytes())); + + for (int i = 0; i < numberOfPuts; i++){ + db.put(cfHandle, String.valueOf(i).getBytes(), + String.valueOf(i).getBytes()); + } + // the latest sequence number is 10 because + // (5 + 5) puts were written beforehand + assertThat(db.getLatestSequenceNumber()). + isEqualTo(numberOfPuts + numberOfPuts); + + // Get updates since the beginning + transactionLogIterator = db.getUpdatesSince(0); + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.status(); + + // The first sequence number is 1 + TransactionLogIterator.BatchResult batchResult = + transactionLogIterator.getBatch(); + assertThat(batchResult.sequenceNumber()).isEqualTo(1); + } finally { + if (transactionLogIterator != null) { + transactionLogIterator.dispose(); + } + if (cfHandle != null) { + cfHandle.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void transactionLogIteratorStallAtLastRecord() throws RocksDBException { + RocksDB db = null; + Options options = null; + TransactionLogIterator transactionLogIterator = null; + try { + options = new Options(). + setCreateIfMissing(true). + setWalTtlSeconds(1000). + setWalSizeLimitMB(10); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value1".getBytes()); + // Get updates since the beginning + transactionLogIterator = db.getUpdatesSince(0); + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.next(); + assertThat(transactionLogIterator.isValid()).isFalse(); + transactionLogIterator.status(); + db.put("key2".getBytes(), "value2".getBytes()); + transactionLogIterator.next(); + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); + + } finally { + if (transactionLogIterator != null) { + transactionLogIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void transactionLogIteratorCheckAfterRestart() throws RocksDBException { + final int numberOfKeys = 2; + RocksDB db = null; + Options options = null; + TransactionLogIterator transactionLogIterator = null; + try { + options = new Options(). + setCreateIfMissing(true). + setWalTtlSeconds(1000). + setWalSizeLimitMB(10); + + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + db.put("key1".getBytes(), "value1".getBytes()); + db.put("key2".getBytes(), "value2".getBytes()); + db.flush(new FlushOptions().setWaitForFlush(true)); + // reopen + db.close(); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + assertThat(db.getLatestSequenceNumber()).isEqualTo(numberOfKeys); + + transactionLogIterator = db.getUpdatesSince(0); + for (int i = 0; i < numberOfKeys; i++) { + transactionLogIterator.status(); + assertThat(transactionLogIterator.isValid()).isTrue(); + transactionLogIterator.next(); + } + } finally { + if (transactionLogIterator != null) { + transactionLogIterator.dispose(); + } + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/TtlDBTest.java b/java/src/test/java/org/rocksdb/TtlDBTest.java new file mode 100644 index 000000000..0b816d66a --- /dev/null +++ b/java/src/test/java/org/rocksdb/TtlDBTest.java @@ -0,0 +1,168 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static org.assertj.core.api.Assertions.assertThat; + +public class TtlDBTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void ttlDBOpen() throws RocksDBException, + InterruptedException { + Options options = null; + TtlDB ttlDB = null; + try { + options = new Options(). + setCreateIfMissing(true). + setMaxGrandparentOverlapFactor(0). + setMaxMemCompactionLevel(0); + ttlDB = TtlDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + ttlDB.put("key".getBytes(), "value".getBytes()); + assertThat(ttlDB.get("key".getBytes())). + isEqualTo("value".getBytes()); + assertThat(ttlDB.get("key".getBytes())).isNotNull(); + } finally { + if (ttlDB != null) { + ttlDB.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void ttlDBOpenWithTtl() throws RocksDBException, + InterruptedException { + Options options = null; + TtlDB ttlDB = null; + try { + options = new Options(). + setCreateIfMissing(true). + setMaxGrandparentOverlapFactor(0). + setMaxMemCompactionLevel(0); + ttlDB = TtlDB.open(options, dbFolder.getRoot().getAbsolutePath(), + 1, false); + ttlDB.put("key".getBytes(), "value".getBytes()); + assertThat(ttlDB.get("key".getBytes())). + isEqualTo("value".getBytes()); + TimeUnit.SECONDS.sleep(2); + + ttlDB.compactRange(); + assertThat(ttlDB.get("key".getBytes())).isNull(); + } finally { + if (ttlDB != null) { + ttlDB.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void ttlDbOpenWithColumnFamilies() throws RocksDBException, InterruptedException { + DBOptions dbOptions = null; + TtlDB ttlDB = null; + List cfNames = + new ArrayList<>(); + List columnFamilyHandleList = + new ArrayList<>(); + cfNames.add(new ColumnFamilyDescriptor(RocksDB.DEFAULT_COLUMN_FAMILY)); + cfNames.add(new ColumnFamilyDescriptor("new_cf".getBytes())); + List ttlValues = new ArrayList<>(); + // Default column family with infinite lifetime + ttlValues.add(0); + // new column family with 1 second ttl + ttlValues.add(1); + + try { + dbOptions = new DBOptions(). + setCreateMissingColumnFamilies(true). + setCreateIfMissing(true); + ttlDB = TtlDB.open(dbOptions, dbFolder.getRoot().getAbsolutePath(), + cfNames, columnFamilyHandleList, ttlValues, false); + + ttlDB.put("key".getBytes(), "value".getBytes()); + assertThat(ttlDB.get("key".getBytes())). + isEqualTo("value".getBytes()); + ttlDB.put(columnFamilyHandleList.get(1), "key".getBytes(), + "value".getBytes()); + assertThat(ttlDB.get(columnFamilyHandleList.get(1), + "key".getBytes())).isEqualTo("value".getBytes()); + TimeUnit.SECONDS.sleep(2); + + ttlDB.compactRange(); + ttlDB.compactRange(columnFamilyHandleList.get(1)); + + assertThat(ttlDB.get("key".getBytes())).isNotNull(); + assertThat(ttlDB.get(columnFamilyHandleList.get(1), + "key".getBytes())).isNull(); + + + } finally { + for (ColumnFamilyHandle columnFamilyHandle : + columnFamilyHandleList) { + columnFamilyHandle.dispose(); + } + if (ttlDB != null) { + ttlDB.close(); + } + if (dbOptions != null) { + dbOptions.dispose(); + } + } + } + + @Test + public void createTtlColumnFamily() throws RocksDBException, + InterruptedException { + Options options = null; + TtlDB ttlDB = null; + ColumnFamilyHandle columnFamilyHandle = null; + try { + options = new Options().setCreateIfMissing(true); + ttlDB = TtlDB.open(options, + dbFolder.getRoot().getAbsolutePath()); + columnFamilyHandle = ttlDB.createColumnFamilyWithTtl( + new ColumnFamilyDescriptor("new_cf".getBytes()), 1); + ttlDB.put(columnFamilyHandle, "key".getBytes(), + "value".getBytes()); + assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())). + isEqualTo("value".getBytes()); + TimeUnit.SECONDS.sleep(2); + ttlDB.compactRange(columnFamilyHandle); + assertThat(ttlDB.get(columnFamilyHandle, "key".getBytes())).isNull(); + } finally { + if (columnFamilyHandle != null) { + columnFamilyHandle.dispose(); + } + if (ttlDB != null) { + ttlDB.close(); + } + if (options != null) { + options.dispose(); + } + } + } +} diff --git a/java/src/test/java/org/rocksdb/Types.java b/java/src/test/java/org/rocksdb/Types.java new file mode 100644 index 000000000..5ad35f463 --- /dev/null +++ b/java/src/test/java/org/rocksdb/Types.java @@ -0,0 +1,43 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +/** + * Simple type conversion methods + * for use in tests + */ +public class Types { + + /** + * Convert first 4 bytes of a byte array to an int + * + * @param data The byte array + * + * @return An integer + */ + public static int byteToInt(final byte data[]) { + return (data[0] & 0xff) | + ((data[1] & 0xff) << 8) | + ((data[2] & 0xff) << 16) | + ((data[3] & 0xff) << 24); + } + + /** + * Convert an int to 4 bytes + * + * @param v The int + * + * @return A byte array containing 4 bytes + */ + public static byte[] intToByte(final int v) { + return new byte[] { + (byte)((v >>> 0) & 0xff), + (byte)((v >>> 8) & 0xff), + (byte)((v >>> 16) & 0xff), + (byte)((v >>> 24) & 0xff) + }; + } +} diff --git a/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java new file mode 100644 index 000000000..b09cc9259 --- /dev/null +++ b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java @@ -0,0 +1,170 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + + +public class WriteBatchHandlerTest { + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void writeBatchHandler() throws IOException, RocksDBException { + WriteBatch batch = null; + CapturingWriteBatchHandler handler = null; + try { + // setup test data + final List>> testEvents = new ArrayList<>(); + testEvents.add(new Tuple<>(Action.DELETE, + new Tuple("k0".getBytes(), null))); + testEvents.add(new Tuple<>(Action.PUT, + new Tuple<>("k1".getBytes(), "v1".getBytes()))); + testEvents.add(new Tuple<>(Action.PUT, + new Tuple<>("k2".getBytes(), "v2".getBytes()))); + testEvents.add(new Tuple<>(Action.PUT, + new Tuple<>("k3".getBytes(), "v3".getBytes()))); + testEvents.add(new Tuple<>(Action.LOG, + new Tuple(null, "log1".getBytes()))); + testEvents.add(new Tuple<>(Action.MERGE, + new Tuple<>("k2".getBytes(), "v22".getBytes()))); + testEvents.add(new Tuple<>(Action.DELETE, + new Tuple("k3".getBytes(), null))); + + // load test data to the write batch + batch = new WriteBatch(); + for (final Tuple> testEvent : testEvents) { + final Tuple data = testEvent.value; + switch (testEvent.key) { + + case PUT: + batch.put(data.key, data.value); + break; + + case MERGE: + batch.merge(data.key, data.value); + break; + + case DELETE: + batch.remove(data.key); + break; + + case LOG: + batch.putLogData(data.value); + break; + } + } + + // attempt to read test data back from the WriteBatch by iterating with a handler + handler = new CapturingWriteBatchHandler(); + batch.iterate(handler); + + // compare the results to the test data + final List>> actualEvents = handler.getEvents(); + assertThat(testEvents.size()).isSameAs(actualEvents.size()); + + for (int i = 0; i < testEvents.size(); i++) { + assertThat(equals(testEvents.get(i), actualEvents.get(i))).isTrue(); + } + } finally { + if (handler != null) { + handler.dispose(); + } + if (batch != null) { + batch.dispose(); + } + } + } + + private static boolean equals(final Tuple> expected, + final Tuple> actual) { + if (!expected.key.equals(actual.key)) { + return false; + } + + final Tuple expectedData = expected.value; + final Tuple actualData = actual.value; + + return equals(expectedData.key, actualData.key) + && equals(expectedData.value, actualData.value); + } + + private static boolean equals(byte[] expected, byte[] actual) { + if (expected != null) { + return Arrays.equals(expected, actual); + } else { + return actual == null; + } + } + + private static class Tuple { + public final K key; + public final V value; + + public Tuple(final K key, final V value) { + this.key = key; + this.value = value; + } + } + + /** + * Enumeration of Write Batch + * event actions + */ + private enum Action { + PUT, + MERGE, + DELETE, + LOG + } + + /** + * A simple WriteBatch Handler which adds a record + * of each event that it receives to a list + */ + private static class CapturingWriteBatchHandler extends WriteBatch.Handler { + + private final List>> events = new ArrayList<>(); + + /** + * Returns a copy of the current events list + * + * @return a list of the events which have happened upto now + */ + public List>> getEvents() { + return new ArrayList<>(events); + } + + @Override + public void put(final byte[] key, final byte[] value) { + events.add(new Tuple<>(Action.PUT, new Tuple<>(key, value))); + } + + @Override + public void merge(final byte[] key, final byte[] value) { + events.add(new Tuple<>(Action.MERGE, new Tuple<>(key, value))); + } + + @Override + public void delete(final byte[] key) { + events.add(new Tuple<>(Action.DELETE, new Tuple(key, null))); + } + + @Override + public void logData(final byte[] blob) { + events.add(new Tuple<>(Action.LOG, new Tuple(null, blob))); + } + } +} diff --git a/java/src/test/java/org/rocksdb/WriteBatchTest.java b/java/src/test/java/org/rocksdb/WriteBatchTest.java new file mode 100644 index 000000000..89a9d5405 --- /dev/null +++ b/java/src/test/java/org/rocksdb/WriteBatchTest.java @@ -0,0 +1,123 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.UnsupportedEncodingException; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * This class mimics the db/write_batch_test.cc + * in the c++ rocksdb library. + * + * Not ported yet: + * + * Continue(); + * PutGatherSlices(); + */ +public class WriteBatchTest { + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void emptyWriteBatch() { + WriteBatch batch = new WriteBatch(); + assertThat(batch.count()).isEqualTo(0); + } + + @Test + public void multipleBatchOperations() + throws UnsupportedEncodingException { + WriteBatch batch = new WriteBatch(); + batch.put("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + batch.remove("box".getBytes("US-ASCII")); + batch.put("baz".getBytes("US-ASCII"), "boo".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.setSequence(batch, 100); + assertThat(WriteBatchTestInternalHelper.sequence(batch)). + isNotNull(). + isEqualTo(100); + assertThat(batch.count()).isEqualTo(3); + assertThat(new String(getContents(batch), "US-ASCII")). + isEqualTo("Put(baz, boo)@102" + + "Delete(box)@101" + + "Put(foo, bar)@100"); + } + + @Test + public void testAppendOperation() + throws UnsupportedEncodingException { + WriteBatch b1 = new WriteBatch(); + WriteBatch b2 = new WriteBatch(); + WriteBatchTestInternalHelper.setSequence(b1, 200); + WriteBatchTestInternalHelper.setSequence(b2, 300); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(getContents(b1).length).isEqualTo(0); + assertThat(b1.count()).isEqualTo(0); + b2.put("a".getBytes("US-ASCII"), "va".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat("Put(a, va)@200".equals(new String(getContents(b1), "US-ASCII"))); + assertThat(b1.count()).isEqualTo(1); + b2.clear(); + b2.put("b".getBytes("US-ASCII"), "vb".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(("Put(a, va)@200" + + "Put(b, vb)@201") + .equals(new String(getContents(b1), "US-ASCII"))); + assertThat(b1.count()).isEqualTo(2); + b2.remove("foo".getBytes("US-ASCII")); + WriteBatchTestInternalHelper.append(b1, b2); + assertThat(("Put(a, va)@200" + + "Put(b, vb)@202" + + "Put(b, vb)@201" + + "Delete(foo)@203") + .equals(new String(getContents(b1), "US-ASCII"))); + assertThat(b1.count()).isEqualTo(4); + } + + @Test + public void blobOperation() + throws UnsupportedEncodingException { + WriteBatch batch = new WriteBatch(); + batch.put("k1".getBytes("US-ASCII"), "v1".getBytes("US-ASCII")); + batch.put("k2".getBytes("US-ASCII"), "v2".getBytes("US-ASCII")); + batch.put("k3".getBytes("US-ASCII"), "v3".getBytes("US-ASCII")); + batch.putLogData("blob1".getBytes("US-ASCII")); + batch.remove("k2".getBytes("US-ASCII")); + batch.putLogData("blob2".getBytes("US-ASCII")); + batch.merge("foo".getBytes("US-ASCII"), "bar".getBytes("US-ASCII")); + assertThat(batch.count()).isEqualTo(5); + assertThat(("Merge(foo, bar)@4" + + "Put(k1, v1)@0" + + "Delete(k2)@3" + + "Put(k2, v2)@1" + + "Put(k3, v3)@2") + .equals(new String(getContents(batch), "US-ASCII"))); + } + + static native byte[] getContents(WriteBatch batch); +} + +/** + * Package-private class which provides java api to access + * c++ WriteBatchInternal. + */ +class WriteBatchTestInternalHelper { + static native void setSequence(WriteBatch batch, long sn); + static native long sequence(WriteBatch batch); + static native void append(WriteBatch b1, WriteBatch b2); +} diff --git a/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java new file mode 100644 index 000000000..f7eed556a --- /dev/null +++ b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java @@ -0,0 +1,239 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.nio.ByteBuffer; +import java.util.ArrayDeque; +import java.util.Deque; + +import static org.assertj.core.api.Assertions.assertThat; + + +public class WriteBatchWithIndexTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + @Test + public void readYourOwnWrites() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + // Setup options + options.setCreateIfMissing(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + + final byte[] k1 = "key1".getBytes(); + final byte[] v1 = "value1".getBytes(); + final byte[] k2 = "key2".getBytes(); + final byte[] v2 = "value2".getBytes(); + + db.put(k1, v1); + db.put(k2, v2); + + final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + + RocksIterator base = null; + RocksIterator it = null; + try { + base = db.newIterator(); + it = wbwi.newIteratorWithBase(base); + + it.seek(k1); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k1); + assertThat(it.value()).isEqualTo(v1); + + it.seek(k2); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k2); + assertThat(it.value()).isEqualTo(v2); + + //put data to the write batch and make sure we can read it. + final byte[] k3 = "key3".getBytes(); + final byte[] v3 = "value3".getBytes(); + wbwi.put(k3, v3); + it.seek(k3); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k3); + assertThat(it.value()).isEqualTo(v3); + + //update k2 in the write batch and check the value + final byte[] v2Other = "otherValue2".getBytes(); + wbwi.put(k2, v2Other); + it.seek(k2); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k2); + assertThat(it.value()).isEqualTo(v2Other); + + //remove k1 and make sure we can read back the write + wbwi.remove(k1); + it.seek(k1); + assertThat(it.key()).isNotEqualTo(k1); + + //reinsert k1 and make sure we see the new value + final byte[] v1Other = "otherValue1".getBytes(); + wbwi.put(k1, v1Other); + it.seek(k1); + assertThat(it.isValid()).isTrue(); + assertThat(it.key()).isEqualTo(k1); + assertThat(it.value()).isEqualTo(v1Other); + } finally { + if (it != null) { + it.dispose(); + } + if (base != null) { + base.dispose(); + } + } + + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void write_writeBatchWithIndex() throws RocksDBException { + RocksDB db = null; + Options options = null; + try { + options = new Options(); + // Setup options + options.setCreateIfMissing(true); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + + final byte[] k1 = "key1".getBytes(); + final byte[] v1 = "value1".getBytes(); + final byte[] k2 = "key2".getBytes(); + final byte[] v2 = "value2".getBytes(); + + WriteBatchWithIndex wbwi = null; + + try { + wbwi = new WriteBatchWithIndex(); + + + wbwi.put(k1, v1); + wbwi.put(k2, v2); + + db.write(new WriteOptions(), wbwi); + } finally { + if(wbwi != null) { + wbwi.dispose(); + } + } + + assertThat(db.get(k1)).isEqualTo(v1); + assertThat(db.get(k2)).isEqualTo(v2); + + } finally { + if (db != null) { + db.close(); + } + if (options != null) { + options.dispose(); + } + } + } + + @Test + public void iterator() throws RocksDBException { + final WriteBatchWithIndex wbwi = new WriteBatchWithIndex(true); + + final String k1 = "key1"; + final String v1 = "value1"; + final String k2 = "key2"; + final String v2 = "value2"; + final String k3 = "key3"; + final String v3 = "value3"; + final byte[] k1b = k1.getBytes(); + final byte[] v1b = v1.getBytes(); + final byte[] k2b = k2.getBytes(); + final byte[] v2b = v2.getBytes(); + final byte[] k3b = k3.getBytes(); + final byte[] v3b = v3.getBytes(); + + //add put records + wbwi.put(k1b, v1b); + wbwi.put(k2b, v2b); + wbwi.put(k3b, v3b); + + //add a deletion record + final String k4 = "key4"; + final byte[] k4b = k4.getBytes(); + wbwi.remove(k4b); + + WBWIRocksIterator.WriteEntry[] expected = { + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k1), new DirectSlice(v1)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k2), new DirectSlice(v2)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.PUT, + new DirectSlice(k3), new DirectSlice(v3)), + new WBWIRocksIterator.WriteEntry(WBWIRocksIterator.WriteType.DELETE, + new DirectSlice(k4), DirectSlice.NONE) + }; + + WBWIRocksIterator it = null; + try { + it = wbwi.newIterator(); + + //direct access - seek to key offsets + final int[] testOffsets = {2, 0, 1, 3}; + + for(int i = 0; i < testOffsets.length; i++) { + final int testOffset = testOffsets[i]; + final byte[] key = toArray(expected[testOffset].getKey().data()); + + it.seek(key); + assertThat(it.isValid()).isTrue(); + assertThat(it.entry().equals(expected[testOffset])).isTrue(); + } + + //forward iterative access + int i = 0; + for(it.seekToFirst(); it.isValid(); it.next()) { + assertThat(it.entry().equals(expected[i++])).isTrue(); + } + + //reverse iterative access + i = expected.length - 1; + for(it.seekToLast(); it.isValid(); it.prev()) { + assertThat(it.entry().equals(expected[i--])).isTrue(); + } + + } finally { + if(it != null) { + it.dispose(); + } + } + } + + private byte[] toArray(final ByteBuffer buf) { + final byte[] ary = new byte[buf.remaining()]; + buf.get(ary); + return ary; + } +} diff --git a/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/java/src/test/java/org/rocksdb/WriteOptionsTest.java new file mode 100644 index 000000000..4d8e6d97e --- /dev/null +++ b/java/src/test/java/org/rocksdb/WriteOptionsTest.java @@ -0,0 +1,31 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +package org.rocksdb; + +import org.junit.ClassRule; +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class WriteOptionsTest { + + @ClassRule + public static final RocksMemoryResource rocksMemoryResource = + new RocksMemoryResource(); + + @Test + public void writeOptions(){ + WriteOptions writeOptions = new WriteOptions(); + writeOptions.setDisableWAL(true); + assertThat(writeOptions.disableWAL()).isTrue(); + writeOptions.setDisableWAL(false); + assertThat(writeOptions.disableWAL()).isFalse(); + writeOptions.setSync(true); + assertThat(writeOptions.sync()).isTrue(); + writeOptions.setSync(false); + assertThat(writeOptions.sync()).isFalse(); + } +} diff --git a/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java new file mode 100644 index 000000000..61655f33c --- /dev/null +++ b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java @@ -0,0 +1,65 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb.test; + +import org.junit.internal.JUnitSystem; +import org.junit.internal.RealSystem; +import org.junit.internal.TextListener; +import org.junit.runner.Description; +import org.junit.runner.JUnitCore; + +import java.util.ArrayList; +import java.util.List; + +/** + * Custom Junit Runner to print also Test classes + * and executed methods to command prompt. + */ +public class RocksJunitRunner { + + /** + * Listener which overrides default functionality + * to print class and method to system out. + */ + static class RocksJunitListener extends TextListener { + + /** + * RocksJunitListener constructor + * + * @param system JUnitSystem + */ + public RocksJunitListener(JUnitSystem system) { + super(system); + } + + @Override + public void testStarted(Description description) { + System.out.format("Run: %s testing now -> %s \n", + description.getClassName(), + description.getMethodName()); + } + } + + /** + * Main method to execute tests + * + * @param args Test classes as String names + */ + public static void main(String[] args){ + JUnitCore runner = new JUnitCore(); + final JUnitSystem system = new RealSystem(); + runner.addListener(new RocksJunitListener(system)); + try { + List> classes = new ArrayList<>(); + for (String arg : args) { + classes.add(Class.forName(arg)); + } + runner.run(classes.toArray(new Class[1])); + + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } + } +} diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java new file mode 100644 index 000000000..741effebb --- /dev/null +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -0,0 +1,142 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb.util; + +import org.junit.Test; + +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; + +import static org.assertj.core.api.Assertions.assertThat; + +public class EnvironmentTest { + + // Init static context + private static Environment environment = + new Environment(); + + @Test + public void mac32() { + setEnvironmentClassFields("mac", "32"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".jnilib"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-osx.jnilib"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.dylib"); + } + + @Test + public void mac64() { + setEnvironmentClassFields("mac", "64"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".jnilib"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-osx.jnilib"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.dylib"); + } + + @Test + public void nix32() { + // Linux + setEnvironmentClassFields("Linux", "32"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux32.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + // UNIX + setEnvironmentClassFields("Unix", "32"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux32.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + // AIX + setEnvironmentClassFields("aix", "32"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux32.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + } + + @Test + public void nix64() { + setEnvironmentClassFields("Linux", "x64"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux64.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + // UNIX + setEnvironmentClassFields("Unix", "x64"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux64.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + // AIX + setEnvironmentClassFields("aix", "x64"); + assertThat(Environment.isWindows()).isFalse(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".so"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-linux64.so"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.so"); + } + + @Test + public void detectWindows(){ + setEnvironmentClassFields("win", "x64"); + assertThat(Environment.isWindows()).isTrue(); + } + + @Test(expected = UnsupportedOperationException.class) + public void failWinJniLibraryName(){ + setEnvironmentClassFields("win", "x64"); + Environment.getJniLibraryFileName("rocksdb"); + } + + @Test(expected = UnsupportedOperationException.class) + public void failWinSharedLibrary(){ + setEnvironmentClassFields("win", "x64"); + Environment.getSharedLibraryFileName("rocksdb"); + } + + private void setEnvironmentClassFields(String osName, + String osArch) { + setEnvironmentClassField("OS", osName); + setEnvironmentClassField("ARCH", osArch); + } + + private void setEnvironmentClassField(String fieldName, String value) { + final Field field; + try { + field = Environment.class.getDeclaredField(fieldName); + field.setAccessible(true); + final Field modifiersField = Field.class.getDeclaredField("modifiers"); + modifiersField.setAccessible(true); + modifiersField.setInt(field, field.getModifiers() & ~Modifier.FINAL); + field.set(null, value); + } catch (NoSuchFieldException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } +} diff --git a/java/src/test/java/org/rocksdb/util/SizeUnitTest.java b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java new file mode 100644 index 000000000..517e1b2b5 --- /dev/null +++ b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java @@ -0,0 +1,27 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb.util; + +import org.junit.Test; + +import static org.assertj.core.api.Assertions.assertThat; + +public class SizeUnitTest { + + public static final long COMPUTATION_UNIT = 1024L; + + @Test + public void sizeUnit() { + assertThat(SizeUnit.KB).isEqualTo(COMPUTATION_UNIT); + assertThat(SizeUnit.MB).isEqualTo( + SizeUnit.KB * COMPUTATION_UNIT); + assertThat(SizeUnit.GB).isEqualTo( + SizeUnit.MB * COMPUTATION_UNIT); + assertThat(SizeUnit.TB).isEqualTo( + SizeUnit.GB * COMPUTATION_UNIT); + assertThat(SizeUnit.PB).isEqualTo( + SizeUnit.TB * COMPUTATION_UNIT); + } +} diff --git a/linters/cpp_linter/FbcodeCppLinter.php b/linters/cpp_linter/FbcodeCppLinter.php index e62d3bbe1..c7b4935e7 100644 --- a/linters/cpp_linter/FbcodeCppLinter.php +++ b/linters/cpp_linter/FbcodeCppLinter.php @@ -31,7 +31,7 @@ class FbcodeCppLinter extends ArcanistLinter { $this->getEngine()->getFilePathOnDisk($p)); } else { $futures[$p] = new ExecFuture("%s %s 2>&1", - self::CPPLINT, $this->getEngine()->getFilePathOnDisk($p)); + $CPP_LINT, $this->getEngine()->getFilePathOnDisk($p)); } } @@ -68,6 +68,9 @@ class FbcodeCppLinter extends ArcanistLinter { } private function getCppLintOutput($path) { + if (!array_key_exists($path, $this->rawLintOutput)) { + return array(); + } list($output) = $this->rawLintOutput[$path]; $msgs = array(); diff --git a/linters/cpp_linter/cpplint.py b/linters/cpp_linter/cpplint.py index d264b00da..d6201945a 100755 --- a/linters/cpp_linter/cpplint.py +++ b/linters/cpp_linter/cpplint.py @@ -213,7 +213,7 @@ _ERROR_CATEGORIES = [ # flag. By default all errors are on, so only add here categories that should be # off by default (i.e., categories that must be enabled by the --filter= flags). # All entries here should start with a '-' or '+', as in the --filter= flag. -_DEFAULT_FILTERS = ['-build/include_alpha'] +_DEFAULT_FILTERS = [] # We used to check for high-bit characters, but after much discussion we # decided those were OK, as long as they were in UTF-8 and didn't represent diff --git a/linters/lint_engine/FacebookFbcodeLintEngine.php b/linters/lint_engine/FacebookFbcodeLintEngine.php index cb9cf9bdb..131b34efb 100644 --- a/linters/lint_engine/FacebookFbcodeLintEngine.php +++ b/linters/lint_engine/FacebookFbcodeLintEngine.php @@ -36,25 +36,14 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { )); $linters[] = $java_text_linter; - $pep8_options = $this->getPEP8WithTextOptions().',E302'; - $python_linter = new ArcanistPEP8Linter(); - $python_linter->setConfig(array('options' => $pep8_options)); $linters[] = $python_linter; - $python_2space_linter = new ArcanistPEP8Linter(); - $python_2space_linter->setConfig(array('options' => $pep8_options.',E111')); - $linters[] = $python_2space_linter; - // Currently we can't run cpplint in commit hook mode, because it // depends on having access to the working directory. if (!$this->getCommitHookMode()) { $cpp_linters = array(); $google_linter = new ArcanistCpplintLinter(); - $google_linter->setConfig(array( - 'lint.cpplint.prefix' => '', - 'lint.cpplint.bin' => 'cpplint', - )); $cpp_linters[] = $linters[] = $google_linter; $cpp_linters[] = $linters[] = new FbcodeCppLinter(); $cpp_linters[] = $linters[] = new PfffCppLinter(); @@ -119,11 +108,7 @@ class FacebookFbcodeLintEngine extends ArcanistLintEngine { $dir = dirname($dir); } while ($dir != '/' && $dir != '.'); - if ($space_count == 4) { - $cur_path_linter = $python_linter; - } else { - $cur_path_linter = $python_2space_linter; - } + $cur_path_linter = $python_linter; $cur_path_linter->addPath($path); $cur_path_linter->addData($path, $this->loadData($path)); diff --git a/port/atomic_pointer.h b/port/atomic_pointer.h deleted file mode 100644 index db3580bde..000000000 --- a/port/atomic_pointer.h +++ /dev/null @@ -1,157 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -// AtomicPointer provides storage for a lock-free pointer. -// Platform-dependent implementation of AtomicPointer: -// - If the platform provides a cheap barrier, we use it with raw pointers -// - If cstdatomic is present (on newer versions of gcc, it is), we use -// a cstdatomic-based AtomicPointer. However we prefer the memory -// barrier based version, because at least on a gcc 4.4 32-bit build -// on linux, we have encountered a buggy -// implementation. Also, some implementations are much -// slower than a memory-barrier based implementation (~16ns for -// based acquire-load vs. ~1ns for a barrier based -// acquire-load). -// This code is based on atomicops-internals-* in Google's perftools: -// http://code.google.com/p/google-perftools/source/browse/#svn%2Ftrunk%2Fsrc%2Fbase - -#ifndef PORT_ATOMIC_POINTER_H_ -#define PORT_ATOMIC_POINTER_H_ - -#include -#ifdef ROCKSDB_ATOMIC_PRESENT -#include -#endif -#ifdef OS_WIN -#include -#endif -#ifdef OS_MACOSX -#include -#endif - -#if defined(_M_X64) || defined(__x86_64__) -#define ARCH_CPU_X86_FAMILY 1 -#elif defined(_M_IX86) || defined(__i386__) || defined(__i386) -#define ARCH_CPU_X86_FAMILY 1 -#elif defined(__ARMEL__) -#define ARCH_CPU_ARM_FAMILY 1 -#endif - -namespace rocksdb { -namespace port { - -// Define MemoryBarrier() if available -// Windows on x86 -#if defined(OS_WIN) && defined(COMPILER_MSVC) && defined(ARCH_CPU_X86_FAMILY) -// windows.h already provides a MemoryBarrier(void) macro -// http://msdn.microsoft.com/en-us/library/ms684208(v=vs.85).aspx -#define ROCKSDB_HAVE_MEMORY_BARRIER - -// Gcc on x86 -#elif defined(ARCH_CPU_X86_FAMILY) && defined(__GNUC__) -inline void MemoryBarrier() { - // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on - // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. - __asm__ __volatile__("" : : : "memory"); -} -#define ROCKSDB_HAVE_MEMORY_BARRIER - -// Sun Studio -#elif defined(ARCH_CPU_X86_FAMILY) && defined(__SUNPRO_CC) -inline void MemoryBarrier() { - // See http://gcc.gnu.org/ml/gcc/2003-04/msg01180.html for a discussion on - // this idiom. Also see http://en.wikipedia.org/wiki/Memory_ordering. - asm volatile("" : : : "memory"); -} -#define ROCKSDB_HAVE_MEMORY_BARRIER - -// Mac OS -#elif defined(OS_MACOSX) -inline void MemoryBarrier() { - OSMemoryBarrier(); -} -#define ROCKSDB_HAVE_MEMORY_BARRIER - -// ARM Linux -#elif defined(ARCH_CPU_ARM_FAMILY) && defined(__linux__) -typedef void (*LinuxKernelMemoryBarrierFunc)(void); -// The Linux ARM kernel provides a highly optimized device-specific memory -// barrier function at a fixed memory address that is mapped in every -// user-level process. -// -// This beats using CPU-specific instructions which are, on single-core -// devices, un-necessary and very costly (e.g. ARMv7-A "dmb" takes more -// than 180ns on a Cortex-A8 like the one on a Nexus One). Benchmarking -// shows that the extra function call cost is completely negligible on -// multi-core devices. -// -inline void MemoryBarrier() { - (*(LinuxKernelMemoryBarrierFunc)0xffff0fa0)(); -} -#define ROCKSDB_HAVE_MEMORY_BARRIER - -#endif - -// AtomicPointer built using platform-specific MemoryBarrier() -#if defined(ROCKSDB_HAVE_MEMORY_BARRIER) -class AtomicPointer { - private: - void* rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* p) : rep_(p) {} - inline void* NoBarrier_Load() const { return rep_; } - inline void NoBarrier_Store(void* v) { rep_ = v; } - inline void* Acquire_Load() const { - void* result = rep_; - MemoryBarrier(); - return result; - } - inline void Release_Store(void* v) { - MemoryBarrier(); - rep_ = v; - } -}; - -// AtomicPointer based on -#elif defined(ROCKSDB_ATOMIC_PRESENT) -class AtomicPointer { - private: - std::atomic rep_; - public: - AtomicPointer() { } - explicit AtomicPointer(void* v) : rep_(v) { } - inline void* Acquire_Load() const { - return rep_.load(std::memory_order_acquire); - } - inline void Release_Store(void* v) { - rep_.store(v, std::memory_order_release); - } - inline void* NoBarrier_Load() const { - return rep_.load(std::memory_order_relaxed); - } - inline void NoBarrier_Store(void* v) { - rep_.store(v, std::memory_order_relaxed); - } -}; - -// We have neither MemoryBarrier(), nor -#else -#error Please implement AtomicPointer for this platform. - -#endif - -#undef ROCKSDB_HAVE_MEMORY_BARRIER -#undef ARCH_CPU_X86_FAMILY -#undef ARCH_CPU_ARM_FAMILY - -} // namespace port -} // namespace rocksdb - -#endif // PORT_ATOMIC_POINTER_H_ diff --git a/port/port.h b/port/port.h index 2dc9a0fa6..153dc5b5b 100644 --- a/port/port.h +++ b/port/port.h @@ -10,13 +10,13 @@ #ifndef STORAGE_LEVELDB_PORT_PORT_H_ #define STORAGE_LEVELDB_PORT_PORT_H_ -#include +#include // Include the appropriate platform specific file below. If you are // porting to a new platform, see "port_example.h" for documentation // of what the new port_.h file must provide. #if defined(ROCKSDB_PLATFORM_POSIX) -# include "port/port_posix.h" +#include "port/port_posix.h" #endif #endif // STORAGE_LEVELDB_PORT_PORT_H_ diff --git a/port/port_example.h b/port/port_example.h index f124abb06..ba14618fa 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -75,35 +75,6 @@ typedef intptr_t OnceType; #define LEVELDB_ONCE_INIT 0 extern void InitOnce(port::OnceType*, void (*initializer)()); -// A type that holds a pointer that can be read or written atomically -// (i.e., without word-tearing.) -class AtomicPointer { - private: - intptr_t rep_; - public: - // Initialize to arbitrary value - AtomicPointer(); - - // Initialize to hold v - explicit AtomicPointer(void* v) : rep_(v) { } - - // Read and return the stored pointer with the guarantee that no - // later memory access (read or write) by this thread can be - // reordered ahead of this read. - void* Acquire_Load() const; - - // Set v as the stored pointer with the guarantee that no earlier - // memory access (read or write) by this thread can be reordered - // after this store. - void Release_Store(void* v); - - // Read the stored pointer with no ordering guarantees. - void* NoBarrier_Load() const; - - // Set va as the stored pointer with no ordering guarantees. - void NoBarrier_Store(void* v); -}; - // ------------------ Compression ------------------- // Store the snappy compression of "input[0,input_length-1]" in *output. diff --git a/port/port_posix.cc b/port/port_posix.cc index c5ea439eb..a8cffcc7e 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -88,8 +88,8 @@ void CondVar::Wait() { bool CondVar::TimedWait(uint64_t abs_time_us) { struct timespec ts; - ts.tv_sec = abs_time_us / 1000000; - ts.tv_nsec = (abs_time_us % 1000000) * 1000; + ts.tv_sec = static_cast(abs_time_us / 1000000); + ts.tv_nsec = static_cast((abs_time_us % 1000000) * 1000); #ifndef NDEBUG mu_->locked_ = false; diff --git a/port/port_posix.h b/port/port_posix.h index 2e3c868b3..f730c483b 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -34,28 +34,10 @@ #include #endif #include -#ifdef SNAPPY -#include -#endif - -#ifdef ZLIB -#include -#endif - -#ifdef BZIP2 -#include -#endif - -#if defined(LZ4) -#include -#include -#endif #include #include #include -#include "rocksdb/options.h" -#include "port/atomic_pointer.h" #ifndef PLATFORM_IS_LITTLE_ENDIAN #define PLATFORM_IS_LITTLE_ENDIAN (__BYTE_ORDER == __LITTLE_ENDIAN) @@ -150,339 +132,6 @@ typedef pthread_once_t OnceType; #define LEVELDB_ONCE_INIT PTHREAD_ONCE_INIT extern void InitOnce(OnceType* once, void (*initializer)()); -inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, - size_t length, ::std::string* output) { -#ifdef SNAPPY - output->resize(snappy::MaxCompressedLength(length)); - size_t outlen; - snappy::RawCompress(input, length, &(*output)[0], &outlen); - output->resize(outlen); - return true; -#endif - - return false; -} - -inline bool Snappy_GetUncompressedLength(const char* input, size_t length, - size_t* result) { -#ifdef SNAPPY - return snappy::GetUncompressedLength(input, length, result); -#else - return false; -#endif -} - -inline bool Snappy_Uncompress(const char* input, size_t length, - char* output) { -#ifdef SNAPPY - return snappy::RawUncompress(input, length, output); -#else - return false; -#endif -} - -inline bool Zlib_Compress(const CompressionOptions& opts, const char* input, - size_t length, ::std::string* output) { -#ifdef ZLIB - // The memLevel parameter specifies how much memory should be allocated for - // the internal compression state. - // memLevel=1 uses minimum memory but is slow and reduces compression ratio. - // memLevel=9 uses maximum memory for optimal speed. - // The default value is 8. See zconf.h for more details. - static const int memLevel = 8; - z_stream _stream; - memset(&_stream, 0, sizeof(z_stream)); - int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits, - memLevel, opts.strategy); - if (st != Z_OK) { - return false; - } - - // Resize output to be the plain data length. - // This may not be big enough if the compression actually expands data. - output->resize(length); - - // Compress the input, and put compressed data in output. - _stream.next_in = (Bytef *)input; - _stream.avail_in = length; - - // Initialize the output size. - _stream.avail_out = length; - _stream.next_out = (Bytef *)&(*output)[0]; - - int old_sz =0, new_sz =0, new_sz_delta =0; - bool done = false; - while (!done) { - int st = deflate(&_stream, Z_FINISH); - switch (st) { - case Z_STREAM_END: - done = true; - break; - case Z_OK: - // No output space. Increase the output space by 20%. - // (Should we fail the compression since it expands the size?) - old_sz = output->size(); - new_sz_delta = (int)(output->size() * 0.2); - new_sz = output->size() + (new_sz_delta < 10 ? 10 : new_sz_delta); - output->resize(new_sz); - // Set more output. - _stream.next_out = (Bytef *)&(*output)[old_sz]; - _stream.avail_out = new_sz - old_sz; - break; - case Z_BUF_ERROR: - default: - deflateEnd(&_stream); - return false; - } - } - - output->resize(output->size() - _stream.avail_out); - deflateEnd(&_stream); - return true; -#endif - return false; -} - -inline char* Zlib_Uncompress(const char* input_data, size_t input_length, - int* decompress_size, int windowBits = -14) { -#ifdef ZLIB - z_stream _stream; - memset(&_stream, 0, sizeof(z_stream)); - - // For raw inflate, the windowBits should be -8..-15. - // If windowBits is bigger than zero, it will use either zlib - // header or gzip header. Adding 32 to it will do automatic detection. - int st = inflateInit2(&_stream, - windowBits > 0 ? windowBits + 32 : windowBits); - if (st != Z_OK) { - return nullptr; - } - - _stream.next_in = (Bytef *)input_data; - _stream.avail_in = input_length; - - // Assume the decompressed data size will 5x of compressed size. - int output_len = input_length * 5; - char* output = new char[output_len]; - int old_sz = output_len; - - _stream.next_out = (Bytef *)output; - _stream.avail_out = output_len; - - char* tmp = nullptr; - int output_len_delta; - bool done = false; - - //while(_stream.next_in != nullptr && _stream.avail_in != 0) { - while (!done) { - int st = inflate(&_stream, Z_SYNC_FLUSH); - switch (st) { - case Z_STREAM_END: - done = true; - break; - case Z_OK: - // No output space. Increase the output space by 20%. - old_sz = output_len; - output_len_delta = (int)(output_len * 0.2); - output_len += output_len_delta < 10 ? 10 : output_len_delta; - tmp = new char[output_len]; - memcpy(tmp, output, old_sz); - delete[] output; - output = tmp; - - // Set more output. - _stream.next_out = (Bytef *)(output + old_sz); - _stream.avail_out = output_len - old_sz; - break; - case Z_BUF_ERROR: - default: - delete[] output; - inflateEnd(&_stream); - return nullptr; - } - } - - *decompress_size = output_len - _stream.avail_out; - inflateEnd(&_stream); - return output; -#endif - - return nullptr; -} - -inline bool BZip2_Compress(const CompressionOptions& opts, const char* input, - size_t length, ::std::string* output) { -#ifdef BZIP2 - bz_stream _stream; - memset(&_stream, 0, sizeof(bz_stream)); - - // Block size 1 is 100K. - // 0 is for silent. - // 30 is the default workFactor - int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); - if (st != BZ_OK) { - return false; - } - - // Resize output to be the plain data length. - // This may not be big enough if the compression actually expands data. - output->resize(length); - - // Compress the input, and put compressed data in output. - _stream.next_in = (char *)input; - _stream.avail_in = length; - - // Initialize the output size. - _stream.next_out = (char *)&(*output)[0]; - _stream.avail_out = length; - - int old_sz =0, new_sz =0; - while(_stream.next_in != nullptr && _stream.avail_in != 0) { - int st = BZ2_bzCompress(&_stream, BZ_FINISH); - switch (st) { - case BZ_STREAM_END: - break; - case BZ_FINISH_OK: - // No output space. Increase the output space by 20%. - // (Should we fail the compression since it expands the size?) - old_sz = output->size(); - new_sz = (int)(output->size() * 1.2); - output->resize(new_sz); - // Set more output. - _stream.next_out = (char *)&(*output)[old_sz]; - _stream.avail_out = new_sz - old_sz; - break; - case BZ_SEQUENCE_ERROR: - default: - BZ2_bzCompressEnd(&_stream); - return false; - } - } - - output->resize(output->size() - _stream.avail_out); - BZ2_bzCompressEnd(&_stream); - return true; -#endif - return false; -} - -inline char* BZip2_Uncompress(const char* input_data, size_t input_length, - int* decompress_size) { -#ifdef BZIP2 - bz_stream _stream; - memset(&_stream, 0, sizeof(bz_stream)); - - int st = BZ2_bzDecompressInit(&_stream, 0, 0); - if (st != BZ_OK) { - return nullptr; - } - - _stream.next_in = (char *)input_data; - _stream.avail_in = input_length; - - // Assume the decompressed data size will be 5x of compressed size. - int output_len = input_length * 5; - char* output = new char[output_len]; - int old_sz = output_len; - - _stream.next_out = (char *)output; - _stream.avail_out = output_len; - - char* tmp = nullptr; - - while(_stream.next_in != nullptr && _stream.avail_in != 0) { - int st = BZ2_bzDecompress(&_stream); - switch (st) { - case BZ_STREAM_END: - break; - case BZ_OK: - // No output space. Increase the output space by 20%. - old_sz = output_len; - output_len = (int)(output_len * 1.2); - tmp = new char[output_len]; - memcpy(tmp, output, old_sz); - delete[] output; - output = tmp; - - // Set more output. - _stream.next_out = (char *)(output + old_sz); - _stream.avail_out = output_len - old_sz; - break; - default: - delete[] output; - BZ2_bzDecompressEnd(&_stream); - return nullptr; - } - } - - *decompress_size = output_len - _stream.avail_out; - BZ2_bzDecompressEnd(&_stream); - return output; -#endif - return nullptr; -} - -inline bool LZ4_Compress(const CompressionOptions &opts, const char *input, - size_t length, ::std::string* output) { -#ifdef LZ4 - int compressBound = LZ4_compressBound(length); - output->resize(8 + compressBound); - char *p = const_cast(output->c_str()); - memcpy(p, &length, sizeof(length)); - size_t outlen; - outlen = LZ4_compress_limitedOutput(input, p + 8, length, compressBound); - if (outlen == 0) { - return false; - } - output->resize(8 + outlen); - return true; -#endif - return false; -} - -inline char* LZ4_Uncompress(const char* input_data, size_t input_length, - int* decompress_size) { -#ifdef LZ4 - if (input_length < 8) { - return nullptr; - } - int output_len; - memcpy(&output_len, input_data, sizeof(output_len)); - char *output = new char[output_len]; - *decompress_size = LZ4_decompress_safe_partial( - input_data + 8, output, input_length - 8, output_len, output_len); - if (*decompress_size < 0) { - delete[] output; - return nullptr; - } - return output; -#endif - return nullptr; -} - -inline bool LZ4HC_Compress(const CompressionOptions &opts, const char* input, - size_t length, ::std::string* output) { -#ifdef LZ4 - int compressBound = LZ4_compressBound(length); - output->resize(8 + compressBound); - char *p = const_cast(output->c_str()); - memcpy(p, &length, sizeof(length)); - size_t outlen; -#ifdef LZ4_VERSION_MAJOR // they only started defining this since r113 - outlen = LZ4_compressHC2_limitedOutput(input, p + 8, length, compressBound, - opts.level); -#else - outlen = LZ4_compressHC_limitedOutput(input, p + 8, length, compressBound); -#endif - if (outlen == 0) { - return false; - } - output->resize(8 + outlen); - return true; -#endif - return false; -} - #define CACHE_LINE_SIZE 64U #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) diff --git a/port/stack_trace.cc b/port/stack_trace.cc index 76866e63c..b2075b9a9 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -5,15 +5,16 @@ // #include "port/stack_trace.h" -namespace rocksdb { -namespace port { - #if defined(ROCKSDB_LITE) || !(defined(OS_LINUX) || defined(OS_MACOSX)) // noop +namespace rocksdb { +namespace port { void InstallStackTraceHandler() {} void PrintStack(int first_frames_to_skip) {} +} // namespace port +} // namespace rocksdb #else @@ -25,6 +26,9 @@ void PrintStack(int first_frames_to_skip) {} #include #include +namespace rocksdb { +namespace port { + namespace { #ifdef OS_LINUX @@ -33,7 +37,7 @@ const char* GetExecutableName() { char link[1024]; snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); - auto read = readlink(link, name, sizeof(name)); + auto read = readlink(link, name, sizeof(name) - 1); if (-1 == read) { return nullptr; } else { @@ -126,7 +130,7 @@ void InstallStackTraceHandler() { signal(SIGABRT, StackTraceHandler); } -#endif - } // namespace port } // namespace rocksdb + +#endif diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index a259e79d8..c693064af 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { Footer footer; @@ -50,24 +50,26 @@ Status AdaptiveTableFactory::NewTableReader( if (footer.table_magic_number() == kPlainTableMagicNumber || footer.table_magic_number() == kLegacyPlainTableMagicNumber) { return plain_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { return block_based_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else if (footer.table_magic_number() == kCuckooTableMagicNumber) { return cuckoo_table_factory_->NewTableReader( - options, soptions, icomp, std::move(file), file_size, table); + ioptions, env_options, icomp, std::move(file), file_size, table); } else { return Status::NotSupported("Unidentified table format"); } } TableBuilder* AdaptiveTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return table_factory_to_write_->NewTableBuilder(options, internal_comparator, - file, compression_type); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + return table_factory_to_write_->NewTableBuilder( + ioptions, internal_comparator, file, compression_type, compression_opts); } std::string AdaptiveTableFactory::GetPrintableTableOptions() const { diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index 571e07498..3c6455f90 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -12,7 +12,6 @@ namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -31,20 +30,26 @@ class AdaptiveTableFactory : public TableFactory { std::shared_ptr block_based_table_factory, std::shared_ptr plain_table_factory, std::shared_ptr cuckoo_table_factory); + const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(DBOptions* db_opts) const override { - if (db_opts->allow_mmap_reads == false) { + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + if (db_opts.allow_mmap_reads == false) { return Status::NotSupported( "AdaptiveTable with allow_mmap_reads == false is not supported."); } diff --git a/table/block.cc b/table/block.cc index 0db23a1bd..6a5ede600 100644 --- a/table/block.cc +++ b/table/block.cc @@ -297,16 +297,15 @@ uint32_t Block::NumRestarts() const { return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); } -Block::Block(const BlockContents& contents) - : data_(contents.data.data()), - size_(contents.data.size()), - owned_(contents.heap_allocated), - cachable_(contents.cachable), - compression_type_(contents.compression_type) { +Block::Block(BlockContents&& contents) + : contents_(std::move(contents)), + data_(contents_.data.data()), + size_(contents_.data.size()) { if (size_ < sizeof(uint32_t)) { size_ = 0; // Error marker } else { - restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); + restart_offset_ = + static_cast(size_) - (1 + NumRestarts()) * sizeof(uint32_t); if (restart_offset_ > size_ - sizeof(uint32_t)) { // The size is too small for NumRestarts() and therefore // restart_offset_ wrapped around. @@ -315,12 +314,6 @@ Block::Block(const BlockContents& contents) } } -Block::~Block() { - if (owned_) { - delete[] data_; - } -} - Iterator* Block::NewIterator( const Comparator* cmp, BlockIter* iter, bool total_order_seek) { if (size_ < 2*sizeof(uint32_t)) { diff --git a/table/block.h b/table/block.h index 49bcf12cf..0187489bb 100644 --- a/table/block.h +++ b/table/block.h @@ -14,6 +14,10 @@ #include "rocksdb/iterator.h" #include "rocksdb/options.h" #include "db/dbformat.h" +#include "table/block_prefix_index.h" +#include "table/block_hash_index.h" + +#include "format.h" namespace rocksdb { @@ -26,15 +30,17 @@ class BlockPrefixIndex; class Block { public: // Initialize the block with the specified contents. - explicit Block(const BlockContents& contents); + explicit Block(BlockContents&& contents); - ~Block(); + ~Block() = default; size_t size() const { return size_; } const char* data() const { return data_; } - bool cachable() const { return cachable_; } + bool cachable() const { return contents_.cachable; } uint32_t NumRestarts() const; - CompressionType compression_type() const { return compression_type_; } + CompressionType compression_type() const { + return contents_.compression_type; + } // If hash index lookup is enabled and `use_hash_index` is true. This block // will do hash lookup for the key prefix. @@ -58,12 +64,10 @@ class Block { size_t ApproximateMemoryUsage() const; private: - const char* data_; - size_t size_; + BlockContents contents_; + const char* data_; // contents_.data.data() + size_t size_; // contents_.data.size() uint32_t restart_offset_; // Offset in data_ of restart array - bool owned_; // Block owns data_[] - bool cachable_; - CompressionType compression_type_; std::unique_ptr hash_index_; std::unique_ptr prefix_index_; @@ -155,7 +159,8 @@ class BlockIter : public Iterator { // Return the offset in data_ just past the end of the current entry. inline uint32_t NextEntryOffset() const { - return (value_.data() + value_.size()) - data_; + // NOTE: We don't support files bigger than 2GB + return static_cast((value_.data() + value_.size()) - data_); } uint32_t GetRestartPoint(uint32_t index) { diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc new file mode 100644 index 000000000..7037d85bc --- /dev/null +++ b/table/block_based_filter_block.cc @@ -0,0 +1,253 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "table/block_based_filter_block.h" + +#include "db/dbformat.h" +#include "rocksdb/filter_policy.h" +#include "util/coding.h" + +namespace rocksdb { + +namespace { +bool SamePrefix(const SliceTransform* prefix_extractor, + const Slice& key1, const Slice& key2) { + if (!prefix_extractor->InDomain(key1) && + !prefix_extractor->InDomain(key2)) { + return true; + } else if (!prefix_extractor->InDomain(key1) || + !prefix_extractor->InDomain(key2)) { + return false; + } else { + return (prefix_extractor->Transform(key1) == + prefix_extractor->Transform(key2)); + } +} + +void AppendItem(std::string* props, const std::string& key, + const std::string& value) { + char cspace = ' '; + std::string value_str(""); + size_t i = 0; + const size_t dataLength = 64; + const size_t tabLength = 2; + const size_t offLength = 16; + + value_str.append(&value[i], std::min(size_t(dataLength), value.size())); + i += dataLength; + while (i < value.size()) { + value_str.append("\n"); + value_str.append(offLength, cspace); + value_str.append(&value[i], std::min(size_t(dataLength), value.size() - i)); + i += dataLength; + } + + std::string result(""); + if (key.size() < (offLength - tabLength)) + result.append(size_t((offLength - tabLength)) - key.size(), cspace); + result.append(key); + + props->append(result + ": " + value_str + "\n"); +} + +template +void AppendItem(std::string* props, const TKey& key, const std::string& value) { + std::string key_str = std::to_string(key); + AppendItem(props, key_str, value); +} +} // namespace + + +// See doc/table_format.txt for an explanation of the filter block format. + +// Generate new filter every 2KB of data +static const size_t kFilterBaseLg = 11; +static const size_t kFilterBase = 1 << kFilterBaseLg; + +BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering) { + assert(policy_); +} + +void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { + uint64_t filter_index = (block_offset / kFilterBase); + assert(filter_index >= filter_offsets_.size()); + while (filter_index > filter_offsets_.size()) { + GenerateFilter(); + } +} + +void BlockBasedFilterBlockBuilder::Add(const Slice& key) { + added_to_start_ = 0; + if (whole_key_filtering_) { + AddKey(key); + added_to_start_ = 1; + } + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { + start_.push_back(entries_.size()); + entries_.append(key.data(), key.size()); +} + +// Add prefix to filter if needed +inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { + // get slice for most recently added entry + Slice prev; + if (start_.size() > added_to_start_) { + size_t prev_start = start_[start_.size() - 1 - added_to_start_]; + const char* base = entries_.data() + prev_start; + size_t length = entries_.size() - prev_start; + prev = Slice(base, length); + } + + // this assumes prefix(prefix(key)) == prefix(key), as the last + // entry in entries_ may be either a key or prefix, and we use + // prefix(last entry) to get the prefix of the last key. + if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) { + Slice prefix = prefix_extractor_->Transform(key); + start_.push_back(entries_.size()); + entries_.append(prefix.data(), prefix.size()); + } +} + +Slice BlockBasedFilterBlockBuilder::Finish() { + if (!start_.empty()) { + GenerateFilter(); + } + + // Append array of per-filter offsets + const uint32_t array_offset = static_cast(result_.size()); + for (size_t i = 0; i < filter_offsets_.size(); i++) { + PutFixed32(&result_, filter_offsets_[i]); + } + + PutFixed32(&result_, array_offset); + result_.push_back(kFilterBaseLg); // Save encoding parameter in result + return Slice(result_); +} + +void BlockBasedFilterBlockBuilder::GenerateFilter() { + const size_t num_entries = start_.size(); + if (num_entries == 0) { + // Fast path if there are no keys for this filter + filter_offsets_.push_back(static_cast(result_.size())); + return; + } + + // Make list of keys from flattened key structure + start_.push_back(entries_.size()); // Simplify length computation + tmp_entries_.resize(num_entries); + for (size_t i = 0; i < num_entries; i++) { + const char* base = entries_.data() + start_[i]; + size_t length = start_[i + 1] - start_[i]; + tmp_entries_[i] = Slice(base, length); + } + + // Generate filter for current set of keys and append to result_. + filter_offsets_.push_back(static_cast(result_.size())); + policy_->CreateFilter(&tmp_entries_[0], static_cast(num_entries), + &result_); + + tmp_entries_.clear(); + entries_.clear(); + start_.clear(); +} + +BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, BlockContents&& contents) + : policy_(table_opt.filter_policy.get()), + prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + data_(nullptr), + offset_(nullptr), + num_(0), + base_lg_(0), + contents_(std::move(contents)) { + assert(policy_); + size_t n = contents_.data.size(); + if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array + base_lg_ = contents_.data[n - 1]; + uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5); + if (last_word > n - 5) return; + data_ = contents_.data.data(); + offset_ = data_ + last_word; + num_ = (n - 5 - last_word) / 4; +} + +bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key, + uint64_t block_offset) { + assert(block_offset != kNotValid); + if (!whole_key_filtering_) { + return true; + } + return MayMatch(key, block_offset); +} + +bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix, + uint64_t block_offset) { + assert(block_offset != kNotValid); + if (!prefix_extractor_) { + return true; + } + return MayMatch(prefix, block_offset); +} + +bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry, + uint64_t block_offset) { + uint64_t index = block_offset >> base_lg_; + if (index < num_) { + uint32_t start = DecodeFixed32(offset_ + index * 4); + uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { + Slice filter = Slice(data_ + start, limit - start); + return policy_->KeyMayMatch(entry, filter); + } else if (start == limit) { + // Empty filters do not match any entries + return false; + } + } + return true; // Errors are treated as potential matches +} + +size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const { + return num_ * 4 + 5 + (offset_ - data_); +} + +std::string BlockBasedFilterBlockReader::ToString() const { + std::string result, filter_meta; + result.reserve(1024); + + std::string s_bo("Block offset"), s_hd("Hex dump"), s_fb("# filter blocks"); + AppendItem(&result, s_fb, std::to_string(num_)); + AppendItem(&result, s_bo, s_hd); + + for (size_t index = 0; index < num_; index++) { + uint32_t start = DecodeFixed32(offset_ + index * 4); + uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); + + if (start != limit) { + result.append(" filter block # " + std::to_string(index + 1) + "\n"); + Slice filter = Slice(data_ + start, limit - start); + AppendItem(&result, start, filter.ToString(true)); + } + } + return result; +} +} // namespace rocksdb diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h new file mode 100644 index 000000000..cf8c1b47c --- /dev/null +++ b/table/block_based_filter_block.h @@ -0,0 +1,104 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A filter block is stored near the end of a Table file. It contains +// filters (e.g., bloom filters) for all data blocks in the table combined +// into a single filter block. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "table/filter_block.h" +#include "util/hash.h" + +namespace rocksdb { + + +// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// +// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp: +// (StartBlock Add*)* Finish +class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { + public: + BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt); + + virtual bool IsBlockBased() override { return true; } + virtual void StartBlock(uint64_t block_offset) override; + virtual void Add(const Slice& key) override; + virtual Slice Finish() override; + + private: + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + void GenerateFilter(); + + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + std::string entries_; // Flattened entry contents + std::vector start_; // Starting index in entries_ of each entry + uint32_t added_to_start_; // To indicate if key is added + std::string result_; // Filter data computed so far + std::vector tmp_entries_; // policy_->CreateFilter() argument + std::vector filter_offsets_; + + // No copying allowed + BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&); + void operator=(const BlockBasedFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class BlockBasedFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and *policy must stay live while *this is live. + BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + BlockContents&& contents); + virtual bool IsBlockBased() override { return true; } + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) override; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) override; + virtual size_t ApproximateMemoryUsage() const override; + + // convert this object to a human readable form + std::string ToString() const override; + + private: + const FilterPolicy* policy_; + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + const char* data_; // Pointer to filter data (at block-start) + const char* offset_; // Pointer to beginning of offset array (at block-end) + size_t num_; // Number of entries in offset array + size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) + BlockContents contents_; + + bool MayMatch(const Slice& entry, uint64_t block_offset); + + // No copying allowed + BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&); + void operator=(const BlockBasedFilterBlockReader&); +}; +} // namespace rocksdb diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc new file mode 100644 index 000000000..28eea16ce --- /dev/null +++ b/table/block_based_filter_block_test.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "table/block_based_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +// For testing: emit an array with one hash value per key +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } +}; + +class FilterBlockTest { + public: + TestHashFilter policy_; + BlockBasedTableOptions table_options_; + + FilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST(FilterBlockTest, EmptyBuilder) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + BlockContents block(builder.Finish(), false, kNoCompression); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100000)); +} + +TEST(FilterBlockTest, SingleChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + builder.StartBlock(100); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.StartBlock(200); + builder.Add("box"); + builder.StartBlock(300); + builder.Add("hello"); + BlockContents block(builder.Finish(), false, kNoCompression); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); + ASSERT_TRUE(reader.KeyMayMatch("bar", 100)); + ASSERT_TRUE(reader.KeyMayMatch("box", 100)); + ASSERT_TRUE(reader.KeyMayMatch("hello", 100)); + ASSERT_TRUE(reader.KeyMayMatch("foo", 100)); + ASSERT_TRUE(!reader.KeyMayMatch("missing", 100)); + ASSERT_TRUE(!reader.KeyMayMatch("other", 100)); +} + +TEST(FilterBlockTest, MultiChunk) { + BlockBasedFilterBlockBuilder builder(nullptr, table_options_); + + // First filter + builder.StartBlock(0); + builder.Add("foo"); + builder.StartBlock(2000); + builder.Add("bar"); + + // Second filter + builder.StartBlock(3100); + builder.Add("box"); + + // Third filter is empty + + // Last filter + builder.StartBlock(9000); + builder.Add("box"); + builder.Add("hello"); + + BlockContents block(builder.Finish(), false, kNoCompression); + BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block)); + + // Check first filter + ASSERT_TRUE(reader.KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader.KeyMayMatch("bar", 2000)); + ASSERT_TRUE(!reader.KeyMayMatch("box", 0)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 0)); + + // Check second filter + ASSERT_TRUE(reader.KeyMayMatch("box", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("box", 4100)); + ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100)); + + // Check last filter + ASSERT_TRUE(reader.KeyMayMatch("box", 9000)); + ASSERT_TRUE(reader.KeyMayMatch("hello", 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000)); + ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000)); +} + +// Test for block based filter block +// use new interface in FilterPolicy to create filter builder/reader +class BlockBasedFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + BlockBasedFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10)); + } + + ~BlockBasedFilterBlockTest() {} +}; + +TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + BlockContents block(builder->Finish(), false, kNoCompression); + ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data)); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100000)); + + delete builder; + delete reader; +} + +TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + builder->StartBlock(100); + builder->Add("foo"); + builder->Add("bar"); + builder->Add("box"); + builder->StartBlock(200); + builder->Add("box"); + builder->StartBlock(300); + builder->Add("hello"); + BlockContents block(builder->Finish(), false, kNoCompression); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); + ASSERT_TRUE(reader->KeyMayMatch("bar", 100)); + ASSERT_TRUE(reader->KeyMayMatch("box", 100)); + ASSERT_TRUE(reader->KeyMayMatch("hello", 100)); + ASSERT_TRUE(reader->KeyMayMatch("foo", 100)); + ASSERT_TRUE(!reader->KeyMayMatch("missing", 100)); + ASSERT_TRUE(!reader->KeyMayMatch("other", 100)); + + delete builder; + delete reader; +} + +TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) { + FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder( + nullptr, table_options_); + + // First filter + builder->StartBlock(0); + builder->Add("foo"); + builder->StartBlock(2000); + builder->Add("bar"); + + // Second filter + builder->StartBlock(3100); + builder->Add("box"); + + // Third filter is empty + + // Last filter + builder->StartBlock(9000); + builder->Add("box"); + builder->Add("hello"); + + BlockContents block(builder->Finish(), false, kNoCompression); + FilterBlockReader* reader = new BlockBasedFilterBlockReader( + nullptr, table_options_, std::move(block)); + + // Check first filter + ASSERT_TRUE(reader->KeyMayMatch("foo", 0)); + ASSERT_TRUE(reader->KeyMayMatch("bar", 2000)); + ASSERT_TRUE(!reader->KeyMayMatch("box", 0)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 0)); + + // Check second filter + ASSERT_TRUE(reader->KeyMayMatch("box", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100)); + + // Check third filter (empty) + ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("box", 4100)); + ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100)); + + // Check last filter + ASSERT_TRUE(reader->KeyMayMatch("box", 9000)); + ASSERT_TRUE(reader->KeyMayMatch("hello", 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000)); + ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000)); + + delete builder; + delete reader; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 0e5ea0a69..813f8a125 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -17,6 +17,7 @@ #include #include #include +#include #include "db/dbformat.h" @@ -25,18 +26,20 @@ #include "rocksdb/env.h" #include "rocksdb/filter_policy.h" #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block.h" #include "table/block_based_table_reader.h" #include "table/block_builder.h" #include "table/filter_block.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/table_builder.h" #include "util/coding.h" +#include "util/compression.h" #include "util/crc32c.h" #include "util/stop_watch.h" #include "util/xxhash.h" @@ -116,7 +119,7 @@ class ShortenedIndexBuilder : public IndexBuilder { public: explicit ShortenedIndexBuilder(const Comparator* comparator) : IndexBuilder(comparator), - index_block_builder_(1 /* block_restart_interval == 1 */, comparator) {} + index_block_builder_(1 /* block_restart_interval == 1 */) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, @@ -133,12 +136,12 @@ class ShortenedIndexBuilder : public IndexBuilder { index_block_builder_.Add(*last_key_in_current_block, handle_encoding); } - virtual Status Finish(IndexBlocks* index_blocks) { + virtual Status Finish(IndexBlocks* index_blocks) override { index_blocks->index_block_contents = index_block_builder_.Finish(); return Status::OK(); } - virtual size_t EstimatedSize() const { + virtual size_t EstimatedSize() const override { return index_block_builder_.CurrentSizeEstimate(); } @@ -175,14 +178,14 @@ class HashIndexBuilder : public IndexBuilder { explicit HashIndexBuilder(const Comparator* comparator, const SliceTransform* hash_key_extractor) : IndexBuilder(comparator), - primary_index_builder(comparator), + primary_index_builder_(comparator), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, const BlockHandle& block_handle) override { ++current_restart_index_; - primary_index_builder.AddIndexEntry(last_key_in_current_block, + primary_index_builder_.AddIndexEntry(last_key_in_current_block, first_key_in_next_block, block_handle); } @@ -201,7 +204,7 @@ class HashIndexBuilder : public IndexBuilder { // copy. pending_entry_prefix_ = key_prefix.ToString(); pending_block_num_ = 1; - pending_entry_index_ = current_restart_index_; + pending_entry_index_ = static_cast(current_restart_index_); } else { // entry number increments when keys share the prefix reside in // differnt data blocks. @@ -213,9 +216,9 @@ class HashIndexBuilder : public IndexBuilder { } } - virtual Status Finish(IndexBlocks* index_blocks) { + virtual Status Finish(IndexBlocks* index_blocks) override { FlushPendingPrefix(); - primary_index_builder.Finish(index_blocks); + primary_index_builder_.Finish(index_blocks); index_blocks->meta_blocks.insert( {kHashIndexPrefixesBlock.c_str(), prefix_block_}); index_blocks->meta_blocks.insert( @@ -223,8 +226,8 @@ class HashIndexBuilder : public IndexBuilder { return Status::OK(); } - virtual size_t EstimatedSize() const { - return primary_index_builder.EstimatedSize() + prefix_block_.size() + + virtual size_t EstimatedSize() const override { + return primary_index_builder_.EstimatedSize() + prefix_block_.size() + prefix_meta_block_.size(); } @@ -232,12 +235,13 @@ class HashIndexBuilder : public IndexBuilder { void FlushPendingPrefix() { prefix_block_.append(pending_entry_prefix_.data(), pending_entry_prefix_.size()); - PutVarint32(&prefix_meta_block_, pending_entry_prefix_.size()); + PutVarint32(&prefix_meta_block_, + static_cast(pending_entry_prefix_.size())); PutVarint32(&prefix_meta_block_, pending_entry_index_); PutVarint32(&prefix_meta_block_, pending_block_num_); } - ShortenedIndexBuilder primary_index_builder; + ShortenedIndexBuilder primary_index_builder_; const SliceTransform* hash_key_extractor_; // stores a sequence of prefixes @@ -255,6 +259,9 @@ class HashIndexBuilder : public IndexBuilder { uint64_t current_restart_index_ = 0; }; +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { + // Create a index builder based on its type. IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, const SliceTransform* prefix_extractor) { @@ -275,14 +282,31 @@ IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, return nullptr; } +// Create a index builder based on its type. +FilterBlockBuilder* CreateFilterBlockBuilder(const ImmutableCFOptions& opt, + const BlockBasedTableOptions& table_opt) { + if (table_opt.filter_policy == nullptr) return nullptr; + + FilterBitsBuilder* filter_bits_builder = + table_opt.filter_policy->GetFilterBitsBuilder(); + if (filter_bits_builder == nullptr) { + return new BlockBasedFilterBlockBuilder(opt.prefix_extractor, table_opt); + } else { + return new FullFilterBlockBuilder(opt.prefix_extractor, table_opt, + filter_bits_builder); + } +} + bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) { // Check to see if compressed less than 12.5% return compressed_size < raw_size - (raw_size / 8u); } +// format_version is the block format as defined in include/rocksdb/table.h Slice CompressBlock(const Slice& raw, const CompressionOptions& compression_options, - CompressionType* type, std::string* compressed_output) { + CompressionType* type, uint32_t format_version, + std::string* compressed_output) { if (*type == kNoCompression) { return raw; } @@ -291,36 +315,44 @@ Slice CompressBlock(const Slice& raw, // supported in this platform and (2) the compression rate is "good enough". switch (*type) { case kSnappyCompression: - if (port::Snappy_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + if (Snappy_Compress(compression_options, raw.data(), raw.size(), + compressed_output) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } break; // fall back to no compression. case kZlibCompression: - if (port::Zlib_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + if (Zlib_Compress( + compression_options, + GetCompressFormatForVersion(kZlibCompression, format_version), + raw.data(), raw.size(), compressed_output) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } break; // fall back to no compression. case kBZip2Compression: - if (port::BZip2_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + if (BZip2_Compress( + compression_options, + GetCompressFormatForVersion(kBZip2Compression, format_version), + raw.data(), raw.size(), compressed_output) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } break; // fall back to no compression. case kLZ4Compression: - if (port::LZ4_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + if (LZ4_Compress( + compression_options, + GetCompressFormatForVersion(kLZ4Compression, format_version), + raw.data(), raw.size(), compressed_output) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } break; // fall back to no compression. case kLZ4HCCompression: - if (port::LZ4HC_Compress(compression_options, raw.data(), raw.size(), - compressed_output) && + if (LZ4HC_Compress( + compression_options, + GetCompressFormatForVersion(kLZ4HCCompression, format_version), + raw.data(), raw.size(), compressed_output) && GoodCompressionRatio(compressed_output->size(), raw.size())) { return *compressed_output; } @@ -334,6 +366,8 @@ Slice CompressBlock(const Slice& raw, return raw; } +} // namespace + // kBlockBasedTableMagicNumber was picked by running // echo rocksdb.table.block_based | sha1sum // and taking the leading 64 bits. @@ -366,7 +400,6 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector std::string val; PutFixed32(&val, static_cast(index_type_)); properties->insert({BlockBasedTablePropertyNames::kIndexType, val}); - return Status::OK(); } @@ -385,7 +418,7 @@ class BlockBasedTableBuilder::BlockBasedTablePropertiesCollector }; struct BlockBasedTableBuilder::Rep { - const Options options; + const ImmutableCFOptions ioptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; WritableFile* file; @@ -397,11 +430,12 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr index_builder; std::string last_key; - CompressionType compression_type; + const CompressionType compression_type; + const CompressionOptions compression_opts; TableProperties props; bool closed = false; // Either Finish() or Abandon() has been called. - FilterBlockBuilder* filter_block; + std::unique_ptr filter_block; char compressed_cache_key_prefix[BlockBasedTable::kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size; @@ -413,27 +447,28 @@ struct BlockBasedTableBuilder::Rep { std::vector> table_properties_collectors; - Rep(const Options& opt, const BlockBasedTableOptions& table_opt, - const InternalKeyComparator& icomparator, - WritableFile* f, CompressionType compression_type) - : options(opt), + Rep(const ImmutableCFOptions& _ioptions, + const BlockBasedTableOptions& table_opt, + const InternalKeyComparator& icomparator, WritableFile* f, + const CompressionType _compression_type, + const CompressionOptions& _compression_opts) + : ioptions(_ioptions), table_options(table_opt), internal_comparator(icomparator), file(f), - data_block(table_options.block_restart_interval, &internal_comparator), - internal_prefix_transform(options.prefix_extractor.get()), - index_builder(CreateIndexBuilder( - table_options.index_type, &internal_comparator, - &this->internal_prefix_transform)), - compression_type(compression_type), - filter_block(table_options.filter_policy == nullptr ? - nullptr : - new FilterBlockBuilder(opt, table_options, &internal_comparator)), + data_block(table_options.block_restart_interval), + internal_prefix_transform(_ioptions.prefix_extractor), + index_builder(CreateIndexBuilder(table_options.index_type, + &internal_comparator, + &this->internal_prefix_transform)), + compression_type(_compression_type), + compression_opts(_compression_opts), + filter_block(CreateFilterBlockBuilder(_ioptions, table_options)), flush_block_policy( table_options.flush_block_policy_factory->NewFlushBlockPolicy( - table_options, data_block)) { + table_options, data_block)) { for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -443,11 +478,24 @@ struct BlockBasedTableBuilder::Rep { }; BlockBasedTableBuilder::BlockBasedTableBuilder( - const Options& options, const BlockBasedTableOptions& table_options, + const ImmutableCFOptions& ioptions, + const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, WritableFile* file, - CompressionType compression_type) - : rep_(new Rep(options, table_options, internal_comparator, - file, compression_type)) { + const CompressionType compression_type, + const CompressionOptions& compression_opts) { + BlockBasedTableOptions sanitized_table_options(table_options); + if (sanitized_table_options.format_version == 0 && + sanitized_table_options.checksum != kCRC32c) { + Log(InfoLogLevel::WARN_LEVEL, ioptions.info_log, + "Silently converting format_version to 1 because checksum is " + "non-default"); + // silently convert format_version to 1 to keep consistent with current + // behavior + sanitized_table_options.format_version = 1; + } + + rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, file, + compression_type, compression_opts); if (rep_->filter_block != nullptr) { rep_->filter_block->StartBlock(0); } @@ -461,7 +509,6 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( BlockBasedTableBuilder::~BlockBasedTableBuilder() { assert(rep_->closed); // Catch errors where caller forgot to call Finish() - delete rep_->filter_block; delete rep_; } @@ -492,7 +539,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { } if (r->filter_block != nullptr) { - r->filter_block->AddKey(ExtractUserKey(key)); + r->filter_block->Add(ExtractUserKey(key)); } r->last_key.assign(key.data(), key.size()); @@ -503,7 +550,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) { r->index_builder->OnKeyAdded(key); NotifyCollectTableCollectorsOnAdd(key, value, r->table_properties_collectors, - r->options.info_log.get()); + r->ioptions.info_log); } void BlockBasedTableBuilder::Flush() { @@ -541,10 +588,10 @@ void BlockBasedTableBuilder::WriteBlock(const Slice& raw_block_contents, Slice block_contents; if (raw_block_contents.size() < kCompressionSizeLimit) { block_contents = - CompressBlock(raw_block_contents, r->options.compression_opts, &type, - &r->compressed_output); + CompressBlock(raw_block_contents, r->compression_opts, &type, + r->table_options.format_version, &r->compressed_output); } else { - RecordTick(r->options.statistics.get(), NUMBER_BLOCK_NOT_COMPRESSED); + RecordTick(r->ioptions.statistics, NUMBER_BLOCK_NOT_COMPRESSED); type = kNoCompression; block_contents = raw_block_contents; } @@ -556,8 +603,7 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, CompressionType type, BlockHandle* handle) { Rep* r = rep_; - StopWatch sw(r->options.env, r->options.statistics.get(), - WRITE_RAW_BLOCK_MICROS); + StopWatch sw(r->ioptions.env, r->ioptions.statistics, WRITE_RAW_BLOCK_MICROS); handle->set_offset(r->offset); handle->set_size(block_contents.size()); r->status = r->file->Append(block_contents); @@ -578,7 +624,8 @@ void BlockBasedTableBuilder::WriteRawBlock(const Slice& block_contents, } case kxxHash: { void* xxh = XXH32_init(0); - XXH32_update(xxh, block_contents.data(), block_contents.size()); + XXH32_update(xxh, block_contents.data(), + static_cast(block_contents.size())); XXH32_update(xxh, trailer, 1); // Extend to cover block type EncodeFixed32(trailer_without_type, XXH32_digest(xxh)); break; @@ -618,18 +665,13 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, Cache::Handle* cache_handle = nullptr; size_t size = block_contents.size(); - char* ubuf = new char[size + 1]; // make a new copy - memcpy(ubuf, block_contents.data(), size); + std::unique_ptr ubuf(new char[size + 1]); + memcpy(ubuf.get(), block_contents.data(), size); ubuf[size] = type; - BlockContents results; - Slice sl(ubuf, size); - results.data = sl; - results.cachable = true; // XXX - results.heap_allocated = true; - results.compression_type = type; + BlockContents results(std::move(ubuf), size, true, type); - Block* block = new Block(results); + Block* block = new Block(std::move(results)); // make cache key by appending the file offset to the cache prefix id char* end = EncodeVarint64( @@ -645,7 +687,7 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, block_cache_compressed->Release(cache_handle); // Invalidate OS cache. - r->file->InvalidateCache(r->offset, size); + r->file->InvalidateCache(static_cast(r->offset), size); } return Status::OK(); } @@ -657,10 +699,7 @@ Status BlockBasedTableBuilder::Finish() { assert(!r->closed); r->closed = true; - BlockHandle filter_block_handle, - metaindex_block_handle, - index_block_handle; - + BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle; // Write filter block if (ok() && r->filter_block != nullptr) { auto filter_contents = r->filter_block->Finish(); @@ -699,7 +738,12 @@ Status BlockBasedTableBuilder::Finish() { if (r->filter_block != nullptr) { // Add mapping from ".Name" to location // of filter data. - std::string key = BlockBasedTable::kFilterBlockPrefix; + std::string key; + if (r->filter_block->IsBlockBased()) { + key = BlockBasedTable::kFilterBlockPrefix; + } else { + key = BlockBasedTable::kFullFilterBlockPrefix; + } key.append(r->table_options.filter_policy->Name()); meta_index_builder.Add(key, filter_block_handle); } @@ -707,7 +751,6 @@ Status BlockBasedTableBuilder::Finish() { // Write properties block. { PropertyBlockBuilder property_block_builder; - std::vector failed_user_prop_collectors; r->props.filter_policy_name = r->table_options.filter_policy != nullptr ? r->table_options.filter_policy->Name() : ""; r->props.index_size = @@ -718,7 +761,7 @@ Status BlockBasedTableBuilder::Finish() { // Add use collected properties NotifyCollectTableCollectorsOnFinish(r->table_properties_collectors, - r->options.info_log.get(), + r->ioptions.info_log, &property_block_builder); BlockHandle properties_block_handle; @@ -749,9 +792,13 @@ Status BlockBasedTableBuilder::Finish() { // TODO(icanadi) at some point in the future, when we're absolutely sure // nobody will roll back to RocksDB 2.x versions, retire the legacy magic // number and always write new table files with new magic number - bool legacy = (r->table_options.checksum == kCRC32c); + bool legacy = (r->table_options.format_version == 0); + // this is guaranteed by BlockBasedTableBuilder's constructor + assert(r->table_options.checksum == kCRC32c || + r->table_options.format_version != 0); Footer footer(legacy ? kLegacyBlockBasedTableMagicNumber - : kBlockBasedTableMagicNumber); + : kBlockBasedTableMagicNumber, + r->table_options.format_version); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(index_block_handle); footer.set_checksum(r->table_options.checksum); @@ -777,14 +824,12 @@ Status BlockBasedTableBuilder::Finish() { } } - Log( - r->options.info_log, + Log(InfoLogLevel::INFO_LEVEL, r->ioptions.info_log, "Table was constructed:\n" " [basic properties]: %s\n" " [user collected properties]: %s", r->props.ToString().c_str(), - user_collected.c_str() - ); + user_collected.c_str()); } return r->status; @@ -805,5 +850,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const { } const std::string BlockBasedTable::kFilterBlockPrefix = "filter."; +const std::string BlockBasedTable::kFullFilterBlockPrefix = "fullfilter."; } // namespace rocksdb diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 72a2f207a..6fde32919 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -28,10 +28,12 @@ class BlockBasedTableBuilder : public TableBuilder { // Create a builder that will store the contents of the table it is // building in *file. Does not close the file. It is up to the // caller to close the file after calling Finish(). - BlockBasedTableBuilder(const Options& options, + BlockBasedTableBuilder(const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type); + WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts); // REQUIRES: Either Finish() or Abandon() has been called. ~BlockBasedTableBuilder(); diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index de30fb383..17ee0b8cb 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -14,11 +14,12 @@ #include #include +#include "port/port.h" #include "rocksdb/flush_block_policy.h" #include "rocksdb/cache.h" #include "table/block_based_table_builder.h" #include "table/block_based_table_reader.h" -#include "port/port.h" +#include "table/format.h" namespace rocksdb { @@ -41,25 +42,49 @@ BlockBasedTableFactory::BlockBasedTableFactory( } Status BlockBasedTableFactory::NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { - return BlockBasedTable::Open(options, soptions, table_options_, + return BlockBasedTable::Open(ioptions, soptions, table_options_, internal_comparator, std::move(file), file_size, table_reader); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const { auto table_builder = new BlockBasedTableBuilder( - options, table_options_, internal_comparator, file, compression_type); + ioptions, table_options_, internal_comparator, file, + compression_type, compression_opts); return table_builder; } +Status BlockBasedTableFactory::SanitizeOptions( + const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const { + if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && + cf_opts.prefix_extractor == nullptr) { + return Status::InvalidArgument("Hash index is specified for block-based " + "table, but prefix_extractor is not given"); + } + if (table_options_.cache_index_and_filter_blocks && + table_options_.no_block_cache) { + return Status::InvalidArgument("Enable cache_index_and_filter_blocks, " + ", but block cache is disabled"); + } + if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { + return Status::InvalidArgument( + "Unsupported BlockBasedTable format_version. Please check " + "include/rocksdb/table.h for more info"); + } + return Status::OK(); +} + std::string BlockBasedTableFactory::GetPrintableTableOptions() const { std::string ret; ret.reserve(20000); @@ -116,10 +141,16 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); + snprintf(buffer, kBufferSize, " format_version: %d\n", + table_options_.format_version); ret.append(buffer); return ret; } +const BlockBasedTableOptions& BlockBasedTableFactory::GetTableOptions() const { + return table_options_; +} + TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& table_options) { return new BlockBasedTableFactory(table_options); diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 90282bf9d..674289779 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -14,13 +14,11 @@ #include #include "rocksdb/flush_block_policy.h" -#include "rocksdb/options.h" #include "rocksdb/table.h" #include "db/dbformat.h" namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -35,22 +33,26 @@ class BlockBasedTableFactory : public TableFactory { const char* Name() const override { return "BlockBasedTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + Status NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const override; + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType compression_type, + const CompressionOptions& compression_opts) const override; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(DBOptions* db_opts) const override { - return Status::OK(); - } + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override; std::string GetPrintableTableOptions() const override; + const BlockBasedTableOptions& GetTableOptions() const; + private: BlockBasedTableOptions table_options_; }; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 0be38a1dc..f03ab2b4b 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -26,15 +26,19 @@ #include "table/block.h" #include "table/filter_block.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "table/block_hash_index.h" #include "table/block_prefix_index.h" #include "table/format.h" #include "table/meta_blocks.h" #include "table/two_level_iterator.h" +#include "table/get_context.h" #include "util/coding.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" +#include "util/string_util.h" namespace rocksdb { @@ -46,7 +50,6 @@ using std::unique_ptr; typedef BlockBasedTable::IndexReader IndexReader; namespace { - // The longest the prefix of the cache key used to identify blocks can be. // We are using the fact that we know for Posix files the unique ID is three // varints. @@ -65,7 +68,7 @@ Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, Status s = ReadBlockContents(file, footer, options, handle, &contents, env, do_uncompress); if (s.ok()) { - *result = new Block(contents); + *result = new Block(std::move(contents)); } return s; @@ -251,9 +254,6 @@ class HashIndexReader : public IndexReader { &prefixes_meta_contents, env, true /* do decompression */); if (!s.ok()) { - if (prefixes_contents.heap_allocated) { - delete[] prefixes_contents.data.data(); - } // TODO: log error return Status::OK(); } @@ -268,7 +268,7 @@ class HashIndexReader : public IndexReader { // TODO: log error if (s.ok()) { new_index_reader->index_block_->SetBlockHashIndex(hash_index); - new_index_reader->OwnPrefixesContents(prefixes_contents); + new_index_reader->OwnPrefixesContents(std::move(prefixes_contents)); } } else { BlockPrefixIndex* prefix_index = nullptr; @@ -282,18 +282,6 @@ class HashIndexReader : public IndexReader { } } - // Always release prefix meta block - if (prefixes_meta_contents.heap_allocated) { - delete[] prefixes_meta_contents.data.data(); - } - - // Release prefix content block if we don't own it. - if (!new_index_reader->own_prefixes_contents_) { - if (prefixes_contents.heap_allocated) { - delete[] prefixes_contents.data.data(); - } - } - return Status::OK(); } @@ -312,39 +300,34 @@ class HashIndexReader : public IndexReader { private: HashIndexReader(const Comparator* comparator, Block* index_block) - : IndexReader(comparator), - index_block_(index_block), - own_prefixes_contents_(false) { + : IndexReader(comparator), index_block_(index_block) { assert(index_block_ != nullptr); } ~HashIndexReader() { - if (own_prefixes_contents_ && prefixes_contents_.heap_allocated) { - delete[] prefixes_contents_.data.data(); - } } - void OwnPrefixesContents(const BlockContents& prefixes_contents) { - prefixes_contents_ = prefixes_contents; - own_prefixes_contents_ = true; + void OwnPrefixesContents(BlockContents&& prefixes_contents) { + prefixes_contents_ = std::move(prefixes_contents); } std::unique_ptr index_block_; - bool own_prefixes_contents_; BlockContents prefixes_contents_; }; struct BlockBasedTable::Rep { - Rep(const EnvOptions& storage_options, - const BlockBasedTableOptions& table_opt, - const InternalKeyComparator& internal_comparator) - : soptions(storage_options), table_options(table_opt), - filter_policy(table_opt.filter_policy.get()), - internal_comparator(internal_comparator) {} - - Options options; - const EnvOptions& soptions; + Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, + const BlockBasedTableOptions& _table_opt, + const InternalKeyComparator& _internal_comparator) + : ioptions(_ioptions), + env_options(_env_options), + table_options(_table_opt), + filter_policy(_table_opt.filter_policy.get()), + internal_comparator(_internal_comparator) {} + + const ImmutableCFOptions& ioptions; + const EnvOptions& env_options; const BlockBasedTableOptions& table_options; const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; @@ -383,11 +366,9 @@ BlockBasedTable::~BlockBasedTable() { // was not read from cache, `cache_handle` will be nullptr. template struct BlockBasedTable::CachableEntry { - CachableEntry(TValue* value, Cache::Handle* cache_handle) - : value(value) - , cache_handle(cache_handle) { - } - CachableEntry(): CachableEntry(nullptr, nullptr) { } + CachableEntry(TValue* _value, Cache::Handle* _cache_handle) + : value(_value), cache_handle(_cache_handle) {} + CachableEntry() : CachableEntry(nullptr, nullptr) {} void Release(Cache* cache) { if (cache_handle) { cache->Release(cache_handle); @@ -446,7 +427,8 @@ void BlockBasedTable::GenerateCachePrefix(Cache* cc, } } -Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, +Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, @@ -454,15 +436,22 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, unique_ptr* table_reader) { table_reader->reset(); - Footer footer(kBlockBasedTableMagicNumber); - auto s = ReadFooterFromFile(file.get(), file_size, &footer); - if (!s.ok()) return s; + Footer footer; + auto s = ReadFooterFromFile(file.get(), file_size, &footer, + kBlockBasedTableMagicNumber); + if (!s.ok()) { + return s; + } + if (!BlockBasedTableSupportedVersion(footer.version())) { + return Status::Corruption( + "Unknown Footer version. Maybe this file was created with newer " + "version of RocksDB?"); + } // We've successfully read the footer and the index block: we're // ready to serve requests. Rep* rep = new BlockBasedTable::Rep( - soptions, table_options, internal_comparator); - rep->options = options; + ioptions, env_options, table_options, internal_comparator); rep->file = std::move(file); rep->footer = footer; rep->index_type = table_options.index_type; @@ -474,36 +463,42 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, std::unique_ptr meta; std::unique_ptr meta_iter; s = ReadMetaBlock(rep, &meta, &meta_iter); + if (!s.ok()) { + return s; + } // Read the properties bool found_properties_block = true; s = SeekToPropertiesBlock(meta_iter.get(), &found_properties_block); - if (found_properties_block) { + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, + "Cannot seek to properties block from file: %s", + s.ToString().c_str()); + } else if (found_properties_block) { s = meta_iter->status(); TableProperties* table_properties = nullptr; if (s.ok()) { s = ReadProperties(meta_iter->value(), rep->file.get(), rep->footer, - rep->options.env, rep->options.info_log.get(), + rep->ioptions.env, rep->ioptions.info_log, &table_properties); } if (!s.ok()) { - auto err_msg = - "[Warning] Encountered error while reading data from properties " - "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); + Log(InfoLogLevel::WARN_LEVEL, rep->ioptions.info_log, + "Encountered error while reading data from properties " + "block %s", s.ToString().c_str()); } else { rep->table_properties.reset(table_properties); } } else { - Log(WARN_LEVEL, rep->options.info_log, + Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log, "Cannot find Properties block from file."); } // Will use block cache for index/filter blocks access? - if (table_options.block_cache && - table_options.cache_index_and_filter_blocks) { + if (table_options.cache_index_and_filter_blocks) { + assert(table_options.block_cache != nullptr); // Hack: Call NewIndexIterator() to implicitly add index to the block_cache unique_ptr iter(new_table->NewIndexIterator(ReadOptions())); s = iter->status(); @@ -518,7 +513,6 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, // pre-load these blocks, which will kept in member variables in Rep // and with a same life-time as this table object. IndexReader* index_reader = nullptr; - // TODO: we never really verify check sum for index block s = new_table->CreateIndexReader(&index_reader, meta_iter.get()); if (s.ok()) { @@ -526,12 +520,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, // Set filter block if (rep->filter_policy) { - std::string key = kFilterBlockPrefix; - key.append(rep->filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(meta_iter.get(), key, &handle).ok()) { - rep->filter.reset(ReadFilter(handle, rep)); - } + rep->filter.reset(ReadFilter(rep, meta_iter.get(), nullptr)); } } else { delete index_reader; @@ -546,7 +535,7 @@ Status BlockBasedTable::Open(const Options& options, const EnvOptions& soptions, } void BlockBasedTable::SetupForCompaction() { - switch (rep_->options.access_hint_on_compaction_start) { + switch (rep_->ioptions.access_hint_on_compaction_start) { case Options::NONE: break; case Options::NORMAL: @@ -596,15 +585,12 @@ Status BlockBasedTable::ReadMetaBlock( ReadOptions(), rep->footer.metaindex_handle(), &meta, - rep->options.env); + rep->ioptions.env); - if (!s.ok()) { - auto err_msg = - "[Warning] Encountered error while reading data from properties" - "block " + s.ToString(); - Log(rep->options.info_log, "%s", err_msg.c_str()); - } if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, rep->ioptions.info_log, + "Encountered error while reading data from properties" + " block %s", s.ToString().c_str()); delete meta; return s; } @@ -619,7 +605,7 @@ Status BlockBasedTable::GetDataBlockFromCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block) { + BlockBasedTable::CachableEntry* block, uint32_t format_version) { Status s; Block* compressed_block = nullptr; Cache::Handle* block_cache_compressed_handle = nullptr; @@ -662,11 +648,12 @@ Status BlockBasedTable::GetDataBlockFromCache( // Retrieve the uncompressed contents into a new buffer BlockContents contents; s = UncompressBlockContents(compressed_block->data(), - compressed_block->size(), &contents); + compressed_block->size(), &contents, + format_version); // Insert uncompressed block into block cache if (s.ok()) { - block->value = new Block(contents); // uncompressed block + block->value = new Block(std::move(contents)); // uncompressed block assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { @@ -687,7 +674,7 @@ Status BlockBasedTable::PutDataBlockToCache( const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, Statistics* statistics, - CachableEntry* block, Block* raw_block) { + CachableEntry* block, Block* raw_block, uint32_t format_version) { assert(raw_block->compression_type() == kNoCompression || block_cache_compressed != nullptr); @@ -695,8 +682,8 @@ Status BlockBasedTable::PutDataBlockToCache( // Retrieve the uncompressed contents into a new buffer BlockContents contents; if (raw_block->compression_type() != kNoCompression) { - s = UncompressBlockContents(raw_block->data(), raw_block->size(), - &contents); + s = UncompressBlockContents(raw_block->data(), raw_block->size(), &contents, + format_version); } if (!s.ok()) { delete raw_block; @@ -704,7 +691,7 @@ Status BlockBasedTable::PutDataBlockToCache( } if (raw_block->compression_type() != kNoCompression) { - block->value = new Block(contents); // uncompressed block + block->value = new Block(std::move(contents)); // uncompressed block } else { block->value = raw_block; raw_block = nullptr; @@ -738,30 +725,56 @@ Status BlockBasedTable::PutDataBlockToCache( return s; } -FilterBlockReader* BlockBasedTable::ReadFilter(const BlockHandle& filter_handle, - BlockBasedTable::Rep* rep, - size_t* filter_size) { +FilterBlockReader* BlockBasedTable::ReadFilter( + Rep* rep, Iterator* meta_index_iter, size_t* filter_size) { // TODO: We might want to unify with ReadBlockFromFile() if we start // requiring checksum verification in Table::Open. - ReadOptions opt; - BlockContents block; - if (!ReadBlockContents(rep->file.get(), rep->footer, opt, filter_handle, - &block, rep->options.env, false).ok()) { - return nullptr; - } + for (auto prefix : {kFullFilterBlockPrefix, kFilterBlockPrefix}) { + std::string filter_block_key = prefix; + filter_block_key.append(rep->filter_policy->Name()); + BlockHandle handle; + if (FindMetaBlock(meta_index_iter, filter_block_key, &handle).ok()) { + BlockContents block; + if (!ReadBlockContents(rep->file.get(), rep->footer, ReadOptions(), + handle, &block, rep->ioptions.env, false).ok()) { + // Error reading the block + return nullptr; + } - if (filter_size) { - *filter_size = block.data.size(); - } + if (filter_size) { + *filter_size = block.data.size(); + } - return new FilterBlockReader( - rep->options, rep->table_options, block.data, block.heap_allocated); + assert(rep->filter_policy); + if (kFilterBlockPrefix == prefix) { + return new BlockBasedFilterBlockReader( + rep->ioptions.prefix_extractor, rep->table_options, + std::move(block)); + } else if (kFullFilterBlockPrefix == prefix) { + auto filter_bits_reader = rep->filter_policy-> + GetFilterBitsReader(block.data); + if (filter_bits_reader != nullptr) { + return new FullFilterBlockReader(rep->ioptions.prefix_extractor, + rep->table_options, + std::move(block), + filter_bits_reader); + } + } else { + assert(false); + return nullptr; + } + } + } + return nullptr; } BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( - bool no_io) const { - // filter pre-populated - if (rep_->filter != nullptr) { + bool no_io) const { + // If cache_index_and_filter_blocks is false, filter should be pre-populated. + // We will return rep_->filter anyway. rep_->filter can be nullptr if filter + // read fails at Open() time. We don't want to reload again since it will + // most probably fail again. + if (!rep_->table_options.cache_index_and_filter_blocks) { return {rep_->filter.get(), nullptr /* cache handle */}; } @@ -773,22 +786,19 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( // Fetching from the cache char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey( - rep_->cache_key_prefix, - rep_->cache_key_prefix_size, - rep_->footer.metaindex_handle(), - cache_key - ); - - Statistics* statistics = rep_->options.statistics.get(); + auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->footer.metaindex_handle(), + cache_key); + + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_FILTER_MISS, BLOCK_CACHE_FILTER_HIT, statistics); FilterBlockReader* filter = nullptr; if (cache_handle != nullptr) { - filter = reinterpret_cast( - block_cache->Value(cache_handle)); + filter = reinterpret_cast( + block_cache->Value(cache_handle)); } else if (no_io) { // Do not invoke any io. return CachableEntry(); @@ -799,14 +809,9 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( auto s = ReadMetaBlock(rep_, &meta, &iter); if (s.ok()) { - std::string filter_block_key = kFilterBlockPrefix; - filter_block_key.append(rep_->filter_policy->Name()); - BlockHandle handle; - if (FindMetaBlock(iter.get(), filter_block_key, &handle).ok()) { - filter = ReadFilter(handle, rep_, &filter_size); - assert(filter); + filter = ReadFilter(rep_, iter.get(), &filter_size); + if (filter != nullptr) { assert(filter_size > 0); - cache_handle = block_cache->Insert( key, filter, filter_size, &DeleteCachedEntry); RecordTick(statistics, BLOCK_CACHE_ADD); @@ -830,7 +835,7 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options, char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, rep_->footer.index_handle(), cache_key); - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, BLOCK_CACHE_INDEX_HIT, statistics); @@ -906,7 +911,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // If either block cache is enabled, we'll try to read from it. if (block_cache != nullptr || block_cache_compressed != nullptr) { - Statistics* statistics = rep->options.statistics.get(); + Statistics* statistics = rep->ioptions.statistics; char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; char compressed_cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice key, /* key to the block cache */ @@ -914,8 +919,8 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, // create key for block cache if (block_cache != nullptr) { - key = GetCacheKey(rep->cache_key_prefix, - rep->cache_key_prefix_size, handle, cache_key); + key = GetCacheKey(rep->cache_key_prefix, rep->cache_key_prefix_size, + handle, cache_key); } if (block_cache_compressed != nullptr) { @@ -925,20 +930,22 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } s = GetDataBlockFromCache(key, ckey, block_cache, block_cache_compressed, - statistics, ro, &block); + statistics, ro, &block, + rep->table_options.format_version); if (block.value == nullptr && !no_io && ro.fill_cache) { Block* raw_block = nullptr; { - StopWatch sw(rep->options.env, statistics, READ_BLOCK_GET_MICROS); + StopWatch sw(rep->ioptions.env, statistics, READ_BLOCK_GET_MICROS); s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &raw_block, rep->options.env, + &raw_block, rep->ioptions.env, block_cache_compressed == nullptr); } if (s.ok()) { s = PutDataBlockToCache(key, ckey, block_cache, block_cache_compressed, - ro, statistics, &block, raw_block); + ro, statistics, &block, raw_block, + rep->table_options.format_version); } } } @@ -955,7 +962,7 @@ Iterator* BlockBasedTable::NewDataBlockIterator(Rep* rep, } } s = ReadBlockFromFile(rep->file.get(), rep->footer, ro, handle, - &block.value, rep->options.env); + &block.value, rep->ioptions.env); } Iterator* iter; @@ -982,7 +989,8 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { public: BlockEntryIteratorState(BlockBasedTable* table, const ReadOptions& read_options) - : TwoLevelIteratorState(table->rep_->options.prefix_extractor != nullptr), + : TwoLevelIteratorState( + table->rep_->ioptions.prefix_extractor != nullptr), table_(table), read_options_(read_options) {} @@ -1020,8 +1028,8 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { return true; } - assert(rep_->options.prefix_extractor != nullptr); - auto prefix = rep_->options.prefix_extractor->Transform( + assert(rep_->ioptions.prefix_extractor != nullptr); + auto prefix = rep_->ioptions.prefix_extractor->Transform( ExtractUserKey(internal_key)); InternalKey internal_key_prefix(prefix, 0, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1034,50 +1042,59 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { // loaded to memory. ReadOptions no_io_read_options; no_io_read_options.read_tier = kBlockCacheTier; - unique_ptr iiter(NewIndexIterator(no_io_read_options)); - iiter->Seek(internal_prefix); - - if (!iiter->Valid()) { - // we're past end of file - // if it's incomplete, it means that we avoided I/O - // and we're not really sure that we're past the end - // of the file - may_match = iiter->status().IsIncomplete(); - } else if (ExtractUserKey(iiter->key()).starts_with( - ExtractUserKey(internal_prefix))) { - // we need to check for this subtle case because our only - // guarantee is that "the key is a string >= last key in that data - // block" according to the doc/table_format.txt spec. - // - // Suppose iiter->key() starts with the desired prefix; it is not - // necessarily the case that the corresponding data block will - // contain the prefix, since iiter->key() need not be in the - // block. However, the next data block may contain the prefix, so - // we return true to play it safe. - may_match = true; - } else { - // iiter->key() does NOT start with the desired prefix. Because - // Seek() finds the first key that is >= the seek target, this - // means that iiter->key() > prefix. Thus, any data blocks coming - // after the data block corresponding to iiter->key() cannot - // possibly contain the key. Thus, the corresponding data block - // is the only one which could potentially contain the prefix. - Slice handle_value = iiter->value(); - BlockHandle handle; - s = handle.DecodeFrom(&handle_value); - assert(s.ok()); - auto filter_entry = GetFilter(true /* no io */); - may_match = filter_entry.value == nullptr || - filter_entry.value->PrefixMayMatch(handle.offset(), prefix); - filter_entry.Release(rep_->table_options.block_cache.get()); + + // First, try check with full filter + auto filter_entry = GetFilter(true /* no io */); + FilterBlockReader* filter = filter_entry.value; + if (filter != nullptr && !filter->IsBlockBased()) { + may_match = filter->PrefixMayMatch(prefix); + } + + // Then, try find it within each block + if (may_match) { + unique_ptr iiter(NewIndexIterator(no_io_read_options)); + iiter->Seek(internal_prefix); + + if (!iiter->Valid()) { + // we're past end of file + // if it's incomplete, it means that we avoided I/O + // and we're not really sure that we're past the end + // of the file + may_match = iiter->status().IsIncomplete(); + } else if (ExtractUserKey(iiter->key()).starts_with( + ExtractUserKey(internal_prefix))) { + // we need to check for this subtle case because our only + // guarantee is that "the key is a string >= last key in that data + // block" according to the doc/table_format.txt spec. + // + // Suppose iiter->key() starts with the desired prefix; it is not + // necessarily the case that the corresponding data block will + // contain the prefix, since iiter->key() need not be in the + // block. However, the next data block may contain the prefix, so + // we return true to play it safe. + may_match = true; + } else if (filter != nullptr && filter->IsBlockBased()) { + // iiter->key() does NOT start with the desired prefix. Because + // Seek() finds the first key that is >= the seek target, this + // means that iiter->key() > prefix. Thus, any data blocks coming + // after the data block corresponding to iiter->key() cannot + // possibly contain the key. Thus, the corresponding data block + // is the only on could potentially contain the prefix. + Slice handle_value = iiter->value(); + BlockHandle handle; + s = handle.DecodeFrom(&handle_value); + assert(s.ok()); + may_match = filter->PrefixMayMatch(prefix, handle.offset()); + } } - Statistics* statistics = rep_->options.statistics.get(); + Statistics* statistics = rep_->ioptions.statistics; RecordTick(statistics, BLOOM_FILTER_PREFIX_CHECKED); if (!may_match) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } + filter_entry.Release(rep_->table_options.block_cache.get()); return may_match; } @@ -1087,70 +1104,91 @@ Iterator* BlockBasedTable::NewIterator(const ReadOptions& read_options, NewIndexIterator(read_options), arena); } +bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter, + const Slice& internal_key) const { + if (filter == nullptr || filter->IsBlockBased()) { + return true; + } + Slice user_key = ExtractUserKey(internal_key); + if (!filter->KeyMayMatch(user_key)) { + return false; + } + if (rep_->ioptions.prefix_extractor && + !filter->PrefixMayMatch( + rep_->ioptions.prefix_extractor->Transform(user_key))) { + return false; + } + return true; +} + Status BlockBasedTable::Get( - const ReadOptions& read_options, const Slice& key, void* handle_context, - bool (*result_handler)(void* handle_context, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context)) { + const ReadOptions& read_options, const Slice& key, + GetContext* get_context) { Status s; - BlockIter iiter; - NewIndexIterator(read_options, &iiter); - auto filter_entry = GetFilter(read_options.read_tier == kBlockCacheTier); FilterBlockReader* filter = filter_entry.value; - bool done = false; - for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { - Slice handle_value = iiter.value(); - BlockHandle handle; - bool may_not_exist_in_filter = - filter != nullptr && handle.DecodeFrom(&handle_value).ok() && - !filter->KeyMayMatch(handle.offset(), ExtractUserKey(key)); - - if (may_not_exist_in_filter) { - // Not found - // TODO: think about interaction with Merge. If a user key cannot - // cross one data block, we should be fine. - RecordTick(rep_->options.statistics.get(), BLOOM_FILTER_USEFUL); - break; - } else { - BlockIter biter; - NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); - - if (read_options.read_tier && biter.status().IsIncomplete()) { - // couldn't get block from block_cache - // Update Saver.state to Found because we are only looking for whether - // we can guarantee the key is not there when "no_io" is set - (*mark_key_may_exist_handler)(handle_context); - break; - } - if (!biter.status().ok()) { - s = biter.status(); - break; - } + // First check the full filter + // If full filter not useful, Then go into each block + if (!FullFilterKeyMayMatch(filter, key)) { + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + } else { + BlockIter iiter; + NewIndexIterator(read_options, &iiter); - // Call the *saver function on each entry/block until it returns false - for (biter.Seek(key); biter.Valid(); biter.Next()) { - ParsedInternalKey parsed_key; - if (!ParseInternalKey(biter.key(), &parsed_key)) { - s = Status::Corruption(Slice()); - } + bool done = false; + for (iiter.Seek(key); iiter.Valid() && !done; iiter.Next()) { + Slice handle_value = iiter.value(); - if (!(*result_handler)(handle_context, parsed_key, - biter.value())) { - done = true; + BlockHandle handle; + bool not_exist_in_filter = + filter != nullptr && filter->IsBlockBased() == true && + handle.DecodeFrom(&handle_value).ok() && + !filter->KeyMayMatch(ExtractUserKey(key), handle.offset()); + + if (not_exist_in_filter) { + // Not found + // TODO: think about interaction with Merge. If a user key cannot + // cross one data block, we should be fine. + RecordTick(rep_->ioptions.statistics, BLOOM_FILTER_USEFUL); + break; + } else { + BlockIter biter; + NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); + + if (read_options.read_tier && biter.status().IsIncomplete()) { + // couldn't get block from block_cache + // Update Saver.state to Found because we are only looking for whether + // we can guarantee the key is not there when "no_io" is set + get_context->MarkKeyMayExist(); break; } + if (!biter.status().ok()) { + s = biter.status(); + break; + } + + // Call the *saver function on each entry/block until it returns false + for (biter.Seek(key); biter.Valid(); biter.Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(biter.key(), &parsed_key)) { + s = Status::Corruption(Slice()); + } + + if (!get_context->SaveValue(parsed_key, biter.value())) { + done = true; + break; + } + } + s = biter.status(); } - s = biter.status(); + } + if (s.ok()) { + s = iiter.status(); } } filter_entry.Release(rep_->table_options.block_cache.get()); - if (s.ok()) { - s = iiter.status(); - } - return s; } @@ -1170,12 +1208,13 @@ bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, char cache_key_storage[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; Slice cache_key = - GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, handle, - cache_key_storage); + GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + handle, cache_key_storage); Slice ckey; s = GetDataBlockFromCache(cache_key, ckey, block_cache, nullptr, nullptr, - options, &block); + options, &block, + rep_->table_options.format_version); assert(s.ok()); bool in_cache = block.value != nullptr; if (in_cache) { @@ -1205,13 +1244,13 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } auto file = rep_->file.get(); - auto env = rep_->options.env; + auto env = rep_->ioptions.env; auto comparator = &rep_->internal_comparator; const Footer& footer = rep_->footer; if (index_type_on_file == BlockBasedTableOptions::kHashSearch && - rep_->options.prefix_extractor == nullptr) { - Log(rep_->options.info_log, + rep_->ioptions.prefix_extractor == nullptr) { + Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, "BlockBasedTableOptions::kHashSearch requires " "options.prefix_extractor to be set." " Fall back to binary seach index."); @@ -1232,7 +1271,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, if (!s.ok()) { // we simply fall back to binary search in case there is any // problem with prefix hash index loading. - Log(rep_->options.info_log, + Log(InfoLogLevel::WARN_LEVEL, rep_->ioptions.info_log, "Unable to read the metaindex block." " Fall back to binary seach index."); return BinarySearchIndexReader::Create( @@ -1244,7 +1283,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, // We need to wrap data with internal_prefix_transform to make sure it can // handle prefix correctly. rep_->internal_prefix_transform.reset( - new InternalKeySliceTransform(rep_->options.prefix_extractor.get())); + new InternalKeySliceTransform(rep_->ioptions.prefix_extractor)); return HashIndexReader::Create( rep_->internal_prefix_transform.get(), footer, file, env, comparator, footer.index_handle(), meta_index_iter, index_reader, @@ -1252,7 +1291,7 @@ Status BlockBasedTable::CreateIndexReader(IndexReader** index_reader, } default: { std::string error_message = - "Unrecognized index type: " + std::to_string(rep_->index_type); + "Unrecognized index type: " + ToString(rep_->index_type); return Status::InvalidArgument(error_message.c_str()); } } @@ -1299,4 +1338,217 @@ bool BlockBasedTable::TEST_index_reader_preloaded() const { return rep_->index_reader != nullptr; } +Status BlockBasedTable::DumpTable(WritableFile* out_file) { + // Output Footer + out_file->Append( + "Footer Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->footer.ToString().c_str()); + out_file->Append("\n"); + + // Output MetaIndex + out_file->Append( + "Metaindex Details:\n" + "--------------------------------------\n"); + std::unique_ptr meta; + std::unique_ptr meta_iter; + Status s = ReadMetaBlock(rep_, &meta, &meta_iter); + if (s.ok()) { + for (meta_iter->SeekToFirst(); meta_iter->Valid(); meta_iter->Next()) { + s = meta_iter->status(); + if (!s.ok()) { + return s; + } + if (meta_iter->key() == rocksdb::kPropertiesBlock) { + out_file->Append(" Properties block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } else if (strstr(meta_iter->key().ToString().c_str(), + "filter.rocksdb.") != nullptr) { + out_file->Append(" Filter block handle: "); + out_file->Append(meta_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + } + } + out_file->Append("\n"); + } else { + return s; + } + + // Output TableProperties + const rocksdb::TableProperties* table_properties; + table_properties = rep_->table_properties.get(); + + if (table_properties != nullptr) { + out_file->Append( + "Table Properties:\n" + "--------------------------------------\n" + " "); + out_file->Append(table_properties->ToString("\n ", ": ").c_str()); + out_file->Append("\n"); + } + + // Output Filter blocks + if (!rep_->filter && !table_properties->filter_policy_name.empty()) { + // Support only BloomFilter as off now + rocksdb::BlockBasedTableOptions table_options; + table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(1)); + if (table_properties->filter_policy_name.compare( + table_options.filter_policy->Name()) == 0) { + std::string filter_block_key = kFilterBlockPrefix; + filter_block_key.append(table_properties->filter_policy_name); + BlockHandle handle; + if (FindMetaBlock(meta_iter.get(), filter_block_key, &handle).ok()) { + BlockContents block; + if (ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(), + handle, &block, rep_->ioptions.env, false).ok()) { + rep_->filter.reset( + new BlockBasedFilterBlockReader(rep_->ioptions.prefix_extractor, + table_options, std::move(block))); + } + } + } + } + if (rep_->filter) { + out_file->Append( + "Filter Details:\n" + "--------------------------------------\n" + " "); + out_file->Append(rep_->filter->ToString().c_str()); + out_file->Append("\n"); + } + + // Output Index block + s = DumpIndexBlock(out_file); + if (!s.ok()) { + return s; + } + // Output Data blocks + s = DumpDataBlocks(out_file); + + return s; +} + +Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { + out_file->Append( + "Index Details:\n" + "--------------------------------------\n"); + + std::unique_ptr blockhandles_iter(NewIndexIterator(ReadOptions())); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + out_file->Append(" Block key hex dump: Data block handle\n"); + out_file->Append(" Block key ascii\n\n"); + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + Slice key = blockhandles_iter->key(); + InternalKey ikey; + ikey.DecodeFrom(key); + + out_file->Append(" HEX "); + out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = ikey.user_key().ToString(); + std::string res_key(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append("\n ------\n"); + } + out_file->Append("\n"); + return Status::OK(); +} + +Status BlockBasedTable::DumpDataBlocks(WritableFile* out_file) { + std::unique_ptr blockhandles_iter(NewIndexIterator(ReadOptions())); + Status s = blockhandles_iter->status(); + if (!s.ok()) { + out_file->Append("Can not read Index Block \n\n"); + return s; + } + + size_t block_id = 1; + for (blockhandles_iter->SeekToFirst(); blockhandles_iter->Valid(); + block_id++, blockhandles_iter->Next()) { + s = blockhandles_iter->status(); + if (!s.ok()) { + break; + } + + out_file->Append("Data Block # "); + out_file->Append(std::to_string(block_id)); + out_file->Append(" @ "); + out_file->Append(blockhandles_iter->value().ToString(true).c_str()); + out_file->Append("\n"); + out_file->Append("--------------------------------------\n"); + + std::unique_ptr datablock_iter; + datablock_iter.reset( + NewDataBlockIterator(rep_, ReadOptions(), blockhandles_iter->value())); + s = datablock_iter->status(); + + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n\n"); + continue; + } + + for (datablock_iter->SeekToFirst(); datablock_iter->Valid(); + datablock_iter->Next()) { + s = datablock_iter->status(); + if (!s.ok()) { + out_file->Append("Error reading the block - Skipped \n"); + break; + } + Slice key = datablock_iter->key(); + Slice value = datablock_iter->value(); + InternalKey ikey, iValue; + ikey.DecodeFrom(key); + iValue.DecodeFrom(value); + + out_file->Append(" HEX "); + out_file->Append(ikey.user_key().ToString(true).c_str()); + out_file->Append(": "); + out_file->Append(iValue.user_key().ToString(true).c_str()); + out_file->Append("\n"); + + std::string str_key = ikey.user_key().ToString(); + std::string str_value = iValue.user_key().ToString(); + std::string res_key(""), res_value(""); + char cspace = ' '; + for (size_t i = 0; i < str_key.size(); i++) { + res_key.append(&str_key[i], 1); + res_key.append(1, cspace); + } + for (size_t i = 0; i < str_value.size(); i++) { + res_value.append(&str_value[i], 1); + res_value.append(1, cspace); + } + + out_file->Append(" ASCII "); + out_file->Append(res_key.c_str()); + out_file->Append(": "); + out_file->Append(res_value.c_str()); + out_file->Append("\n ------\n"); + } + out_file->Append("\n"); + } + return Status::OK(); +} + } // namespace rocksdb diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 3ff97dda6..e3594cf7c 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -14,10 +14,12 @@ #include #include +#include "rocksdb/options.h" #include "rocksdb/statistics.h" #include "rocksdb/status.h" #include "rocksdb/table.h" #include "table/table_reader.h" +#include "table/table_properties_internal.h" #include "util/coding.h" namespace rocksdb { @@ -27,6 +29,8 @@ class BlockIter; class BlockHandle; class Cache; class FilterBlockReader; +class BlockBasedFilterBlockReader; +class FullFilterBlockReader; class Footer; class InternalKeyComparator; class Iterator; @@ -36,8 +40,8 @@ class TableReader; class WritableFile; struct BlockBasedTableOptions; struct EnvOptions; -struct Options; struct ReadOptions; +class GetContext; using std::unique_ptr; @@ -47,6 +51,7 @@ using std::unique_ptr; class BlockBasedTable : public TableReader { public: static const std::string kFilterBlockPrefix; + static const std::string kFullFilterBlockPrefix; // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow @@ -58,7 +63,8 @@ class BlockBasedTable : public TableReader { // to nullptr and returns a non-ok status. // // *file must remain live while this Table is in use. - static Status Open(const Options& db_options, const EnvOptions& env_options, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, unique_ptr&& file, uint64_t file_size, @@ -72,11 +78,7 @@ class BlockBasedTable : public TableReader { Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; Status Get(const ReadOptions& readOptions, const Slice& key, - void* handle_context, - bool (*result_handler)(void* handle_context, - const ParsedInternalKey& k, const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = - nullptr) override; + GetContext* get_context) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -98,6 +100,9 @@ class BlockBasedTable : public TableReader { size_t ApproximateMemoryUsage() const override; + // convert SST file to a human readable form + Status DumpTable(WritableFile* out_file) override; + ~BlockBasedTable(); bool TEST_filter_block_preloaded() const; @@ -145,7 +150,7 @@ class BlockBasedTable : public TableReader { const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, Statistics* statistics, const ReadOptions& read_options, - BlockBasedTable::CachableEntry* block); + BlockBasedTable::CachableEntry* block, uint32_t format_version); // Put a raw block (maybe compressed) to the corresponding block caches. // This method will perform decompression against raw_block if needed and then // populate the block caches. @@ -158,7 +163,7 @@ class BlockBasedTable : public TableReader { const Slice& block_cache_key, const Slice& compressed_block_cache_key, Cache* block_cache, Cache* block_cache_compressed, const ReadOptions& read_options, Statistics* statistics, - CachableEntry* block, Block* raw_block); + CachableEntry* block, Block* raw_block, uint32_t format_version); // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // after a call to Seek(key), until handle_result returns false. @@ -175,6 +180,9 @@ class BlockBasedTable : public TableReader { Status CreateIndexReader(IndexReader** index_reader, Iterator* preloaded_meta_index_iter = nullptr); + bool FullFilterKeyMayMatch(FilterBlockReader* filter, + const Slice& user_key) const; + // Read the meta block from sst. static Status ReadMetaBlock( Rep* rep, @@ -182,8 +190,10 @@ class BlockBasedTable : public TableReader { std::unique_ptr* iter); // Create the filter from the filter block. - static FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, - Rep* rep, size_t* filter_size = nullptr); + static FilterBlockReader* ReadFilter( + Rep* rep, + Iterator* meta_index_iter, + size_t* filter_size = nullptr); static void SetupCacheKeyPrefix(Rep* rep); @@ -200,6 +210,10 @@ class BlockBasedTable : public TableReader { // For Posix files the unique ID is three varints. static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length*3+1; + // Helper functions for DumpTable() + Status DumpIndexBlock(WritableFile* out_file); + Status DumpDataBlocks(WritableFile* out_file); + // No copying allowed explicit BlockBasedTable(const TableReader&) = delete; void operator=(const TableReader&) = delete; diff --git a/table/block_builder.cc b/table/block_builder.cc index 5bac54ae7..1eee96d46 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -41,10 +41,8 @@ namespace rocksdb { -BlockBuilder::BlockBuilder(int block_restart_interval, - const Comparator* comparator) +BlockBuilder::BlockBuilder(int block_restart_interval) : block_restart_interval_(block_restart_interval), - comparator_(comparator), restarts_(), counter_(0), finished_(false) { @@ -87,7 +85,7 @@ Slice BlockBuilder::Finish() { for (size_t i = 0; i < restarts_.size(); i++) { PutFixed32(&buffer_, restarts_[i]); } - PutFixed32(&buffer_, restarts_.size()); + PutFixed32(&buffer_, static_cast(restarts_.size())); finished_ = true; return Slice(buffer_); } @@ -96,8 +94,6 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { Slice last_key_piece(last_key_); assert(!finished_); assert(counter_ <= block_restart_interval_); - assert(buffer_.empty() // No values yet? - || comparator_->Compare(key, last_key_piece) > 0); size_t shared = 0; if (counter_ < block_restart_interval_) { // See how much sharing to do with previous string @@ -107,15 +103,15 @@ void BlockBuilder::Add(const Slice& key, const Slice& value) { } } else { // Restart compression - restarts_.push_back(buffer_.size()); + restarts_.push_back(static_cast(buffer_.size())); counter_ = 0; } const size_t non_shared = key.size() - shared; // Add "" to buffer_ - PutVarint32(&buffer_, shared); - PutVarint32(&buffer_, non_shared); - PutVarint32(&buffer_, value.size()); + PutVarint32(&buffer_, static_cast(shared)); + PutVarint32(&buffer_, static_cast(non_shared)); + PutVarint32(&buffer_, static_cast(value.size())); // Add string delta to buffer_ followed by value buffer_.append(key.data() + shared, non_shared); diff --git a/table/block_builder.h b/table/block_builder.h index eb7c49f7d..c01a23bea 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -15,15 +15,13 @@ namespace rocksdb { -class Comparator; - class BlockBuilder { public: BlockBuilder(const BlockBuilder&) = delete; void operator=(const BlockBuilder&) = delete; - - BlockBuilder(int block_restart_interval, const Comparator* comparator); - + + explicit BlockBuilder(int block_restart_interval); + // Reset the contents as if the BlockBuilder was just constructed. void Reset(); @@ -50,7 +48,6 @@ class BlockBuilder { private: const int block_restart_interval_; - const Comparator* comparator_; std::string buffer_; // Destination buffer std::vector restarts_; // Restart points diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc index 7a6e219a0..a8c965864 100644 --- a/table/block_hash_index.cc +++ b/table/block_hash_index.cc @@ -59,7 +59,7 @@ BlockHashIndex* CreateBlockHashIndexOnTheFly( auto hash_index = new BlockHashIndex( hash_key_extractor, true /* hash_index will copy prefix when Add() is called */); - uint64_t current_restart_index = 0; + uint32_t current_restart_index = 0; std::string pending_entry_prefix; // pending_block_num == 0 also implies there is no entry inserted at all. diff --git a/table/block_hash_index.h b/table/block_hash_index.h index d5603d366..582910796 100644 --- a/table/block_hash_index.h +++ b/table/block_hash_index.h @@ -25,8 +25,8 @@ class BlockHashIndex { public: // Represents a restart index in the index block's restart array. struct RestartIndex { - explicit RestartIndex(uint32_t first_index, uint32_t num_blocks = 1) - : first_index(first_index), num_blocks(num_blocks) {} + explicit RestartIndex(uint32_t _first_index, uint32_t _num_blocks = 1) + : first_index(_first_index), num_blocks(_num_blocks) {} // For a given prefix, what is the restart index for the first data block // that contains it. diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc index 6f7bcb2b7..8a6d1b093 100644 --- a/table/block_hash_index_test.cc +++ b/table/block_hash_index_test.cc @@ -82,8 +82,8 @@ TEST(BlockTest, BasicTest) { auto prefix_extractor = NewFixedPrefixTransform(prefix_size); std::unique_ptr block_hash_index(CreateBlockHashIndexOnTheFly( - &index_iter, &data_iter, index_entries.size(), BytewiseComparator(), - prefix_extractor)); + &index_iter, &data_iter, static_cast(index_entries.size()), + BytewiseComparator(), prefix_extractor)); std::map expected = { {"01xx", BlockHashIndex::RestartIndex(0, 1)}, diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc index f06dcd9fe..147bcf56e 100644 --- a/table/block_prefix_index.cc +++ b/table/block_prefix_index.cc @@ -87,7 +87,7 @@ class BlockPrefixIndex::Builder { BlockPrefixIndex* Finish() { // For now, use roughly 1:1 prefix to bucket ratio. - uint32_t num_buckets = prefixes_.size() + 1; + uint32_t num_buckets = static_cast(prefixes_.size()) + 1; // Collect prefix records that hash to the same bucket, into a single // linklist. @@ -143,8 +143,8 @@ class BlockPrefixIndex::Builder { auto current = prefixes_per_bucket[i]; // populate block ids from largest to smallest while (current != nullptr) { - for (uint32_t i = 0; i < current->num_blocks; i++) { - *last_block = current->end_block - i; + for (uint32_t iter = 0; iter < current->num_blocks; iter++) { + *last_block = current->end_block - iter; last_block--; } current = current->next; @@ -210,8 +210,8 @@ Status BlockPrefixIndex::Create(const SliceTransform* internal_prefix_extractor, return s; } -const uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, - uint32_t** blocks) { +uint32_t BlockPrefixIndex::GetBlocks(const Slice& key, + uint32_t** blocks) { Slice prefix = internal_prefix_extractor_->Transform(key); uint32_t bucket = PrefixToBucket(prefix, num_buckets_); diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h index 2afecadd2..662bc09aa 100644 --- a/table/block_prefix_index.h +++ b/table/block_prefix_index.h @@ -23,7 +23,7 @@ class BlockPrefixIndex { // the key, based on the prefix. // Returns the total number of relevant blocks, 0 means the key does // not exist. - const uint32_t GetBlocks(const Slice& key, uint32_t** blocks); + uint32_t GetBlocks(const Slice& key, uint32_t** blocks); size_t ApproximateMemoryUsage() const { return sizeof(BlockPrefixIndex) + diff --git a/table/block_test.cc b/table/block_test.cc index da01d6def..fa263bcbd 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -76,7 +76,7 @@ TEST(BlockTest, SimpleTest) { std::vector keys; std::vector values; - BlockBuilder builder(16, ic.get()); + BlockBuilder builder(16); int num_records = 100000; GenerateRandomKVs(&keys, &values, 0, num_records); @@ -92,8 +92,7 @@ TEST(BlockTest, SimpleTest) { BlockContents contents; contents.data = rawblock; contents.cachable = false; - contents.heap_allocated = false; - Block reader(contents); + Block reader(std::move(contents)); // read contents of block sequentially int count = 0; @@ -132,8 +131,7 @@ BlockContents GetBlockContents(std::unique_ptr *builder, const std::vector &keys, const std::vector &values, const int prefix_group_size = 1) { - builder->reset( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())); + builder->reset(new BlockBuilder(1 /* restart interval */)); // Add only half of the keys for (size_t i = 0; i < keys.size(); ++i) { @@ -144,7 +142,6 @@ BlockContents GetBlockContents(std::unique_ptr *builder, BlockContents contents; contents.data = rawblock; contents.cachable = false; - contents.heap_allocated = false; return contents; } @@ -154,8 +151,10 @@ void CheckBlockContents(BlockContents contents, const int max_key, const std::vector &values) { const size_t prefix_size = 6; // create block reader - Block reader1(contents); - Block reader2(contents); + BlockContents contents_ref(contents.data, contents.cachable, + contents.compression_type); + Block reader1(std::move(contents)); + Block reader2(std::move(contents_ref)); std::unique_ptr prefix_extractor( NewFixedPrefixTransform(prefix_size)); @@ -164,7 +163,7 @@ void CheckBlockContents(BlockContents contents, const int max_key, auto iter1 = reader1.NewIterator(nullptr); auto iter2 = reader1.NewIterator(nullptr); reader1.SetBlockHashIndex(CreateBlockHashIndexOnTheFly( - iter1, iter2, keys.size(), BytewiseComparator(), + iter1, iter2, static_cast(keys.size()), BytewiseComparator(), prefix_extractor.get())); delete iter1; @@ -213,7 +212,7 @@ TEST(BlockTest, SimpleIndexHash) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values); - CheckBlockContents(contents, kMaxKey, keys, values); + CheckBlockContents(std::move(contents), kMaxKey, keys, values); } TEST(BlockTest, IndexHashWithSharedPrefix) { @@ -232,7 +231,7 @@ TEST(BlockTest, IndexHashWithSharedPrefix) { std::unique_ptr builder; auto contents = GetBlockContents(&builder, keys, values, kPrefixGroup); - CheckBlockContents(contents, kMaxKey, keys, values); + CheckBlockContents(std::move(contents), kMaxKey, keys, values); } } // namespace rocksdb diff --git a/table/bloom_block.cc b/table/bloom_block.cc index c44ab66ca..cfea8a2c5 100644 --- a/table/bloom_block.cc +++ b/table/bloom_block.cc @@ -11,7 +11,7 @@ namespace rocksdb { -void BloomBlockBuilder::AddKeysHashes(const std::vector keys_hashes) { +void BloomBlockBuilder::AddKeysHashes(const std::vector& keys_hashes) { for (auto hash : keys_hashes) { bloom_.AddHash(hash); } diff --git a/table/bloom_block.h b/table/bloom_block.h index d55453eda..5b60d2bca 100644 --- a/table/bloom_block.h +++ b/table/bloom_block.h @@ -18,15 +18,16 @@ class BloomBlockBuilder { explicit BloomBlockBuilder(uint32_t num_probes = 6) : bloom_(num_probes, nullptr) {} - void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality, - size_t huge_page_tlb_size, Logger* logger) { - bloom_.SetTotalBits(arena, total_bits, locality, huge_page_tlb_size, + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger) { + bloom_.SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger); } uint32_t GetNumBlocks() const { return bloom_.GetNumBlocks(); } - void AddKeysHashes(const std::vector keys_hashes); + void AddKeysHashes(const std::vector& keys_hashes); Slice Finish(); diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 6326d3787..1aa1e0707 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -21,6 +21,7 @@ #include "table/meta_blocks.h" #include "util/autovector.h" #include "util/random.h" +#include "util/string_util.h" namespace rocksdb { const std::string CuckooTablePropertyNames::kEmptyKey = @@ -35,6 +36,12 @@ const std::string CuckooTablePropertyNames::kIsLastLevel = "rocksdb.cuckoo.file.islastlevel"; const std::string CuckooTablePropertyNames::kCuckooBlockSize = "rocksdb.cuckoo.hash.cuckooblocksize"; +const std::string CuckooTablePropertyNames::kIdentityAsFirstHash = + "rocksdb.cuckoo.hash.identityfirst"; +const std::string CuckooTablePropertyNames::kUseModuleHash = + "rocksdb.cuckoo.hash.usemodule"; +const std::string CuckooTablePropertyNames::kUserKeyLength = + "rocksdb.cuckoo.hash.userkeylength"; // Obtained by running echo rocksdb.table.cuckoo | sha1sum extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; @@ -43,6 +50,7 @@ CuckooTableBuilder::CuckooTableBuilder( WritableFile* file, double max_hash_table_ratio, uint32_t max_num_hash_table, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : num_hash_func_(2), file_(file), @@ -50,13 +58,19 @@ CuckooTableBuilder::CuckooTableBuilder( max_num_hash_func_(max_num_hash_table), max_search_depth_(max_search_depth), cuckoo_block_size_(std::max(1U, cuckoo_block_size)), - hash_table_size_(2), + hash_table_size_(use_module_hash ? 0 : 2), is_last_level_file_(false), has_seen_first_key_(false), + has_seen_first_value_(false), + key_size_(0), + value_size_(0), + num_entries_(0), + num_values_(0), ucomp_(user_comparator), + use_module_hash_(use_module_hash), + identity_as_first_hash_(identity_as_first_hash), get_slice_hash_(get_slice_hash), closed_(false) { - properties_.num_entries = 0; // Data is in a huge block. properties_.num_data_blocks = 1; properties_.index_size = 0; @@ -64,7 +78,7 @@ CuckooTableBuilder::CuckooTableBuilder( } void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { - if (properties_.num_entries >= kMaxVectorIdx - 1) { + if (num_entries_ >= kMaxVectorIdx - 1) { status_ = Status::NotSupported("Number of keys in a file must be < 2^32-1"); return; } @@ -73,6 +87,12 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { status_ = Status::Corruption("Unable to parse key into inernal key."); return; } + if (ikey.type != kTypeDeletion && ikey.type != kTypeValue) { + status_ = Status::NotSupported("Unsupported key type " + + ToString(ikey.type)); + return; + } + // Determine if we can ignore the sequence number and value type from // internal keys by looking at sequence number from first key. We assume // that if first key has a zero sequence number, then all the remaining @@ -82,15 +102,40 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { has_seen_first_key_ = true; smallest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); + key_size_ = is_last_level_file_ ? ikey.user_key.size() : key.size(); + } + if (key_size_ != (is_last_level_file_ ? ikey.user_key.size() : key.size())) { + status_ = Status::NotSupported("all keys have to be the same size"); + return; } // Even if one sequence number is non-zero, then it is not last level. assert(!is_last_level_file_ || ikey.sequence == 0); - if (is_last_level_file_) { - kvs_.emplace_back(std::make_pair( - ikey.user_key.ToString(), value.ToString())); + + if (ikey.type == kTypeValue) { + if (!has_seen_first_value_) { + has_seen_first_value_ = true; + value_size_ = value.size(); + } + if (value_size_ != value.size()) { + status_ = Status::NotSupported("all values have to be the same size"); + return; + } + + if (is_last_level_file_) { + kvs_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + kvs_.append(key.data(), key.size()); + } + kvs_.append(value.data(), value.size()); + ++num_values_; } else { - kvs_.emplace_back(std::make_pair(key.ToString(), value.ToString())); + if (is_last_level_file_) { + deleted_keys_.append(ikey.user_key.data(), ikey.user_key.size()); + } else { + deleted_keys_.append(key.data(), key.size()); + } } + ++num_entries_; // In order to fill the empty buckets in the hash table, we identify a // key which is not used so far (unused_user_key). We determine this by @@ -102,25 +147,52 @@ void CuckooTableBuilder::Add(const Slice& key, const Slice& value) { } else if (ikey.user_key.compare(largest_user_key_) > 0) { largest_user_key_.assign(ikey.user_key.data(), ikey.user_key.size()); } - if (hash_table_size_ < kvs_.size() / max_hash_table_ratio_) { - hash_table_size_ *= 2; + if (!use_module_hash_) { + if (hash_table_size_ < num_entries_ / max_hash_table_ratio_) { + hash_table_size_ *= 2; + } } } +bool CuckooTableBuilder::IsDeletedKey(uint64_t idx) const { + assert(closed_); + return idx >= num_values_; +} + +Slice CuckooTableBuilder::GetKey(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + return Slice(&deleted_keys_[(idx - num_values_) * key_size_], key_size_); + } + return Slice(&kvs_[idx * (key_size_ + value_size_)], key_size_); +} + +Slice CuckooTableBuilder::GetUserKey(uint64_t idx) const { + assert(closed_); + return is_last_level_file_ ? GetKey(idx) : ExtractUserKey(GetKey(idx)); +} + +Slice CuckooTableBuilder::GetValue(uint64_t idx) const { + assert(closed_); + if (IsDeletedKey(idx)) { + static std::string empty_value(value_size_, 'a'); + return Slice(empty_value); + } + return Slice(&kvs_[idx * (key_size_ + value_size_) + key_size_], value_size_); +} + Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { - uint64_t hash_table_size_minus_one = hash_table_size_ - 1; - buckets->resize(hash_table_size_minus_one + cuckoo_block_size_); - uint64_t make_space_for_key_call_id = 0; - for (uint32_t vector_idx = 0; vector_idx < kvs_.size(); vector_idx++) { + buckets->resize(hash_table_size_ + cuckoo_block_size_ - 1); + uint32_t make_space_for_key_call_id = 0; + for (uint32_t vector_idx = 0; vector_idx < num_entries_; vector_idx++) { uint64_t bucket_id; bool bucket_found = false; autovector hash_vals; - Slice user_key = is_last_level_file_ ? kvs_[vector_idx].first : - ExtractUserKey(kvs_[vector_idx].first); + Slice user_key = GetUserKey(vector_idx); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !bucket_found; ++hash_cnt) { - uint64_t hash_val = CuckooHash(user_key, hash_cnt, - hash_table_size_minus_one, get_slice_hash_); + uint64_t hash_val = CuckooHash(user_key, hash_cnt, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); // If there is a collision, check next cuckoo_block_size_ locations for // empty locations. While checking, if we reach end of the hash table, // stop searching and proceed for next hash function. @@ -131,10 +203,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { bucket_found = true; break; } else { - if (ucomp_->Compare(user_key, is_last_level_file_ - ? Slice(kvs_[(*buckets)[hash_val].vector_idx].first) - : ExtractUserKey( - kvs_[(*buckets)[hash_val].vector_idx].first)) == 0) { + if (ucomp_->Compare(user_key, + GetUserKey((*buckets)[hash_val].vector_idx)) == 0) { return Status::NotSupported("Same key is being inserted again."); } hash_vals.push_back(hash_val); @@ -149,8 +219,8 @@ Status CuckooTableBuilder::MakeHashTable(std::vector* buckets) { } // We don't really need to rehash the entire table because old hashes are // still valid and we only increased the number of hash functions. - uint64_t hash_val = CuckooHash(user_key, num_hash_func_, - hash_table_size_minus_one, get_slice_hash_); + uint64_t hash_val = CuckooHash(user_key, num_hash_func_, use_module_hash_, + hash_table_size_, identity_as_first_hash_, get_slice_hash_); ++num_hash_func_; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++hash_val) { @@ -174,14 +244,18 @@ Status CuckooTableBuilder::Finish() { std::vector buckets; Status s; std::string unused_bucket; - if (!kvs_.empty()) { + if (num_entries_ > 0) { + // Calculate the real hash size if module hash is enabled. + if (use_module_hash_) { + hash_table_size_ = num_entries_ / max_hash_table_ratio_; + } s = MakeHashTable(&buckets); if (!s.ok()) { return s; } // Determine unused_user_key to fill empty buckets. std::string unused_user_key = smallest_user_key_; - int curr_pos = unused_user_key.size() - 1; + int curr_pos = static_cast(unused_user_key.size()) - 1; while (curr_pos >= 0) { --unused_user_key[curr_pos]; if (Slice(unused_user_key).compare(smallest_user_key_) < 0) { @@ -192,7 +266,7 @@ Status CuckooTableBuilder::Finish() { if (curr_pos < 0) { // Try using the largest key to identify an unused key. unused_user_key = largest_user_key_; - curr_pos = unused_user_key.size() - 1; + curr_pos = static_cast(unused_user_key.size()) - 1; while (curr_pos >= 0) { ++unused_user_key[curr_pos]; if (Slice(unused_user_key).compare(largest_user_key_) > 0) { @@ -211,14 +285,13 @@ Status CuckooTableBuilder::Finish() { AppendInternalKey(&unused_bucket, ikey); } } - properties_.num_entries = kvs_.size(); - properties_.fixed_key_len = unused_bucket.size(); - uint32_t value_length = kvs_.empty() ? 0 : kvs_[0].second.size(); - uint32_t bucket_size = value_length + properties_.fixed_key_len; + properties_.num_entries = num_entries_; + properties_.fixed_key_len = key_size_; properties_.user_collected_properties[ CuckooTablePropertyNames::kValueLength].assign( - reinterpret_cast(&value_length), sizeof(value_length)); + reinterpret_cast(&value_size_), sizeof(value_size_)); + uint64_t bucket_size = key_size_ + value_size_; unused_bucket.resize(bucket_size, 'a'); // Write the table. uint32_t num_added = 0; @@ -227,9 +300,11 @@ Status CuckooTableBuilder::Finish() { s = file_->Append(Slice(unused_bucket)); } else { ++num_added; - s = file_->Append(kvs_[bucket.vector_idx].first); + s = file_->Append(GetKey(bucket.vector_idx)); if (s.ok()) { - s = file_->Append(kvs_[bucket.vector_idx].second); + if (value_size_ > 0) { + s = file_->Append(GetValue(bucket.vector_idx)); + } } } if (!s.ok()) { @@ -238,7 +313,7 @@ Status CuckooTableBuilder::Finish() { } assert(num_added == NumEntries()); properties_.raw_key_size = num_added * properties_.fixed_key_len; - properties_.raw_value_size = num_added * value_length; + properties_.raw_value_size = num_added * value_size_; uint64_t offset = buckets.size() * bucket_size; properties_.data_size = offset; @@ -249,11 +324,10 @@ Status CuckooTableBuilder::Finish() { CuckooTablePropertyNames::kNumHashFunc].assign( reinterpret_cast(&num_hash_func_), sizeof(num_hash_func_)); - uint64_t hash_table_size = buckets.size() - cuckoo_block_size_ + 1; properties_.user_collected_properties[ CuckooTablePropertyNames::kHashTableSize].assign( - reinterpret_cast(&hash_table_size), - sizeof(hash_table_size)); + reinterpret_cast(&hash_table_size_), + sizeof(hash_table_size_)); properties_.user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].assign( reinterpret_cast(&is_last_level_file_), @@ -262,6 +336,19 @@ Status CuckooTableBuilder::Finish() { CuckooTablePropertyNames::kCuckooBlockSize].assign( reinterpret_cast(&cuckoo_block_size_), sizeof(cuckoo_block_size_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kIdentityAsFirstHash].assign( + reinterpret_cast(&identity_as_first_hash_), + sizeof(identity_as_first_hash_)); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kUseModuleHash].assign( + reinterpret_cast(&use_module_hash_), + sizeof(use_module_hash_)); + uint32_t user_key_len = static_cast(smallest_user_key_.size()); + properties_.user_collected_properties[ + CuckooTablePropertyNames::kUserKeyLength].assign( + reinterpret_cast(&user_key_len), + sizeof(user_key_len)); // Write meta blocks. MetaIndexBuilder meta_index_builder; @@ -290,7 +377,7 @@ Status CuckooTableBuilder::Finish() { return s; } - Footer footer(kCuckooTableMagicNumber); + Footer footer(kCuckooTableMagicNumber, 1); footer.set_metaindex_handle(meta_index_block_handle); footer.set_index_handle(BlockHandle::NullBlockHandle()); std::string footer_encoding; @@ -305,26 +392,30 @@ void CuckooTableBuilder::Abandon() { } uint64_t CuckooTableBuilder::NumEntries() const { - return kvs_.size(); + return num_entries_; } uint64_t CuckooTableBuilder::FileSize() const { if (closed_) { return file_->GetFileSize(); - } else if (properties_.num_entries == 0) { + } else if (num_entries_ == 0) { return 0; } - // Account for buckets being a power of two. - // As elements are added, file size remains constant for a while and doubles - // its size. Since compaction algorithm stops adding elements only after it - // exceeds the file limit, we account for the extra element being added here. - uint64_t expected_hash_table_size = hash_table_size_; - if (expected_hash_table_size < (kvs_.size() + 1) / max_hash_table_ratio_) { - expected_hash_table_size *= 2; + if (use_module_hash_) { + return (key_size_ + value_size_) * num_entries_ / max_hash_table_ratio_; + } else { + // Account for buckets being a power of two. + // As elements are added, file size remains constant for a while and + // doubles its size. Since compaction algorithm stops adding elements + // only after it exceeds the file limit, we account for the extra element + // being added here. + uint64_t expected_hash_table_size = hash_table_size_; + if (expected_hash_table_size < (num_entries_ + 1) / max_hash_table_ratio_) { + expected_hash_table_size *= 2; + } + return (key_size_ + value_size_) * expected_hash_table_size - 1; } - return (kvs_[0].first.size() + kvs_[0].second.size()) * - expected_hash_table_size; } // This method is invoked when there is no place to insert the target key. @@ -339,15 +430,14 @@ uint64_t CuckooTableBuilder::FileSize() const { // If tree depth exceedes max depth, we return false indicating failure. bool CuckooTableBuilder::MakeSpaceForKey( const autovector& hash_vals, - const uint64_t make_space_for_key_call_id, - std::vector* buckets, - uint64_t* bucket_id) { + const uint32_t make_space_for_key_call_id, + std::vector* buckets, uint64_t* bucket_id) { struct CuckooNode { uint64_t bucket_id; uint32_t depth; uint32_t parent_pos; - CuckooNode(uint64_t bucket_id, uint32_t depth, int parent_pos) - : bucket_id(bucket_id), depth(depth), parent_pos(parent_pos) {} + CuckooNode(uint64_t _bucket_id, uint32_t _depth, int _parent_pos) + : bucket_id(_bucket_id), depth(_depth), parent_pos(_parent_pos) {} }; // This is BFS search tree that is stored simply as a vector. // Each node stores the index of parent node in the vector. @@ -359,14 +449,12 @@ bool CuckooTableBuilder::MakeSpaceForKey( // of the method. We store this number into the nodes that we explore in // current method call. // It is unlikely for the increment operation to overflow because the maximum - // no. of times this will be called is <= max_num_hash_func_ + kvs_.size(). + // no. of times this will be called is <= max_num_hash_func_ + num_entries_. for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { - uint64_t bucket_id = hash_vals[hash_cnt]; - (*buckets)[bucket_id].make_space_for_key_call_id = - make_space_for_key_call_id; - tree.push_back(CuckooNode(bucket_id, 0, 0)); + uint64_t bid = hash_vals[hash_cnt]; + (*buckets)[bid].make_space_for_key_call_id = make_space_for_key_call_id; + tree.push_back(CuckooNode(bid, 0, 0)); } - uint64_t hash_table_size_minus_one = hash_table_size_ - 1; bool null_found = false; uint32_t curr_pos = 0; while (!null_found && curr_pos < tree.size()) { @@ -378,10 +466,9 @@ bool CuckooTableBuilder::MakeSpaceForKey( CuckooBucket& curr_bucket = (*buckets)[curr_node.bucket_id]; for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_ && !null_found; ++hash_cnt) { - uint64_t child_bucket_id = CuckooHash( - (is_last_level_file_ ? kvs_[curr_bucket.vector_idx].first : - ExtractUserKey(Slice(kvs_[curr_bucket.vector_idx].first))), - hash_cnt, hash_table_size_minus_one, get_slice_hash_); + uint64_t child_bucket_id = CuckooHash(GetUserKey(curr_bucket.vector_idx), + hash_cnt, use_module_hash_, hash_table_size_, identity_as_first_hash_, + get_slice_hash_); // Iterate inside Cuckoo Block. for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; ++block_idx, ++child_bucket_id) { @@ -408,7 +495,7 @@ bool CuckooTableBuilder::MakeSpaceForKey( // child with the parent. Stop when first level is reached in the tree // (happens when 0 <= bucket_to_replace_pos < num_hash_func_) and return // this location in first level for target key to be inserted. - uint32_t bucket_to_replace_pos = tree.size()-1; + uint32_t bucket_to_replace_pos = static_cast(tree.size()) - 1; while (bucket_to_replace_pos >= num_hash_func_) { CuckooNode& curr_node = tree[bucket_to_replace_pos]; (*buckets)[curr_node.bucket_id] = diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 2bf206102..26c94e1bc 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -24,6 +24,7 @@ class CuckooTableBuilder: public TableBuilder { WritableFile* file, double max_hash_table_ratio, uint32_t max_num_hash_func, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, + bool use_module_hash, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. @@ -67,13 +68,16 @@ class CuckooTableBuilder: public TableBuilder { }; static const uint32_t kMaxVectorIdx = std::numeric_limits::max(); - bool MakeSpaceForKey( - const autovector& hash_vals, - const uint64_t call_id, - std::vector* buckets, - uint64_t* bucket_id); + bool MakeSpaceForKey(const autovector& hash_vals, + const uint32_t call_id, + std::vector* buckets, uint64_t* bucket_id); Status MakeHashTable(std::vector* buckets); + inline bool IsDeletedKey(uint64_t idx) const; + inline Slice GetKey(uint64_t idx) const; + inline Slice GetUserKey(uint64_t idx) const; + inline Slice GetValue(uint64_t idx) const; + uint32_t num_hash_func_; WritableFile* file_; const double max_hash_table_ratio_; @@ -82,11 +86,24 @@ class CuckooTableBuilder: public TableBuilder { const uint32_t cuckoo_block_size_; uint64_t hash_table_size_; bool is_last_level_file_; + bool has_seen_first_key_; + bool has_seen_first_value_; + uint64_t key_size_; + uint64_t value_size_; + // A list of fixed-size key-value pairs concatenating into a string. + // Use GetKey(), GetUserKey(), and GetValue() to retrieve a specific + // key / value given an index + std::string kvs_; + std::string deleted_keys_; + // Number of key-value pairs stored in kvs_ + number of deleted keys + uint64_t num_entries_; + // Number of keys that contain value (non-deletion op) + uint64_t num_values_; Status status_; - std::vector> kvs_; TableProperties properties_; - bool has_seen_first_key_; const Comparator* ucomp_; + bool use_module_hash_; + bool identity_as_first_hash_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); std::string largest_user_key_ = ""; diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index 69647d410..ecd23aff5 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -50,12 +50,6 @@ class CuckooBuilderTest { TableProperties* props = nullptr; ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, kCuckooTableMagicNumber, env_, nullptr, &props)); - ASSERT_EQ(props->num_entries, keys.size()); - ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); - ASSERT_EQ(props->data_size, expected_unused_bucket.size() * - (expected_table_size + expected_cuckoo_block_size - 1)); - ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); - // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; @@ -83,17 +77,24 @@ class CuckooBuilderTest { *reinterpret_cast(props->user_collected_properties[ CuckooTablePropertyNames::kIsLastLevel].data()); ASSERT_EQ(expected_is_last_level, is_last_level_found); + + ASSERT_EQ(props->num_entries, keys.size()); + ASSERT_EQ(props->fixed_key_len, keys.empty() ? 0 : keys[0].size()); + ASSERT_EQ(props->data_size, expected_unused_bucket.size() * + (expected_table_size + expected_cuckoo_block_size - 1)); + ASSERT_EQ(props->raw_key_size, keys.size()*props->fixed_key_len); delete props; // Check contents of the bucket. std::vector keys_found(keys.size(), false); - uint32_t bucket_size = expected_unused_bucket.size(); + size_t bucket_size = expected_unused_bucket.size(); for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { Slice read_slice; ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, &read_slice, nullptr)); - uint32_t key_idx = std::find(expected_locations.begin(), - expected_locations.end(), i) - expected_locations.begin(); + size_t key_idx = + std::find(expected_locations.begin(), expected_locations.end(), i) - + expected_locations.begin(); if (key_idx == keys.size()) { // i is not one of the expected locaitons. Empty bucket. ASSERT_EQ(read_slice.compare(expected_unused_bucket), 0); @@ -133,11 +134,12 @@ TEST(CuckooBuilderTest, SuccessWithEmptyFile) { fname = test::TmpDir() + "/EmptyFile"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - 4, 100, BytewiseComparator(), 1, GetSliceHash); + 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); + ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); - CheckFileContents({}, {}, {}, "", 0, 2, false); + CheckFileContents({}, {}, {}, "", 2, 2, false); } TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { @@ -155,22 +157,25 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -192,22 +197,25 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -229,23 +237,27 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; uint32_t cuckoo_block_size = 2; fname = test::TmpDir() + "/WithCollisionFullKey2"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, + false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -272,22 +284,25 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKey) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -311,22 +326,25 @@ TEST(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { for (auto& user_key : user_keys) { keys.push_back(GetInternalKey(user_key, false)); } + uint64_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 2, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(keys.size() / kHashTableRatio); std::string expected_unused_bucket = GetInternalKey("key00", true); expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(keys, values, expected_locations, @@ -344,22 +362,25 @@ TEST(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { {user_keys[3], {3, 4, 5, 6}} }; std::vector expected_locations = {0, 1, 2, 3}; + uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -377,22 +398,25 @@ TEST(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { {user_keys[3], {0, 1, 2, 3}}, }; std::vector expected_locations = {0, 1, 2, 3}; + uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -412,22 +436,25 @@ TEST(CuckooBuilderTest, WithCollisionPathUserKey) { {user_keys[4], {0, 2}}, }; std::vector expected_locations = {0, 1, 3, 4, 2}; + uint64_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); ASSERT_EQ(builder.NumEntries(), i + 1); ASSERT_OK(builder.status()); } + size_t bucket_size = user_keys[0].size() + values[0].size(); + ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); ASSERT_OK(writable_file->Close()); + ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); - uint32_t expected_table_size = NextPowOf2(user_keys.size() / kHashTableRatio); std::string expected_unused_bucket = "key00"; expected_unused_bucket += std::string(values[0].size(), 'a'); CheckFileContents(user_keys, values, expected_locations, @@ -453,7 +480,7 @@ TEST(CuckooBuilderTest, FailWhenCollisionPathTooLong) { fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -473,7 +500,7 @@ TEST(CuckooBuilderTest, FailWhenSameKeyInserted) { fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, GetSliceHash); + num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index e2cc6fd89..4afc9fc2e 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -11,11 +11,12 @@ #include "table/cuckoo_table_reader.h" namespace rocksdb { -Status CuckooTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, const InternalKeyComparator& icomp, + +Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { - std::unique_ptr new_reader(new CuckooTableReader(options, + std::unique_ptr new_reader(new CuckooTableReader(ioptions, std::move(file), file_size, icomp.user_comparator(), nullptr)); Status s = new_reader->status(); if (s.ok()) { @@ -25,10 +26,15 @@ Status CuckooTableFactory::NewTableReader(const Options& options, } TableBuilder* CuckooTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new CuckooTableBuilder(file, hash_table_ratio_, 64, max_search_depth_, - internal_comparator.user_comparator(), cuckoo_block_size_, nullptr); + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + // TODO: change builder to take the option struct + return new CuckooTableBuilder(file, table_options_.hash_table_ratio, 64, + table_options_.max_search_depth, internal_comparator.user_comparator(), + table_options_.cuckoo_block_size, table_options_.use_module_hash, + table_options_.identity_as_first_hash, nullptr); } std::string CuckooTableFactory::GetPrintableTableOptions() const { @@ -38,21 +44,22 @@ std::string CuckooTableFactory::GetPrintableTableOptions() const { char buffer[kBufferSize]; snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", - hash_table_ratio_); + table_options_.hash_table_ratio); ret.append(buffer); snprintf(buffer, kBufferSize, " max_search_depth: %u\n", - max_search_depth_); + table_options_.max_search_depth); ret.append(buffer); snprintf(buffer, kBufferSize, " cuckoo_block_size: %u\n", - cuckoo_block_size_); + table_options_.cuckoo_block_size); + ret.append(buffer); + snprintf(buffer, kBufferSize, " identity_as_first_hash: %d\n", + table_options_.identity_as_first_hash); ret.append(buffer); return ret; } -TableFactory* NewCuckooTableFactory(double hash_table_ratio, - uint32_t max_search_depth, uint32_t cuckoo_block_size) { - return new CuckooTableFactory( - hash_table_ratio, max_search_depth, cuckoo_block_size); +TableFactory* NewCuckooTableFactory(const CuckooTableOptions& table_options) { + return new CuckooTableFactory(table_options); } } // namespace rocksdb diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index 06f657d22..625fd9995 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -9,21 +9,33 @@ #include #include "rocksdb/table.h" #include "util/murmurhash.h" +#include "rocksdb/options.h" namespace rocksdb { const uint32_t kCuckooMurmurSeedMultiplier = 816922183; static inline uint64_t CuckooHash( - const Slice& user_key, uint32_t hash_cnt, uint64_t table_size_minus_one, + const Slice& user_key, uint32_t hash_cnt, bool use_module_hash, + uint64_t table_size_, bool identity_as_first_hash, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) { #ifndef NDEBUG // This part is used only in unit tests. if (get_slice_hash != nullptr) { - return get_slice_hash(user_key, hash_cnt, table_size_minus_one + 1); + return get_slice_hash(user_key, hash_cnt, table_size_); } #endif - return MurmurHash(user_key.data(), user_key.size(), - kCuckooMurmurSeedMultiplier * hash_cnt) & table_size_minus_one; + uint64_t value = 0; + if (hash_cnt == 0 && identity_as_first_hash) { + value = (*reinterpret_cast(user_key.data())); + } else { + value = MurmurHash(user_key.data(), static_cast(user_key.size()), + kCuckooMurmurSeedMultiplier * hash_cnt); + } + if (use_module_hash) { + return value % table_size_; + } else { + return value & (table_size_ - 1); + } } // Cuckoo Table is designed for applications that require fast point lookups @@ -35,36 +47,32 @@ static inline uint64_t CuckooHash( // - Does not support Merge operations. class CuckooTableFactory : public TableFactory { public: - CuckooTableFactory(double hash_table_ratio, uint32_t max_search_depth, - uint32_t cuckoo_block_size) - : hash_table_ratio_(hash_table_ratio), - max_search_depth_(max_search_depth), - cuckoo_block_size_(cuckoo_block_size) {} + explicit CuckooTableFactory(const CuckooTableOptions& table_options) + : table_options_(table_options) {} ~CuckooTableFactory() {} const char* Name() const override { return "CuckooTable"; } Status NewTableReader( - const Options& options, const EnvOptions& soptions, + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, + TableBuilder* NewTableBuilder(const ImmutableCFOptions& options, const InternalKeyComparator& icomparator, WritableFile* file, - CompressionType compression_type) const override; + const CompressionType, const CompressionOptions&) const override; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(DBOptions* db_opts) const override { + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { return Status::OK(); } std::string GetPrintableTableOptions() const override; private: - const double hash_table_ratio_; - const uint32_t max_search_depth_; - const uint32_t cuckoo_block_size_; + const CuckooTableOptions table_options_; }; } // namespace rocksdb diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index f1dcbc3bb..7f017ec7c 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -16,20 +16,23 @@ #include #include #include "rocksdb/iterator.h" +#include "rocksdb/table.h" #include "table/meta_blocks.h" #include "table/cuckoo_table_factory.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" namespace rocksdb { namespace { - static const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); +const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); +const uint32_t kInvalidIndex = std::numeric_limits::max(); } extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, @@ -37,12 +40,12 @@ CuckooTableReader::CuckooTableReader( : file_(std::move(file)), ucomp_(comparator), get_slice_hash_(get_slice_hash) { - if (!options.allow_mmap_reads) { + if (!ioptions.allow_mmap_reads) { status_ = Status::InvalidArgument("File is not mmaped"); } TableProperties* props = nullptr; status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!status_.ok()) { return; } @@ -50,21 +53,29 @@ CuckooTableReader::CuckooTableReader( auto& user_props = props->user_collected_properties; auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); if (hash_funs == user_props.end()) { - status_ = Status::InvalidArgument("Number of hash functions not found"); + status_ = Status::Corruption("Number of hash functions not found"); return; } num_hash_func_ = *reinterpret_cast(hash_funs->second.data()); auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); if (unused_key == user_props.end()) { - status_ = Status::InvalidArgument("Empty bucket value not found"); + status_ = Status::Corruption("Empty bucket value not found"); return; } unused_key_ = unused_key->second; - key_length_ = props->fixed_key_len; + key_length_ = static_cast(props->fixed_key_len); + auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); + if (user_key_len == user_props.end()) { + status_ = Status::Corruption("User key length not found"); + return; + } + user_key_length_ = *reinterpret_cast( + user_key_len->second.data()); + auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); if (value_length == user_props.end()) { - status_ = Status::InvalidArgument("Value length not found"); + status_ = Status::Corruption("Value length not found"); return; } value_length_ = *reinterpret_cast( @@ -74,21 +85,40 @@ CuckooTableReader::CuckooTableReader( auto hash_table_size = user_props.find( CuckooTablePropertyNames::kHashTableSize); if (hash_table_size == user_props.end()) { - status_ = Status::InvalidArgument("Hash table size not found"); + status_ = Status::Corruption("Hash table size not found"); return; } - table_size_minus_one_ = *reinterpret_cast( - hash_table_size->second.data()) - 1; + table_size_ = *reinterpret_cast( + hash_table_size->second.data()); + auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); if (is_last_level == user_props.end()) { - status_ = Status::InvalidArgument("Is last level not found"); + status_ = Status::Corruption("Is last level not found"); return; } is_last_level_ = *reinterpret_cast(is_last_level->second.data()); + + auto identity_as_first_hash = user_props.find( + CuckooTablePropertyNames::kIdentityAsFirstHash); + if (identity_as_first_hash == user_props.end()) { + status_ = Status::Corruption("identity as first hash not found"); + return; + } + identity_as_first_hash_ = *reinterpret_cast( + identity_as_first_hash->second.data()); + + auto use_module_hash = user_props.find( + CuckooTablePropertyNames::kUseModuleHash); + if (use_module_hash == user_props.end()) { + status_ = Status::Corruption("hash type is not found"); + return; + } + use_module_hash_ = *reinterpret_cast( + use_module_hash->second.data()); auto cuckoo_block_size = user_props.find( CuckooTablePropertyNames::kCuckooBlockSize); if (cuckoo_block_size == user_props.end()) { - status_ = Status::InvalidArgument("Cuckoo block size not found"); + status_ = Status::Corruption("Cuckoo block size not found"); return; } cuckoo_block_size_ = *reinterpret_cast( @@ -97,36 +127,32 @@ CuckooTableReader::CuckooTableReader( status_ = file_->Read(0, file_size, &file_data_, nullptr); } -Status CuckooTableReader::Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context)) { +Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context) { assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); Slice user_key = ExtractUserKey(key); for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { uint64_t offset = bucket_length_ * CuckooHash( - user_key, hash_cnt, table_size_minus_one_, get_slice_hash_); + user_key, hash_cnt, use_module_hash_, table_size_, + identity_as_first_hash_, get_slice_hash_); const char* bucket = &file_data_.data()[offset]; for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; - ++block_idx, bucket += bucket_length_) { + ++block_idx, bucket += bucket_length_) { if (ucomp_->Compare(Slice(unused_key_.data(), user_key.size()), - Slice(bucket, user_key.size())) == 0) { + Slice(bucket, user_key.size())) == 0) { return Status::OK(); } // Here, we compare only the user key part as we support only one entry // per user key and we don't support sanpshot. if (ucomp_->Compare(user_key, Slice(bucket, user_key.size())) == 0) { - Slice value = Slice(&bucket[key_length_], value_length_); + Slice value(bucket + key_length_, value_length_); if (is_last_level_) { - ParsedInternalKey found_ikey( - Slice(bucket, key_length_), 0, kTypeValue); - result_handler(handle_context, found_ikey, value); + get_context->SaveValue(value); } else { Slice full_key(bucket, key_length_); ParsedInternalKey found_ikey; ParseInternalKey(full_key, &found_ikey); - result_handler(handle_context, found_ikey, value); + get_context->SaveValue(found_ikey, value); } // We don't support merge operations. So, we return here. return Status::OK(); @@ -140,7 +166,8 @@ void CuckooTableReader::Prepare(const Slice& key) { // Prefetch the first Cuckoo Block. Slice user_key = ExtractUserKey(key); uint64_t addr = reinterpret_cast(file_data_.data()) + - bucket_length_ * CuckooHash(user_key, 0, table_size_minus_one_, nullptr); + bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, + identity_as_first_hash_, nullptr); uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { PREFETCH(reinterpret_cast(addr), 0, 3); @@ -160,33 +187,43 @@ class CuckooTableIterator : public Iterator { Slice key() const override; Slice value() const override; Status status() const override { return status_; } - void LoadKeysFromReader(); + void InitIfNeeded(); private: - struct CompareKeys { - CompareKeys(const Comparator* ucomp, const bool last_level) - : ucomp_(ucomp), - is_last_level_(last_level) {} - bool operator()(const std::pair& first, - const std::pair& second) const { - if (is_last_level_) { - return ucomp_->Compare(first.first, second.first) < 0; - } else { - return ucomp_->Compare(ExtractUserKey(first.first), - ExtractUserKey(second.first)) < 0; - } + struct BucketComparator { + BucketComparator(const Slice& file_data, const Comparator* ucomp, + uint32_t bucket_len, uint32_t user_key_len, + const Slice& target = Slice()) + : file_data_(file_data), + ucomp_(ucomp), + bucket_len_(bucket_len), + user_key_len_(user_key_len), + target_(target) {} + bool operator()(const uint32_t first, const uint32_t second) const { + const char* first_bucket = + (first == kInvalidIndex) ? target_.data() : + &file_data_.data()[first * bucket_len_]; + const char* second_bucket = + (second == kInvalidIndex) ? target_.data() : + &file_data_.data()[second * bucket_len_]; + return ucomp_->Compare(Slice(first_bucket, user_key_len_), + Slice(second_bucket, user_key_len_)) < 0; } - private: + const Slice file_data_; const Comparator* ucomp_; - const bool is_last_level_; + const uint32_t bucket_len_; + const uint32_t user_key_len_; + const Slice target_; }; - const CompareKeys comparator_; + + const BucketComparator bucket_comparator_; void PrepareKVAtCurrIdx(); CuckooTableReader* reader_; + bool initialized_; Status status_; // Contains a map of keys to bucket_id sorted in key order. - std::vector> key_to_bucket_id_; + std::vector sorted_bucket_ids_; // We assume that the number of items can be stored in uint32 (4 Billion). uint32_t curr_key_idx_; Slice curr_value_; @@ -197,57 +234,67 @@ class CuckooTableIterator : public Iterator { }; CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) - : comparator_(reader->ucomp_, reader->is_last_level_), + : bucket_comparator_(reader->file_data_, reader->ucomp_, + reader->bucket_length_, reader->user_key_length_), reader_(reader), - curr_key_idx_(std::numeric_limits::max()) { - key_to_bucket_id_.clear(); + initialized_(false), + curr_key_idx_(kInvalidIndex) { + sorted_bucket_ids_.clear(); curr_value_.clear(); curr_key_.Clear(); } -void CuckooTableIterator::LoadKeysFromReader() { - key_to_bucket_id_.reserve(reader_->GetTableProperties()->num_entries); - uint64_t num_buckets = reader_->table_size_minus_one_ + - reader_->cuckoo_block_size_; - for (uint32_t bucket_id = 0; bucket_id < num_buckets; bucket_id++) { - Slice read_key; - status_ = reader_->file_->Read(bucket_id * reader_->bucket_length_, - reader_->key_length_, &read_key, nullptr); - if (read_key != Slice(reader_->unused_key_)) { - key_to_bucket_id_.push_back(std::make_pair(read_key, bucket_id)); +void CuckooTableIterator::InitIfNeeded() { + if (initialized_) { + return; + } + sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries); + uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; + assert(num_buckets < kInvalidIndex); + const char* bucket = reader_->file_data_.data(); + for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { + if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { + sorted_bucket_ids_.push_back(bucket_id); } + bucket += reader_->bucket_length_; } - assert(key_to_bucket_id_.size() == + assert(sorted_bucket_ids_.size() == reader_->GetTableProperties()->num_entries); - std::sort(key_to_bucket_id_.begin(), key_to_bucket_id_.end(), comparator_); - curr_key_idx_ = key_to_bucket_id_.size(); + std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), + bucket_comparator_); + curr_key_idx_ = kInvalidIndex; + initialized_ = true; } void CuckooTableIterator::SeekToFirst() { + InitIfNeeded(); curr_key_idx_ = 0; PrepareKVAtCurrIdx(); } void CuckooTableIterator::SeekToLast() { - curr_key_idx_ = key_to_bucket_id_.size() - 1; + InitIfNeeded(); + curr_key_idx_ = static_cast(sorted_bucket_ids_.size()) - 1; PrepareKVAtCurrIdx(); } void CuckooTableIterator::Seek(const Slice& target) { - // We assume that the target is an internal key. If this is last level file, - // we need to take only the user key part to seek. - Slice target_to_search = reader_->is_last_level_ ? - ExtractUserKey(target) : target; - auto seek_it = std::lower_bound(key_to_bucket_id_.begin(), - key_to_bucket_id_.end(), - std::make_pair(target_to_search, 0), - comparator_); - curr_key_idx_ = std::distance(key_to_bucket_id_.begin(), seek_it); + InitIfNeeded(); + const BucketComparator seek_comparator( + reader_->file_data_, reader_->ucomp_, + reader_->bucket_length_, reader_->user_key_length_, + ExtractUserKey(target)); + auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), + sorted_bucket_ids_.end(), + kInvalidIndex, + seek_comparator); + curr_key_idx_ = + static_cast(std::distance(sorted_bucket_ids_.begin(), seek_it)); PrepareKVAtCurrIdx(); } bool CuckooTableIterator::Valid() const { - return curr_key_idx_ < key_to_bucket_id_.size(); + return curr_key_idx_ < sorted_bucket_ids_.size(); } void CuckooTableIterator::PrepareKVAtCurrIdx() { @@ -256,15 +303,17 @@ void CuckooTableIterator::PrepareKVAtCurrIdx() { curr_key_.Clear(); return; } - uint64_t offset = ((uint64_t) key_to_bucket_id_[curr_key_idx_].second - * reader_->bucket_length_) + reader_->key_length_; - status_ = reader_->file_->Read(offset, reader_->value_length_, - &curr_value_, nullptr); + uint32_t id = sorted_bucket_ids_[curr_key_idx_]; + const char* offset = reader_->file_data_.data() + + id * reader_->bucket_length_; if (reader_->is_last_level_) { // Always return internal key. - curr_key_.SetInternalKey( - key_to_bucket_id_[curr_key_idx_].first, 0, kTypeValue); + curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), + 0, kTypeValue); + } else { + curr_key_.SetKey(Slice(offset, reader_->key_length_)); } + curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); } void CuckooTableIterator::Next() { @@ -279,7 +328,7 @@ void CuckooTableIterator::Next() { void CuckooTableIterator::Prev() { if (curr_key_idx_ == 0) { - curr_key_idx_ = key_to_bucket_id_.size(); + curr_key_idx_ = static_cast(sorted_bucket_ids_.size()); } if (!Valid()) { curr_value_.clear(); @@ -292,11 +341,7 @@ void CuckooTableIterator::Prev() { Slice CuckooTableIterator::key() const { assert(Valid()); - if (reader_->is_last_level_) { - return curr_key_.GetKey(); - } else { - return key_to_bucket_id_[curr_key_idx_].first; - } + return curr_key_.GetKey(); } Slice CuckooTableIterator::value() const { @@ -323,9 +368,6 @@ Iterator* CuckooTableReader::NewIterator( auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); iter = new (iter_mem) CuckooTableIterator(this); } - if (iter->status().ok()) { - iter->LoadKeysFromReader(); - } return iter; } diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 05d5c3397..4f00a9e41 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -16,6 +16,7 @@ #include "db/dbformat.h" #include "rocksdb/env.h" +#include "rocksdb/options.h" #include "table/table_reader.h" namespace rocksdb { @@ -26,7 +27,7 @@ class TableReader; class CuckooTableReader: public TableReader { public: CuckooTableReader( - const Options& options, + const ImmutableCFOptions& ioptions, std::unique_ptr&& file, uint64_t file_size, const Comparator* user_comparator, @@ -39,12 +40,8 @@ class CuckooTableReader: public TableReader { Status status() const { return status_; } - Status Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) - override; + Status Get(const ReadOptions& read_options, const Slice& key, + GetContext* get_context) override; Iterator* NewIterator(const ReadOptions&, Arena* arena = nullptr) override; void Prepare(const Slice& target) override; @@ -63,16 +60,19 @@ class CuckooTableReader: public TableReader { std::unique_ptr file_; Slice file_data_; bool is_last_level_; + bool identity_as_first_hash_; + bool use_module_hash_; std::shared_ptr table_props_; Status status_; uint32_t num_hash_func_; std::string unused_key_; uint32_t key_length_; + uint32_t user_key_length_; uint32_t value_length_; uint32_t bucket_length_; uint32_t cuckoo_block_size_; uint32_t cuckoo_block_bytes_minus_one_; - uint64_t table_size_minus_one_; + uint64_t table_size_; const Comparator* ucomp_; uint64_t (*get_slice_hash_)(const Slice& s, uint32_t index, uint64_t max_num_buckets); diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 63fe0ae5b..aaeb3956c 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -11,7 +11,10 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -22,6 +25,7 @@ int main() { #include "table/cuckoo_table_builder.h" #include "table/cuckoo_table_reader.h" #include "table/cuckoo_table_factory.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/random.h" #include "util/testharness.h" @@ -35,6 +39,7 @@ DEFINE_string(file_dir, "", "Directory where the files will be created" DEFINE_bool(enable_perf, false, "Run Benchmark Tests too."); DEFINE_bool(write, false, "Should write new values to file in performance tests?"); +DEFINE_bool(identity_as_first_hash, true, "use identity as first hash"); namespace rocksdb { @@ -57,25 +62,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index, return hash_map[s.ToString()][index]; } -// Methods, variables for checking key and values read. -struct ValuesToAssert { - ValuesToAssert(const std::string& key, const Slice& value) - : expected_user_key(key), - expected_value(value), - call_count(0) {} - std::string expected_user_key; - Slice expected_value; - int call_count; -}; - -bool AssertValues(void* assert_obj, - const ParsedInternalKey& k, const Slice& v) { - ValuesToAssert *ptr = reinterpret_cast(assert_obj); - ASSERT_EQ(ptr->expected_value.ToString(), v.ToString()); - ASSERT_EQ(ptr->expected_user_key, k.user_key.ToString()); - ++ptr->call_count; - return false; -} } // namespace class CuckooReaderTest { @@ -86,8 +72,8 @@ class CuckooReaderTest { env_options = EnvOptions(options); } - void SetUp(int num_items) { - this->num_items = num_items; + void SetUp(int num) { + num_items = num; hash_map.clear(); keys.clear(); keys.resize(num_items); @@ -106,7 +92,8 @@ class CuckooReaderTest { std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( - writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, GetSliceHash); + writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, + false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -121,18 +108,22 @@ class CuckooReaderTest { // Check reader now. std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, ucomp, GetSliceHash); ASSERT_OK(reader.status()); + // Assume no merge/deletion for (uint32_t i = 0; i < num_items; ++i) { - ValuesToAssert v(user_keys[i], values[i]); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(keys[i]), &v, AssertValues, nullptr)); - ASSERT_EQ(1, v.call_count); + std::string value; + GetContext get_context(ucomp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(user_keys[i]), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(keys[i]), &get_context)); + ASSERT_EQ(values[i], value); } } void UpdateKeys(bool with_zero_seqno) { @@ -147,8 +138,9 @@ class CuckooReaderTest { void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, ucomp, @@ -169,7 +161,7 @@ class CuckooReaderTest { ASSERT_EQ(static_cast(cnt), num_items); it->SeekToLast(); - cnt = num_items - 1; + cnt = static_cast(num_items) - 1; ASSERT_TRUE(it->Valid()); while (it->Valid()) { ASSERT_OK(it->status()); @@ -180,7 +172,7 @@ class CuckooReaderTest { } ASSERT_EQ(cnt, -1); - cnt = num_items / 2; + cnt = static_cast(num_items) / 2; it->Seek(keys[cnt]); while (it->Valid()) { ASSERT_OK(it->status()); @@ -322,14 +314,16 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Make all hash values collide. AddHashLookups(user_keys[i], 0, kNumHashFunc); } + auto* ucmp = BytewiseComparator(); CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, + ioptions, std::move(read_file), file_size, - BytewiseComparator(), + ucmp, GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. @@ -338,10 +332,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { AddHashLookups(not_found_user_key, 0, kNumHashFunc); ParsedInternalKey ikey(not_found_user_key, 1000, kTypeValue); AppendInternalKey(¬_found_key, ikey); - ValuesToAssert v("", ""); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(not_found_key), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + std::string value; + GetContext get_context(ucmp, nullptr, nullptr, nullptr, GetContext::kNotFound, + Slice(not_found_key), &value, nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key), &get_context)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); // Search for a key with an independent hash value. std::string not_found_user_key2 = "key" + NumToStr(num_items + 1); @@ -349,9 +344,11 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { ParsedInternalKey ikey2(not_found_user_key2, 1000, kTypeValue); std::string not_found_key2; AppendInternalKey(¬_found_key2, ikey2); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(not_found_key2), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + GetContext get_context2(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(not_found_key2), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(not_found_key2), &get_context2)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); // Test read when key is unused key. @@ -361,34 +358,25 @@ TEST(CuckooReaderTest, WhenKeyNotFound) { // Add hash values that map to empty buckets. AddHashLookups(ExtractUserKey(unused_key).ToString(), kNumHashFunc, kNumHashFunc); - ASSERT_OK(reader.Get( - ReadOptions(), Slice(unused_key), &v, AssertValues, nullptr)); - ASSERT_EQ(0, v.call_count); + GetContext get_context3(ucmp, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(unused_key), &value, + nullptr, nullptr); + ASSERT_OK(reader.Get(ReadOptions(), Slice(unused_key), &get_context3)); + ASSERT_TRUE(value.empty()); ASSERT_OK(reader.status()); } // Performance tests namespace { -bool DoNothing(void* arg, const ParsedInternalKey& k, const Slice& v) { - // Deliberately empty. - return false; -} - -bool CheckValue(void* cnt_ptr, const ParsedInternalKey& k, const Slice& v) { - ++*reinterpret_cast(cnt_ptr); - std::string expected_value; - AppendInternalKey(&expected_value, k); - ASSERT_EQ(0, v.compare(Slice(&expected_value[0], v.size()))); - return false; -} - void GetKeys(uint64_t num, std::vector* keys) { + keys->clear(); IterKey k; k.SetInternalKey("", 0, kTypeValue); std::string internal_key_suffix = k.GetKey().ToString(); ASSERT_EQ(static_cast(8), internal_key_suffix.size()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { - std::string new_key(reinterpret_cast(&key_idx), sizeof(key_idx)); + uint64_t value = 2 * key_idx; + std::string new_key(reinterpret_cast(&value), sizeof(value)); new_key += internal_key_suffix; keys->push_back(new_key); } @@ -399,7 +387,7 @@ std::string GetFileName(uint64_t num) { FLAGS_file_dir = test::TmpDir(); } return FLAGS_file_dir + "/cuckoo_read_benchmark" + - std::to_string(num/1000000) + "Mkeys"; + ToString(num/1000000) + "Mkeys"; } // Create last level file as we are interested in measuring performance of @@ -416,7 +404,8 @@ void WriteFile(const std::vector& keys, ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); CuckooTableBuilder builder( writable_file.get(), hash_ratio, - 64, 1000, test::Uint64Comparator(), 5, nullptr); + 64, 1000, test::Uint64Comparator(), 5, + false, FLAGS_identity_as_first_hash, nullptr); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. @@ -433,18 +422,21 @@ void WriteFile(const std::vector& keys, std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, std::move(read_file), file_size, + ioptions, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; + std::string value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, + nullptr, nullptr); for (uint64_t i = 0; i < num; ++i) { - int cnt = 0; - ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &cnt, CheckValue, nullptr)); - if (cnt != 1) { - fprintf(stderr, "%" PRIu64 " not found.\n", i); - ASSERT_EQ(1, cnt); - } + value.clear(); + ASSERT_OK(reader.Get(r_options, Slice(keys[i]), &get_context)); + ASSERT_TRUE(Slice(keys[i]) == Slice(&keys[i][0], 4)); } } @@ -460,8 +452,9 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + const ImmutableCFOptions ioptions(options); CuckooTableReader reader( - options, std::move(read_file), file_size, test::Uint64Comparator(), + ioptions, std::move(read_file), file_size, test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = @@ -474,21 +467,33 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { " hash functions: %u.\n", num, num * 100.0 / (table_size), num_hash_fun); ReadOptions r_options; + std::vector keys; + keys.reserve(num); + for (uint64_t i = 0; i < num; ++i) { + keys.push_back(2 * i); + } + std::random_shuffle(keys.begin(), keys.end()); + + std::string value; + // Assume only the fast path is triggered + GetContext get_context(nullptr, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), &value, + nullptr, nullptr); uint64_t start_time = env->NowMicros(); if (batch_size > 0) { for (uint64_t i = 0; i < num; i += batch_size) { for (uint64_t j = i; j < i+batch_size && j < num; ++j) { - reader.Prepare(Slice(reinterpret_cast(&j), 16)); + reader.Prepare(Slice(reinterpret_cast(&keys[j]), 16)); } for (uint64_t j = i; j < i+batch_size && j < num; ++j) { - reader.Get(r_options, Slice(reinterpret_cast(&j), 16), - nullptr, DoNothing, nullptr); + reader.Get(r_options, Slice(reinterpret_cast(&keys[j]), 16), + &get_context); } } } else { for (uint64_t i = 0; i < num; i++) { - reader.Get(r_options, Slice(reinterpret_cast(&i), 16), nullptr, - DoNothing, nullptr); + reader.Get(r_options, Slice(reinterpret_cast(&keys[i]), 16), + &get_context); } } float time_per_op = (env->NowMicros() - start_time) * 1.0 / num; @@ -506,17 +511,17 @@ TEST(CuckooReaderTest, TestReadPerformance) { // These numbers are chosen to have a hash utilizaiton % close to // 0.9, 0.75, 0.6 and 0.5 respectively. // They all create 128 M buckets. - std::vector nums = {120*1000*1000, 100*1000*1000, 80*1000*1000, - 70*1000*1000}; + std::vector nums = {120*1024*1024, 100*1024*1024, 80*1024*1024, + 70*1024*1024}; #ifndef NDEBUG fprintf(stdout, "WARNING: Not compiled with DNDEBUG. Performance tests may be slow.\n"); #endif - std::vector keys; - GetKeys(*std::max_element(nums.begin(), nums.end()), &keys); for (uint64_t num : nums) { if (FLAGS_write || !Env::Default()->FileExists(GetFileName(num))) { - WriteFile(keys, num, hash_ratio); + std::vector all_keys; + GetKeys(num, &all_keys); + WriteFile(all_keys, num, hash_ratio); } ReadKeys(num, 0); ReadKeys(num, 10); diff --git a/table/filter_block.cc b/table/filter_block.cc deleted file mode 100644 index 6b4ff1c10..000000000 --- a/table/filter_block.cc +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/filter_block.h" - -#include "db/dbformat.h" -#include "rocksdb/filter_policy.h" -#include "util/coding.h" - -namespace rocksdb { - -// See doc/table_format.txt for an explanation of the filter block format. - -// Generate new filter every 2KB of data -static const size_t kFilterBaseLg = 11; -static const size_t kFilterBase = 1 << kFilterBaseLg; - -FilterBlockBuilder::FilterBlockBuilder(const Options& opt, - const BlockBasedTableOptions& table_opt, - const Comparator* internal_comparator) - : policy_(table_opt.filter_policy.get()), - prefix_extractor_(opt.prefix_extractor.get()), - whole_key_filtering_(table_opt.whole_key_filtering), - comparator_(internal_comparator) {} - -void FilterBlockBuilder::StartBlock(uint64_t block_offset) { - uint64_t filter_index = (block_offset / kFilterBase); - assert(filter_index >= filter_offsets_.size()); - while (filter_index > filter_offsets_.size()) { - GenerateFilter(); - } -} - -bool FilterBlockBuilder::SamePrefix(const Slice &key1, - const Slice &key2) const { - if (!prefix_extractor_->InDomain(key1) && - !prefix_extractor_->InDomain(key2)) { - return true; - } else if (!prefix_extractor_->InDomain(key1) || - !prefix_extractor_->InDomain(key2)) { - return false; - } else { - return (prefix_extractor_->Transform(key1) == - prefix_extractor_->Transform(key2)); - } -} - -void FilterBlockBuilder::AddKey(const Slice& key) { - // get slice for most recently added entry - Slice prev; - size_t added_to_start = 0; - - // add key to filter if needed - if (whole_key_filtering_) { - start_.push_back(entries_.size()); - ++added_to_start; - entries_.append(key.data(), key.size()); - } - - if (start_.size() > added_to_start) { - size_t prev_start = start_[start_.size() - 1 - added_to_start]; - const char* base = entries_.data() + prev_start; - size_t length = entries_.size() - prev_start; - prev = Slice(base, length); - } - - // add prefix to filter if needed - if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { - // this assumes prefix(prefix(key)) == prefix(key), as the last - // entry in entries_ may be either a key or prefix, and we use - // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || !SamePrefix(key, prev)) { - Slice prefix = prefix_extractor_->Transform(key); - start_.push_back(entries_.size()); - entries_.append(prefix.data(), prefix.size()); - } - } -} - -Slice FilterBlockBuilder::Finish() { - if (!start_.empty()) { - GenerateFilter(); - } - - // Append array of per-filter offsets - const uint32_t array_offset = result_.size(); - for (size_t i = 0; i < filter_offsets_.size(); i++) { - PutFixed32(&result_, filter_offsets_[i]); - } - - PutFixed32(&result_, array_offset); - result_.push_back(kFilterBaseLg); // Save encoding parameter in result - return Slice(result_); -} - -void FilterBlockBuilder::GenerateFilter() { - const size_t num_entries = start_.size(); - if (num_entries == 0) { - // Fast path if there are no keys for this filter - filter_offsets_.push_back(result_.size()); - return; - } - - // Make list of keys from flattened key structure - start_.push_back(entries_.size()); // Simplify length computation - tmp_entries_.resize(num_entries); - for (size_t i = 0; i < num_entries; i++) { - const char* base = entries_.data() + start_[i]; - size_t length = start_[i+1] - start_[i]; - tmp_entries_[i] = Slice(base, length); - } - - // Generate filter for current set of keys and append to result_. - filter_offsets_.push_back(result_.size()); - policy_->CreateFilter(&tmp_entries_[0], num_entries, &result_); - - tmp_entries_.clear(); - entries_.clear(); - start_.clear(); -} - -FilterBlockReader::FilterBlockReader( - const Options& opt, const BlockBasedTableOptions& table_opt, - const Slice& contents, bool delete_contents_after_use) - : policy_(table_opt.filter_policy.get()), - prefix_extractor_(opt.prefix_extractor.get()), - whole_key_filtering_(table_opt.whole_key_filtering), - data_(nullptr), - offset_(nullptr), - num_(0), - base_lg_(0) { - size_t n = contents.size(); - if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array - base_lg_ = contents[n-1]; - uint32_t last_word = DecodeFixed32(contents.data() + n - 5); - if (last_word > n - 5) return; - data_ = contents.data(); - offset_ = data_ + last_word; - num_ = (n - 5 - last_word) / 4; - if (delete_contents_after_use) { - filter_data.reset(contents.data()); - } -} - -bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, - const Slice& key) { - if (!whole_key_filtering_) { - return true; - } - return MayMatch(block_offset, key); -} - -bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, - const Slice& prefix) { - if (!prefix_extractor_) { - return true; - } - return MayMatch(block_offset, prefix); -} - -bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { - uint64_t index = block_offset >> base_lg_; - if (index < num_) { - uint32_t start = DecodeFixed32(offset_ + index*4); - uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); - if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { - Slice filter = Slice(data_ + start, limit - start); - return policy_->KeyMayMatch(entry, filter); - } else if (start == limit) { - // Empty filters do not match any entries - return false; - } - } - return true; // Errors are treated as potential matches -} - -size_t FilterBlockReader::ApproximateMemoryUsage() const { - return num_ * 4 + 5 + (offset_ - data_); -} -} diff --git a/table/filter_block.h b/table/filter_block.h index 5041393f6..855a23169 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -10,6 +10,11 @@ // A filter block is stored near the end of a Table file. It contains // filters (e.g., bloom filters) for all data blocks in the table combined // into a single filter block. +// +// It is a base class for BlockBasedFilter and FullFilter. +// These two are both used in BlockBasedTable. The first one contain filter +// For a part of keys in sst file, the second contain filter for all keys +// in sst file. #pragma once @@ -23,9 +28,11 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/table.h" #include "util/hash.h" +#include "format.h" namespace rocksdb { +const uint64_t kNotValid = ULLONG_MAX; class FilterPolicy; // A FilterBlockBuilder is used to construct all of the filters for a @@ -33,64 +40,51 @@ class FilterPolicy; // a special block in the Table. // // The sequence of calls to FilterBlockBuilder must match the regexp: -// (StartBlock AddKey*)* Finish +// (StartBlock Add*)* Finish +// +// BlockBased/Full FilterBlock would be called in the same way. class FilterBlockBuilder { public: - explicit FilterBlockBuilder(const Options& opt, - const BlockBasedTableOptions& table_opt, - const Comparator* internal_comparator); + explicit FilterBlockBuilder() {} + virtual ~FilterBlockBuilder() {} - void StartBlock(uint64_t block_offset); - void AddKey(const Slice& key); - Slice Finish(); + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual void StartBlock(uint64_t block_offset) = 0; // Start new block filter + virtual void Add(const Slice& key) = 0; // Add a key to current filter + virtual Slice Finish() = 0; // Generate Filter private: - bool SamePrefix(const Slice &key1, const Slice &key2) const; - void GenerateFilter(); - - // important: all of these might point to invalid addresses - // at the time of destruction of this filter block. destructor - // should NOT dereference them. - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - const Comparator* comparator_; - - std::string entries_; // Flattened entry contents - std::vector start_; // Starting index in entries_ of each entry - std::string result_; // Filter data computed so far - std::vector tmp_entries_; // policy_->CreateFilter() argument - std::vector filter_offsets_; - // No copying allowed FilterBlockBuilder(const FilterBlockBuilder&); void operator=(const FilterBlockBuilder&); }; +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +// +// BlockBased/Full FilterBlock would be called in the same way. class FilterBlockReader { public: - // REQUIRES: "contents" and *policy must stay live while *this is live. - FilterBlockReader( - const Options& opt, - const BlockBasedTableOptions& table_opt, - const Slice& contents, - bool delete_contents_after_use = false); - bool KeyMayMatch(uint64_t block_offset, const Slice& key); - bool PrefixMayMatch(uint64_t block_offset, const Slice& prefix); - size_t ApproximateMemoryUsage() const; + explicit FilterBlockReader() {} + virtual ~FilterBlockReader() {} - private: - const FilterPolicy* policy_; - const SliceTransform* prefix_extractor_; - bool whole_key_filtering_; - const char* data_; // Pointer to filter data (at block-start) - const char* offset_; // Pointer to beginning of offset array (at block-end) - size_t num_; // Number of entries in offset array - size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file) - std::unique_ptr filter_data; + virtual bool IsBlockBased() = 0; // If is blockbased filter + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) = 0; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) = 0; + virtual size_t ApproximateMemoryUsage() const = 0; + // convert this object to a human readable form + virtual std::string ToString() const { + std::string error_msg("Unsupported filter \n"); + return error_msg; + } - bool MayMatch(uint64_t block_offset, const Slice& entry); + private: + // No copying allowed + FilterBlockReader(const FilterBlockReader&); + void operator=(const FilterBlockReader&); }; -} +} // namespace rocksdb diff --git a/table/filter_block_test.cc b/table/filter_block_test.cc deleted file mode 100644 index 95496a82c..000000000 --- a/table/filter_block_test.cc +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2012 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "table/filter_block.h" - -#include "rocksdb/filter_policy.h" -#include "util/coding.h" -#include "util/hash.h" -#include "util/logging.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -// For testing: emit an array with one hash value per key -class TestHashFilter : public FilterPolicy { - public: - virtual const char* Name() const { - return "TestHashFilter"; - } - - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { - for (int i = 0; i < n; i++) { - uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); - PutFixed32(dst, h); - } - } - - virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { - uint32_t h = Hash(key.data(), key.size(), 1); - for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { - if (h == DecodeFixed32(filter.data() + i)) { - return true; - } - } - return false; - } -}; - -class FilterBlockTest { - public: - Options options_; - BlockBasedTableOptions table_options_; - - FilterBlockTest() { - options_ = Options(); - table_options_.filter_policy.reset(new TestHashFilter()); - } -}; - -TEST(FilterBlockTest, EmptyBuilder) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); - Slice block = builder.Finish(); - ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block)); - FilterBlockReader reader(options_, table_options_, block); - ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(100000, "foo")); -} - -TEST(FilterBlockTest, SingleChunk) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); - builder.StartBlock(100); - builder.AddKey("foo"); - builder.AddKey("bar"); - builder.AddKey("box"); - builder.StartBlock(200); - builder.AddKey("box"); - builder.StartBlock(300); - builder.AddKey("hello"); - Slice block = builder.Finish(); - FilterBlockReader reader(options_, table_options_, block); - ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(100, "bar")); - ASSERT_TRUE(reader.KeyMayMatch(100, "box")); - ASSERT_TRUE(reader.KeyMayMatch(100, "hello")); - ASSERT_TRUE(reader.KeyMayMatch(100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(100, "missing")); - ASSERT_TRUE(! reader.KeyMayMatch(100, "other")); -} - -TEST(FilterBlockTest, MultiChunk) { - FilterBlockBuilder builder(options_, table_options_, options_.comparator); - - // First filter - builder.StartBlock(0); - builder.AddKey("foo"); - builder.StartBlock(2000); - builder.AddKey("bar"); - - // Second filter - builder.StartBlock(3100); - builder.AddKey("box"); - - // Third filter is empty - - // Last filter - builder.StartBlock(9000); - builder.AddKey("box"); - builder.AddKey("hello"); - - Slice block = builder.Finish(); - FilterBlockReader reader(options_, table_options_, block); - - // Check first filter - ASSERT_TRUE(reader.KeyMayMatch(0, "foo")); - ASSERT_TRUE(reader.KeyMayMatch(2000, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(0, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(0, "hello")); - - // Check second filter - ASSERT_TRUE(reader.KeyMayMatch(3100, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(3100, "hello")); - - // Check third filter (empty) - ASSERT_TRUE(! reader.KeyMayMatch(4100, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "bar")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "box")); - ASSERT_TRUE(! reader.KeyMayMatch(4100, "hello")); - - // Check last filter - ASSERT_TRUE(reader.KeyMayMatch(9000, "box")); - ASSERT_TRUE(reader.KeyMayMatch(9000, "hello")); - ASSERT_TRUE(! reader.KeyMayMatch(9000, "foo")); - ASSERT_TRUE(! reader.KeyMayMatch(9000, "bar")); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} diff --git a/table/format.cc b/table/format.cc index a642965d5..6b3d4eead 100644 --- a/table/format.cc +++ b/table/format.cc @@ -12,10 +12,10 @@ #include #include -#include "port/port.h" #include "rocksdb/env.h" #include "table/block.h" #include "util/coding.h" +#include "util/compression.h" #include "util/crc32c.h" #include "util/perf_context_imp.h" #include "util/xxhash.h" @@ -51,8 +51,44 @@ Status BlockHandle::DecodeFrom(Slice* input) { return Status::Corruption("bad block handle"); } } + +// Return a string that contains the copy of handle. +std::string BlockHandle::ToString(bool hex) const { + std::string handle_str; + EncodeTo(&handle_str); + if (hex) { + std::string result; + char buf[10]; + for (size_t i = 0; i < handle_str.size(); i++) { + snprintf(buf, sizeof(buf), "%02X", + static_cast(handle_str[i])); + result += buf; + } + return result; + } else { + return handle_str; + } +} + const BlockHandle BlockHandle::kNullBlockHandle(0, 0); +namespace { +inline bool IsLegacyFooterFormat(uint64_t magic_number) { + return magic_number == kLegacyBlockBasedTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber; +} +inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { + if (magic_number == kLegacyBlockBasedTableMagicNumber) { + return kBlockBasedTableMagicNumber; + } + if (magic_number == kLegacyPlainTableMagicNumber) { + return kPlainTableMagicNumber; + } + assert(false); + return 0; +} +} // namespace + // legacy footer format: // metaindex handle (varint64 offset, varint64 size) // index handle (varint64 offset, varint64 size) @@ -66,7 +102,8 @@ const BlockHandle BlockHandle::kNullBlockHandle(0, 0); // footer version (4 bytes) // table_magic_number (8 bytes) void Footer::EncodeTo(std::string* dst) const { - if (version() == kLegacyFooter) { + assert(HasInitializedTableMagicNumber()); + if (IsLegacyFooterFormat(table_magic_number())) { // has to be default checksum with legacy footer assert(checksum_ == kCRC32c); const size_t original_size = dst->size(); @@ -81,39 +118,24 @@ void Footer::EncodeTo(std::string* dst) const { dst->push_back(static_cast(checksum_)); metaindex_handle_.EncodeTo(dst); index_handle_.EncodeTo(dst); - dst->resize(original_size + kVersion1EncodedLength - 12); // Padding - PutFixed32(dst, kFooterVersion); + dst->resize(original_size + kNewVersionsEncodedLength - 12); // Padding + PutFixed32(dst, version()); PutFixed32(dst, static_cast(table_magic_number() & 0xffffffffu)); PutFixed32(dst, static_cast(table_magic_number() >> 32)); - assert(dst->size() == original_size + kVersion1EncodedLength); + assert(dst->size() == original_size + kNewVersionsEncodedLength); } } -namespace { -inline bool IsLegacyFooterFormat(uint64_t magic_number) { - return magic_number == kLegacyBlockBasedTableMagicNumber || - magic_number == kLegacyPlainTableMagicNumber; -} - -inline uint64_t UpconvertLegacyFooterFormat(uint64_t magic_number) { - if (magic_number == kLegacyBlockBasedTableMagicNumber) { - return kBlockBasedTableMagicNumber; - } - if (magic_number == kLegacyPlainTableMagicNumber) { - return kPlainTableMagicNumber; - } - assert(false); - return 0; -} -} // namespace - -Footer::Footer(uint64_t table_magic_number) - : version_(IsLegacyFooterFormat(table_magic_number) ? kLegacyFooter - : kFooterVersion), +Footer::Footer(uint64_t _table_magic_number, uint32_t _version) + : version_(_version), checksum_(kCRC32c), - table_magic_number_(table_magic_number) {} + table_magic_number_(_table_magic_number) { + // This should be guaranteed by constructor callers + assert(!IsLegacyFooterFormat(_table_magic_number) || version_ == 0); +} Status Footer::DecodeFrom(Slice* input) { + assert(!HasInitializedTableMagicNumber()); assert(input != nullptr); assert(input->size() >= kMinEncodedLength); @@ -129,42 +151,29 @@ Status Footer::DecodeFrom(Slice* input) { if (legacy) { magic = UpconvertLegacyFooterFormat(magic); } - if (HasInitializedTableMagicNumber()) { - if (magic != table_magic_number()) { - char buffer[80]; - snprintf(buffer, sizeof(buffer) - 1, - "not an sstable (bad magic number --- %lx)", - (long)magic); - return Status::InvalidArgument(buffer); - } - } else { - set_table_magic_number(magic); - } + set_table_magic_number(magic); if (legacy) { // The size is already asserted to be at least kMinEncodedLength // at the beginning of the function input->remove_prefix(input->size() - kVersion0EncodedLength); - version_ = kLegacyFooter; + version_ = 0 /* legacy */; checksum_ = kCRC32c; } else { version_ = DecodeFixed32(magic_ptr - 4); - if (version_ != kFooterVersion) { - return Status::Corruption("bad footer version"); - } - // Footer version 1 will always occupy exactly this many bytes. + // Footer version 1 and higher will always occupy exactly this many bytes. // It consists of the checksum type, two block handles, padding, // a version number, and a magic number - if (input->size() < kVersion1EncodedLength) { - return Status::InvalidArgument("input is too short to be an sstable"); + if (input->size() < kNewVersionsEncodedLength) { + return Status::Corruption("input is too short to be an sstable"); } else { - input->remove_prefix(input->size() - kVersion1EncodedLength); + input->remove_prefix(input->size() - kNewVersionsEncodedLength); } - uint32_t checksum; - if (!GetVarint32(input, &checksum)) { + uint32_t chksum; + if (!GetVarint32(input, &chksum)) { return Status::Corruption("bad checksum type"); } - checksum_ = static_cast(checksum); + checksum_ = static_cast(chksum); } Status result = metaindex_handle_.DecodeFrom(input); @@ -179,18 +188,39 @@ Status Footer::DecodeFrom(Slice* input) { return result; } -Status ReadFooterFromFile(RandomAccessFile* file, - uint64_t file_size, - Footer* footer) { +std::string Footer::ToString() const { + std::string result, handle_; + result.reserve(1024); + + bool legacy = IsLegacyFooterFormat(table_magic_number_); + if (legacy) { + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("table_magic_number: " + std::to_string(table_magic_number_) + + "\n "); + } else { + result.append("checksum: " + std::to_string(checksum_) + "\n "); + result.append("metaindex handle: " + metaindex_handle_.ToString() + "\n "); + result.append("index handle: " + index_handle_.ToString() + "\n "); + result.append("footer version: " + std::to_string(version_) + "\n "); + result.append("table_magic_number: " + std::to_string(table_magic_number_) + + "\n "); + } + return result; +} + +Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, + Footer* footer, uint64_t enforce_table_magic_number) { if (file_size < Footer::kMinEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); + return Status::Corruption("file is too short to be an sstable"); } char footer_space[Footer::kMaxEncodedLength]; Slice footer_input; - size_t read_offset = (file_size > Footer::kMaxEncodedLength) - ? (file_size - Footer::kMaxEncodedLength) - : 0; + size_t read_offset = + (file_size > Footer::kMaxEncodedLength) + ? static_cast(file_size - Footer::kMaxEncodedLength) + : 0; Status s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, footer_space); if (!s.ok()) return s; @@ -198,12 +228,23 @@ Status ReadFooterFromFile(RandomAccessFile* file, // Check that we actually read the whole footer from the file. It may be // that size isn't correct. if (footer_input.size() < Footer::kMinEncodedLength) { - return Status::InvalidArgument("file is too short to be an sstable"); + return Status::Corruption("file is too short to be an sstable"); } - return footer->DecodeFrom(&footer_input); + s = footer->DecodeFrom(&footer_input); + if (!s.ok()) { + return s; + } + if (enforce_table_magic_number != 0 && + enforce_table_magic_number != footer->table_magic_number()) { + return Status::Corruption("Bad table magic number"); + } + return Status::OK(); } +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { + // Read a block and check its CRC // contents is the result of reading. // According to the implementation of file->Read, contents may not point to buf @@ -211,10 +252,13 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, Slice* contents, /* result of reading */ char* buf) { size_t n = static_cast(handle.size()); + Status s; + + { + PERF_TIMER_GUARD(block_read_time); + s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf); + } - PERF_TIMER_AUTO(block_read_time); - Status s = file->Read(handle.offset(), n + kBlockTrailerSize, contents, buf); - PERF_TIMER_MEASURE(block_read_time); PERF_COUNTER_ADD(block_read_count, 1); PERF_COUNTER_ADD(block_read_byte, n + kBlockTrailerSize); @@ -228,6 +272,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, // Check the crc of the type and the block contents const char* data = contents->data(); // Pointer to where Read put the data if (options.verify_checksums) { + PERF_TIMER_GUARD(block_checksum_time); uint32_t value = DecodeFixed32(data + n + 1); uint32_t actual = 0; switch (footer.checksum()) { @@ -236,7 +281,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, actual = crc32c::Value(data, n + 1); break; case kxxHash: - actual = XXH32(data, n + 1, 0); + actual = XXH32(data, static_cast(n) + 1, 0); break; default: s = Status::Corruption("unknown checksum type"); @@ -247,118 +292,60 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, if (!s.ok()) { return s; } - PERF_TIMER_STOP(block_checksum_time); } return s; } -// Decompress a block according to params -// May need to malloc a space for cache usage -Status DecompressBlock(BlockContents* result, size_t block_size, - bool do_uncompress, const char* buf, - const Slice& contents, bool use_stack_buf) { - Status s; - size_t n = block_size; - const char* data = contents.data(); - - result->data = Slice(); - result->cachable = false; - result->heap_allocated = false; - - PERF_TIMER_AUTO(block_decompress_time); - rocksdb::CompressionType compression_type = - static_cast(data[n]); - // If the caller has requested that the block not be uncompressed - if (!do_uncompress || compression_type == kNoCompression) { - if (data != buf) { - // File implementation gave us pointer to some other data. - // Use it directly under the assumption that it will be live - // while the file is open. - result->data = Slice(data, n); - result->heap_allocated = false; - result->cachable = false; // Do not double-cache - } else { - if (use_stack_buf) { - // Need to allocate space in heap for cache usage - char* new_buf = new char[n]; - memcpy(new_buf, buf, n); - result->data = Slice(new_buf, n); - } else { - result->data = Slice(buf, n); - } +} // namespace - result->heap_allocated = true; - result->cachable = true; - } - result->compression_type = compression_type; - s = Status::OK(); +Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + BlockContents* contents, Env* env, + bool decompression_requested) { + Status status; + Slice slice; + size_t n = static_cast(handle.size()); + std::unique_ptr heap_buf; + char stack_buf[DefaultStackBufferSize]; + char* used_buf = nullptr; + rocksdb::CompressionType compression_type; + + if (decompression_requested && + n + kBlockTrailerSize < DefaultStackBufferSize) { + // If we've got a small enough hunk of data, read it in to the + // trivially allocated stack buffer instead of needing a full malloc() + used_buf = &stack_buf[0]; } else { - s = UncompressBlockContents(data, n, result); + heap_buf = std::unique_ptr(new char[n + kBlockTrailerSize]); + used_buf = heap_buf.get(); } - PERF_TIMER_STOP(block_decompress_time); - return s; -} -// Read and Decompress block -// Use buf in stack as temp reading buffer -Status ReadAndDecompressFast(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, - const BlockHandle& handle, BlockContents* result, - Env* env, bool do_uncompress) { - Status s; - Slice contents; - size_t n = static_cast(handle.size()); - char buf[DefaultStackBufferSize]; + status = ReadBlock(file, footer, options, handle, &slice, used_buf); - s = ReadBlock(file, footer, options, handle, &contents, buf); - if (!s.ok()) { - return s; + if (!status.ok()) { + return status; } - s = DecompressBlock(result, n, do_uncompress, buf, contents, true); - if (!s.ok()) { - return s; - } - return s; -} -// Read and Decompress block -// Use buf in heap as temp reading buffer -Status ReadAndDecompress(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, Env* env, bool do_uncompress) { - Status s; - Slice contents; - size_t n = static_cast(handle.size()); - char* buf = new char[n + kBlockTrailerSize]; + PERF_TIMER_GUARD(block_decompress_time); - s = ReadBlock(file, footer, options, handle, &contents, buf); - if (!s.ok()) { - delete[] buf; - return s; - } - s = DecompressBlock(result, n, do_uncompress, buf, contents, false); - if (!s.ok()) { - delete[] buf; - return s; + compression_type = static_cast(slice.data()[n]); + + if (decompression_requested && compression_type != kNoCompression) { + return UncompressBlockContents(slice.data(), n, contents, footer.version()); } - if (result->data.data() != buf) { - delete[] buf; + if (slice.data() != used_buf) { + *contents = BlockContents(Slice(slice.data(), n), false, compression_type); + return status; } - return s; -} -Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, Env* env, bool do_uncompress) { - size_t n = static_cast(handle.size()); - if (do_uncompress && n + kBlockTrailerSize < DefaultStackBufferSize) { - return ReadAndDecompressFast(file, footer, options, handle, result, env, - do_uncompress); - } else { - return ReadAndDecompress(file, footer, options, handle, result, env, - do_uncompress); + if (used_buf == &stack_buf[0]) { + heap_buf = std::unique_ptr(new char[n]); + memcpy(heap_buf.get(), stack_buf, n); } + + *contents = BlockContents(std::move(heap_buf), n, true, compression_type); + return status; } // @@ -367,9 +354,11 @@ Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, // contents are uncompresed into this buffer. This // buffer is returned via 'result' and it is upto the caller to // free this buffer. +// format_version is the block format as defined in include/rocksdb/table.h Status UncompressBlockContents(const char* data, size_t n, - BlockContents* result) { - char* ubuf = nullptr; + BlockContents* contents, + uint32_t format_version) { + std::unique_ptr ubuf; int decompress_size = 0; assert(data[n] != kNoCompression); switch (data[n]) { @@ -377,67 +366,67 @@ Status UncompressBlockContents(const char* data, size_t n, size_t ulength = 0; static char snappy_corrupt_msg[] = "Snappy not supported or corrupted Snappy compressed block contents"; - if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) { + if (!Snappy_GetUncompressedLength(data, n, &ulength)) { return Status::Corruption(snappy_corrupt_msg); } - ubuf = new char[ulength]; - if (!port::Snappy_Uncompress(data, n, ubuf)) { - delete[] ubuf; + ubuf = std::unique_ptr(new char[ulength]); + if (!Snappy_Uncompress(data, n, ubuf.get())) { return Status::Corruption(snappy_corrupt_msg); } - result->data = Slice(ubuf, ulength); - result->heap_allocated = true; - result->cachable = true; + *contents = BlockContents(std::move(ubuf), ulength, true, kNoCompression); break; } case kZlibCompression: - ubuf = port::Zlib_Uncompress(data, n, &decompress_size); - static char zlib_corrupt_msg[] = - "Zlib not supported or corrupted Zlib compressed block contents"; + ubuf = std::unique_ptr(Zlib_Uncompress( + data, n, &decompress_size, + GetCompressFormatForVersion(kZlibCompression, format_version))); if (!ubuf) { + static char zlib_corrupt_msg[] = + "Zlib not supported or corrupted Zlib compressed block contents"; return Status::Corruption(zlib_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kBZip2Compression: - ubuf = port::BZip2_Uncompress(data, n, &decompress_size); - static char bzip2_corrupt_msg[] = - "Bzip2 not supported or corrupted Bzip2 compressed block contents"; + ubuf = std::unique_ptr(BZip2_Uncompress( + data, n, &decompress_size, + GetCompressFormatForVersion(kBZip2Compression, format_version))); if (!ubuf) { + static char bzip2_corrupt_msg[] = + "Bzip2 not supported or corrupted Bzip2 compressed block contents"; return Status::Corruption(bzip2_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4Compression: - ubuf = port::LZ4_Uncompress(data, n, &decompress_size); - static char lz4_corrupt_msg[] = - "LZ4 not supported or corrupted LZ4 compressed block contents"; + ubuf = std::unique_ptr(LZ4_Uncompress( + data, n, &decompress_size, + GetCompressFormatForVersion(kLZ4Compression, format_version))); if (!ubuf) { + static char lz4_corrupt_msg[] = + "LZ4 not supported or corrupted LZ4 compressed block contents"; return Status::Corruption(lz4_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; case kLZ4HCCompression: - ubuf = port::LZ4_Uncompress(data, n, &decompress_size); - static char lz4hc_corrupt_msg[] = - "LZ4HC not supported or corrupted LZ4HC compressed block contents"; + ubuf = std::unique_ptr(LZ4_Uncompress( + data, n, &decompress_size, + GetCompressFormatForVersion(kLZ4HCCompression, format_version))); if (!ubuf) { + static char lz4hc_corrupt_msg[] = + "LZ4HC not supported or corrupted LZ4HC compressed block contents"; return Status::Corruption(lz4hc_corrupt_msg); } - result->data = Slice(ubuf, decompress_size); - result->heap_allocated = true; - result->cachable = true; + *contents = + BlockContents(std::move(ubuf), decompress_size, true, kNoCompression); break; default: return Status::Corruption("bad block type"); } - result->compression_type = kNoCompression; // not compressed any more return Status::OK(); } diff --git a/table/format.h b/table/format.h index a971c1a67..900a07148 100644 --- a/table/format.h +++ b/table/format.h @@ -33,15 +33,18 @@ class BlockHandle { // The offset of the block in the file. uint64_t offset() const { return offset_; } - void set_offset(uint64_t offset) { offset_ = offset; } + void set_offset(uint64_t _offset) { offset_ = _offset; } // The size of the stored block uint64_t size() const { return size_; } - void set_size(uint64_t size) { size_ = size; } + void set_size(uint64_t _size) { size_ = _size; } void EncodeTo(std::string* dst) const; Status DecodeFrom(Slice* input); + // Return a string that contains the copy of handle. + std::string ToString(bool hex = true) const; + // if the block handle's offset and size are both "0", we will view it // as a null block handle that points to no where. bool IsNull() const { @@ -62,6 +65,21 @@ class BlockHandle { static const BlockHandle kNullBlockHandle; }; +inline uint32_t GetCompressFormatForVersion(CompressionType compression_type, + uint32_t version) { + // snappy is not versioned + assert(compression_type != kSnappyCompression && + compression_type != kNoCompression); + // As of version 2, we encode compressed block with + // compress_format_version == 2. Before that, the version is 1. + // DO NOT CHANGE THIS FUNCTION, it affects disk format + return version >= 2 ? 2 : 1; +} + +inline bool BlockBasedTableSupportedVersion(uint32_t version) { + return version <= 2; +} + // Footer encapsulates the fixed information stored at the tail // end of every table file. class Footer { @@ -69,12 +87,13 @@ class Footer { // Constructs a footer without specifying its table magic number. // In such case, the table magic number of such footer should be // initialized via @ReadFooterFromFile(). - Footer() : Footer(kInvalidTableMagicNumber) {} + // Use this when you plan to load Footer with DecodeFrom(). Never use this + // when you plan to EncodeTo. + Footer() : Footer(kInvalidTableMagicNumber, 0) {} - // @table_magic_number serves two purposes: - // 1. Identify different types of the tables. - // 2. Help us to identify if a given file is a valid sst. - explicit Footer(uint64_t table_magic_number); + // Use this constructor when you plan to write out the footer using + // EncodeTo(). Never use this constructor with DecodeFrom(). + Footer(uint64_t table_magic_number, uint32_t version); // The version of the footer in this file uint32_t version() const { return version_; } @@ -94,20 +113,13 @@ class Footer { uint64_t table_magic_number() const { return table_magic_number_; } - // The version of Footer we encode - enum { - kLegacyFooter = 0, - kFooterVersion = 1, - }; - void EncodeTo(std::string* dst) const; - // Set the current footer based on the input slice. If table_magic_number_ - // is not set (i.e., HasInitializedTableMagicNumber() is true), then this - // function will also initialize table_magic_number_. Otherwise, this - // function will verify whether the magic number specified in the input - // slice matches table_magic_number_ and update the current footer only - // when the test passes. + // Set the current footer based on the input slice. + // + // REQUIRES: table_magic_number_ is not set (i.e., + // HasInitializedTableMagicNumber() is true). The function will initialize the + // magic number Status DecodeFrom(Slice* input); // Encoded length of a Footer. Note that the serialization of a Footer will @@ -118,17 +130,19 @@ class Footer { // Footer version 0 (legacy) will always occupy exactly this many bytes. // It consists of two block handles, padding, and a magic number. kVersion0EncodedLength = 2 * BlockHandle::kMaxEncodedLength + 8, - // Footer version 1 will always occupy exactly this many bytes. - // It consists of the checksum type, two block handles, padding, - // a version number, and a magic number - kVersion1EncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, - + // Footer of versions 1 and higher will always occupy exactly this many + // bytes. It consists of the checksum type, two block handles, padding, + // a version number (bigger than 1), and a magic number + kNewVersionsEncodedLength = 1 + 2 * BlockHandle::kMaxEncodedLength + 4 + 8, kMinEncodedLength = kVersion0EncodedLength, - kMaxEncodedLength = kVersion1EncodedLength + kMaxEncodedLength = kNewVersionsEncodedLength, }; static const uint64_t kInvalidTableMagicNumber = 0; + // convert this object to a human readable form + std::string ToString() const; + private: // REQUIRES: magic number wasn't initialized. void set_table_magic_number(uint64_t magic_number) { @@ -150,9 +164,11 @@ class Footer { }; // Read the footer from file -Status ReadFooterFromFile(RandomAccessFile* file, - uint64_t file_size, - Footer* footer); +// If enforce_table_magic_number != 0, ReadFooterFromFile() will return +// corruption if table_magic number is not equal to enforce_table_magic_number +Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, + Footer* footer, + uint64_t enforce_table_magic_number = 0); // 1-byte type + 32-bit crc static const size_t kBlockTrailerSize = 5; @@ -160,18 +176,29 @@ static const size_t kBlockTrailerSize = 5; struct BlockContents { Slice data; // Actual contents of data bool cachable; // True iff data can be cached - bool heap_allocated; // True iff caller should delete[] data.data() CompressionType compression_type; + std::unique_ptr allocation; + + BlockContents() : cachable(false), compression_type(kNoCompression) {} + + BlockContents(const Slice& _data, bool _cachable, + CompressionType _compression_type) + : data(_data), cachable(_cachable), compression_type(_compression_type) {} + + BlockContents(std::unique_ptr&& _data, size_t _size, bool _cachable, + CompressionType _compression_type) + : data(_data.get(), _size), + cachable(_cachable), + compression_type(_compression_type), + allocation(std::move(_data)) {} }; // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFile* file, - const Footer& footer, +extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, - BlockContents* result, - Env* env, + BlockContents* contents, Env* env, bool do_uncompress); // The 'data' points to the raw block contents read in from file. @@ -179,9 +206,11 @@ extern Status ReadBlockContents(RandomAccessFile* file, // contents are uncompresed into this buffer. This buffer is // returned via 'result' and it is upto the caller to // free this buffer. -extern Status UncompressBlockContents(const char* data, - size_t n, - BlockContents* result); +// For description of compress_format_version and possible values, see +// util/compression.h +extern Status UncompressBlockContents(const char* data, size_t n, + BlockContents* contents, + uint32_t compress_format_version); // Implementation details follow. Clients should ignore, @@ -190,9 +219,7 @@ inline BlockHandle::BlockHandle() ~static_cast(0)) { } -inline BlockHandle::BlockHandle(uint64_t offset, uint64_t size) - : offset_(offset), - size_(size) { -} +inline BlockHandle::BlockHandle(uint64_t _offset, uint64_t _size) + : offset_(_offset), size_(_size) {} } // namespace rocksdb diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc new file mode 100644 index 000000000..4113ec57a --- /dev/null +++ b/table/full_filter_block.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/full_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "port/port.h" +#include "util/coding.h" + +namespace rocksdb { + +FullFilterBlockBuilder::FullFilterBlockBuilder( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + FilterBitsBuilder* filter_bits_builder) + : prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + num_added_(0) { + assert(filter_bits_builder != nullptr); + filter_bits_builder_.reset(filter_bits_builder); +} + +void FullFilterBlockBuilder::Add(const Slice& key) { + if (whole_key_filtering_) { + AddKey(key); + } + if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { + AddPrefix(key); + } +} + +// Add key to filter if needed +inline void FullFilterBlockBuilder::AddKey(const Slice& key) { + filter_bits_builder_->AddKey(key); + num_added_++; +} + +// Add prefix to filter if needed +inline void FullFilterBlockBuilder::AddPrefix(const Slice& key) { + Slice prefix = prefix_extractor_->Transform(key); + filter_bits_builder_->AddKey(prefix); + num_added_++; +} + +Slice FullFilterBlockBuilder::Finish() { + if (num_added_ != 0) { + num_added_ = 0; + return filter_bits_builder_->Finish(&filter_data_); + } + return Slice(); +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, const Slice& contents, + FilterBitsReader* filter_bits_reader) + : prefix_extractor_(prefix_extractor), + whole_key_filtering_(table_opt.whole_key_filtering), + contents_(contents) { + assert(filter_bits_reader != nullptr); + filter_bits_reader_.reset(filter_bits_reader); +} + +FullFilterBlockReader::FullFilterBlockReader( + const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, BlockContents&& contents, + FilterBitsReader* filter_bits_reader) + : FullFilterBlockReader(prefix_extractor, table_opt, contents.data, + filter_bits_reader) { + block_contents_ = std::move(contents); +} + +bool FullFilterBlockReader::KeyMayMatch(const Slice& key, + uint64_t block_offset) { + assert(block_offset == kNotValid); + if (!whole_key_filtering_) { + return true; + } + return MayMatch(key); +} + +bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix, + uint64_t block_offset) { + assert(block_offset == kNotValid); + if (!prefix_extractor_) { + return true; + } + return MayMatch(prefix); +} + +bool FullFilterBlockReader::MayMatch(const Slice& entry) { + if (contents_.size() != 0) { + return filter_bits_reader_->MayMatch(entry); + } + return true; // remain the same with block_based filter +} + +size_t FullFilterBlockReader::ApproximateMemoryUsage() const { + return contents_.size(); +} +} // namespace rocksdb diff --git a/table/full_filter_block.h b/table/full_filter_block.h new file mode 100644 index 000000000..6d6294cf2 --- /dev/null +++ b/table/full_filter_block.h @@ -0,0 +1,111 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/slice_transform.h" +#include "db/dbformat.h" +#include "util/hash.h" +#include "table/filter_block.h" + +namespace rocksdb { + +class FilterPolicy; +class FilterBitsBuilder; +class FilterBitsReader; + +// A FullFilterBlockBuilder is used to construct a full filter for a +// particular Table. It generates a single string which is stored as +// a special block in the Table. +// The format of full filter block is: +// +----------------------------------------------------------------+ +// | full filter for all keys in sst file | +// +----------------------------------------------------------------+ +// The full filter can be very large. At the end of it, we put +// num_probes: how many hash functions are used in bloom filter +// +class FullFilterBlockBuilder : public FilterBlockBuilder { + public: + explicit FullFilterBlockBuilder(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + FilterBitsBuilder* filter_bits_builder); + // bits_builder is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockBuilder() {} + + virtual bool IsBlockBased() override { return false; } + virtual void StartBlock(uint64_t block_offset) override {} + virtual void Add(const Slice& key) override; + virtual Slice Finish() override; + + private: + // important: all of these might point to invalid addresses + // at the time of destruction of this filter block. destructor + // should NOT dereference them. + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + uint32_t num_added_; + std::unique_ptr filter_bits_builder_; + std::unique_ptr filter_data_; + + void AddKey(const Slice& key); + void AddPrefix(const Slice& key); + + // No copying allowed + FullFilterBlockBuilder(const FullFilterBlockBuilder&); + void operator=(const FullFilterBlockBuilder&); +}; + +// A FilterBlockReader is used to parse filter from SST table. +// KeyMayMatch and PrefixMayMatch would trigger filter checking +class FullFilterBlockReader : public FilterBlockReader { + public: + // REQUIRES: "contents" and filter_bits_reader must stay live + // while *this is live. + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + const Slice& contents, + FilterBitsReader* filter_bits_reader); + explicit FullFilterBlockReader(const SliceTransform* prefix_extractor, + const BlockBasedTableOptions& table_opt, + BlockContents&& contents, + FilterBitsReader* filter_bits_reader); + + // bits_reader is created in filter_policy, it should be passed in here + // directly. and be deleted here + ~FullFilterBlockReader() {} + + virtual bool IsBlockBased() override { return false; } + virtual bool KeyMayMatch(const Slice& key, + uint64_t block_offset = kNotValid) override; + virtual bool PrefixMayMatch(const Slice& prefix, + uint64_t block_offset = kNotValid) override; + virtual size_t ApproximateMemoryUsage() const override; + + private: + const SliceTransform* prefix_extractor_; + bool whole_key_filtering_; + + std::unique_ptr filter_bits_reader_; + Slice contents_; + BlockContents block_contents_; + std::unique_ptr filter_data_; + + bool MayMatch(const Slice& entry); + + // No copying allowed + FullFilterBlockReader(const FullFilterBlockReader&); + void operator=(const FullFilterBlockReader&); +}; + +} // namespace rocksdb diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc new file mode 100644 index 000000000..7adb5f08d --- /dev/null +++ b/table/full_filter_block_test.cc @@ -0,0 +1,182 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/full_filter_block.h" + +#include "rocksdb/filter_policy.h" +#include "util/coding.h" +#include "util/hash.h" +#include "util/logging.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class TestFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit TestFilterBitsBuilder() {} + + // Add Key to filter + virtual void AddKey(const Slice& key) override { + hash_entries_.push_back(Hash(key.data(), key.size(), 1)); + } + + // Generate the filter using the keys that are added + virtual Slice Finish(std::unique_ptr* buf) override { + uint32_t len = static_cast(hash_entries_.size()) * 4; + char* data = new char[len]; + for (size_t i = 0; i < hash_entries_.size(); i++) { + EncodeFixed32(data + i * 4, hash_entries_[i]); + } + const char* const_data = data; + buf->reset(const_data); + return Slice(data, len); + } + + private: + std::vector hash_entries_; +}; + +class TestFilterBitsReader : public FilterBitsReader { + public: + explicit TestFilterBitsReader(const Slice& contents) + : data_(contents.data()), len_(static_cast(contents.size())) {} + + virtual bool MayMatch(const Slice& entry) override { + uint32_t h = Hash(entry.data(), entry.size(), 1); + for (size_t i = 0; i + 4 <= len_; i += 4) { + if (h == DecodeFixed32(data_ + i)) { + return true; + } + } + return false; + } + + private: + const char* data_; + uint32_t len_; +}; + + +class TestHashFilter : public FilterPolicy { + public: + virtual const char* Name() const { + return "TestHashFilter"; + } + + virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + for (int i = 0; i < n; i++) { + uint32_t h = Hash(keys[i].data(), keys[i].size(), 1); + PutFixed32(dst, h); + } + } + + virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const { + uint32_t h = Hash(key.data(), key.size(), 1); + for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) { + if (h == DecodeFixed32(filter.data() + i)) { + return true; + } + } + return false; + } + + virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { + return new TestFilterBitsBuilder(); + } + + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) + const override { + return new TestFilterBitsReader(contents); + } +}; + +class PluginFullFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + PluginFullFilterBlockTest() { + table_options_.filter_policy.reset(new TestHashFilter()); + } +}; + +TEST(PluginFullFilterBlockTest, PluginEmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo")); +} + +TEST(PluginFullFilterBlockTest, PluginSingleChunk) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice block = builder.Finish(); + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(reader.KeyMayMatch("bar")); + ASSERT_TRUE(reader.KeyMayMatch("box")); + ASSERT_TRUE(reader.KeyMayMatch("hello")); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(!reader.KeyMayMatch("missing")); + ASSERT_TRUE(!reader.KeyMayMatch("other")); +} + +class FullFilterBlockTest { + public: + BlockBasedTableOptions table_options_; + + FullFilterBlockTest() { + table_options_.filter_policy.reset(NewBloomFilterPolicy(10, false)); + } + + ~FullFilterBlockTest() {} +}; + +TEST(FullFilterBlockTest, EmptyBuilder) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + Slice block = builder.Finish(); + ASSERT_EQ("", EscapeString(block)); + + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + // Remain same symantic with blockbased filter + ASSERT_TRUE(reader.KeyMayMatch("foo")); +} + +TEST(FullFilterBlockTest, SingleChunk) { + FullFilterBlockBuilder builder(nullptr, table_options_, + table_options_.filter_policy->GetFilterBitsBuilder()); + builder.Add("foo"); + builder.Add("bar"); + builder.Add("box"); + builder.Add("box"); + builder.Add("hello"); + Slice block = builder.Finish(); + FullFilterBlockReader reader(nullptr, table_options_, block, + table_options_.filter_policy->GetFilterBitsReader(block)); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(reader.KeyMayMatch("bar")); + ASSERT_TRUE(reader.KeyMayMatch("box")); + ASSERT_TRUE(reader.KeyMayMatch("hello")); + ASSERT_TRUE(reader.KeyMayMatch("foo")); + ASSERT_TRUE(!reader.KeyMayMatch("missing")); + ASSERT_TRUE(!reader.KeyMayMatch("other")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/get_context.cc b/table/get_context.cc new file mode 100644 index 000000000..59dfa41e6 --- /dev/null +++ b/table/get_context.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "table/get_context.h" +#include "rocksdb/merge_operator.h" +#include "rocksdb/statistics.h" +#include "util/statistics.h" + +namespace rocksdb { + +GetContext::GetContext(const Comparator* ucmp, + const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, + GetState init_state, const Slice& user_key, std::string* ret_value, + bool* value_found, MergeContext* merge_context) + : ucmp_(ucmp), + merge_operator_(merge_operator), + logger_(logger), + statistics_(statistics), + state_(init_state), + user_key_(user_key), + value_(ret_value), + value_found_(value_found), + merge_context_(merge_context) { +} + +// Called from TableCache::Get and Table::Get when file/block in which +// key may exist are not there in TableCache/BlockCache respectively. In this +// case we can't guarantee that key does not exist and are not permitted to do +// IO to be certain.Set the status=kFound and value_found=false to let the +// caller know that key may exist but is not there in memory +void GetContext::MarkKeyMayExist() { + state_ = kFound; + if (value_found_ != nullptr) { + *value_found_ = false; + } +} + +void GetContext::SaveValue(const Slice& value) { + state_ = kFound; + value_->assign(value.data(), value.size()); +} + +bool GetContext::SaveValue(const ParsedInternalKey& parsed_key, + const Slice& value) { + assert((state_ != kMerge && parsed_key.type != kTypeMerge) || + merge_context_ != nullptr); + if (ucmp_->Compare(parsed_key.user_key, user_key_) == 0) { + // Key matches. Process it + switch (parsed_key.type) { + case kTypeValue: + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kFound; + value_->assign(value.data(), value.size()); + } else if (kMerge == state_) { + assert(merge_operator_ != nullptr); + state_ = kFound; + if (!merge_operator_->FullMerge(user_key_, &value, + merge_context_->GetOperands(), + value_, logger_)) { + RecordTick(statistics_, NUMBER_MERGE_FAILURES); + state_ = kCorrupt; + } + } + return false; + + case kTypeDeletion: + assert(state_ == kNotFound || state_ == kMerge); + if (kNotFound == state_) { + state_ = kDeleted; + } else if (kMerge == state_) { + state_ = kFound; + if (!merge_operator_->FullMerge(user_key_, nullptr, + merge_context_->GetOperands(), + value_, logger_)) { + RecordTick(statistics_, NUMBER_MERGE_FAILURES); + state_ = kCorrupt; + } + } + return false; + + case kTypeMerge: + assert(state_ == kNotFound || state_ == kMerge); + state_ = kMerge; + merge_context_->PushOperand(value); + return true; + + default: + assert(false); + break; + } + } + + // state_ could be Corrupt, merge or notfound + return false; +} + +} // namespace rocksdb diff --git a/table/get_context.h b/table/get_context.h new file mode 100644 index 000000000..a38f3c533 --- /dev/null +++ b/table/get_context.h @@ -0,0 +1,47 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#include +#include "db/merge_context.h" + +namespace rocksdb { +class MergeContext; + +class GetContext { + public: + enum GetState { + kNotFound, + kFound, + kDeleted, + kCorrupt, + kMerge // saver contains the current merge result (the operands) + }; + + GetContext(const Comparator* ucmp, const MergeOperator* merge_operator, + Logger* logger, Statistics* statistics, + GetState init_state, const Slice& user_key, std::string* ret_value, + bool* value_found, MergeContext* merge_context); + + void MarkKeyMayExist(); + void SaveValue(const Slice& value); + bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value); + GetState State() const { return state_; } + + private: + const Comparator* ucmp_; + const MergeOperator* merge_operator_; + // the merge operations encountered; + Logger* logger_; + Statistics* statistics_; + + GetState state_; + Slice user_key_; + std::string* value_; + bool* value_found_; // Is value set correctly? Used by KeyMayExist + MergeContext* merge_context_; +}; + +} // namespace rocksdb diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index 502cacb3e..d64047bea 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -20,17 +20,15 @@ namespace rocksdb { class IteratorWrapper { public: IteratorWrapper(): iter_(nullptr), valid_(false) { } - explicit IteratorWrapper(Iterator* iter): iter_(nullptr) { - Set(iter); - } + explicit IteratorWrapper(Iterator* _iter) : iter_(nullptr) { Set(_iter); } ~IteratorWrapper() {} Iterator* iter() const { return iter_; } // Takes ownership of "iter" and will delete it when destroyed, or // when Set() is invoked again. - void Set(Iterator* iter) { + void Set(Iterator* _iter) { delete iter_; - iter_ = iter; + iter_ = _iter; if (iter_ == nullptr) { valid_ = false; } else { diff --git a/table/merger.cc b/table/merger.cc index 611480cec..26a90097c 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -23,27 +23,24 @@ #include "util/autovector.h" namespace rocksdb { -namespace merger { -typedef std::priority_queue< - IteratorWrapper*, - std::vector, - MaxIteratorComparator> MaxIterHeap; +// Without anonymous namespace here, we fail the warning -Wmissing-prototypes +namespace { +typedef std::priority_queue, + MaxIteratorComparator> MergerMaxIterHeap; -typedef std::priority_queue< - IteratorWrapper*, - std::vector, - MinIteratorComparator> MinIterHeap; +typedef std::priority_queue, + MinIteratorComparator> MergerMinIterHeap; // Return's a new MaxHeap of IteratorWrapper's using the provided Comparator. -MaxIterHeap NewMaxIterHeap(const Comparator* comparator) { - return MaxIterHeap(MaxIteratorComparator(comparator)); +MergerMaxIterHeap NewMergerMaxIterHeap(const Comparator* comparator) { + return MergerMaxIterHeap(MaxIteratorComparator(comparator)); } // Return's a new MinHeap of IteratorWrapper's using the provided Comparator. -MinIterHeap NewMinIterHeap(const Comparator* comparator) { - return MinIterHeap(MinIteratorComparator(comparator)); +MergerMinIterHeap NewMergerMinIterHeap(const Comparator* comparator) { + return MergerMinIterHeap(MinIteratorComparator(comparator)); } -} // namespace merger +} // namespace const size_t kNumIterReserve = 4; @@ -56,8 +53,8 @@ class MergingIterator : public Iterator { current_(nullptr), use_heap_(true), direction_(kForward), - maxHeap_(merger::NewMaxIterHeap(comparator_)), - minHeap_(merger::NewMinIterHeap(comparator_)) { + maxHeap_(NewMergerMaxIterHeap(comparator_)), + minHeap_(NewMergerMinIterHeap(comparator_)) { children_.resize(n); for (int i = 0; i < n; i++) { children_[i].Set(children[i]); @@ -116,12 +113,12 @@ class MergingIterator : public Iterator { // Invalidate the heap. use_heap_ = false; IteratorWrapper* first_child = nullptr; - PERF_TIMER_DECLARE(); for (auto& child : children_) { - PERF_TIMER_START(seek_child_seek_time); - child.Seek(target); - PERF_TIMER_STOP(seek_child_seek_time); + { + PERF_TIMER_GUARD(seek_child_seek_time); + child.Seek(target); + } PERF_COUNTER_ADD(seek_child_seek_count, 1); if (child.Valid()) { @@ -134,24 +131,21 @@ class MergingIterator : public Iterator { } else { // We have more than one children with valid keys. Initialize // the heap and put the first child into the heap. - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); ClearHeaps(); minHeap_.push(first_child); - PERF_TIMER_STOP(seek_min_heap_time); } } if (use_heap_) { - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); minHeap_.push(&child); - PERF_TIMER_STOP(seek_min_heap_time); } } } if (use_heap_) { // If heap is valid, need to put the smallest key to curent_. - PERF_TIMER_START(seek_min_heap_time); + PERF_TIMER_GUARD(seek_min_heap_time); FindSmallest(); - PERF_TIMER_STOP(seek_min_heap_time); } else { // The heap is not valid, then the current_ iterator is the first // one, or null if there is no first child. @@ -243,14 +237,14 @@ class MergingIterator : public Iterator { } virtual Status status() const { - Status status; + Status s; for (auto& child : children_) { - status = child.status(); - if (!status.ok()) { + s = child.status(); + if (!s.ok()) { break; } } - return status; + return s; } private: @@ -274,8 +268,8 @@ class MergingIterator : public Iterator { kReverse }; Direction direction_; - merger::MaxIterHeap maxHeap_; - merger::MinIterHeap minHeap_; + MergerMaxIterHeap maxHeap_; + MergerMinIterHeap minHeap_; }; void MergingIterator::FindSmallest() { @@ -302,8 +296,8 @@ void MergingIterator::FindLargest() { void MergingIterator::ClearHeaps() { use_heap_ = true; - maxHeap_ = merger::NewMaxIterHeap(comparator_); - minHeap_ = merger::NewMinIterHeap(comparator_); + maxHeap_ = NewMergerMaxIterHeap(comparator_); + minHeap_ = NewMergerMinIterHeap(comparator_); } Iterator* NewMergingIterator(const Comparator* cmp, Iterator** list, int n, diff --git a/table/merger_test.cc b/table/merger_test.cc new file mode 100644 index 000000000..56ea361be --- /dev/null +++ b/table/merger_test.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include + +#include "rocksdb/iterator.h" +#include "table/merger.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class VectorIterator : public Iterator { + public: + explicit VectorIterator(const std::vector& keys) + : keys_(keys), current_(keys.size()) { + std::sort(keys_.begin(), keys_.end()); + } + + virtual bool Valid() const { return current_ < keys_.size(); } + + virtual void SeekToFirst() { current_ = 0; } + virtual void SeekToLast() { current_ = keys_.size() - 1; } + + virtual void Seek(const Slice& target) { + current_ = std::lower_bound(keys_.begin(), keys_.end(), target.ToString()) - + keys_.begin(); + } + + virtual void Next() { current_++; } + virtual void Prev() { current_--; } + + virtual Slice key() const { return Slice(keys_[current_]); } + virtual Slice value() const { return Slice(); } + + virtual Status status() const { return Status::OK(); } + + private: + std::vector keys_; + size_t current_; +}; + +class MergerTest { + public: + MergerTest() + : rnd_(3), merging_iterator_(nullptr), single_iterator_(nullptr) {} + ~MergerTest() = default; + std::vector GenerateStrings(size_t len, int string_len) { + std::vector ret; + for (size_t i = 0; i < len; ++i) { + ret.push_back(test::RandomHumanReadableString(&rnd_, string_len)); + } + return ret; + } + + void AssertEquivalence() { + auto a = merging_iterator_.get(); + auto b = single_iterator_.get(); + if (!a->Valid()) { + ASSERT_TRUE(!b->Valid()); + } else { + ASSERT_TRUE(b->Valid()); + ASSERT_EQ(b->key().ToString(), a->key().ToString()); + ASSERT_EQ(b->value().ToString(), a->value().ToString()); + } + } + + void SeekToRandom() { Seek(test::RandomHumanReadableString(&rnd_, 5)); } + + void Seek(std::string target) { + merging_iterator_->Seek(target); + single_iterator_->Seek(target); + } + + void SeekToFirst() { + merging_iterator_->SeekToFirst(); + single_iterator_->SeekToFirst(); + } + + void SeekToLast() { + merging_iterator_->SeekToLast(); + single_iterator_->SeekToLast(); + } + + void Next(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Next(); + single_iterator_->Next(); + } + AssertEquivalence(); + } + + void Prev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + merging_iterator_->Prev(); + single_iterator_->Prev(); + } + AssertEquivalence(); + } + + void NextAndPrev(int times) { + for (int i = 0; i < times && merging_iterator_->Valid(); ++i) { + AssertEquivalence(); + if (rnd_.OneIn(2)) { + merging_iterator_->Prev(); + single_iterator_->Prev(); + } else { + merging_iterator_->Next(); + single_iterator_->Next(); + } + } + AssertEquivalence(); + } + + void Generate(size_t num_iterators, size_t strings_per_iterator, + int letters_per_string) { + std::vector small_iterators; + for (size_t i = 0; i < num_iterators; ++i) { + auto strings = GenerateStrings(strings_per_iterator, letters_per_string); + small_iterators.push_back(new VectorIterator(strings)); + all_keys_.insert(all_keys_.end(), strings.begin(), strings.end()); + } + + merging_iterator_.reset( + NewMergingIterator(BytewiseComparator(), &small_iterators[0], + static_cast(small_iterators.size()))); + single_iterator_.reset(new VectorIterator(all_keys_)); + } + + Random rnd_; + std::unique_ptr merging_iterator_; + std::unique_ptr single_iterator_; + std::vector all_keys_; +}; + +TEST(MergerTest, SeekToRandomNextTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToRandomNextSmallStringsTest) { + Generate(1000, 50, 2); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToRandomPrevTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToRandom(); + AssertEquivalence(); + Prev(50000); + } +} + +TEST(MergerTest, SeekToRandomRandomTest) { + Generate(200, 50, 50); + for (int i = 0; i < 3; ++i) { + SeekToRandom(); + AssertEquivalence(); + NextAndPrev(5000); + } +} + +TEST(MergerTest, SeekToFirstTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToFirst(); + AssertEquivalence(); + Next(50000); + } +} + +TEST(MergerTest, SeekToLastTest) { + Generate(1000, 50, 50); + for (int i = 0; i < 10; ++i) { + SeekToLast(); + AssertEquivalence(); + Prev(50000); + } +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index a95f4c119..6f83f42d4 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -11,14 +11,13 @@ #include "rocksdb/table_properties.h" #include "table/block.h" #include "table/format.h" +#include "table/table_properties_internal.h" #include "util/coding.h" namespace rocksdb { MetaIndexBuilder::MetaIndexBuilder() - : meta_index_block_( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { -} + : meta_index_block_(new BlockBuilder(1 /* restart interval */)) {} void MetaIndexBuilder::Add(const std::string& key, const BlockHandle& handle) { @@ -35,9 +34,7 @@ Slice MetaIndexBuilder::Finish() { } PropertyBlockBuilder::PropertyBlockBuilder() - : properties_block_( - new BlockBuilder(1 /* restart interval */, BytewiseComparator())) { -} + : properties_block_(new BlockBuilder(1 /* restart interval */)) {} void PropertyBlockBuilder::Add(const std::string& name, const std::string& val) { @@ -90,9 +87,9 @@ void LogPropertiesCollectionError( assert(method == "Add" || method == "Finish"); std::string msg = - "[Warning] encountered error when calling TablePropertiesCollector::" + + "Encountered error when calling TablePropertiesCollector::" + method + "() with collector name: " + name; - Log(info_log, "%s", msg.c_str()); + Log(InfoLogLevel::ERROR_LEVEL, info_log, "%s", msg.c_str()); } bool NotifyCollectTableCollectorsOnAdd( @@ -145,14 +142,15 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, BlockContents block_contents; ReadOptions read_options; read_options.verify_checksums = false; - Status s = ReadBlockContents(file, footer, read_options, handle, - &block_contents, env, false); + Status s; + s = ReadBlockContents(file, footer, read_options, handle, &block_contents, + env, false); if (!s.ok()) { return s; } - Block properties_block(block_contents); + Block properties_block(std::move(block_contents)); std::unique_ptr iter( properties_block.NewIterator(BytewiseComparator())); @@ -195,9 +193,9 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, if (!GetVarint64(&raw_val, &val)) { // skip malformed value auto error_msg = - "[Warning] detect malformed value in properties meta-block:" + "Detect malformed value in properties meta-block:" "\tkey: " + key + "\tval: " + raw_val.ToString(); - Log(logger, "%s", error_msg.c_str()); + Log(InfoLogLevel::ERROR_LEVEL, logger, "%s", error_msg.c_str()); continue; } *(pos->second) = val; @@ -222,8 +220,8 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties) { // -- Read metaindex block - Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); + Footer footer; + auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number); if (!s.ok()) { return s; } @@ -237,7 +235,7 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter( metaindex_block.NewIterator(BytewiseComparator())); @@ -276,8 +274,8 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle) { - Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); + Footer footer; + auto s = ReadFooterFromFile(file, file_size, &footer, table_magic_number); if (!s.ok()) { return s; } @@ -291,7 +289,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, if (!s.ok()) { return s; } - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); @@ -303,10 +301,11 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents) { - Footer footer(table_magic_number); - auto s = ReadFooterFromFile(file, file_size, &footer); - if (!s.ok()) { - return s; + Status status; + Footer footer; + status = ReadFooterFromFile(file, file_size, &footer, table_magic_number); + if (!status.ok()) { + return status; } // Reading metaindex block @@ -314,30 +313,28 @@ Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, BlockContents metaindex_contents; ReadOptions read_options; read_options.verify_checksums = false; - s = ReadBlockContents(file, footer, read_options, metaindex_handle, - &metaindex_contents, env, false); - if (!s.ok()) { - return s; + status = ReadBlockContents(file, footer, read_options, metaindex_handle, + &metaindex_contents, env, false); + if (!status.ok()) { + return status; } // Finding metablock - Block metaindex_block(metaindex_contents); + Block metaindex_block(std::move(metaindex_contents)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewIterator(BytewiseComparator())); BlockHandle block_handle; - s = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); + status = FindMetaBlock(meta_iter.get(), meta_block_name, &block_handle); - if (!s.ok()) { - return s; + if (!status.ok()) { + return status; } // Reading metablock - s = ReadBlockContents(file, footer, read_options, block_handle, contents, env, - false); - - return s; + return ReadBlockContents(file, footer, read_options, block_handle, contents, + env, false); } } // namespace rocksdb diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 798a18af0..283f7a0be 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -119,10 +119,6 @@ Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties); -// Seek to the properties block. -// If it successfully seeks to the properties block, "is_found" will be -// set to true. -extern Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found); // Find the meta block from the meta index block. Status FindMetaBlock(Iterator* meta_index_iter, diff --git a/table/mock_table.cc b/table/mock_table.cc new file mode 100644 index 000000000..70adf2da6 --- /dev/null +++ b/table/mock_table.cc @@ -0,0 +1,116 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/table_properties.h" +#include "table/mock_table.h" +#include "table/get_context.h" +#include "db/dbformat.h" +#include "port/port.h" +#include "util/coding.h" + +namespace rocksdb { +namespace mock { + +Iterator* MockTableReader::NewIterator(const ReadOptions&, Arena* arena) { + return new MockTableIterator(table_); +} + +Status MockTableReader::Get(const ReadOptions&, const Slice& key, + GetContext* get_context) { + std::unique_ptr iter(new MockTableIterator(table_)); + for (iter->Seek(key); iter->Valid(); iter->Next()) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter->key(), &parsed_key)) { + return Status::Corruption(Slice()); + } + + if (!get_context->SaveValue(parsed_key, iter->value())) { + break; + } + } + return Status::OK(); +} + +std::shared_ptr MockTableReader::GetTableProperties() + const { + return std::shared_ptr(new TableProperties()); +} + +MockTableFactory::MockTableFactory() : next_id_(1) {} + +Status MockTableFactory::NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const { + uint32_t id = GetIDFromFile(file.get()); + + MutexLock lock_guard(&file_system_.mutex); + + auto it = file_system_.files.find(id); + if (it == file_system_.files.end()) { + return Status::IOError("Mock file not found"); + } + + table_reader->reset(new MockTableReader(it->second)); + + return Status::OK(); +} + +TableBuilder* MockTableFactory::NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const { + uint32_t id = GetAndWriteNextID(file); + + return new MockTableBuilder(id, &file_system_); +} + +Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, + MockFileContents file_contents) { + std::unique_ptr file; + auto s = env->NewWritableFile(fname, &file, EnvOptions()); + if (!s.ok()) { + return s; + } + + uint32_t id = GetAndWriteNextID(file.get()); + file_system_.files.insert({id, std::move(file_contents)}); + return Status::OK(); +} + +uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const { + uint32_t next_id = next_id_.fetch_add(1); + char buf[4]; + EncodeFixed32(buf, next_id); + file->Append(Slice(buf, 4)); + return next_id; +} + +uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const { + char buf[4]; + Slice result; + file->Read(0, 4, &result, buf); + assert(result.size() == 4); + return DecodeFixed32(buf); +} + +void MockTableFactory::AssertSingleFile(const MockFileContents& file_contents) { + ASSERT_EQ(file_system_.files.size(), 1U); + ASSERT_TRUE(file_contents == file_system_.files.begin()->second); +} + +void MockTableFactory::AssertLatestFile(const MockFileContents& file_contents) { + ASSERT_GE(file_system_.files.size(), 1U); + auto latest = file_system_.files.end(); + --latest; + ASSERT_TRUE(file_contents == latest->second); +} + +} // namespace mock +} // namespace rocksdb diff --git a/table/mock_table.h b/table/mock_table.h new file mode 100644 index 000000000..57481a4bc --- /dev/null +++ b/table/mock_table.h @@ -0,0 +1,179 @@ +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once +#include +#include +#include +#include +#include +#include + +#include "rocksdb/table.h" +#include "table/table_reader.h" +#include "table/table_builder.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { +namespace mock { + +typedef std::map MockFileContents; +// NOTE this currently only supports bitwise comparator + +struct MockTableFileSystem { + port::Mutex mutex; + std::map files; +}; + +class MockTableReader : public TableReader { + public: + explicit MockTableReader(const MockFileContents& table) : table_(table) {} + + Iterator* NewIterator(const ReadOptions&, Arena* arena) override; + + Status Get(const ReadOptions&, const Slice& key, + GetContext* get_context) override; + + uint64_t ApproximateOffsetOf(const Slice& key) override { return 0; } + + virtual size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override {} + + std::shared_ptr GetTableProperties() const override; + + ~MockTableReader() {} + + private: + const MockFileContents& table_; +}; + +class MockTableIterator : public Iterator { + public: + explicit MockTableIterator(const MockFileContents& table) : table_(table) { + itr_ = table_.end(); + } + + bool Valid() const { return itr_ != table_.end(); } + + void SeekToFirst() { itr_ = table_.begin(); } + + void SeekToLast() { + itr_ = table_.end(); + --itr_; + } + + void Seek(const Slice& target) { + std::string str_target(target.data(), target.size()); + itr_ = table_.lower_bound(str_target); + } + + void Next() { ++itr_; } + + void Prev() { + if (itr_ == table_.begin()) { + itr_ = table_.end(); + } else { + --itr_; + } + } + + Slice key() const { return Slice(itr_->first); } + + Slice value() const { return Slice(itr_->second); } + + Status status() const { return Status::OK(); } + + private: + const MockFileContents& table_; + MockFileContents::const_iterator itr_; +}; + +class MockTableBuilder : public TableBuilder { + public: + MockTableBuilder(uint32_t id, MockTableFileSystem* file_system) + : id_(id), file_system_(file_system) {} + + // REQUIRES: Either Finish() or Abandon() has been called. + ~MockTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override { + table_.insert({key.ToString(), value.ToString()}); + } + + // Return non-ok iff some error has been detected. + Status status() const override { return Status::OK(); } + + Status Finish() override { + MutexLock lock_guard(&file_system_->mutex); + file_system_->files.insert({id_, table_}); + return Status::OK(); + } + + void Abandon() override {} + + uint64_t NumEntries() const override { return table_.size(); } + + uint64_t FileSize() const override { return table_.size(); } + + private: + uint32_t id_; + MockTableFileSystem* file_system_; + MockFileContents table_; +}; + +class MockTableFactory : public TableFactory { + public: + MockTableFactory(); + const char* Name() const override { return "MockTable"; } + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_key, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table_reader) const; + + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_key, WritableFile* file, + const CompressionType compression_type, + const CompressionOptions& compression_opts) const; + + // This function will directly create mock table instead of going through + // MockTableBuilder. MockFileContents has to have a format of . Those key-value pairs will then be inserted into the mock table + Status CreateMockTable(Env* env, const std::string& fname, + MockFileContents file_contents); + + virtual Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const { + return Status::OK(); + } + + virtual std::string GetPrintableTableOptions() const override { + return std::string(); + } + + // This function will assert that only a single file exists and that the + // contents are equal to file_contents + void AssertSingleFile(const MockFileContents& file_contents); + void AssertLatestFile(const MockFileContents& file_contents); + + private: + uint32_t GetAndWriteNextID(WritableFile* file) const; + uint32_t GetIDFromFile(RandomAccessFile* file) const; + + mutable MockTableFileSystem file_system_; + mutable std::atomic next_id_; +}; + +} // namespace mock +} // namespace rocksdb diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 4f3b62ad4..0f89dd1f5 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -6,8 +6,10 @@ #ifndef ROCKSDB_LITE #include "table/plain_table_builder.h" -#include #include + +#include +#include #include #include "rocksdb/comparator.h" @@ -20,7 +22,6 @@ #include "table/block_builder.h" #include "table/bloom_block.h" #include "table/plain_table_index.h" -#include "table/filter_block.h" #include "table/format.h" #include "table/meta_blocks.h" #include "util/coding.h" @@ -58,24 +59,24 @@ extern const uint64_t kPlainTableMagicNumber = 0x8242229663bf9564ull; extern const uint64_t kLegacyPlainTableMagicNumber = 0x4f3418eb7a8f13b8ull; PlainTableBuilder::PlainTableBuilder( - const Options& options, WritableFile* file, uint32_t user_key_len, - EncodingType encoding_type, size_t index_sparseness, + const ImmutableCFOptions& ioptions, WritableFile* file, + uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) - : options_(options), + : ioptions_(ioptions), bloom_block_(num_probes), file_(file), bloom_bits_per_key_(bloom_bits_per_key), huge_page_tlb_size_(huge_page_tlb_size), - encoder_(encoding_type, user_key_len, options.prefix_extractor.get(), + encoder_(encoding_type, user_key_len, ioptions.prefix_extractor, index_sparseness), store_index_in_file_(store_index_in_file), - prefix_extractor_(options.prefix_extractor.get()) { + prefix_extractor_(ioptions.prefix_extractor) { // Build index block and save it in the file if hash_table_ratio > 0 if (store_index_in_file_) { assert(hash_table_ratio > 0 || IsTotalOrderMode()); index_builder_.reset( - new PlainTableIndexBuilder(&arena_, options, index_sparseness, + new PlainTableIndexBuilder(&arena_, ioptions, index_sparseness, hash_table_ratio, huge_page_tlb_size_)); assert(bloom_bits_per_key_ > 0); properties_.user_collected_properties @@ -93,10 +94,10 @@ PlainTableBuilder::PlainTableBuilder( // plain encoding. properties_.format_version = (encoding_type == kPlain) ? 0 : 1; - if (options_.prefix_extractor) { + if (ioptions_.prefix_extractor) { properties_.user_collected_properties [PlainTablePropertyNames::kPrefixExtractorName] = - options_.prefix_extractor->Name(); + ioptions_.prefix_extractor->Name(); } std::string val; @@ -105,7 +106,7 @@ PlainTableBuilder::PlainTableBuilder( [PlainTablePropertyNames::kEncodingType] = val; for (auto& collector_factories : - options.table_properties_collector_factories) { + ioptions.table_properties_collector_factories) { table_properties_collectors_.emplace_back( collector_factories->CreateTablePropertiesCollector()); } @@ -124,17 +125,18 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // Store key hash if (store_index_in_file_) { - if (options_.prefix_extractor.get() == nullptr) { + if (ioptions_.prefix_extractor == nullptr) { keys_or_prefixes_hashes_.push_back(GetSliceHash(internal_key.user_key)); } else { Slice prefix = - options_.prefix_extractor->Transform(internal_key.user_key); + ioptions_.prefix_extractor->Transform(internal_key.user_key); keys_or_prefixes_hashes_.push_back(GetSliceHash(prefix)); } } // Write value - auto prev_offset = offset_; + assert(offset_ <= std::numeric_limits::max()); + auto prev_offset = static_cast(offset_); // Write out the key encoder_.AppendKey(key, file_, &offset_, meta_bytes_buf, &meta_bytes_buf_size); @@ -143,7 +145,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { } // Write value length - int value_size = value.size(); + uint32_t value_size = static_cast(value.size()); char* end_ptr = EncodeVarint32(meta_bytes_buf + meta_bytes_buf_size, value_size); assert(end_ptr <= meta_bytes_buf + sizeof(meta_bytes_buf)); @@ -160,7 +162,7 @@ void PlainTableBuilder::Add(const Slice& key, const Slice& value) { // notify property collectors NotifyCollectTableCollectorsOnAdd(key, value, table_properties_collectors_, - options_.info_log.get()); + ioptions_.info_log); } Status PlainTableBuilder::status() const { return status_; } @@ -181,9 +183,11 @@ Status PlainTableBuilder::Finish() { MetaIndexBuilder meta_index_builer; if (store_index_in_file_ && (properties_.num_entries > 0)) { + assert(properties_.num_entries <= std::numeric_limits::max()); bloom_block_.SetTotalBits( - &arena_, properties_.num_entries * bloom_bits_per_key_, - options_.bloom_locality, huge_page_tlb_size_, options_.info_log.get()); + &arena_, + static_cast(properties_.num_entries) * bloom_bits_per_key_, + ioptions_.bloom_locality, huge_page_tlb_size_, ioptions_.info_log); PutVarint32(&properties_.user_collected_properties [PlainTablePropertyNames::kNumBloomBlocks], @@ -224,7 +228,7 @@ Status PlainTableBuilder::Finish() { // -- Add user collected properties NotifyCollectTableCollectorsOnFinish(table_properties_collectors_, - options_.info_log.get(), + ioptions_.info_log, &property_block_builder); // -- Write property block @@ -254,7 +258,7 @@ Status PlainTableBuilder::Finish() { // Write Footer // no need to write out new footer if we're using default checksum - Footer footer(kLegacyPlainTableMagicNumber); + Footer footer(kLegacyPlainTableMagicNumber, 0); footer.set_metaindex_handle(metaindex_block_handle); footer.set_index_handle(BlockHandle::NullBlockHandle()); std::string footer_encoding; diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index 2871d887e..8fc4f1fe4 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -30,7 +30,7 @@ class PlainTableBuilder: public TableBuilder { // caller to close the file after calling Finish(). The output file // will be part of level specified by 'level'. A value of -1 means // that the caller does not know which level the output file will reside. - PlainTableBuilder(const Options& options, WritableFile* file, + PlainTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, @@ -71,7 +71,7 @@ class PlainTableBuilder: public TableBuilder { private: Arena arena_; - Options options_; + const ImmutableCFOptions& ioptions_; std::vector> table_properties_collectors_; @@ -81,7 +81,7 @@ class PlainTableBuilder: public TableBuilder { WritableFile* file_; uint64_t offset_ = 0; uint32_t bloom_bits_per_key_; - uint32_t huge_page_tlb_size_; + size_t huge_page_tlb_size_; Status status_; TableProperties properties_; PlainTableKeyEncoder encoder_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index 145179bae..fae0d8018 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,22 +14,24 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const Options& options, - const EnvOptions& soptions, +Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& icomp, unique_ptr&& file, uint64_t file_size, unique_ptr* table) const { - return PlainTableReader::Open(options, soptions, icomp, std::move(file), + return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, huge_page_tlb_size_, full_scan_mode_); } TableBuilder* PlainTableFactory::NewTableBuilder( - const Options& options, const InternalKeyComparator& internal_comparator, - WritableFile* file, CompressionType compression_type) const { - return new PlainTableBuilder(options, file, user_key_len_, encoding_type_, + const ImmutableCFOptions& ioptions, + const InternalKeyComparator& internal_comparator, + WritableFile* file, const CompressionType, + const CompressionOptions&) const { + return new PlainTableBuilder(ioptions, file, user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); @@ -50,10 +52,10 @@ std::string PlainTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " hash_table_ratio: %lf\n", hash_table_ratio_); ret.append(buffer); - snprintf(buffer, kBufferSize, " index_sparseness: %zd\n", + snprintf(buffer, kBufferSize, " index_sparseness: %zu\n", index_sparseness_); ret.append(buffer); - snprintf(buffer, kBufferSize, " huge_page_tlb_size: %zd\n", + snprintf(buffer, kBufferSize, " huge_page_tlb_size: %zu\n", huge_page_tlb_size_); ret.append(buffer); snprintf(buffer, kBufferSize, " encoding_type: %d\n", diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 31e20b016..23b54f092 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -14,7 +14,6 @@ namespace rocksdb { -struct Options; struct EnvOptions; using std::unique_ptr; @@ -128,7 +127,7 @@ class TableBuilder; class PlainTableFactory : public TableFactory { public: ~PlainTableFactory() {} - // user_key_size is the length of the user key. If it is set to be + // user_key_len is the length of the user key. If it is set to be // kPlainTableVariableLength, then it means variable length. Otherwise, all // the keys need to have the fix length of this value. bloom_bits_per_key is // number of bits used for bloom filer per key. hash_table_ratio is @@ -154,23 +153,26 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader(const Options& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; - TableBuilder* NewTableBuilder(const Options& options, - const InternalKeyComparator& icomparator, - WritableFile* file, - CompressionType compression_type) const - override; + Status NewTableReader( + const ImmutableCFOptions& options, const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const override; + TableBuilder* NewTableBuilder( + const ImmutableCFOptions& options, + const InternalKeyComparator& icomparator, + WritableFile* file, + const CompressionType, + const CompressionOptions&) const override; std::string GetPrintableTableOptions() const override; static const char kValueTypeSeqId0 = 0xFF; // Sanitizes the specified DB Options. - Status SanitizeDBOptions(DBOptions* db_opts) const override { - if (db_opts->allow_mmap_reads == false) { + Status SanitizeOptions(const DBOptions& db_opts, + const ColumnFamilyOptions& cf_opts) const override { + if (db_opts.allow_mmap_reads == false) { return Status::NotSupported( "PlainTable with allow_mmap_reads == false is not supported."); } diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc index efba9b71d..7ca451eb3 100644 --- a/table/plain_table_index.cc +++ b/table/plain_table_index.cc @@ -3,6 +3,14 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include + #include "table/plain_table_index.h" #include "util/coding.h" #include "util/hash.h" @@ -24,7 +32,8 @@ Status PlainTableIndex::InitFromRawData(Slice data) { if (!GetVarint32(&data, &num_prefixes_)) { return Status::Corruption("Couldn't read the index size!"); } - sub_index_size_ = data.size() - index_size_ * kOffsetLen; + sub_index_size_ = + static_cast(data.size()) - index_size_ * kOffsetLen; char* index_data_begin = const_cast(data.data()); index_ = reinterpret_cast(index_data_begin); @@ -48,7 +57,7 @@ PlainTableIndex::IndexSearchResult PlainTableIndex::GetOffset( } } -void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash, +void PlainTableIndexBuilder::IndexRecordList::AddRecord(uint32_t hash, uint32_t offset) { if (num_records_in_current_group_ == kNumRecordsPerGroup) { current_group_ = AllocateNewGroup(); @@ -61,7 +70,7 @@ void PlainTableIndexBuilder::IndexRecordList::AddRecord(murmur_t hash, } void PlainTableIndexBuilder::AddKeyPrefix(Slice key_prefix_slice, - uint64_t key_offset) { + uint32_t key_offset) { if (is_first_record_ || prev_key_prefix_ != key_prefix_slice.ToString()) { ++num_prefixes_; if (!is_first_record_) { @@ -93,7 +102,8 @@ Slice PlainTableIndexBuilder::Finish() { BucketizeIndexes(&hash_to_offsets, &entries_per_bucket); keys_per_prefix_hist_.Add(num_keys_per_prefix_); - Log(options_.info_log, "Number of Keys per prefix Histogram: %s", + Log(InfoLogLevel::INFO_LEVEL, ioptions_.info_log, + "Number of Keys per prefix Histogram: %s", keys_per_prefix_hist_.ToString().c_str()); // From the temp data structure, populate indexes. @@ -147,18 +157,19 @@ void PlainTableIndexBuilder::BucketizeIndexes( Slice PlainTableIndexBuilder::FillIndexes( const std::vector& hash_to_offsets, const std::vector& entries_per_bucket) { - Log(options_.info_log, "Reserving %zu bytes for plain table's sub_index", + Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log, + "Reserving %" PRIu32 " bytes for plain table's sub_index", sub_index_size_); auto total_allocate_size = GetTotalSize(); char* allocated = arena_->AllocateAligned( - total_allocate_size, huge_page_tlb_size_, options_.info_log.get()); + total_allocate_size, huge_page_tlb_size_, ioptions_.info_log); auto temp_ptr = EncodeVarint32(allocated, index_size_); uint32_t* index = reinterpret_cast(EncodeVarint32(temp_ptr, num_prefixes_)); char* sub_index = reinterpret_cast(index + index_size_); - size_t sub_index_offset = 0; + uint32_t sub_index_offset = 0; for (uint32_t i = 0; i < index_size_; i++) { uint32_t num_keys_for_bucket = entries_per_bucket[i]; switch (num_keys_for_bucket) { @@ -191,7 +202,8 @@ Slice PlainTableIndexBuilder::FillIndexes( } assert(sub_index_offset == sub_index_size_); - Log(options_.info_log, "hash table size: %d, suffix_map length %zu", + Log(InfoLogLevel::DEBUG_LEVEL, ioptions_.info_log, + "hash table size: %d, suffix_map length %zu", index_size_, sub_index_size_); return Slice(allocated, GetTotalSize()); } @@ -199,3 +211,5 @@ Slice PlainTableIndexBuilder::FillIndexes( const std::string PlainTableIndexBuilder::kPlainTableIndexBlock = "PlainTableIndexBlock"; }; // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/table/plain_table_index.h b/table/plain_table_index.h index f63bbd0d5..be8ad1639 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -5,6 +5,8 @@ #pragma once +#ifndef ROCKSDB_LITE + #include #include @@ -92,7 +94,7 @@ class PlainTableIndex { private: uint32_t index_size_; - size_t sub_index_size_; + uint32_t sub_index_size_; uint32_t num_prefixes_; uint32_t* index_; @@ -108,11 +110,11 @@ class PlainTableIndex { // #wiki-in-memory-index-format class PlainTableIndexBuilder { public: - PlainTableIndexBuilder(Arena* arena, const Options& options, - uint32_t index_sparseness, double hash_table_ratio, - double huge_page_tlb_size) + PlainTableIndexBuilder(Arena* arena, const ImmutableCFOptions& ioptions, + size_t index_sparseness, double hash_table_ratio, + size_t huge_page_tlb_size) : arena_(arena), - options_(options), + ioptions_(ioptions), record_list_(kRecordsPerGroup), is_first_record_(true), due_index_(false), @@ -120,11 +122,11 @@ class PlainTableIndexBuilder { num_keys_per_prefix_(0), prev_key_prefix_hash_(0), index_sparseness_(index_sparseness), - prefix_extractor_(options.prefix_extractor.get()), + prefix_extractor_(ioptions.prefix_extractor), hash_table_ratio_(hash_table_ratio), huge_page_tlb_size_(huge_page_tlb_size) {} - void AddKeyPrefix(Slice key_prefix_slice, uint64_t key_offset); + void AddKeyPrefix(Slice key_prefix_slice, uint32_t key_offset); Slice Finish(); @@ -156,7 +158,7 @@ class PlainTableIndexBuilder { } } - void AddRecord(murmur_t hash, uint32_t offset); + void AddRecord(uint32_t hash, uint32_t offset); size_t GetNumRecords() const { return (groups_.size() - 1) * kNumRecordsPerGroup + @@ -196,7 +198,7 @@ class PlainTableIndexBuilder { const std::vector& entries_per_bucket); Arena* arena_; - Options options_; + const ImmutableCFOptions ioptions_; HistogramImpl keys_per_prefix_hist_; IndexRecordList record_list_; bool is_first_record_; @@ -205,13 +207,13 @@ class PlainTableIndexBuilder { uint32_t num_keys_per_prefix_; uint32_t prev_key_prefix_hash_; - uint32_t index_sparseness_; + size_t index_sparseness_; uint32_t index_size_; - size_t sub_index_size_; + uint32_t sub_index_size_; const SliceTransform* prefix_extractor_; double hash_table_ratio_; - double huge_page_tlb_size_; + size_t huge_page_tlb_size_; std::string prev_key_prefix_; @@ -219,3 +221,5 @@ class PlainTableIndexBuilder { }; }; // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index eedf58aea..08d16f191 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -30,7 +30,7 @@ const unsigned char kSizeInlineLimit = 0x3F; size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) { out_buffer[0] = type << 6; - if (key_size < 0x3F) { + if (key_size < static_cast(kSizeInlineLimit)) { // size inlined out_buffer[0] |= static_cast(key_size); return 1; @@ -43,7 +43,7 @@ size_t EncodeSize(EntryType type, uint32_t key_size, char* out_buffer) { // Return position after the size byte(s). nullptr means error const char* DecodeSize(const char* offset, const char* limit, - EntryType* entry_type, size_t* key_size) { + EntryType* entry_type, uint32_t* key_size) { assert(offset < limit); *entry_type = static_cast( (static_cast(offset[0]) & ~kSizeInlineLimit) >> 6); @@ -73,10 +73,10 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, Slice key_to_write = key; // Portion of internal key to write out. - size_t user_key_size = fixed_user_key_len_; + uint32_t user_key_size = fixed_user_key_len_; if (encoding_type_ == kPlain) { if (fixed_user_key_len_ == kPlainTableVariableLength) { - user_key_size = key.size() - 8; + user_key_size = static_cast(key.size() - 8); // Write key length char key_size_buf[5]; // tmp buffer for key size as varint32 char* ptr = EncodeVarint32(key_size_buf, user_key_size); @@ -93,13 +93,13 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, char size_bytes[12]; size_t size_bytes_pos = 0; - user_key_size = key.size() - 8; + user_key_size = static_cast(key.size() - 8); Slice prefix = prefix_extractor_->Transform(Slice(key.data(), user_key_size)); - if (key_count_for_prefix == 0 || prefix != pre_prefix_.GetKey() || - key_count_for_prefix % index_sparseness_ == 0) { - key_count_for_prefix = 1; + if (key_count_for_prefix_ == 0 || prefix != pre_prefix_.GetKey() || + key_count_for_prefix_ % index_sparseness_ == 0) { + key_count_for_prefix_ = 1; pre_prefix_.SetKey(prefix); size_bytes_pos += EncodeSize(kFullKey, user_key_size, size_bytes); Status s = file->Append(Slice(size_bytes, size_bytes_pos)); @@ -108,14 +108,15 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, } *offset += size_bytes_pos; } else { - key_count_for_prefix++; - if (key_count_for_prefix == 2) { + key_count_for_prefix_++; + if (key_count_for_prefix_ == 2) { // For second key within a prefix, need to encode prefix length size_bytes_pos += - EncodeSize(kPrefixFromPreviousKey, pre_prefix_.GetKey().size(), + EncodeSize(kPrefixFromPreviousKey, + static_cast(pre_prefix_.GetKey().size()), size_bytes + size_bytes_pos); } - size_t prefix_len = pre_prefix_.GetKey().size(); + uint32_t prefix_len = static_cast(pre_prefix_.GetKey().size()); size_bytes_pos += EncodeSize(kKeySuffix, user_key_size - prefix_len, size_bytes + size_bytes_pos); Status s = file->Append(Slice(size_bytes, size_bytes_pos)); @@ -184,7 +185,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey( const char* start, const char* limit, ParsedInternalKey* parsed_key, Slice* internal_key, size_t* bytes_read, bool* seekable) { const char* key_ptr = start; - size_t user_key_size = 0; + uint32_t user_key_size = 0; if (fixed_user_key_len_ != kPlainTableVariableLength) { user_key_size = fixed_user_key_len_; key_ptr = start; @@ -195,7 +196,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey( return Status::Corruption( "Unexpected EOF when reading the next key's size"); } - user_key_size = static_cast(tmp_size); + user_key_size = tmp_size; *bytes_read = key_ptr - start; } // dummy initial value to avoid compiler complain @@ -227,7 +228,7 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey( bool expect_suffix = false; do { - size_t size = 0; + uint32_t size = 0; // dummy initial value to avoid compiler complain bool decoded_internal_key_valid = true; const char* pos = DecodeSize(key_ptr, limit, &entry_type, &size); diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index ba66c2645..9047087ae 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -26,7 +26,7 @@ class PlainTableKeyEncoder { fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), - key_count_for_prefix(0) {} + key_count_for_prefix_(0) {} // key: the key to write out, in the format of internal key. // file: the output file to write out // offset: offset in the file. Needs to be updated after appending bytes @@ -45,7 +45,7 @@ class PlainTableKeyEncoder { uint32_t fixed_user_key_len_; const SliceTransform* prefix_extractor_; const size_t index_sparseness_; - size_t key_count_for_prefix; + size_t key_count_for_prefix_; IterKey pre_prefix_; }; diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index b5eccd310..0cd73ac32 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -26,6 +26,7 @@ #include "table/two_level_iterator.h" #include "table/plain_table_factory.h" #include "table/plain_table_key_coding.h" +#include "table/get_context.h" #include "util/arena.h" #include "util/coding.h" @@ -35,6 +36,7 @@ #include "util/murmurhash.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" +#include "util/string_util.h" namespace rocksdb { @@ -87,7 +89,7 @@ class PlainTableIterator : public Iterator { }; extern const uint64_t kPlainTableMagicNumber; -PlainTableReader::PlainTableReader(const Options& options, +PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, @@ -97,12 +99,12 @@ PlainTableReader::PlainTableReader(const Options& options, : internal_comparator_(icomparator), encoding_type_(encoding_type), full_scan_mode_(false), - data_end_offset_(table_properties->data_size), - user_key_len_(table_properties->fixed_key_len), - prefix_extractor_(options.prefix_extractor.get()), + data_end_offset_(static_cast(table_properties->data_size)), + user_key_len_(static_cast(table_properties->fixed_key_len)), + prefix_extractor_(ioptions.prefix_extractor), enable_bloom_(false), bloom_(6, nullptr), - options_(options), + ioptions_(ioptions), file_(std::move(file)), file_size_(file_size), table_properties_(nullptr) {} @@ -110,8 +112,8 @@ PlainTableReader::PlainTableReader(const Options& options, PlainTableReader::~PlainTableReader() { } -Status PlainTableReader::Open(const Options& options, - const EnvOptions& soptions, +Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, @@ -119,14 +121,14 @@ Status PlainTableReader::Open(const Options& options, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode) { - assert(options.allow_mmap_reads); + assert(ioptions.allow_mmap_reads); if (file_size > PlainTableIndex::kMaxFileSize) { return Status::NotSupported("File is too large for PlainTableReader!"); } TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - options.env, options.info_log.get(), &props); + ioptions.env, ioptions.info_log, &props); if (!s.ok()) { return s; } @@ -137,12 +139,12 @@ Status PlainTableReader::Open(const Options& options, user_props.find(PlainTablePropertyNames::kPrefixExtractorName); if (!full_scan_mode && prefix_extractor_in_file != user_props.end()) { - if (!options.prefix_extractor) { + if (!ioptions.prefix_extractor) { return Status::InvalidArgument( "Prefix extractor is missing when opening a PlainTable built " "using a prefix extractor"); } else if (prefix_extractor_in_file->second.compare( - options.prefix_extractor->Name()) != 0) { + ioptions.prefix_extractor->Name()) != 0) { return Status::InvalidArgument( "Prefix extractor given doesn't match the one used to build " "PlainTable"); @@ -158,8 +160,8 @@ Status PlainTableReader::Open(const Options& options, } std::unique_ptr new_reader(new PlainTableReader( - options, std::move(file), soptions, internal_comparator, encoding_type, - file_size, props)); + ioptions, std::move(file), env_options, internal_comparator, + encoding_type, file_size, props)); s = new_reader->MmapDataFile(); if (!s.ok()) { @@ -207,7 +209,7 @@ Status PlainTableReader::PopulateIndexRecordList( bool is_first_record = true; Slice key_prefix_slice; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (pos < data_end_offset_) { uint32_t key_offset = pos; ParsedInternalKey key; @@ -252,8 +254,8 @@ void PlainTableReader::AllocateAndFillBloom(int bloom_bits_per_key, uint32_t bloom_total_bits = num_prefixes * bloom_bits_per_key; if (bloom_total_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, bloom_total_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, bloom_total_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); FillBloom(prefix_hashes); } } @@ -281,14 +283,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, BlockContents bloom_block_contents; auto s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, BloomBlockBuilder::kBloomBlock, + ioptions_.env, BloomBlockBuilder::kBloomBlock, &bloom_block_contents); bool index_in_file = s.ok(); BlockContents index_block_contents; s = ReadMetaBlock(file_.get(), file_size_, kPlainTableMagicNumber, - options_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, - &index_block_contents); + ioptions_.env, PlainTableIndexBuilder::kPlainTableIndexBlock, + &index_block_contents); index_in_file &= s.ok(); @@ -310,8 +312,9 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, index_block = nullptr; } - if ((options_.prefix_extractor.get() == nullptr) && (hash_table_ratio != 0)) { - // options.prefix_extractor is requried for a hash-based look-up. + if ((ioptions_.prefix_extractor == nullptr) && + (hash_table_ratio != 0)) { + // ioptions.prefix_extractor is requried for a hash-based look-up. return Status::NotSupported( "PlainTable requires a prefix extractor enable prefix hash mode."); } @@ -325,11 +328,12 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, // Allocate bloom filter here for total order mode. if (IsTotalOrderMode()) { uint32_t num_bloom_bits = - table_properties_->num_entries * bloom_bits_per_key; + static_cast(table_properties_->num_entries) * + bloom_bits_per_key; if (num_bloom_bits > 0) { enable_bloom_ = true; - bloom_.SetTotalBits(&arena_, num_bloom_bits, options_.bloom_locality, - huge_page_tlb_size, options_.info_log.get()); + bloom_.SetTotalBits(&arena_, num_bloom_bits, ioptions_.bloom_locality, + huge_page_tlb_size, ioptions_.info_log); } } } else { @@ -348,20 +352,20 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, bloom_.SetRawData( const_cast( reinterpret_cast(bloom_block->data())), - bloom_block->size() * 8, num_blocks); + static_cast(bloom_block->size()) * 8, num_blocks); } - PlainTableIndexBuilder index_builder(&arena_, options_, index_sparseness, + PlainTableIndexBuilder index_builder(&arena_, ioptions_, index_sparseness, hash_table_ratio, huge_page_tlb_size); std::vector prefix_hashes; if (!index_in_file) { - Status s = PopulateIndexRecordList(&index_builder, &prefix_hashes); + s = PopulateIndexRecordList(&index_builder, &prefix_hashes); if (!s.ok()) { return s; } } else { - Status s = index_.InitFromRawData(*index_block); + s = index_.InitFromRawData(*index_block); if (!s.ok()) { return s; } @@ -377,14 +381,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, // Fill two table properties. if (!index_in_file) { props->user_collected_properties["plain_table_hash_table_size"] = - std::to_string(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); + ToString(index_.GetIndexSize() * PlainTableIndex::kOffsetLen); props->user_collected_properties["plain_table_sub_index_size"] = - std::to_string(index_.GetSubIndexSize()); + ToString(index_.GetSubIndexSize()); } else { props->user_collected_properties["plain_table_hash_table_size"] = - std::to_string(0); + ToString(0); props->user_collected_properties["plain_table_sub_index_size"] = - std::to_string(0); + ToString(0); } return Status::OK(); @@ -422,7 +426,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, uint32_t file_offset = GetFixed32Element(base_ptr, mid); size_t tmp; Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + file_offset, file_data_.data() + data_end_offset_, &mid_key, nullptr, &tmp); @@ -451,7 +455,7 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix, size_t tmp; uint32_t low_key_offset = GetFixed32Element(base_ptr, low); Status s = PlainTableKeyDecoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()) + ioptions_.prefix_extractor) .NextKey(file_data_.data() + low_key_offset, file_data_.data() + data_end_offset_, &low_key, nullptr, &tmp); @@ -507,7 +511,7 @@ Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, return Status::Corruption( "Unexpected EOF when reading the next value's size."); } - *offset = *offset + (value_ptr - start) + value_size; + *offset = *offset + static_cast(value_ptr - start) + value_size; if (*offset > data_end_offset_) { return Status::Corruption("Unexpected EOF when reading the next value. "); } @@ -524,10 +528,7 @@ void PlainTableReader::Prepare(const Slice& target) { } Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, - void* arg, - bool (*saver)(void*, const ParsedInternalKey&, - const Slice&), - void (*mark_key_may_exist)(void*)) { + GetContext* get_context) { // Check bloom filter first. Slice prefix_slice; uint32_t prefix_hash; @@ -565,9 +566,9 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } Slice found_value; PlainTableKeyDecoder decoder(encoding_type_, user_key_len_, - options_.prefix_extractor.get()); + ioptions_.prefix_extractor); while (offset < data_end_offset_) { - Status s = Next(&decoder, &offset, &found_key, nullptr, &found_value); + s = Next(&decoder, &offset, &found_key, nullptr, &found_value); if (!s.ok()) { return s; } @@ -579,8 +580,10 @@ Status PlainTableReader::Get(const ReadOptions& ro, const Slice& target, } prefix_match = true; } + // TODO(ljin): since we know the key comparison result here, + // can we enable the fast path? if (internal_comparator_.Compare(found_key, parsed_target) >= 0) { - if (!(*saver)(arg, found_key, found_value)) { + if (!get_context->SaveValue(found_key, found_value)) { break; } } diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index 4a626979a..9d0df974e 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -36,6 +36,7 @@ class TableCache; class TableReader; class InternalKeyComparator; class PlainTableKeyDecoder; +class GetContext; using std::unique_ptr; using std::unordered_map; @@ -52,7 +53,8 @@ extern const uint32_t kPlainTableVariableLength; // The implementation of IndexedTableReader requires output file is mmaped class PlainTableReader: public TableReader { public: - static Status Open(const Options& options, const EnvOptions& soptions, + static Status Open(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, unique_ptr&& file, uint64_t file_size, unique_ptr* table, @@ -64,10 +66,8 @@ class PlainTableReader: public TableReader { void Prepare(const Slice& target); - Status Get(const ReadOptions&, const Slice& key, void* arg, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist)(void*) = nullptr); + Status Get(const ReadOptions&, const Slice& key, + GetContext* get_context) override; uint64_t ApproximateOffsetOf(const Slice& key); @@ -82,8 +82,9 @@ class PlainTableReader: public TableReader { return arena_.MemoryAllocatedBytes(); } - PlainTableReader(const Options& options, unique_ptr&& file, - const EnvOptions& storage_options, + PlainTableReader(const ImmutableCFOptions& ioptions, + unique_ptr&& file, + const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, const TableProperties* table_properties); @@ -122,7 +123,7 @@ class PlainTableReader: public TableReader { // sst file that stores data. const uint32_t data_start_offset_ = 0; const uint32_t data_end_offset_; - const size_t user_key_len_; + const uint32_t user_key_len_; const SliceTransform* prefix_extractor_; static const size_t kNumInternalBytes = 8; @@ -132,9 +133,9 @@ class PlainTableReader: public TableReader { DynamicBloom bloom_; Arena arena_; - const Options& options_; + const ImmutableCFOptions& ioptions_; unique_ptr file_; - uint32_t file_size_; + uint64_t file_size_; std::shared_ptr table_properties_; bool IsFixedLength() const { diff --git a/table/table_properties.cc b/table/table_properties.cc index c7e141943..1ee34a671 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -3,9 +3,12 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include "table/table_properties_internal.h" #include "rocksdb/table_properties.h" #include "rocksdb/iterator.h" #include "rocksdb/env.h" +#include "port/port.h" +#include "util/string_util.h" namespace rocksdb { @@ -30,7 +33,7 @@ namespace { const std::string& prop_delim, const std::string& kv_delim) { AppendProperty( - props, key, std::to_string(value), prop_delim, kv_delim + props, key, ToString(value), prop_delim, kv_delim ); } } diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h new file mode 100644 index 000000000..9ef8ad432 --- /dev/null +++ b/table/table_properties_internal.h @@ -0,0 +1,18 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "rocksdb/status.h" +#include "rocksdb/iterator.h" + +namespace rocksdb { + +// Seek to the properties block. +// If it successfully seeks to the properties block, "is_found" will be +// set to true. +Status SeekToPropertiesBlock(Iterator* meta_iter, bool* is_found); + +} // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h index 22f5a859e..d3801442e 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -18,6 +18,7 @@ class Slice; class Arena; struct ReadOptions; struct TableProperties; +class GetContext; // A Table is a sorted map from strings to strings. Tables are // immutable and persistent. A Table may be safely accessed from @@ -55,23 +56,22 @@ class TableReader { // Report an approximation of how much memory has been used. virtual size_t ApproximateMemoryUsage() const = 0; - // Calls (*result_handler)(handle_context, ...) repeatedly, starting with - // the entry found after a call to Seek(key), until result_handler returns - // false, where k is the actual internal key for a row found and v as the - // value of the key. May not make such a call if filter policy says that key - // is not present. + // Calls get_context->SaveValue() repeatedly, starting with + // the entry found after a call to Seek(key), until it returns false. + // May not make such a call if filter policy says that key is not present. // - // mark_key_may_exist_handler needs to be called when it is configured to be - // memory only and the key is not found in the block cache, with - // the parameter to be handle_context. + // get_context->MarkKeyMayExist needs to be called when it is configured to be + // memory only and the key is not found in the block cache. // // readOptions is the options for the read // key is the key to search for - virtual Status Get( - const ReadOptions& readOptions, const Slice& key, void* handle_context, - bool (*result_handler)(void* arg, const ParsedInternalKey& k, - const Slice& v), - void (*mark_key_may_exist_handler)(void* handle_context) = nullptr) = 0; + virtual Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context) = 0; + + // convert db file to a human readable form + virtual Status DumpTable(WritableFile* out_file) { + return Status::NotSupported("DumpTable() not supported"); + } }; } // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index ed2c7c52d..a75424e82 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -18,10 +18,10 @@ int main() { #include "rocksdb/table.h" #include "db/db_impl.h" #include "db/dbformat.h" -#include "port/atomic_pointer.h" #include "table/block_based_table_factory.h" #include "table/plain_table_factory.h" #include "table/table_builder.h" +#include "table/get_context.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -48,11 +48,6 @@ static std::string MakeKey(int i, int j, bool through_db) { return key.Encode().ToString(); } -static bool DummySaveValue(void* arg, const ParsedInternalKey& ikey, - const Slice& v) { - return false; -} - uint64_t Now(Env* env, bool measured_by_nanosecond) { return measured_by_nanosecond ? env->NowNanos() : env->NowMicros(); } @@ -88,10 +83,12 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, TableBuilder* tb = nullptr; DB* db = nullptr; Status s; + const ImmutableCFOptions ioptions(opts); if (!through_db) { env->NewWritableFile(file_name, &file, env_options); - tb = opts.table_factory->NewTableBuilder(opts, ikc, file.get(), - CompressionType::kNoCompression); + tb = opts.table_factory->NewTableBuilder(ioptions, ikc, file.get(), + CompressionType::kNoCompression, + CompressionOptions()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -118,18 +115,17 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, unique_ptr table_reader; unique_ptr raf; if (!through_db) { - Status s = env->NewRandomAccessFile(file_name, &raf, env_options); + s = env->NewRandomAccessFile(file_name, &raf, env_options); uint64_t file_size; env->GetFileSize(file_name, &file_size); s = opts.table_factory->NewTableReader( - opts, env_options, ikc, std::move(raf), file_size, &table_reader); + ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); } Random rnd(301); std::string result; HistogramImpl hist; - void* arg = nullptr; for (int it = 0; it < num_iter; it++) { for (int i = 0; i < num_keys1; i++) { for (int j = 0; j < num_keys2; j++) { @@ -145,8 +141,13 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::string key = MakeKey(r1, r2, through_db); uint64_t start_time = Now(env, measured_by_nanosecond); if (!through_db) { - s = table_reader->Get(read_options, key, arg, DummySaveValue, - nullptr); + std::string value; + MergeContext merge_context; + GetContext get_context(ioptions.comparator, ioptions.merge_operator, + ioptions.info_log, ioptions.statistics, + GetContext::kNotFound, Slice(key), &value, + nullptr, &merge_context); + s = table_reader->Get(read_options, key, &get_context); } else { s = db->Get(read_options, key, &result); } @@ -256,11 +257,18 @@ int main(int argc, char** argv) { options.compression = rocksdb::CompressionType::kNoCompression; if (FLAGS_table_factory == "cuckoo_hash") { +#ifndef ROCKSDB_LITE options.allow_mmap_reads = true; env_options.use_mmap_reads = true; - - tf.reset(rocksdb::NewCuckooTableFactory(0.75)); + rocksdb::CuckooTableOptions table_options; + table_options.hash_table_ratio = 0.75; + tf.reset(rocksdb::NewCuckooTableFactory(table_options)); +#else + fprintf(stderr, "Plain table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } else if (FLAGS_table_factory == "plain_table") { +#ifndef ROCKSDB_LITE options.allow_mmap_reads = true; env_options.use_mmap_reads = true; @@ -272,6 +280,10 @@ int main(int argc, char** argv) { tf.reset(new rocksdb::PlainTableFactory(plain_table_options)); options.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform( FLAGS_prefix_len)); +#else + fprintf(stderr, "Cuckoo table is not supported in lite mode\n"); + exit(1); +#endif // ROCKSDB_LITE } else if (FLAGS_table_factory == "block_based") { tf.reset(new rocksdb::BlockBasedTableFactory()); } else { diff --git a/table/table_test.cc b/table/table_test.cc index 929cdf832..31883c3c7 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -20,6 +20,7 @@ #include "db/dbformat.h" #include "db/memtable.h" #include "db/write_batch_internal.h" +#include "db/writebuffer.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" @@ -37,11 +38,14 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "table/get_context.h" +#include "util/compression.h" #include "util/random.h" #include "util/statistics.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -194,6 +198,7 @@ class Constructor { // been added so far. Returns the keys in sorted order in "*keys" // and stores the key/value pairs in "*kvmap" void Finish(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, std::vector* keys, KVMap* kvmap) { @@ -206,12 +211,14 @@ class Constructor { keys->push_back(it->first); } data_.clear(); - Status s = FinishImpl(options, table_options, internal_comparator, *kvmap); + Status s = FinishImpl(options, ioptions, table_options, + internal_comparator, *kvmap); ASSERT_TRUE(s.ok()) << s.ToString(); } // Construct the data structure from the data in "data" virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, const KVMap& data) = 0; @@ -220,8 +227,12 @@ class Constructor { virtual const KVMap& data() { return data_; } + virtual bool IsArenaMode() const { return false; } + virtual DB* db() const { return nullptr; } // Overridden in DBConstructor + virtual bool AnywayDeleteIterator() const { return false; } + protected: const InternalKeyComparator* last_internal_key_; @@ -239,26 +250,23 @@ class BlockConstructor: public Constructor { delete block_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - const KVMap& data) { + const KVMap& kv_map) { delete block_; block_ = nullptr; - BlockBuilder builder(table_options.block_restart_interval, - &internal_comparator); + BlockBuilder builder(table_options.block_restart_interval); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - builder.Add(it->first, it->second); + for (const auto kv : kv_map) { + builder.Add(kv.first, kv.second); } // Open the block data_ = builder.Finish().ToString(); BlockContents contents; contents.data = data_; contents.cachable = false; - contents.heap_allocated = false; - block_ = new Block(contents); + block_ = new Block(std::move(contents)); return Status::OK(); } virtual Iterator* NewIterator() const { @@ -276,8 +284,15 @@ class BlockConstructor: public Constructor { // A helper class that converts internal format keys into user keys class KeyConvertingIterator: public Iterator { public: - explicit KeyConvertingIterator(Iterator* iter) : iter_(iter) { } - virtual ~KeyConvertingIterator() { delete iter_; } + KeyConvertingIterator(Iterator* iter, bool arena_mode = false) + : iter_(iter), arena_mode_(arena_mode) {} + virtual ~KeyConvertingIterator() { + if (arena_mode_) { + iter_->~Iterator(); + } else { + delete iter_; + } + } virtual bool Valid() const { return iter_->Valid(); } virtual void Seek(const Slice& target) { ParsedInternalKey ikey(target, kMaxSequenceNumber, kTypeValue); @@ -292,12 +307,12 @@ class KeyConvertingIterator: public Iterator { virtual Slice key() const { assert(Valid()); - ParsedInternalKey key; - if (!ParseInternalKey(iter_->key(), &key)) { + ParsedInternalKey parsed_key; + if (!ParseInternalKey(iter_->key(), &parsed_key)) { status_ = Status::Corruption("malformed internal key"); return Slice("corrupted key"); } - return key.user_key; + return parsed_key.user_key; } virtual Slice value() const { return iter_->value(); } @@ -308,6 +323,7 @@ class KeyConvertingIterator: public Iterator { private: mutable Status status_; Iterator* iter_; + bool arena_mode_; // No copying allowed KeyConvertingIterator(const KeyConvertingIterator&); @@ -323,25 +339,25 @@ class TableConstructor: public Constructor { ~TableConstructor() { Reset(); } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - const KVMap& data) { + const KVMap& kv_map) { Reset(); sink_.reset(new StringSink()); unique_ptr builder; - builder.reset(options.table_factory->NewTableBuilder( - options, internal_comparator, sink_.get(), options.compression)); + builder.reset(ioptions.table_factory->NewTableBuilder( + ioptions, internal_comparator, sink_.get(), options.compression, + CompressionOptions())); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { + for (const auto kv : kv_map) { if (convert_to_internal_key_) { - ParsedInternalKey ikey(it->first, kMaxSequenceNumber, kTypeValue); + ParsedInternalKey ikey(kv.first, kMaxSequenceNumber, kTypeValue); std::string encoded; AppendInternalKey(&encoded, ikey); - builder->Add(encoded, it->second); + builder->Add(encoded, kv.second); } else { - builder->Add(it->first, it->second); + builder->Add(kv.first, kv.second); } ASSERT_TRUE(builder->status().ok()); } @@ -353,9 +369,9 @@ class TableConstructor: public Constructor { // Open the table uniq_id_ = cur_uniq_id_++; source_.reset(new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, internal_comparator, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, internal_comparator, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -373,12 +389,12 @@ class TableConstructor: public Constructor { return table_reader_->ApproximateOffsetOf(key); } - virtual Status Reopen(const Options& options) { + virtual Status Reopen(const ImmutableCFOptions& ioptions) { source_.reset( new StringSource(sink_->contents(), uniq_id_, - options.allow_mmap_reads)); - return options.table_factory->NewTableReader( - options, soptions, *last_internal_key_, std::move(source_), + ioptions.allow_mmap_reads)); + return ioptions.table_factory->NewTableReader( + ioptions, soptions, *last_internal_key_, std::move(source_), sink_->contents().size(), &table_reader_); } @@ -386,6 +402,10 @@ class TableConstructor: public Constructor { return table_reader_.get(); } + virtual bool AnywayDeleteIterator() const override { + return convert_to_internal_key_; + } + private: void Reset() { uniq_id_ = 0; @@ -393,12 +413,12 @@ class TableConstructor: public Constructor { sink_.reset(); source_.reset(); } - bool convert_to_internal_key_; uint64_t uniq_id_; unique_ptr sink_; unique_ptr source_; unique_ptr table_reader_; + bool convert_to_internal_key_; TableConstructor(); @@ -409,42 +429,51 @@ uint64_t TableConstructor::cur_uniq_id_ = 1; class MemTableConstructor: public Constructor { public: - explicit MemTableConstructor(const Comparator* cmp) + explicit MemTableConstructor(const Comparator* cmp, WriteBuffer* wb) : Constructor(cmp), internal_comparator_(cmp), + write_buffer_(wb), table_factory_(new SkipListFactory) { - Options options; - options.memtable_factory = table_factory_; - memtable_ = new MemTable(internal_comparator_, options); + options_.memtable_factory = table_factory_; + ImmutableCFOptions ioptions(options_); + memtable_ = new MemTable(internal_comparator_, ioptions, + MutableCFOptions(options_, ioptions), wb); memtable_->Ref(); } ~MemTableConstructor() { delete memtable_->Unref(); } - virtual Status FinishImpl(const Options& options, + virtual Status FinishImpl(const Options&, const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - const KVMap& data) { + const KVMap& kv_map) { delete memtable_->Unref(); - Options memtable_options; - memtable_options.memtable_factory = table_factory_; - memtable_ = new MemTable(internal_comparator_, memtable_options); + ImmutableCFOptions mem_ioptions(ioptions); + memtable_ = new MemTable(internal_comparator_, mem_ioptions, + MutableCFOptions(options_, mem_ioptions), + write_buffer_); memtable_->Ref(); int seq = 1; - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { - memtable_->Add(seq, kTypeValue, it->first, it->second); + for (const auto kv : kv_map) { + memtable_->Add(seq, kTypeValue, kv.first, kv.second); seq++; } return Status::OK(); } virtual Iterator* NewIterator() const { - return new KeyConvertingIterator(memtable_->NewIterator(ReadOptions())); + return new KeyConvertingIterator( + memtable_->NewIterator(ReadOptions(), &arena_), true); } + virtual bool AnywayDeleteIterator() const override { return true; } + + virtual bool IsArenaMode() const override { return true; } + private: + mutable Arena arena_; InternalKeyComparator internal_comparator_; + Options options_; + WriteBuffer* write_buffer_; MemTable* memtable_; std::shared_ptr table_factory_; }; @@ -461,17 +490,16 @@ class DBConstructor: public Constructor { delete db_; } virtual Status FinishImpl(const Options& options, + const ImmutableCFOptions& ioptions, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - const KVMap& data) { + const KVMap& kv_map) { delete db_; db_ = nullptr; NewDB(); - for (KVMap::const_iterator it = data.begin(); - it != data.end(); - ++it) { + for (const auto kv : kv_map) { WriteBatch batch; - batch.Put(it->first, it->second); + batch.Put(kv.first, kv.second); ASSERT_TRUE(db_->Write(WriteOptions(), &batch).ok()); } return Status::OK(); @@ -506,9 +534,8 @@ static bool SnappyCompressionSupported() { #ifdef SNAPPY std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Snappy_Compress(Options().compression_opts, - in.data(), in.size(), - &out); + return Snappy_Compress(Options().compression_opts, in.data(), in.size(), + &out); #else return false; #endif @@ -518,9 +545,8 @@ static bool ZlibCompressionSupported() { #ifdef ZLIB std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::Zlib_Compress(Options().compression_opts, - in.data(), in.size(), - &out); + return Zlib_Compress(Options().compression_opts, 2, in.data(), in.size(), + &out); #else return false; #endif @@ -530,9 +556,8 @@ static bool BZip2CompressionSupported() { #ifdef BZIP2 std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::BZip2_Compress(Options().compression_opts, - in.data(), in.size(), - &out); + return BZip2_Compress(Options().compression_opts, 2, in.data(), in.size(), + &out); #else return false; #endif @@ -542,8 +567,8 @@ static bool LZ4CompressionSupported() { #ifdef LZ4 std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::LZ4_Compress(Options().compression_opts, in.data(), in.size(), - &out); + return LZ4_Compress(Options().compression_opts, 2, in.data(), in.size(), + &out); #else return false; #endif @@ -553,8 +578,8 @@ static bool LZ4HCCompressionSupported() { #ifdef LZ4 std::string out; Slice in = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; - return port::LZ4HC_Compress(Options().compression_opts, in.data(), in.size(), - &out); + return LZ4HC_Compress(Options().compression_opts, 2, in.data(), in.size(), + &out); #else return false; #endif @@ -575,6 +600,7 @@ struct TestArgs { bool reverse_compare; int restart_interval; CompressionType compression; + uint32_t format_version; }; static std::vector GenerateArgList() { @@ -588,22 +614,26 @@ static std::vector GenerateArgList() { std::vector restart_intervals = {16, 1, 1024}; // Only add compression if it is supported - std::vector compression_types; - compression_types.push_back(kNoCompression); + std::vector> compression_types; + compression_types.emplace_back(kNoCompression, false); if (SnappyCompressionSupported()) { - compression_types.push_back(kSnappyCompression); + compression_types.emplace_back(kSnappyCompression, false); } if (ZlibCompressionSupported()) { - compression_types.push_back(kZlibCompression); + compression_types.emplace_back(kZlibCompression, false); + compression_types.emplace_back(kZlibCompression, true); } if (BZip2CompressionSupported()) { - compression_types.push_back(kBZip2Compression); + compression_types.emplace_back(kBZip2Compression, false); + compression_types.emplace_back(kBZip2Compression, true); } if (LZ4CompressionSupported()) { - compression_types.push_back(kLZ4Compression); + compression_types.emplace_back(kLZ4Compression, false); + compression_types.emplace_back(kLZ4Compression, true); } if (LZ4HCCompressionSupported()) { - compression_types.push_back(kLZ4HCCompression); + compression_types.emplace_back(kLZ4HCCompression, false); + compression_types.emplace_back(kLZ4HCCompression, true); } for (auto test_type : test_types) { @@ -615,7 +645,7 @@ static std::vector GenerateArgList() { one_arg.type = test_type; one_arg.reverse_compare = reverse_compare; one_arg.restart_interval = restart_intervals[0]; - one_arg.compression = compression_types[0]; + one_arg.compression = compression_types[0].first; test_args.push_back(one_arg); continue; } @@ -626,7 +656,8 @@ static std::vector GenerateArgList() { one_arg.type = test_type; one_arg.reverse_compare = reverse_compare; one_arg.restart_interval = restart_interval; - one_arg.compression = compression_type; + one_arg.compression = compression_type.first; + one_arg.format_version = compression_type.second ? 2 : 1; test_args.push_back(one_arg); } } @@ -671,7 +702,9 @@ class FixedOrLessPrefixTransform : public SliceTransform { class Harness { public: - Harness() : constructor_(nullptr) { } + Harness() + : ioptions_(options_), constructor_(nullptr), + write_buffer_(options_.db_write_buffer_size) {} void Init(const TestArgs& args) { delete constructor_; @@ -695,6 +728,7 @@ class Harness { new FlushBlockBySizePolicyFactory()); table_options_.block_size = 256; table_options_.block_restart_interval = args.restart_interval; + table_options_.format_version = args.format_version; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); constructor_ = new TableConstructor(options_.comparator); @@ -748,7 +782,8 @@ class Harness { table_options_.block_size = 256; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); - constructor_ = new MemTableConstructor(options_.comparator); + constructor_ = new MemTableConstructor(options_.comparator, + &write_buffer_); break; case DB_TEST: table_options_.block_size = 256; @@ -757,6 +792,7 @@ class Harness { constructor_ = new DBConstructor(options_.comparator); break; } + ioptions_ = ImmutableCFOptions(options_); } ~Harness() { @@ -770,8 +806,8 @@ class Harness { void Test(Random* rnd) { std::vector keys; KVMap data; - constructor_->Finish(options_, table_options_, *internal_comparator_, - &keys, &data); + constructor_->Finish(options_, ioptions_, table_options_, + *internal_comparator_, &keys, &data); TestForwardScan(keys, data); if (support_prev_) { @@ -792,7 +828,11 @@ class Harness { iter->Next(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestBackwardScan(const std::vector& keys, @@ -807,7 +847,11 @@ class Harness { iter->Prev(); } ASSERT_TRUE(!iter->Valid()); - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } void TestRandomAccess(Random* rnd, @@ -877,7 +921,11 @@ class Harness { } } } - delete iter; + if (constructor_->IsArenaMode() && !constructor_->AnywayDeleteIterator()) { + iter->~Iterator(); + } else { + delete iter; + } } std::string ToString(const KVMap& data, const KVMap::const_iterator& it) { @@ -909,7 +957,7 @@ class Harness { if (keys.empty()) { return "foo"; } else { - const int index = rnd->Uniform(keys.size()); + const int index = rnd->Uniform(static_cast(keys.size())); std::string result = keys[index]; switch (rnd->Uniform(support_prev_ ? 3 : 1)) { case 0: @@ -940,8 +988,10 @@ class Harness { private: Options options_ = Options(); + ImmutableCFOptions ioptions_; BlockBasedTableOptions table_options_ = BlockBasedTableOptions(); Constructor* constructor_; + WriteBuffer write_buffer_; bool support_prev_; bool only_support_prefix_seek_; shared_ptr internal_comparator_; @@ -1000,9 +1050,9 @@ TEST(TablePropertyTest, PrefixScanTest) { pos->first.compare(0, prefix.size(), prefix) == 0; ++pos) { ++num; - auto key = prefix + "." + std::to_string(num); + auto key = prefix + "." + ToString(num); ASSERT_EQ(key, pos->first); - ASSERT_EQ(std::to_string(num), pos->second); + ASSERT_EQ(ToString(num), pos->second); } ASSERT_EQ(3, num); } @@ -1039,7 +1089,8 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); @@ -1054,7 +1105,7 @@ TEST(BlockBasedTableTest, BasicBlockBasedTableProperties) { ASSERT_EQ("", props.filter_policy_name); // no filter policy is used // Verify data size. - BlockBuilder block_builder(1, options.comparator); + BlockBuilder block_builder(1); for (const auto& item : kvmap) { block_builder.Add(item.first, item.second); } @@ -1072,7 +1123,8 @@ TEST(BlockBasedTableTest, FilterPolicyNameProperties) { Options options; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto& props = *c.GetTableReader()->GetTableProperties(); ASSERT_EQ("rocksdb.BuiltinBloomFilter", props.filter_policy_name); @@ -1123,7 +1175,8 @@ TEST(BlockBasedTableTest, TotalOrderSeekOnHashIndex) { c.Add("cccc2", std::string('a', 56)); std::vector keys; KVMap kvmap; - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); auto props = c.GetTableReader()->GetTableProperties(); ASSERT_EQ(7u, props->num_data_blocks); @@ -1167,7 +1220,7 @@ static std::string RandomString(Random* rnd, int len) { return r; } -void AddInternalKey(TableConstructor* c, const std::string prefix, +void AddInternalKey(TableConstructor* c, const std::string& prefix, int suffix_len = 800) { static Random rnd(1023); InternalKey k(prefix + RandomString(&rnd, 800), 0, kTypeValue); @@ -1207,7 +1260,8 @@ TEST(TableTest, HashIndexTest) { std::unique_ptr comparator( new InternalKeyComparator(BytewiseComparator())); - c.Finish(options, table_options, *comparator, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); auto reader = c.GetTableReader(); auto props = reader->GetTableProperties(); @@ -1315,7 +1369,8 @@ TEST(BlockBasedTableTest, IndexSizeStat) { table_options.block_restart_interval = 1; options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); auto index_size = c.GetTableReader()->GetTableProperties()->index_size; ASSERT_GT(index_size, last_index_size); @@ -1341,7 +1396,8 @@ TEST(BlockBasedTableTest, NumBlockStat) { std::vector ks; KVMap kvmap; - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &ks, &kvmap); ASSERT_EQ(kvmap.size(), c.GetTableReader()->GetTableProperties()->num_data_blocks); @@ -1362,31 +1418,32 @@ class BlockCachePropertiesSnapshot { filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT); } - void AssertIndexBlockStat(int64_t index_block_cache_miss, - int64_t index_block_cache_hit) { - ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); - ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); + void AssertIndexBlockStat(int64_t expected_index_block_cache_miss, + int64_t expected_index_block_cache_hit) { + ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit); } - void AssertFilterBlockStat(int64_t filter_block_cache_miss, - int64_t filter_block_cache_hit) { - ASSERT_EQ(filter_block_cache_miss, this->filter_block_cache_miss); - ASSERT_EQ(filter_block_cache_hit, this->filter_block_cache_hit); + void AssertFilterBlockStat(int64_t expected_filter_block_cache_miss, + int64_t expected_filter_block_cache_hit) { + ASSERT_EQ(expected_filter_block_cache_miss, filter_block_cache_miss); + ASSERT_EQ(expected_filter_block_cache_hit, filter_block_cache_hit); } // Check if the fetched props matches the expected ones. // TODO(kailiu) Use this only when you disabled filter policy! - void AssertEqual(int64_t index_block_cache_miss, - int64_t index_block_cache_hit, int64_t data_block_cache_miss, - int64_t data_block_cache_hit) const { - ASSERT_EQ(index_block_cache_miss, this->index_block_cache_miss); - ASSERT_EQ(index_block_cache_hit, this->index_block_cache_hit); - ASSERT_EQ(data_block_cache_miss, this->data_block_cache_miss); - ASSERT_EQ(data_block_cache_hit, this->data_block_cache_hit); - ASSERT_EQ(index_block_cache_miss + data_block_cache_miss, - this->block_cache_miss); - ASSERT_EQ(index_block_cache_hit + data_block_cache_hit, - this->block_cache_hit); + void AssertEqual(int64_t expected_index_block_cache_miss, + int64_t expected_index_block_cache_hit, + int64_t expected_data_block_cache_miss, + int64_t expected_data_block_cache_hit) const { + ASSERT_EQ(expected_index_block_cache_miss, index_block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit, index_block_cache_hit); + ASSERT_EQ(expected_data_block_cache_miss, data_block_cache_miss); + ASSERT_EQ(expected_data_block_cache_hit, data_block_cache_hit); + ASSERT_EQ(expected_index_block_cache_miss + expected_data_block_cache_miss, + block_cache_miss); + ASSERT_EQ(expected_index_block_cache_hit + expected_data_block_cache_hit, + block_cache_hit); } private: @@ -1407,8 +1464,6 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { options.create_if_missing = true; options.statistics = CreateDBStatistics(); BlockBasedTableOptions table_options; - // Intentionally commented out: table_options.cache_index_and_filter_blocks = - // true; table_options.block_cache = NewLRUCache(1024); table_options.filter_policy.reset(NewBloomFilterPolicy(10)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -1417,7 +1472,8 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { TableConstructor c(BytewiseComparator(), true); c.Add("key", "value"); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is enabled. @@ -1433,8 +1489,11 @@ TEST(BlockBasedTableTest, BlockCacheDisabledTest) { } { + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, Slice(), nullptr, + nullptr, nullptr); // a hack that just to trigger BlockBasedTable::GetFilter. - reader->Get(ReadOptions(), "non-exist-key", nullptr, nullptr, nullptr); + reader->Get(ReadOptions(), "non-exist-key", &get_context); BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertIndexBlockStat(0, 0); props.AssertFilterBlockStat(0, 0); @@ -1459,10 +1518,11 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { TableConstructor c(BytewiseComparator()); c.Add("key", "value"); - c.Finish(options, table_options, + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); // preloading filter/index blocks is prohibited. - auto reader = dynamic_cast(c.GetTableReader()); + auto* reader = dynamic_cast(c.GetTableReader()); ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); ASSERT_TRUE(!reader->TEST_index_reader_preloaded()); @@ -1508,36 +1568,20 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { // release the iterator so that the block cache can reset correctly. iter.reset(); - // -- PART 2: Open without block cache - table_options.no_block_cache = true; - table_options.block_cache.reset(); - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - options.statistics = CreateDBStatistics(); // reset the stats - c.Reopen(options); - table_options.no_block_cache = false; - - { - iter.reset(c.NewIterator()); - iter->SeekToFirst(); - ASSERT_EQ("key", iter->key().ToString()); - BlockCachePropertiesSnapshot props(options.statistics.get()); - // Nothing is affected at all - props.AssertEqual(0, 0, 0, 0); - } - - // -- PART 3: Open with very small block cache + // -- PART 2: Open with very small block cache // In this test, no block will ever get hit since the block cache is // too small to fit even one entry. table_options.block_cache = NewLRUCache(1); + options.statistics = CreateDBStatistics(); options.table_factory.reset(new BlockBasedTableFactory(table_options)); - c.Reopen(options); + const ImmutableCFOptions ioptions2(options); + c.Reopen(ioptions2); { BlockCachePropertiesSnapshot props(options.statistics.get()); props.AssertEqual(1, // index block miss 0, 0, 0); } - { // Both index and data block get accessed. // It first cache index block then data block. But since the cache size @@ -1557,6 +1601,37 @@ TEST(BlockBasedTableTest, FilterBlockInBlockCache) { props.AssertEqual(2, 0, 0 + 1, // data block miss 0); } + iter.reset(); + + // -- PART 3: Open table with bloom filter enabled but not in SST file + table_options.block_cache = NewLRUCache(4096); + table_options.cache_index_and_filter_blocks = false; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + TableConstructor c3(BytewiseComparator()); + std::string user_key = "k01"; + InternalKey internal_key(user_key, 0, kTypeValue); + c3.Add(internal_key.Encode().ToString(), "hello"); + ImmutableCFOptions ioptions3(options); + // Generate table without filter policy + c3.Finish(options, ioptions3, table_options, + GetPlainInternalComparator(options.comparator), &keys, &kvmap); + // Open table with filter policy + table_options.filter_policy.reset(NewBloomFilterPolicy(1)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + options.statistics = CreateDBStatistics(); + ImmutableCFOptions ioptions4(options); + ASSERT_OK(c3.Reopen(ioptions4)); + reader = dynamic_cast(c3.GetTableReader()); + ASSERT_TRUE(!reader->TEST_filter_block_preloaded()); + std::string value; + GetContext get_context(options.comparator, nullptr, nullptr, nullptr, + GetContext::kNotFound, user_key, &value, + nullptr, nullptr); + ASSERT_OK(reader->Get(ReadOptions(), user_key, &get_context)); + ASSERT_EQ(value, "hello"); + BlockCachePropertiesSnapshot props(options.statistics.get()); + props.AssertFilterBlockStat(0, 0); } TEST(BlockBasedTableTest, BlockCacheLeak) { @@ -1584,7 +1659,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { c.Add("k07", std::string(100000, 'x')); std::vector keys; KVMap kvmap; - c.Finish(opt, table_options, *ikc, &keys, &kvmap); + const ImmutableCFOptions ioptions(opt); + c.Finish(opt, ioptions, table_options, *ikc, &keys, &kvmap); unique_ptr iter(c.NewIterator()); iter->SeekToFirst(); @@ -1595,7 +1671,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { } ASSERT_OK(iter->status()); - ASSERT_OK(c.Reopen(opt)); + const ImmutableCFOptions ioptions1(opt); + ASSERT_OK(c.Reopen(ioptions1)); auto table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1604,7 +1681,8 @@ TEST(BlockBasedTableTest, BlockCacheLeak) { // rerun with different block cache table_options.block_cache = NewLRUCache(16 * 1024 * 1024); opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); - ASSERT_OK(c.Reopen(opt)); + const ImmutableCFOptions ioptions2(opt); + ASSERT_OK(c.Reopen(ioptions2)); table_reader = dynamic_cast(c.GetTableReader()); for (const std::string& key : keys) { ASSERT_TRUE(!table_reader->TEST_KeyInCache(ReadOptions(), key)); @@ -1620,9 +1698,11 @@ TEST(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; Options options; + const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); std::unique_ptr builder( - factory.NewTableBuilder(options, ikc, &sink, kNoCompression)); + factory.NewTableBuilder(ioptions, ikc, &sink, kNoCompression, + CompressionOptions())); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1665,7 +1745,9 @@ TEST(GeneralTableTest, ApproximateOffsetOfPlain) { options.compression = kNoCompression; BlockBasedTableOptions table_options; table_options.block_size = 1024; - c.Finish(options, table_options, internal_comparator, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, internal_comparator, + &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1695,7 +1777,8 @@ static void DoCompressionTest(CompressionType comp) { options.compression = comp; BlockBasedTableOptions table_options; table_options.block_size = 1024; - c.Finish(options, table_options, ikc, &keys, &kvmap); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, ikc, &keys, &kvmap); ASSERT_TRUE(Between(c.ApproximateOffsetOf("abc"), 0, 0)); ASSERT_TRUE(Between(c.ApproximateOffsetOf("k01"), 0, 0)); @@ -1797,7 +1880,10 @@ TEST(MemTableTest, Simple) { auto table_factory = std::make_shared(); Options options; options.memtable_factory = table_factory; - MemTable* memtable = new MemTable(cmp, options); + ImmutableCFOptions ioptions(options); + WriteBuffer wb(options.db_write_buffer_size); + MemTable* memtable = new MemTable(cmp, ioptions, + MutableCFOptions(options, ioptions), &wb); memtable->Ref(); WriteBatch batch; WriteBatchInternal::SetSequence(&batch, 100); @@ -1805,10 +1891,11 @@ TEST(MemTableTest, Simple) { batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k3"), std::string("v3")); batch.Put(std::string("largekey"), std::string("vlarge")); - ColumnFamilyMemTablesDefault cf_mems_default(memtable, &options); + ColumnFamilyMemTablesDefault cf_mems_default(memtable); ASSERT_TRUE(WriteBatchInternal::InsertInto(&batch, &cf_mems_default).ok()); - Iterator* iter = memtable->NewIterator(ReadOptions()); + Arena arena; + ScopedArenaIterator iter(memtable->NewIterator(ReadOptions(), &arena)); iter->SeekToFirst(); while (iter->Valid()) { fprintf(stderr, "key: '%s' -> '%s'\n", @@ -1817,7 +1904,6 @@ TEST(MemTableTest, Simple) { iter->Next(); } - delete iter; delete memtable->Unref(); } @@ -1868,7 +1954,7 @@ TEST(Harness, FooterTests) { { // upconvert legacy block based std::string encoded; - Footer footer(kLegacyBlockBasedTableMagicNumber); + Footer footer(kLegacyBlockBasedTableMagicNumber, 0); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1882,11 +1968,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); } { // xxhash block based std::string encoded; - Footer footer(kBlockBasedTableMagicNumber); + Footer footer(kBlockBasedTableMagicNumber, 1); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1901,11 +1988,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); } { // upconvert legacy plain table std::string encoded; - Footer footer(kLegacyPlainTableMagicNumber); + Footer footer(kLegacyPlainTableMagicNumber, 0); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1919,11 +2007,12 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 0U); } { // xxhash block based std::string encoded; - Footer footer(kPlainTableMagicNumber); + Footer footer(kPlainTableMagicNumber, 1); BlockHandle meta_index(10, 5), index(20, 15); footer.set_metaindex_handle(meta_index); footer.set_index_handle(index); @@ -1938,6 +2027,26 @@ TEST(Harness, FooterTests) { ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 1U); + } + { + // version == 2 + std::string encoded; + Footer footer(kBlockBasedTableMagicNumber, 2); + BlockHandle meta_index(10, 5), index(20, 15); + footer.set_metaindex_handle(meta_index); + footer.set_index_handle(index); + footer.EncodeTo(&encoded); + Footer decoded_footer; + Slice encoded_slice(encoded); + decoded_footer.DecodeFrom(&encoded_slice); + ASSERT_EQ(decoded_footer.table_magic_number(), kBlockBasedTableMagicNumber); + ASSERT_EQ(decoded_footer.checksum(), kCRC32c); + ASSERT_EQ(decoded_footer.metaindex_handle().offset(), meta_index.offset()); + ASSERT_EQ(decoded_footer.metaindex_handle().size(), meta_index.size()); + ASSERT_EQ(decoded_footer.index_handle().offset(), index.offset()); + ASSERT_EQ(decoded_footer.index_handle().size(), index.size()); + ASSERT_EQ(decoded_footer.version(), 2U); } } diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index d955dd763..030193597 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -19,8 +19,8 @@ class InternalKeyComparator; class Arena; struct TwoLevelIteratorState { - explicit TwoLevelIteratorState(bool check_prefix_may_match) - : check_prefix_may_match(check_prefix_may_match) {} + explicit TwoLevelIteratorState(bool _check_prefix_may_match) + : check_prefix_may_match(_check_prefix_may_match) {} virtual ~TwoLevelIteratorState() {} virtual Iterator* NewSecondaryIterator(const Slice& handle) = 0; diff --git a/tools/auto_sanity_test.sh b/tools/auto_sanity_test.sh index 2d63c0a85..138c855c0 100755 --- a/tools/auto_sanity_test.sh +++ b/tools/auto_sanity_test.sh @@ -37,6 +37,11 @@ echo "Running db sanity check with commits $commit_new and $commit_old." echo "=============================================================" echo "Making build $commit_new" +git checkout $commit_new +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_new" + exit 1 +fi makestuff mv db_sanity_test new_db_sanity_test echo "Creating db based on the new commit --- $commit_new" @@ -44,6 +49,11 @@ echo "Creating db based on the new commit --- $commit_new" echo "=============================================================" echo "Making build $commit_old" +git checkout $commit_old +if [ $? -ne 0 ]; then + echo "[ERROR] Can't checkout $commit_old" + exit 1 +fi makestuff mv db_sanity_test old_db_sanity_test echo "Creating db based on the old commit --- $commit_old" diff --git a/tools/benchmark.sh b/tools/benchmark.sh new file mode 100755 index 000000000..135209384 --- /dev/null +++ b/tools/benchmark.sh @@ -0,0 +1,246 @@ +#!/bin/bash +# REQUIRE: db_bench binary exists in the current directory + +if [ $# -ne 1 ]; then + echo "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/readrandom/readwhilewriting]" + exit 0 +fi + +# size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +if [ -z $DB_DIR ]; then + echo "DB_DIR is not defined" + exit 0 +fi + +if [ -z $WAL_DIR ]; then + echo "WAL_DIR is not defined" + exit 0 +fi + +output_dir=${OUTPUT_DIR:-/tmp/} +if [ ! -d $output_dir ]; then + mkdir -p $output_dir +fi + +num_read_threads=${NUM_READ_THREADS:-16} +writes_per_second=${WRITES_PER_SEC:-$((80 * K))} # (only for readwhilewriting) +num_nexts_per_seek=${NUM_NEXTS_PER_SEEK:-10} # (only for rangescanwhilewriting) +cache_size=$((1 * G)) +duration=${DURATION:-0} + +num_keys=${NUM_KEYS:-$((1 * G))} +key_size=20 +value_size=800 + +const_params=" + --db=$DB_DIR \ + --wal_dir=$WAL_DIR \ + \ + --num_levels=6 \ + --key_size=$key_size \ + --value_size=$value_size \ + --block_size=4096 \ + --cache_size=$cache_size \ + --cache_numshardbits=6 \ + --compression_type=zlib \ + --min_level_to_compress=2 \ + --compression_ratio=0.5 \ + \ + --hard_rate_limit=2 \ + --rate_limit_delay_max_milliseconds=1000000 \ + --write_buffer_size=$((128 * M)) \ + --max_write_buffer_number=3 \ + --target_file_size_base=$((128 * M)) \ + --max_bytes_for_level_base=$((1 * G)) \ + \ + --verify_checksum=1 \ + --delete_obsolete_files_period_micros=$((60 * M)) \ + --max_grandparent_overlap_factor=10 \ + \ + --statistics=1 \ + --stats_per_interval=1 \ + --stats_interval=$((1 * M)) \ + --histogram=1 \ + \ + --memtablerep=skip_list \ + --bloom_bits=10 \ + --open_files=$((20 * K))" + +l0_config=" + --level0_file_num_compaction_trigger=4 \ + --level0_slowdown_writes_trigger=8 \ + --level0_stop_writes_trigger=12" + +if [ $duration -gt 0 ]; then + const_params="$const_params --duration=$duration" +fi + +params_r="$const_params $l0_config --max_background_compactions=4 --max_background_flushes=1" +params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=16" +params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=16 \ + --level0_file_num_compaction_trigger=$((10 * M)) \ + --level0_slowdown_writes_trigger=$((10 * M)) \ + --level0_stop_writes_trigger=$((10 * M))" + +function run_bulkload { + echo "Bulk loading $num_keys random keys into database..." + cmd="./db_bench $params_bulkload --benchmarks=fillrandom \ + --use_existing_db=0 \ + --num=$num_keys \ + --disable_auto_compactions=1 \ + --sync=0 \ + --disable_data_sync=0 \ + --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_fillrandom.log" + echo $cmd | tee $output_dir/benchmark_bulkload_fillrandom.log + eval $cmd + echo "Compacting..." + cmd="./db_bench $params_w --benchmarks=compact \ + --use_existing_db=1 \ + --num=$num_keys \ + --disable_auto_compactions=1 \ + --sync=0 \ + --disable_data_sync=0 \ + --threads=1 2>&1 | tee $output_dir/benchmark_bulkload_compact.log" + echo $cmd | tee $output_dir/benchmark_bulkload_compact.log + eval $cmd +} + +function run_fillseq { + echo "Loading $num_keys keys sequentially into database..." + cmd="./db_bench $params_w --benchmarks=fillseq \ + --use_existing_db=0 \ + --num=$num_keys \ + --sync=1 \ + --disable_data_sync=0 \ + --threads=1 2>&1 | tee $output_dir/benchmark_fillseq.log" + echo $cmd | tee $output_dir/benchmark_fillseq.log + eval $cmd +} + +function run_overwrite { + echo "Loading $num_keys keys sequentially into database..." + cmd="./db_bench $params_w --benchmarks=overwrite \ + --use_existing_db=1 \ + --num=$num_keys \ + --sync=1 \ + --disable_data_sync=0 \ + --threads=1 2>&1 | tee $output_dir/benchmark_overwrite.log" + echo $cmd | tee $output_dir/benchmark_overwrite.log + eval $cmd +} + +function run_filluniquerandom { + echo "Loading $num_keys unique keys randomly into database..." + cmd="./db_bench $params_w --benchmarks=filluniquerandom \ + --use_existing_db=0 \ + --num=$num_keys \ + --sync=1 \ + --disable_data_sync=0 \ + --threads=1 2>&1 | tee $output_dir/benchmark_filluniquerandom.log" + echo $cmd | tee $output_dir/benchmark_filluniquerandom.log + eval $cmd +} + +function run_readrandom { + echo "Reading $num_keys random keys from database..." + cmd="./db_bench $params_r --benchmarks=readrandom \ + --use_existing_db=1 \ + --num=$num_keys \ + --threads=$num_read_threads \ + --disable_auto_compactions=1 \ + 2>&1 | tee $output_dir/benchmark_readrandom.log" + echo $cmd | tee $output_dir/benchmark_readrandom.log + eval $cmd +} + +function run_readwhilewriting { + echo "Reading $num_keys random keys from database whiling writing.." + cmd="./db_bench $params_r --benchmarks=readwhilewriting \ + --use_existing_db=1 \ + --num=$num_keys \ + --sync=1 \ + --disable_data_sync=0 \ + --threads=$num_read_threads \ + --writes_per_second=$writes_per_second \ + 2>&1 | tee $output_dir/benchmark_readwhilewriting.log" + echo $cmd | tee $output_dir/benchmark_readwhilewriting.log + eval $cmd +} + +function run_rangescanwhilewriting { + echo "Range scan $num_keys random keys from database whiling writing.." + cmd="./db_bench $params_r --benchmarks=seekrandomwhilewriting \ + --use_existing_db=1 \ + --num=$num_keys \ + --sync=1 \ + --disable_data_sync=0 \ + --threads=$num_read_threads \ + --writes_per_second=$writes_per_second \ + --seek_nexts=$num_nexts_per_seek \ + 2>&1 | tee $output_dir/benchmark_rangescanwhilewriting.log" + echo $cmd | tee $output_dir/benchmark_rangescanwhilewriting.log + eval $cmd +} + +function now() { + echo `date +"%s"` +} + +report="$output_dir/report.txt" + +echo "===== Benchmark =====" + +# Run!!! +IFS=',' read -a jobs <<< $1 +for job in ${jobs[@]}; do + + if [ $job != debug ]; then + echo "Start $job at `date`" | tee -a $report + fi + + start=$(now) + if [ $job = bulkload ]; then + run_bulkload + elif [ $job = fillseq ]; then + run_fillseq + elif [ $job = overwrite ]; then + run_overwrite + elif [ $job = filluniquerandom ]; then + run_filluniquerandom + elif [ $job = readrandom ]; then + run_readrandom + elif [ $job = readwhilewriting ]; then + run_readwhilewriting + elif [ $job = rangescanwhilewriting ]; then + run_rangescanwhilewriting + elif [ $job = debug ]; then + num_keys=10000; # debug + echo "Setting num_keys to $num_keys" + else + echo "unknown job $job" + exit + fi + end=$(now) + + if [ $job != debug ]; then + echo "Complete $job in $((end-start)) seconds" | tee -a $report + fi + + if [[ $job = readrandom || $job = readwhilewriting || $job == rangescanwhilewriting ]]; then + lat=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $3}') + qps=$(grep "micros\/op" "$output_dir/benchmark_$job.log" | grep "ops\/sec" | awk '{print $5}') + line=$(grep "rocksdb.db.get.micros" "$output_dir/benchmark_$job.log") + p50=$(echo $line | awk '{print $7}') + p99=$(echo $line | awk '{print $13}') + print_percentile=$(echo "$p50 != 0 || $p99 != 0" | bc); + if [ $print_percentile == "1" ]; then + echo "Read latency p50 = $p50 us, p99 = $p99 us" | tee -a $report + fi + echo "QPS = $qps ops/sec" | tee -a $report + echo "Avg Latency = $lat micros/op " | tee -a $report + fi +done diff --git a/tools/blob_store_bench.cc b/tools/blob_store_bench.cc deleted file mode 100644 index 60a0b84a6..000000000 --- a/tools/blob_store_bench.cc +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include -#include -#include - -#include "rocksdb/env.h" -#include "util/blob_store.h" -#include "util/testutil.h" - -#define KB 1024LL -#define MB 1024*1024LL -// BlobStore does costly asserts to make sure it's running correctly, which -// significantly impacts benchmark runtime. -// NDEBUG will compile out those asserts. -#ifndef NDEBUG -#define NDEBUG -#endif - -using namespace rocksdb; -using namespace std; - -// used by all threads -uint64_t timeout_sec; -Env *env; -BlobStore* bs; - -namespace { -std::string RandomString(Random* rnd, uint64_t len) { - std::string r; - test::RandomString(rnd, len, &r); - return r; -} -} // namespace - -struct Result { - uint32_t writes; - uint32_t reads; - uint32_t deletes; - uint64_t data_written; - uint64_t data_read; - - void print() { - printf("Total writes = %u\n", writes); - printf("Total reads = %u\n", reads); - printf("Total deletes = %u\n", deletes); - printf("Write throughput = %lf MB/s\n", - (double)data_written / (1024*1024.0) / timeout_sec); - printf("Read throughput = %lf MB/s\n", - (double)data_read / (1024*1024.0) / timeout_sec); - printf("Total throughput = %lf MB/s\n", - (double)(data_read + data_written) / (1024*1024.0) / timeout_sec); - } - - Result() { - writes = reads = deletes = data_read = data_written = 0; - } - - Result (uint32_t writes, uint32_t reads, uint32_t deletes, - uint64_t data_written, uint64_t data_read) : - writes(writes), reads(reads), deletes(deletes), - data_written(data_written), data_read(data_read) {} - -}; - -namespace { -Result operator + (const Result &a, const Result &b) { - return Result(a.writes + b.writes, a.reads + b.reads, - a.deletes + b.deletes, a.data_written + b.data_written, - a.data_read + b.data_read); -} -} // namespace - -struct WorkerThread { - uint64_t data_size_from, data_size_to; - double read_ratio; - uint64_t working_set_size; // start deleting once you reach this - Result result; - atomic stopped; - - WorkerThread(uint64_t data_size_from, uint64_t data_size_to, - double read_ratio, uint64_t working_set_size) : - data_size_from(data_size_from), data_size_to(data_size_to), - read_ratio(read_ratio), working_set_size(working_set_size), - stopped(false) {} - - WorkerThread(const WorkerThread& wt) : - data_size_from(wt.data_size_from), data_size_to(wt.data_size_to), - read_ratio(wt.read_ratio), working_set_size(wt.working_set_size), - stopped(false) {} -}; - -static void WorkerThreadBody(void* arg) { - WorkerThread* t = reinterpret_cast(arg); - Random rnd(5); - string buf; - vector> blobs; - vector random_strings; - - for (int i = 0; i < 10; ++i) { - random_strings.push_back(RandomString(&rnd, t->data_size_to)); - } - - uint64_t total_size = 0; - - uint64_t start_micros = env->NowMicros(); - while (env->NowMicros() - start_micros < timeout_sec * 1000 * 1000) { - if (blobs.size() && rand() < RAND_MAX * t->read_ratio) { - // read - int bi = rand() % blobs.size(); - Status s = bs->Get(blobs[bi].first, &buf); - assert(s.ok()); - t->result.data_read += buf.size(); - t->result.reads++; - } else { - // write - uint64_t size = rand() % (t->data_size_to - t->data_size_from) + - t->data_size_from; - total_size += size; - string put_str = random_strings[rand() % random_strings.size()]; - blobs.push_back(make_pair(Blob(), size)); - Status s = bs->Put(Slice(put_str.data(), size), &blobs.back().first); - assert(s.ok()); - t->result.data_written += size; - t->result.writes++; - } - - while (total_size >= t->working_set_size) { - // delete random - int bi = rand() % blobs.size(); - total_size -= blobs[bi].second; - bs->Delete(blobs[bi].first); - blobs.erase(blobs.begin() + bi); - t->result.deletes++; - } - } - t->stopped.store(true); -} - -namespace { -Result StartBenchmark(vector& config) { - for (auto w : config) { - env->StartThread(WorkerThreadBody, w); - } - - Result result; - - for (auto w : config) { - while (!w->stopped.load()); - result = result + w->result; - } - - for (auto w : config) { - delete w; - } - - delete bs; - - return result; -} - -vector SetupBenchmarkBalanced() { - string test_path; - env->GetTestDirectory(&test_path); - test_path.append("/blob_store"); - - // config start - uint32_t block_size = 16*KB; - uint32_t file_size = 1*MB; - double read_write_ratio = 0.5; - uint64_t data_read_from = 16*KB; - uint64_t data_read_to = 32*KB; - int number_of_threads = 10; - uint64_t working_set_size = 5*MB; - timeout_sec = 5; - // config end - - bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); - - vector config; - - for (int i = 0; i < number_of_threads; ++i) { - config.push_back(new WorkerThread(data_read_from, - data_read_to, - read_write_ratio, - working_set_size)); - }; - - return config; -} - -vector SetupBenchmarkWriteHeavy() { - string test_path; - env->GetTestDirectory(&test_path); - test_path.append("/blob_store"); - - // config start - uint32_t block_size = 16*KB; - uint32_t file_size = 1*MB; - double read_write_ratio = 0.1; - uint64_t data_read_from = 16*KB; - uint64_t data_read_to = 32*KB; - int number_of_threads = 10; - uint64_t working_set_size = 5*MB; - timeout_sec = 5; - // config end - - bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); - - vector config; - - for (int i = 0; i < number_of_threads; ++i) { - config.push_back(new WorkerThread(data_read_from, - data_read_to, - read_write_ratio, - working_set_size)); - }; - - return config; -} - -vector SetupBenchmarkReadHeavy() { - string test_path; - env->GetTestDirectory(&test_path); - test_path.append("/blob_store"); - - // config start - uint32_t block_size = 16*KB; - uint32_t file_size = 1*MB; - double read_write_ratio = 0.9; - uint64_t data_read_from = 16*KB; - uint64_t data_read_to = 32*KB; - int number_of_threads = 10; - uint64_t working_set_size = 5*MB; - timeout_sec = 5; - // config end - - bs = new BlobStore(test_path, block_size, file_size / block_size, 10000, env); - - vector config; - - for (int i = 0; i < number_of_threads; ++i) { - config.push_back(new WorkerThread(data_read_from, - data_read_to, - read_write_ratio, - working_set_size)); - }; - - return config; -} -} // namespace - -int main(int argc, const char** argv) { - srand(33); - env = Env::Default(); - - { - printf("--- Balanced read/write benchmark ---\n"); - vector config = SetupBenchmarkBalanced(); - Result r = StartBenchmark(config); - r.print(); - } - { - printf("--- Write heavy benchmark ---\n"); - vector config = SetupBenchmarkWriteHeavy(); - Result r = StartBenchmark(config); - r.print(); - } - { - printf("--- Read heavy benchmark ---\n"); - vector config = SetupBenchmarkReadHeavy(); - Result r = StartBenchmark(config); - r.print(); - } - - return 0; -} diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 8d0b4f5f7..77bd6ef27 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -98,6 +98,7 @@ def main(argv): --filter_deletes=%s --memtablerep=prefix_hash --prefix_size=7 + --set_options_one_in=10000 """ % (ops_per_thread, threads, write_buf_size, diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index 5970bb684..b745d7b37 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -3,6 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE #ifndef GFLAGS #include int main() { @@ -12,13 +13,13 @@ int main() { #else #include +#include #include #include "db/write_batch_internal.h" #include "rocksdb/db.h" #include "rocksdb/types.h" -#include "port/atomic_pointer.h" #include "util/testutil.h" // Run a thread to perform Put's. @@ -58,7 +59,7 @@ static void DataPumpThreadBody(void* arg) { } struct ReplicationThread { - port::AtomicPointer stop; + std::atomic stop; DB* db; volatile size_t no_read; }; @@ -68,11 +69,11 @@ static void ReplicationThreadBody(void* arg) { DB* db = t->db; unique_ptr iter; SequenceNumber currentSeqNum = 1; - while (t->stop.Acquire_Load() != nullptr) { + while (!t->stop.load(std::memory_order_acquire)) { iter.reset(); Status s; while(!db->GetUpdatesSince(currentSeqNum, &iter).ok()) { - if (t->stop.Acquire_Load() == nullptr) { + if (t->stop.load(std::memory_order_acquire)) { return; } } @@ -129,11 +130,11 @@ int main(int argc, const char** argv) { ReplicationThread replThread; replThread.db = db; replThread.no_read = 0; - replThread.stop.Release_Store(env); // store something to make it non-null. + replThread.stop.store(false, std::memory_order_release); env->StartThread(ReplicationThreadBody, &replThread); while(replThread.no_read < FLAGS_num_inserts); - replThread.stop.Release_Store(nullptr); + replThread.stop.store(true, std::memory_order_release); if (replThread.no_read < dataPump.no_records) { // no. read should be => than inserted. fprintf(stderr, "No. of Record's written and read not same\nRead : %zu" @@ -145,3 +146,11 @@ int main(int argc, const char** argv) { } #endif // GFLAGS + +#else // ROCKSDB_LITE +#include +int main(int argc, char** argv) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index 4ae120c21..dee180c87 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -8,14 +8,17 @@ #include #include -#include "include/rocksdb/db.h" -#include "include/rocksdb/options.h" -#include "include/rocksdb/env.h" -#include "include/rocksdb/slice.h" -#include "include/rocksdb/status.h" -#include "include/rocksdb/comparator.h" -#include "include/rocksdb/table.h" -#include "include/rocksdb/slice_transform.h" +#include "rocksdb/db.h" +#include "rocksdb/options.h" +#include "rocksdb/env.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/comparator.h" +#include "rocksdb/table.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/filter_policy.h" +#include "port/port.h" +#include "util/string_util.h" namespace rocksdb { @@ -42,14 +45,14 @@ class SanityTest { return s; } for (int i = 0; i < 1000000; ++i) { - std::string k = "key" + std::to_string(i); - std::string v = "value" + std::to_string(i); + std::string k = "key" + ToString(i); + std::string v = "value" + ToString(i); s = db->Put(WriteOptions(), Slice(k), Slice(v)); if (!s.ok()) { return s; } } - return Status::OK(); + return db->Flush(FlushOptions()); } Status Verify() { DB* db; @@ -60,8 +63,8 @@ class SanityTest { return s; } for (int i = 0; i < 1000000; ++i) { - std::string k = "key" + std::to_string(i); - std::string v = "value" + std::to_string(i); + std::string k = "key" + ToString(i); + std::string v = "value" + ToString(i); std::string result; s = db->Get(ReadOptions(), Slice(k), &result); if (!s.ok()) { @@ -130,6 +133,49 @@ class SanityTestZlibCompression : public SanityTest { Options options_; }; +class SanityTestZlibCompressionVersion2 : public SanityTest { + public: + explicit SanityTestZlibCompressionVersion2(const std::string& path) + : SanityTest(path) { + options_.compression = kZlibCompression; + BlockBasedTableOptions table_options; + table_options.format_version = 2; + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "ZlibCompressionVersion2"; } + + private: + Options options_; +}; + +class SanityTestLZ4Compression : public SanityTest { + public: + explicit SanityTestLZ4Compression(const std::string& path) + : SanityTest(path) { + options_.compression = kLZ4Compression; + } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "LZ4Compression"; } + + private: + Options options_; +}; + +class SanityTestLZ4HCCompression : public SanityTest { + public: + explicit SanityTestLZ4HCCompression(const std::string& path) + : SanityTest(path) { + options_.compression = kLZ4HCCompression; + } + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "LZ4HCCompression"; } + + private: + Options options_; +}; + +#ifndef ROCKSDB_LITE class SanityTestPlainTableFactory : public SanityTest { public: explicit SanityTestPlainTableFactory(const std::string& path) @@ -145,20 +191,42 @@ class SanityTestPlainTableFactory : public SanityTest { private: Options options_; }; +#endif // ROCKSDB_LITE + +class SanityTestBloomFilter : public SanityTest { + public: + explicit SanityTestBloomFilter(const std::string& path) : SanityTest(path) { + BlockBasedTableOptions table_options; + table_options.filter_policy.reset(NewBloomFilterPolicy(10)); + options_.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + ~SanityTestBloomFilter() {} + virtual Options GetOptions() const { return options_; } + virtual std::string Name() const { return "BloomFilter"; } + + private: + Options options_; +}; namespace { bool RunSanityTests(const std::string& command, const std::string& path) { std::vector sanity_tests = { - new SanityTestBasic(path), - new SanityTestSpecialComparator(path), + new SanityTestBasic(path), new SanityTestSpecialComparator(path), new SanityTestZlibCompression(path), - new SanityTestPlainTableFactory(path)}; + new SanityTestZlibCompressionVersion2(path), + new SanityTestLZ4Compression(path), + new SanityTestLZ4HCCompression(path), +#ifndef ROCKSDB_LITE + new SanityTestPlainTableFactory(path), +#endif // ROCKSDB_LITE + new SanityTestBloomFilter(path)}; if (command == "create") { fprintf(stderr, "Creating...\n"); } else { fprintf(stderr, "Verifying...\n"); } + bool result = true; for (auto sanity_test : sanity_tests) { Status s; fprintf(stderr, "%s -- ", sanity_test->Name().c_str()); @@ -171,12 +239,12 @@ bool RunSanityTests(const std::string& command, const std::string& path) { fprintf(stderr, "%s\n", s.ToString().c_str()); if (!s.ok()) { fprintf(stderr, "FAIL\n"); - return false; + result = false; } delete sanity_test; } - return true; + return result; } } // namespace diff --git a/tools/db_stress.cc b/tools/db_stress.cc index cffcb1c47..e33eeed73 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -28,9 +28,12 @@ int main() { } #else +#define __STDC_FORMAT_MACROS +#include #include #include #include +#include #include #include "db/db_impl.h" #include "db/version_set.h" @@ -51,6 +54,7 @@ int main() { #include "util/logging.h" #include "hdfs/env_hdfs.h" #include "utilities/merge_operators.h" +#include "util/string_util.h" using GFLAGS::ParseCommandLineFlags; using GFLAGS::RegisterFlagValidator; @@ -112,7 +116,11 @@ DEFINE_bool(verbose, false, "Verbose"); DEFINE_bool(progress_reports, true, "If true, db_stress will report number of finished operations"); -DEFINE_int32(write_buffer_size, rocksdb::Options().write_buffer_size, +DEFINE_uint64(db_write_buffer_size, rocksdb::Options().db_write_buffer_size, + "Number of bytes to buffer in all memtables before compacting"); + +DEFINE_int32(write_buffer_size, + static_cast(rocksdb::Options().write_buffer_size), "Number of bytes to buffer in memtable before compacting"); DEFINE_int32(max_write_buffer_number, @@ -153,7 +161,8 @@ DEFINE_int32(level0_stop_writes_trigger, rocksdb::Options().level0_stop_writes_trigger, "Number of files in level-0 that will trigger put stop."); -DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size, +DEFINE_int32(block_size, + static_cast(rocksdb::BlockBasedTableOptions().block_size), "Number of bytes in a block."); DEFINE_int32(max_background_compactions, @@ -190,6 +199,12 @@ DEFINE_int32(clear_column_family_one_in, 1000000, "it again. If N == 0, never drop/create column families. " "When test_batches_snapshots is true, this flag has no effect"); +DEFINE_int32(set_options_one_in, 0, + "With a chance of 1/N, change some random options"); + +DEFINE_int32(set_in_place_one_in, 0, + "With a chance of 1/N, toggle in place support option"); + DEFINE_int64(cache_size, 2 * KB * KB * KB, "Number of bytes to use as a cache of uncompressed data."); @@ -208,6 +223,9 @@ static const bool FLAGS_reopen_dummy __attribute__((unused)) = DEFINE_int32(bloom_bits, 10, "Bloom filter bits per key. " "Negative means use default settings."); +DEFINE_bool(use_block_based_filter, false, "use block based filter" + "instead of full filter for block based table"); + DEFINE_string(db, "", "Use the db with the following name."); DEFINE_bool(verify_checksum, false, @@ -334,6 +352,8 @@ static const bool FLAGS_purge_redundant_percent_dummy __attribute__((unused)) = DEFINE_bool(filter_deletes, false, "On true, deletes use KeyMayExist to drop" " the delete if key not present"); +DEFINE_bool(in_place_update, false, "On true, does inplace update in memtable"); + enum RepFactory { kSkipList, kHashSkipList, @@ -368,7 +388,7 @@ static bool ValidatePrefixSize(const char* flagname, int32_t value) { return true; } DEFINE_int32(prefix_size, 7, "Control the prefix size for HashSkipListRep"); -static const bool FLAGS_prefix_size_dummy = +static const bool FLAGS_prefix_size_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_prefix_size, &ValidatePrefixSize); DEFINE_bool(use_merge, false, "On true, replaces all writes with a Merge " @@ -561,9 +581,9 @@ class SharedState { explicit SharedState(StressTest* stress_test) : cv_(&mu_), - seed_(FLAGS_seed), + seed_(static_cast(FLAGS_seed)), max_key_(FLAGS_max_key), - log2_keys_per_lock_(FLAGS_log2_keys_per_lock), + log2_keys_per_lock_(static_cast(FLAGS_log2_keys_per_lock)), num_threads_(FLAGS_threads), num_initialized_(0), num_populated_(0), @@ -739,11 +759,8 @@ struct ThreadState { SharedState* shared; Stats stats; - ThreadState(uint32_t index, SharedState *shared) - : tid(index), - rand(1000 + index + shared->GetSeed()), - shared(shared) { - } + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rand(1000 + index + _shared->GetSeed()), shared(_shared) {} }; } // namespace @@ -756,10 +773,12 @@ class StressTest { ? NewLRUCache(FLAGS_compressed_cache_size) : nullptr), filter_policy_(FLAGS_bloom_bits >= 0 - ? NewBloomFilterPolicy(FLAGS_bloom_bits) - : nullptr), + ? FLAGS_use_block_based_filter + ? NewBloomFilterPolicy(FLAGS_bloom_bits, true) + : NewBloomFilterPolicy(FLAGS_bloom_bits, false) + : nullptr), db_(nullptr), - new_column_family_name_(0), + new_column_family_name_(1), num_times_reopened_(0) { if (FLAGS_destroy_db_initially) { std::vector files; @@ -781,8 +800,129 @@ class StressTest { delete db_; } + bool BuildOptionsTable() { + if (FLAGS_set_options_one_in <= 0) { + return true; + } + options_table_ = { + {"write_buffer_size", + { + ToString(FLAGS_write_buffer_size), + ToString(FLAGS_write_buffer_size * 2), + ToString(FLAGS_write_buffer_size * 4) + } + }, + {"max_write_buffer_number", + { + ToString(FLAGS_max_write_buffer_number), + ToString(FLAGS_max_write_buffer_number * 2), + ToString(FLAGS_max_write_buffer_number * 4) + } + }, + {"arena_block_size", + { + ToString(Options().arena_block_size), + ToString(FLAGS_write_buffer_size / 4), + ToString(FLAGS_write_buffer_size / 8), + } + }, + {"memtable_prefix_bloom_bits", {"0", "8", "10"}}, + {"memtable_prefix_bloom_probes", {"4", "5", "6"}}, + {"memtable_prefix_bloom_huge_page_tlb_size", + { + "0", + ToString(2 * 1024 * 1024) + } + }, + {"max_successive_merges", {"0", "2", "4"}}, + {"filter_deletes", {"0", "1"}}, + {"inplace_update_num_locks", {"100", "200", "300"}}, + // TODO(ljin): enable test for this option + // {"disable_auto_compactions", {"100", "200", "300"}}, + {"soft_rate_limit", {"0", "0.5", "0.9"}}, + {"hard_rate_limit", {"0", "1.1", "2.0"}}, + {"level0_file_num_compaction_trigger", + { + ToString(FLAGS_level0_file_num_compaction_trigger), + ToString(FLAGS_level0_file_num_compaction_trigger + 2), + ToString(FLAGS_level0_file_num_compaction_trigger + 4), + } + }, + {"level0_slowdown_writes_trigger", + { + ToString(FLAGS_level0_slowdown_writes_trigger), + ToString(FLAGS_level0_slowdown_writes_trigger + 2), + ToString(FLAGS_level0_slowdown_writes_trigger + 4), + } + }, + {"level0_stop_writes_trigger", + { + ToString(FLAGS_level0_stop_writes_trigger), + ToString(FLAGS_level0_stop_writes_trigger + 2), + ToString(FLAGS_level0_stop_writes_trigger + 4), + } + }, + {"max_grandparent_overlap_factor", + { + ToString(Options().max_grandparent_overlap_factor - 5), + ToString(Options().max_grandparent_overlap_factor), + ToString(Options().max_grandparent_overlap_factor + 5), + } + }, + {"expanded_compaction_factor", + { + ToString(Options().expanded_compaction_factor - 5), + ToString(Options().expanded_compaction_factor), + ToString(Options().expanded_compaction_factor + 5), + } + }, + {"source_compaction_factor", + { + ToString(Options().source_compaction_factor), + ToString(Options().source_compaction_factor * 2), + ToString(Options().source_compaction_factor * 4), + } + }, + {"target_file_size_base", + { + ToString(FLAGS_target_file_size_base), + ToString(FLAGS_target_file_size_base * 2), + ToString(FLAGS_target_file_size_base * 4), + } + }, + {"target_file_size_multiplier", + { + ToString(FLAGS_target_file_size_multiplier), + "1", + "2", + } + }, + {"max_bytes_for_level_base", + { + ToString(FLAGS_max_bytes_for_level_base / 2), + ToString(FLAGS_max_bytes_for_level_base), + ToString(FLAGS_max_bytes_for_level_base * 2), + } + }, + {"max_bytes_for_level_multiplier", + { + ToString(FLAGS_max_bytes_for_level_multiplier), + "1", + "2", + } + }, + {"max_mem_compaction_level", {"0", "1", "2"}}, + {"max_sequential_skip_in_iterations", {"4", "8", "12"}}, + }; + for (const auto& iter : options_table_) { + options_index_.push_back(iter.first); + } + return true; + } + bool Run() { PrintEnv(); + BuildOptionsTable(); Open(); SharedState shared(this); uint32_t n = shared.GetNumThreads(); @@ -1163,6 +1303,33 @@ class StressTest { return s; } + Status SetOptions(ThreadState* thread) { + assert(FLAGS_set_options_one_in > 0); + std::unordered_map opts; + std::string name = options_index_[ + thread->rand.Next() % options_index_.size()]; + int value_idx = thread->rand.Next() % options_table_[name].size(); + if (name == "soft_rate_limit" || name == "hard_rate_limit") { + opts["soft_rate_limit"] = options_table_["soft_rate_limit"][value_idx]; + opts["hard_rate_limit"] = options_table_["hard_rate_limit"][value_idx]; + } else if (name == "level0_file_num_compaction_trigger" || + name == "level0_slowdown_writes_trigger" || + name == "level0_stop_writes_trigger") { + opts["level0_file_num_compaction_trigger"] = + options_table_["level0_file_num_compaction_trigger"][value_idx]; + opts["level0_slowdown_writes_trigger"] = + options_table_["level0_slowdown_writes_trigger"][value_idx]; + opts["level0_stop_writes_trigger"] = + options_table_["level0_stop_writes_trigger"][value_idx]; + } else { + opts[name] = options_table_[name][value_idx]; + } + + int rand_cf_idx = thread->rand.Next() % FLAGS_column_families; + auto cfh = column_families_[rand_cf_idx]; + return db_->SetOptions(cfh, opts); + } + void OperateDb(ThreadState* thread) { ReadOptions read_opts(FLAGS_verify_checksum, true); WriteOptions write_opts; @@ -1199,13 +1366,24 @@ class StressTest { } } + // Change Options + if (FLAGS_set_options_one_in > 0 && + thread->rand.OneIn(FLAGS_set_options_one_in)) { + SetOptions(thread); + } + + if (FLAGS_set_in_place_one_in > 0 && + thread->rand.OneIn(FLAGS_set_in_place_one_in)) { + options_.inplace_update_support ^= options_.inplace_update_support; + } + if (!FLAGS_test_batches_snapshots && FLAGS_clear_column_family_one_in != 0 && FLAGS_column_families > 1) { if (thread->rand.OneIn(FLAGS_clear_column_family_one_in)) { // drop column family and then create it again (can't drop default) int cf = thread->rand.Next() % (FLAGS_column_families - 1) + 1; std::string new_name = - std::to_string(new_column_family_name_.fetch_add(1)); + ToString(new_column_family_name_.fetch_add(1)); { MutexLock l(thread->shared->GetMutex()); fprintf( @@ -1217,12 +1395,20 @@ class StressTest { Status s __attribute__((unused)); s = db_->DropColumnFamily(column_families_[cf]); delete column_families_[cf]; - assert(s.ok()); + if (!s.ok()) { + fprintf(stderr, "dropping column family error: %s\n", + s.ToString().c_str()); + std::terminate(); + } s = db_->CreateColumnFamily(ColumnFamilyOptions(options_), new_name, &column_families_[cf]); column_family_names_[cf] = new_name; thread->shared->ClearColumnFamily(cf); - assert(s.ok()); + if (!s.ok()) { + fprintf(stderr, "creating column family error: %s\n", + s.ToString().c_str()); + std::terminate(); + } thread->shared->UnlockColumnFamily(cf); } } @@ -1273,7 +1459,7 @@ class StressTest { assert(count <= (static_cast(1) << ((8 - FLAGS_prefix_size) * 8))); if (iter->status().ok()) { - thread->stats.AddPrefixes(1, count); + thread->stats.AddPrefixes(1, static_cast(count)); } else { thread->stats.AddErrors(1); } @@ -1297,22 +1483,32 @@ class StressTest { } } thread->shared->Put(rand_column_family, rand_key, value_base); + Status s; if (FLAGS_use_merge) { - db_->Merge(write_opts, column_family, key, v); + s = db_->Merge(write_opts, column_family, key, v); } else { - db_->Put(write_opts, column_family, key, v); + s = db_->Put(write_opts, column_family, key, v); + } + if (!s.ok()) { + fprintf(stderr, "put or merge error: %s\n", s.ToString().c_str()); + std::terminate(); } thread->stats.AddBytesForWrites(1, sz); } else { MultiPut(thread, write_opts, column_family, key, v, sz); } - PrintKeyValue(rand_column_family, rand_key, value, sz); + PrintKeyValue(rand_column_family, static_cast(rand_key), + value, sz); } else if (writeBound <= prob_op && prob_op < delBound) { // OPERATION delete if (!FLAGS_test_batches_snapshots) { thread->shared->Delete(rand_column_family, rand_key); - db_->Delete(write_opts, column_family, key); + Status s = db_->Delete(write_opts, column_family, key); thread->stats.AddDeletes(1); + if (!s.ok()) { + fprintf(stderr, "delete error: %s\n", s.ToString().c_str()); + std::terminate(); + } } else { MultiDelete(thread, write_opts, column_family, key); } @@ -1366,16 +1562,19 @@ class StressTest { from_db = iter->value().ToString(); iter->Next(); } else if (iter->key().compare(k) < 0) { - VerificationAbort(shared, "An out of range key was found", cf, i); + VerificationAbort(shared, "An out of range key was found", + static_cast(cf), i); } } else { // The iterator found no value for the key in question, so do not // move to the next item in the iterator s = Status::NotFound(Slice()); } - VerifyValue(cf, i, options, shared, from_db, s, true); + VerifyValue(static_cast(cf), i, options, shared, from_db, s, + true); if (from_db.length()) { - PrintKeyValue(cf, i, from_db.data(), from_db.length()); + PrintKeyValue(static_cast(cf), static_cast(i), + from_db.data(), from_db.length()); } } } else { @@ -1388,9 +1587,11 @@ class StressTest { std::string keystr = Key(i); Slice k = keystr; Status s = db_->Get(options, column_families_[cf], k, &from_db); - VerifyValue(cf, i, options, shared, from_db, s, true); + VerifyValue(static_cast(cf), i, options, shared, from_db, s, + true); if (from_db.length()) { - PrintKeyValue(cf, i, from_db.data(), from_db.length()); + PrintKeyValue(static_cast(cf), static_cast(i), + from_db.data(), from_db.length()); } } } @@ -1486,6 +1687,8 @@ class StressTest { fprintf(stdout, "Write percentage : %d%%\n", FLAGS_writepercent); fprintf(stdout, "Delete percentage : %d%%\n", FLAGS_delpercent); fprintf(stdout, "Iterate percentage : %d%%\n", FLAGS_iterpercent); + fprintf(stdout, "DB-write-buffer-size: %" PRIu64 "\n", + FLAGS_db_write_buffer_size); fprintf(stdout, "Write-buffer-size : %d\n", FLAGS_write_buffer_size); fprintf(stdout, "Iterations : %lu\n", @@ -1502,6 +1705,8 @@ class StressTest { FLAGS_purge_redundant_percent); fprintf(stdout, "Deletes use filter : %d\n", FLAGS_filter_deletes); + fprintf(stdout, "Do update in place : %d\n", + FLAGS_in_place_update); fprintf(stdout, "Num keys per lock : %d\n", 1 << FLAGS_log2_keys_per_lock); @@ -1552,9 +1757,11 @@ class StressTest { block_based_options.block_cache = cache_; block_based_options.block_cache_compressed = compressed_cache_; block_based_options.block_size = FLAGS_block_size; + block_based_options.format_version = 2; block_based_options.filter_policy = filter_policy_; options_.table_factory.reset( NewBlockBasedTableFactory(block_based_options)); + options_.db_write_buffer_size = FLAGS_db_write_buffer_size; options_.write_buffer_size = FLAGS_write_buffer_size; options_.max_write_buffer_number = FLAGS_max_write_buffer_number; options_.min_write_buffer_number_to_merge = @@ -1585,22 +1792,31 @@ class StressTest { options_.create_if_missing = true; options_.max_manifest_file_size = 10 * 1024; options_.filter_deletes = FLAGS_filter_deletes; + options_.inplace_update_support = FLAGS_in_place_update; if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { fprintf(stderr, "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); exit(1); } switch (FLAGS_rep_factory) { - case kHashSkipList: - options_.memtable_factory.reset(NewHashSkipListRepFactory(10000)); - break; case kSkipList: // no need to do anything break; +#ifndef ROCKSDB_LITE + case kHashSkipList: + options_.memtable_factory.reset(NewHashSkipListRepFactory(10000)); + break; case kVectorRep: options_.memtable_factory.reset(new VectorRepFactory()); break; +#else + default: + fprintf(stderr, + "RocksdbLite only supports skip list mem table. Skip " + "--rep_factory\n"); +#endif // ROCKSDB_LITE } + static Random purge_percent(1000); // no benefit from non-determinism here if (static_cast(purge_percent.Uniform(100)) < FLAGS_purge_redundant_percent - 1) { @@ -1675,7 +1891,7 @@ class StressTest { cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_)); } while (cf_descriptors.size() < (size_t)FLAGS_column_families) { - std::string name = std::to_string(new_column_family_name_.load()); + std::string name = ToString(new_column_family_name_.load()); new_column_family_name_++; cf_descriptors.emplace_back(name, ColumnFamilyOptions(options_)); column_family_names_.push_back(name); @@ -1686,9 +1902,14 @@ class StressTest { assert(!s.ok() || column_families_.size() == static_cast(FLAGS_column_families)); } else { +#ifndef ROCKSDB_LITE DBWithTTL* db_with_ttl; s = DBWithTTL::Open(options_, FLAGS_db, &db_with_ttl, FLAGS_ttl); db_ = db_with_ttl; +#else + fprintf(stderr, "TTL is not supported in RocksDBLite\n"); + exit(1); +#endif } if (!s.ok()) { fprintf(stderr, "open error: %s\n", s.ToString().c_str()); @@ -1728,6 +1949,8 @@ class StressTest { std::vector column_family_names_; std::atomic new_column_family_name_; int num_times_reopened_; + std::unordered_map> options_table_; + std::vector options_index_; }; } // namespace rocksdb diff --git a/tools/ldb.cc b/tools/ldb.cc index 4581b8011..cb5ef5204 100644 --- a/tools/ldb.cc +++ b/tools/ldb.cc @@ -3,6 +3,7 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // +#ifndef ROCKSDB_LITE #include "rocksdb/ldb_tool.h" @@ -11,3 +12,10 @@ int main(int argc, char** argv) { tool.Run(argc, argv); return 0; } +#else +#include +int main(int argc, char** argv) { + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; +} +#endif // ROCKSDB_LITE diff --git a/tools/ldb_test.py b/tools/ldb_test.py index b4ef5221f..f248f88cd 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -378,6 +378,22 @@ class LDBTestCase(unittest.TestCase): my_check_output("rm -f %s" % sstFilePath, shell=True) self.assertRunFAIL("checkconsistency") + def dumpLiveFiles(self, params, dumpFile): + return 0 == run_err_null("./ldb dump_live_files %s > %s" % ( + params, dumpFile)) + + def testDumpLiveFiles(self): + print "Running testDumpLiveFiles..." + + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put x1 y1 --create_if_missing", "OK") + self.assertRunOK("put x2 y2", "OK") + dumpFilePath = os.path.join(self.TMP_DIR, "dump1") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath)) + self.assertRunOK("delete x1", "OK") + self.assertRunOK("put x3 y3", "OK") + dumpFilePath = os.path.join(self.TMP_DIR, "dump2") + self.assertTrue(self.dumpLiveFiles("--db=%s" % dbPath, dumpFilePath)) if __name__ == "__main__": unittest.main() diff --git a/tools/rdb/.gitignore b/tools/rdb/.gitignore new file mode 100644 index 000000000..378eac25d --- /dev/null +++ b/tools/rdb/.gitignore @@ -0,0 +1 @@ +build diff --git a/tools/rdb/API.md b/tools/rdb/API.md new file mode 100644 index 000000000..f25949706 --- /dev/null +++ b/tools/rdb/API.md @@ -0,0 +1,178 @@ +# JavaScript API + +## DBWrapper + +### Constructor + + # Creates a new database wrapper object + RDB() + +### Open + + # Open a new or existing RocksDB database. + # + # db_name (string) - Location of the database (inside the + # `/tmp` directory). + # column_families (string[]) - Names of additional column families + # beyond the default. If there are no other + # column families, this argument can be + # left off. + # + # Returns true if the database was opened successfully, or false otherwise + db_obj.(db_name, column_families = []) + +### Get + + # Get the value of a given key. + # + # key (string) - Which key to get the value of. + # column_family (string) - Which column family to check for the key. + # This argument can be left off for the default + # column family + # + # Returns the value (string) that is associated with the given key if + # one exists, or null otherwise. + db_obj.get(key, column_family = { default }) + +### Put + + # Associate a value with a key. + # + # key (string) - Which key to associate the value with. + # value (string) - The value to associate with the key. + # column_family (string) - Which column family to put the key-value pair + # in. This argument can be left off for the + # default column family. + # + # Returns true if the key-value pair was successfully stored in the + # database, or false otherwise. + db_obj.put(key, value, column_family = { default }) + +### Delete + + # Delete a value associated with a given key. + # + # key (string) - Which key to delete the value of.. + # column_family (string) - Which column family to check for the key. + # This argument can be left off for the default + # column family + # + # Returns true if an error occured while trying to delete the key in + # the database, or false otherwise. Note that this is NOT the same as + # whether a value was deleted; in the case of a specified key not having + # a value, this will still return true. Use the `get` method prior to + # this method to check if a value existed before the call to `delete`. + db_obj.delete(key, column_family = { default }) + +### Dump + + # Print out all the key-value pairs in a given column family of the + # database. + # + # column_family (string) - Which column family to dump the pairs from. + # This argument can be left off for the default + # column family. + # + # Returns true if the keys were successfully read from the database, or + # false otherwise. + db_obj.dump(column_family = { default }) + +### WriteBatch + + # Execute an atomic batch of writes (i.e. puts and deletes) to the + # database. + # + # cf_batches (BatchObject[]; see below) - Put and Delete writes grouped + # by column family to execute + # atomically. + # + # Returns true if the argument array was well-formed and was + # successfully written to the database, or false otherwise. + db_obj.writeBatch(cf_batches) + +### CreateColumnFamily + + # Create a new column familiy for the database. + # + # column_family_name (string) - Name of the new column family. + # + # Returns true if the new column family was successfully created, or + # false otherwise. + db_obj.createColumnFamily(column_family_name) + +### CompactRange + + # Compact the underlying storage for a given range. + # + # In addition to the endpoints of the range, the method is overloaded to + # accept a non-default column family, a set of options, or both. + # + # begin (string) - First key in the range to compact. + # end (string) - Last key in the range to compact. + # options (object) - Contains a subset of the following key-value + # pairs: + # * 'target_level' => int + # * 'target_path_id' => int + # column_family (string) - Which column family to compact the range in. + db_obj.compactRange(begin, end) + db_obj.compactRange(begin, end, options) + db_obj.compactRange(begin, end, column_family) + db_obj.compactRange(begin, end, options, column_family) + + + +### Close + + # Close an a database and free the memory associated with it. + # + # Return null. + # db_obj.close() + + +## BatchObject + +### Structure + +A BatchObject must have at least one of the following key-value pairs: + +* 'put' => Array of ['string1', 'string1'] pairs, each of which signifies that +the key 'string1' should be associated with the value 'string2' +* 'delete' => Array of strings, each of which is a key whose value should be +deleted. + +The following key-value pair is optional: + +* 'column_family' => The name (string) of the column family to apply the +changes to. + +### Examples + + # Writes the key-value pairs 'firstname' => 'Saghm' and + # 'lastname' => 'Rossi' atomically to the database. + db_obj.writeBatch([ + { + put: [ ['firstname', 'Saghm'], ['lastname', 'Rossi'] ] + } + ]); + + + # Deletes the values associated with 'firstname' and 'lastname' in + # the default column family and adds the key 'number_of_people' with + # with the value '2'. Additionally, adds the key-value pair + # 'name' => 'Saghm Rossi' to the column family 'user1' and the pair + # 'name' => 'Matt Blaze' to the column family 'user2'. All writes + # are done atomically. + db_obj.writeBatch([ + { + put: [ ['number_of_people', '2'] ], + delete: ['firstname', 'lastname'] + }, + { + put: [ ['name', 'Saghm Rossi'] ], + column_family: 'user1' + }, + { + put: [ ['name', Matt Blaze'] ], + column_family: 'user2' + } + ]); diff --git a/tools/rdb/README.md b/tools/rdb/README.md new file mode 100644 index 000000000..2cc9acad2 --- /dev/null +++ b/tools/rdb/README.md @@ -0,0 +1,40 @@ +# RDB - RocksDB Shell + +RDB is a NodeJS-based shell interface to RocksDB. It can also be used as a +JavaScript binding for RocksDB within a Node application. + +## Setup/Compilation + +### Requirements + +* static RocksDB library (i.e. librocksdb.a) +* libsnappy +* node (tested onv0.10.33, no guarantees on anything else!) +* node-gyp +* python2 (for node-gyp; tested with 2.7.8) + +### Installation + +NOTE: If your default `python` binary is not a version of python2, add +the arguments `--python /path/to/python2` to the the `node-gyp` commands. + +1. Make sure you have the static library (i.e. "librocksdb.a") in the root +directory of your rocksdb installation. If not, `cd` there and run +`make static_lib`. + +2. Run `node-gyp configure` to generate the build. + +3. Run `node-gyp build` to compile RDB. + +## Usage + +### Running the shell + +Assuming everything compiled correctly, you can run the `rdb` executable +located in the root of the `tools/rdb` directory to start the shell. The file is +just a shell script that runs the node shell and loads the constructor for the +RDB object into the top-level function `RDB`. + +### JavaScript API + +See `API.md` for how to use RocksDB from the shell. diff --git a/tools/rdb/binding.gyp b/tools/rdb/binding.gyp new file mode 100644 index 000000000..89145541c --- /dev/null +++ b/tools/rdb/binding.gyp @@ -0,0 +1,25 @@ +{ + "targets": [ + { + "target_name": "rdb", + "sources": [ + "rdb.cc", + "db_wrapper.cc", + "db_wrapper.h" + ], + "cflags_cc!": [ + "-fno-exceptions" + ], + "cflags_cc+": [ + "-std=c++11", + ], + "include_dirs+": [ + "../../include" + ], + "libraries": [ + "../../../librocksdb.a", + "-lsnappy" + ], + } + ] +} diff --git a/tools/rdb/db_wrapper.cc b/tools/rdb/db_wrapper.cc new file mode 100644 index 000000000..34725379d --- /dev/null +++ b/tools/rdb/db_wrapper.cc @@ -0,0 +1,525 @@ +#include +#include +#include +#include +#include + +#include "db_wrapper.h" +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/options.h" + +namespace { + void printWithBackSlashes(std::string str) { + for (std::string::size_type i = 0; i < str.size(); i++) { + if (str[i] == '\\' || str[i] == '"') { + std::cout << "\\"; + } + + std::cout << str[i]; + } + } + + bool has_key_for_array(Local obj, std::string key) { + return obj->Has(String::NewSymbol(key.c_str())) && + obj->Get(String::NewSymbol(key.c_str()))->IsArray(); + } +} + +using namespace v8; + + +Persistent DBWrapper::constructor; + +DBWrapper::DBWrapper() { + options_.IncreaseParallelism(); + options_.OptimizeLevelStyleCompaction(); + options_.disable_auto_compactions = true; + options_.create_if_missing = true; +} + +DBWrapper::~DBWrapper() { + delete db_; +} + +bool DBWrapper::HasFamilyNamed(std::string& name, DBWrapper* db) { + return db->columnFamilies_.find(name) != db->columnFamilies_.end(); +} + + +void DBWrapper::Init(Handle exports) { + Local tpl = FunctionTemplate::New(New); + tpl->SetClassName(String::NewSymbol("DBWrapper")); + tpl->InstanceTemplate()->SetInternalFieldCount(8); + tpl->PrototypeTemplate()->Set(String::NewSymbol("open"), + FunctionTemplate::New(Open)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("get"), + FunctionTemplate::New(Get)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("put"), + FunctionTemplate::New(Put)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("delete"), + FunctionTemplate::New(Delete)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("dump"), + FunctionTemplate::New(Dump)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("createColumnFamily"), + FunctionTemplate::New(CreateColumnFamily)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("writeBatch"), + FunctionTemplate::New(WriteBatch)->GetFunction()); + tpl->PrototypeTemplate()->Set(String::NewSymbol("compactRange"), + FunctionTemplate::New(CompactRange)->GetFunction()); + + constructor = Persistent::New(tpl->GetFunction()); + exports->Set(String::NewSymbol("DBWrapper"), constructor); +} + +Handle DBWrapper::Open(const Arguments& args) { + HandleScope scope; + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + + if (!(args[0]->IsString() && + (args[1]->IsUndefined() || args[1]->IsArray()))) { + return scope.Close(Boolean::New(false)); + } + + std::string db_file = *v8::String::Utf8Value(args[0]->ToString()); + + std::vector cfs = { rocksdb::kDefaultColumnFamilyName }; + + if (!args[1]->IsUndefined()) { + Handle array = Handle::Cast(args[1]); + for (uint i = 0; i < array->Length(); i++) { + if (!array->Get(i)->IsString()) { + return scope.Close(Boolean::New(false)); + } + + cfs.push_back(*v8::String::Utf8Value(array->Get(i)->ToString())); + } + } + + if (cfs.size() == 1) { + db_wrapper->status_ = rocksdb::DB::Open( + db_wrapper->options_, db_file, &db_wrapper->db_); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); + } + + std::vector families; + + for (std::vector::size_type i = 0; i < cfs.size(); i++) { + families.push_back(rocksdb::ColumnFamilyDescriptor( + cfs[i], rocksdb::ColumnFamilyOptions())); + } + + std::vector handles; + db_wrapper->status_ = rocksdb::DB::Open( + db_wrapper->options_, db_file, families, &handles, &db_wrapper->db_); + + if (!db_wrapper->status_.ok()) { + return scope.Close(Boolean::New(db_wrapper->status_.ok())); + } + + for (std::vector::size_type i = 0; i < handles.size(); i++) { + db_wrapper->columnFamilies_[cfs[i]] = handles[i]; + } + + return scope.Close(Boolean::New(true)); +} + + +Handle DBWrapper::New(const Arguments& args) { + HandleScope scope; + Handle to_return; + + if (args.IsConstructCall()) { + DBWrapper* db_wrapper = new DBWrapper(); + db_wrapper->Wrap(args.This()); + + return args.This(); + } + + const int argc = 0; + Local argv[0] = {}; + + return scope.Close(constructor->NewInstance(argc, argv)); +} + +Handle DBWrapper::Get(const Arguments& args) { + HandleScope scope; + + if (!(args[0]->IsString() && + (args[1]->IsUndefined() || args[1]->IsString()))) { + return scope.Close(Null()); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + std::string key = *v8::String::Utf8Value(args[0]->ToString()); + std::string cf = *v8::String::Utf8Value(args[1]->ToString()); + std::string value; + + if (args[1]->IsUndefined()) { + db_wrapper->status_ = db_wrapper->db_->Get( + rocksdb::ReadOptions(), key, &value); + } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) { + db_wrapper->status_ = db_wrapper->db_->Get( + rocksdb::ReadOptions(), db_wrapper->columnFamilies_[cf], key, &value); + } else { + return scope.Close(Null()); + } + + Handle v = db_wrapper->status_.ok() ? + String::NewSymbol(value.c_str()) : Null(); + + return scope.Close(v); +} + +Handle DBWrapper::Put(const Arguments& args) { + HandleScope scope; + + if (!(args[0]->IsString() && args[1]->IsString() && + (args[2]->IsUndefined() || args[2]->IsString()))) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + std::string key = *v8::String::Utf8Value(args[0]->ToString()); + std::string value = *v8::String::Utf8Value(args[1]->ToString()); + std::string cf = *v8::String::Utf8Value(args[2]->ToString()); + + if (args[2]->IsUndefined()) { + db_wrapper->status_ = db_wrapper->db_->Put( + rocksdb::WriteOptions(), key, value + ); + } else if (db_wrapper->HasFamilyNamed(cf, db_wrapper)) { + db_wrapper->status_ = db_wrapper->db_->Put( + rocksdb::WriteOptions(), + db_wrapper->columnFamilies_[cf], + key, + value + ); + } else { + return scope.Close(Boolean::New(false)); + } + + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::Delete(const Arguments& args) { + HandleScope scope; + + if (!args[0]->IsString()) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + std::string arg0 = *v8::String::Utf8Value(args[0]->ToString()); + std::string arg1 = *v8::String::Utf8Value(args[1]->ToString()); + + if (args[1]->IsUndefined()) { + db_wrapper->status_ = db_wrapper->db_->Delete( + rocksdb::WriteOptions(), arg0); + } else { + if (!db_wrapper->HasFamilyNamed(arg1, db_wrapper)) { + return scope.Close(Boolean::New(false)); + } + db_wrapper->status_ = db_wrapper->db_->Delete( + rocksdb::WriteOptions(), db_wrapper->columnFamilies_[arg1], arg0); + } + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::Dump(const Arguments& args) { + HandleScope scope; + std::unique_ptr iterator; + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + std::string arg0 = *v8::String::Utf8Value(args[0]->ToString()); + + if (args[0]->IsUndefined()) { + iterator.reset(db_wrapper->db_->NewIterator(rocksdb::ReadOptions())); + } else { + if (!db_wrapper->HasFamilyNamed(arg0, db_wrapper)) { + return scope.Close(Boolean::New(false)); + } + + iterator.reset(db_wrapper->db_->NewIterator( + rocksdb::ReadOptions(), db_wrapper->columnFamilies_[arg0])); + } + + iterator->SeekToFirst(); + + while (iterator->Valid()) { + std::cout << "\""; + printWithBackSlashes(iterator->key().ToString()); + std::cout << "\" => \""; + printWithBackSlashes(iterator->value().ToString()); + std::cout << "\"\n"; + iterator->Next(); + } + + return scope.Close(Boolean::New(true)); +} + +Handle DBWrapper::CreateColumnFamily(const Arguments& args) { + HandleScope scope; + + if (!args[0]->IsString()) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + std::string cf_name = *v8::String::Utf8Value(args[0]->ToString()); + + if (db_wrapper->HasFamilyNamed(cf_name, db_wrapper)) { + return scope.Close(Boolean::New(false)); + } + + rocksdb::ColumnFamilyHandle* cf; + db_wrapper->status_ = db_wrapper->db_->CreateColumnFamily( + rocksdb::ColumnFamilyOptions(), cf_name, &cf); + + if (!db_wrapper->status_.ok()) { + return scope.Close(Boolean::New(false)); + } + + db_wrapper->columnFamilies_[cf_name] = cf; + + return scope.Close(Boolean::New(true)); +} + +bool DBWrapper::AddToBatch(rocksdb::WriteBatch& batch, bool del, + Handle array) { + Handle put_pair; + for (uint i = 0; i < array->Length(); i++) { + if (del) { + if (!array->Get(i)->IsString()) { + return false; + } + + batch.Delete(*v8::String::Utf8Value(array->Get(i)->ToString())); + continue; + } + + if (!array->Get(i)->IsArray()) { + return false; + } + + put_pair = Handle::Cast(array->Get(i)); + + if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) { + return false; + } + + batch.Put( + *v8::String::Utf8Value(put_pair->Get(0)->ToString()), + *v8::String::Utf8Value(put_pair->Get(1)->ToString())); + } + + return true; +} + +bool DBWrapper::AddToBatch(rocksdb::WriteBatch& batch, bool del, + Handle array, DBWrapper* db_wrapper, + std::string cf) { + Handle put_pair; + for (uint i = 0; i < array->Length(); i++) { + if (del) { + if (!array->Get(i)->IsString()) { + return false; + } + + batch.Delete( + db_wrapper->columnFamilies_[cf], + *v8::String::Utf8Value(array->Get(i)->ToString())); + continue; + } + + if (!array->Get(i)->IsArray()) { + return false; + } + + put_pair = Handle::Cast(array->Get(i)); + + if (!put_pair->Get(0)->IsString() || !put_pair->Get(1)->IsString()) { + return false; + } + + batch.Put( + db_wrapper->columnFamilies_[cf], + *v8::String::Utf8Value(put_pair->Get(0)->ToString()), + *v8::String::Utf8Value(put_pair->Get(1)->ToString())); + } + + return true; +} + +Handle DBWrapper::WriteBatch(const Arguments& args) { + HandleScope scope; + + if (!args[0]->IsArray()) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + Handle sub_batches = Handle::Cast(args[0]); + Local sub_batch; + rocksdb::WriteBatch batch; + bool well_formed; + + for (uint i = 0; i < sub_batches->Length(); i++) { + if (!sub_batches->Get(i)->IsObject()) { + return scope.Close(Boolean::New(false)); + } + sub_batch = sub_batches->Get(i)->ToObject(); + + if (sub_batch->Has(String::NewSymbol("column_family"))) { + if (!has_key_for_array(sub_batch, "put") && + !has_key_for_array(sub_batch, "delete")) { + return scope.Close(Boolean::New(false)); + } + + well_formed = db_wrapper->AddToBatch( + batch, false, + Handle::Cast(sub_batch->Get(String::NewSymbol("put"))), + db_wrapper, *v8::String::Utf8Value(sub_batch->Get( + String::NewSymbol("column_family")))); + + well_formed = db_wrapper->AddToBatch( + batch, true, + Handle::Cast(sub_batch->Get(String::NewSymbol("delete"))), + db_wrapper, *v8::String::Utf8Value(sub_batch->Get( + String::NewSymbol("column_family")))); + } else { + well_formed = db_wrapper->AddToBatch( + batch, false, + Handle::Cast(sub_batch->Get(String::NewSymbol("put")))); + well_formed = db_wrapper->AddToBatch( + batch, true, + Handle::Cast(sub_batch->Get(String::NewSymbol("delete")))); + + if (!well_formed) { + return scope.Close(Boolean::New(false)); + } + } + } + + db_wrapper->status_ = db_wrapper->db_->Write(rocksdb::WriteOptions(), &batch); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::CompactRangeDefault(const Arguments& args) { + HandleScope scope; + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + rocksdb::Slice begin = *v8::String::Utf8Value(args[0]->ToString()); + rocksdb::Slice end = *v8::String::Utf8Value(args[1]->ToString()); + db_wrapper->status_ = db_wrapper->db_->CompactRange(&end, &begin); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::CompactColumnFamily(const Arguments& args) { + HandleScope scope; + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + rocksdb::Slice begin = *v8::String::Utf8Value(args[0]->ToString()); + rocksdb::Slice end = *v8::String::Utf8Value(args[1]->ToString()); + std::string cf = *v8::String::Utf8Value(args[2]->ToString()); + db_wrapper->status_ = db_wrapper->db_->CompactRange( + db_wrapper->columnFamilies_[cf], &begin, &end); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::CompactOptions(const Arguments& args) { + HandleScope scope; + + if (!args[2]->IsObject()) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + rocksdb::Slice begin = *v8::String::Utf8Value(args[0]->ToString()); + rocksdb::Slice end = *v8::String::Utf8Value(args[1]->ToString()); + Local options = args[2]->ToObject(); + int target_level = -1, target_path_id = 0; + + if (options->Has(String::NewSymbol("target_level")) && + options->Get(String::NewSymbol("target_level"))->IsInt32()) { + target_level = (int)(options->Get( + String::NewSymbol("target_level"))->ToInt32()->Value()); + + if (options->Has(String::NewSymbol("target_path_id")) || + options->Get(String::NewSymbol("target_path_id"))->IsInt32()) { + target_path_id = (int)(options->Get( + String::NewSymbol("target_path_id"))->ToInt32()->Value()); + } + } + + db_wrapper->status_ = db_wrapper->db_->CompactRange( + &begin, &end, true, target_level, target_path_id + ); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::CompactAll(const Arguments& args) { + HandleScope scope; + + if (!args[2]->IsObject() || !args[3]->IsString()) { + return scope.Close(Boolean::New(false)); + } + + DBWrapper* db_wrapper = ObjectWrap::Unwrap(args.This()); + rocksdb::Slice begin = *v8::String::Utf8Value(args[0]->ToString()); + rocksdb::Slice end = *v8::String::Utf8Value(args[1]->ToString()); + Local options = args[2]->ToObject(); + std::string cf = *v8::String::Utf8Value(args[3]->ToString()); + + int target_level = -1, target_path_id = 0; + + if (options->Has(String::NewSymbol("target_level")) && + options->Get(String::NewSymbol("target_level"))->IsInt32()) { + target_level = (int)(options->Get( + String::NewSymbol("target_level"))->ToInt32()->Value()); + + if (options->Has(String::NewSymbol("target_path_id")) || + options->Get(String::NewSymbol("target_path_id"))->IsInt32()) { + target_path_id = (int)(options->Get( + String::NewSymbol("target_path_id"))->ToInt32()->Value()); + } + } + + db_wrapper->status_ = db_wrapper->db_->CompactRange( + db_wrapper->columnFamilies_[cf], &begin, &end, true, target_level, + target_path_id); + + return scope.Close(Boolean::New(db_wrapper->status_.ok())); +} + +Handle DBWrapper::CompactRange(const Arguments& args) { + HandleScope scope; + + if (!args[0]->IsString() || !args[1]->IsString()) { + return scope.Close(Boolean::New(false)); + } + + switch(args.Length()) { + case 2: + return CompactRangeDefault(args); + case 3: + return args[2]->IsString() ? CompactColumnFamily(args) : + CompactOptions(args); + default: + return CompactAll(args); + } +} + +Handle DBWrapper::Close(const Arguments& args) { + HandleScope scope; + + delete ObjectWrap::Unwrap(args.This()); + + return scope.Close(Null()); +} diff --git a/tools/rdb/db_wrapper.h b/tools/rdb/db_wrapper.h new file mode 100644 index 000000000..9d1c8f886 --- /dev/null +++ b/tools/rdb/db_wrapper.h @@ -0,0 +1,58 @@ +#ifndef DBWRAPPER_H +#define DBWRAPPER_H + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/slice.h" +#include "rocksdb/options.h" + +using namespace v8; + +// Used to encapsulate a particular instance of an opened database. +// +// This object should not be used directly in C++; it exists solely to provide +// a mapping from a JavaScript object to a C++ code that can use the RocksDB +// API. +class DBWrapper : public node::ObjectWrap { + public: + static void Init(Handle exports); + + private: + explicit DBWrapper(); + ~DBWrapper(); + + // Helper methods + static bool HasFamilyNamed(std::string& name, DBWrapper* db); + static bool AddToBatch(rocksdb::WriteBatch& batch, bool del, + Handle array); + static bool AddToBatch(rocksdb::WriteBatch& batch, bool del, + Handle array, DBWrapper* db_wrapper, std::string cf); + static Handle CompactRangeDefault(const v8::Arguments& args); + static Handle CompactColumnFamily(const Arguments& args); + static Handle CompactOptions(const Arguments& args); + static Handle CompactAll(const Arguments& args); + + // C++ mappings of API methods + static Persistent constructor; + static Handle Open(const Arguments& args); + static Handle New(const Arguments& args); + static Handle Get(const Arguments& args); + static Handle Put(const Arguments& args); + static Handle Delete(const Arguments& args); + static Handle Dump(const Arguments& args); + static Handle WriteBatch(const Arguments& args); + static Handle CreateColumnFamily(const Arguments& args); + static Handle CompactRange(const Arguments& args); + static Handle Close(const Arguments& args); + + // Internal fields + rocksdb::Options options_; + rocksdb::Status status_; + rocksdb::DB* db_; + std::unordered_map + columnFamilies_; +}; + +#endif diff --git a/tools/rdb/rdb b/tools/rdb/rdb new file mode 100755 index 000000000..82cd17fb7 --- /dev/null +++ b/tools/rdb/rdb @@ -0,0 +1,3 @@ +#!/bin/bash + +node -e "RDB = require('./build/Release/rdb').DBWrapper; console.log('Loaded rocksdb in variable RDB'); repl = require('repl').start('> ');" diff --git a/tools/rdb/rdb.cc b/tools/rdb/rdb.cc new file mode 100644 index 000000000..8710e4623 --- /dev/null +++ b/tools/rdb/rdb.cc @@ -0,0 +1,15 @@ +#ifndef BUILDING_NODE_EXTENSION +#define BUILDING_NODE_EXTENSION +#endif + +#include +#include +#include "db_wrapper.h" + +using namespace v8; + +void InitAll(Handle exports) { + DBWrapper::Init(exports); +} + +NODE_MODULE(rdb, InitAll) diff --git a/tools/rdb/unit_test.js b/tools/rdb/unit_test.js new file mode 100644 index 000000000..d74ee8ce5 --- /dev/null +++ b/tools/rdb/unit_test.js @@ -0,0 +1,124 @@ +assert = require('assert') +RDB = require('./build/Release/rdb').DBWrapper +exec = require('child_process').exec +util = require('util') + +DB_NAME = '/tmp/rocksdbtest-' + process.getuid() + +a = RDB() +assert.equal(a.open(DB_NAME, ['b']), false) + +exec( + util.format( + "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \ + a = RDB('%s'); a.createColumnFamily('b')\"", + DB_NAME + ).exitCode, null +) + + +exec( + util.format( + "node -e \"RDB = require('./build/Release/rdb').DBWrapper; \ + a = RDB('%s', ['b'])\"", + DB_NAME + ).exitCode, null +) + +exec('rm -rf ' + DB_NAME) + +a = RDB() +assert.equal(a.open(DB_NAME, ['a']), false) +assert(a.open(DB_NAME), true) +assert(a.createColumnFamily('temp')) + +b = RDB() +assert.equal(b.open(DB_NAME), false) + +exec('rm -rf ' + DB_NAME) + +DB_NAME += 'b' + +a = RDB() +assert(a.open(DB_NAME)) +assert.equal(a.constructor.name, 'DBWrapper') +assert.equal(a.createColumnFamily(), false) +assert.equal(a.createColumnFamily(1), false) +assert.equal(a.createColumnFamily(['']), false) +assert(a.createColumnFamily('b')) +assert.equal(a.createColumnFamily('b'), false) + +// Get and Put +assert.equal(a.get(1), null) +assert.equal(a.get(['a']), null) +assert.equal(a.get('a', 1), null) +assert.equal(a.get(1, 'a'), null) +assert.equal(a.get(1, 1), null) + +assert.equal(a.put(1), false) +assert.equal(a.put(['a']), false) +assert.equal(a.put('a', 1), false) +assert.equal(a.put(1, 'a'), false) +assert.equal(a.put(1, 1), false) +assert.equal(a.put('a', 'a', 1), false) +assert.equal(a.put('a', 1, 'a'), false) +assert.equal(a.put(1, 'a', 'a'), false) +assert.equal(a.put('a', 1, 1), false) +assert.equal(a.put(1, 'a', 1), false) +assert.equal(a.put(1, 1, 'a'), false) +assert.equal(a.put(1, 1, 1), false) + + +assert.equal(a.get(), null) +assert.equal(a.get('a'), null) +assert.equal(a.get('a', 'c'), null) +assert.equal(a.put(), false) +assert.equal(a.put('a'), false) +assert.equal(a.get('a', 'b', 'c'), null) + +assert(a.put('a', 'axe')) +assert(a.put('a', 'first')) +assert.equal(a.get('a'), 'first') +assert.equal(a.get('a', 'b'), null) +assert.equal(a.get('a', 'c'), null) + +assert(a.put('a', 'apple', 'b')) +assert.equal(a.get('a', 'b'), 'apple') +assert.equal(a.get('a'), 'first') +assert(a.put('b', 'butter', 'b'), 'butter') +assert(a.put('b', 'banana', 'b')) +assert.equal(a.get('b', 'b'), 'banana') +assert.equal(a.get('b'), null) +assert.equal(a.get('b', 'c'), null) + +// Delete +assert.equal(a.delete(1), false) +assert.equal(a.delete('a', 1), false) +assert.equal(a.delete(1, 'a'), false) +assert.equal(a.delete(1, 1), false) + +assert.equal(a.delete('b'), true) +assert(a.delete('a')) +assert.equal(a.get('a'), null) +assert.equal(a.get('a', 'b'), 'apple') +assert.equal(a.delete('c', 'c'), false) +assert.equal(a.delete('c', 'b'), true) +assert(a.delete('b', 'b')) +assert.equal(a.get('b', 'b'), null) + +// Dump +console.log("MARKER 1") +assert(a.dump()) +console.log("Should be no output between 'MARKER 1' and here\n") +console.log('Next line should be "a" => "apple"') +assert(a.dump('b')) + +console.log("\nMARKER 2") +assert.equal(a.dump('c'), false) +console.log("Should be no output between 'MARKER 2' and here\n") + +// WriteBatch + + +// Clean up test database +exec('rm -rf ' + DB_NAME) diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index b41f36d01..b1d58e10e 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -76,6 +76,7 @@ Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels, opt.num_levels = num_levels; opt.create_if_missing = create_if_missing; opt.max_mem_compaction_level = mem_table_compact_level; + opt.max_background_flushes = 0; rocksdb::Status st = rocksdb::DB::Open(opt, dbname_, &db_); if (!st.ok()) { fprintf(stderr, "Can't open the db:%s\n", st.ToString().c_str()); diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh new file mode 100755 index 000000000..2d2fd2ade --- /dev/null +++ b/tools/run_flash_bench.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# REQUIRE: benchmark.sh exists in the current directory +# After execution of this script, log files are generated in $output_dir. +# report.txt provides a high level statistics + +# Size constants +K=1024 +M=$((1024 * K)) +G=$((1024 * M)) + +n=$((1 * G)) +wps=$((80 * K)) +duration=$((12 * 60 * 60)) +num_read_threads=24 + +# Update these parameters before execution !!! +db_dir="/tmp/rocksdb/" +wal_dir="/tmp/rocksdb/" +output_dir="/tmp/output" + + +# Test 1: bulk load +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh bulkload + +# Test 2: sequential fill +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh fillseq + +# Test 3: overwrite +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh overwrite + +# Prepare: populate DB with random data +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + ./benchmark.sh filluniquerandom + +# Test 4: random read +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + DURATION=$duration NUM_READ_THREADS=$num_read_threads \ + ./benchmark.sh readrandom + +# Test 5: random read while writing +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \ + ./benchmark.sh readwhilewriting + +# Test 6: random seek + next()'s while writing +OUTPUT_DIR=$output_dir NUM_KEYS=$n DB_DIR=$db_dir WAL_DIR=$wal_dir \ + DURATION=$duration NUM_READ_THREADS=$num_read_threads WRITES_PER_SECOND=$wps \ + NUM_NEXTS_PER_SEEK=10 \ + ./benchmark.sh rangescanwhilewriting diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 9b130c7c6..403893779 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -3,418 +3,19 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // +#ifndef ROCKSDB_LITE -#include -#include -#include -#include +#include "rocksdb/sst_dump_tool.h" -#include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "rocksdb/iterator.h" -#include "rocksdb/slice_transform.h" -#include "rocksdb/table.h" -#include "rocksdb/table_properties.h" -#include "table/block_based_table_factory.h" -#include "table/plain_table_factory.h" -#include "table/meta_blocks.h" -#include "table/block.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "util/ldb_cmd.h" -#include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" - -namespace rocksdb { - -class SstFileReader { - public: - explicit SstFileReader(const std::string& file_name, - bool verify_checksum, - bool output_hex); - - Status ReadSequential(bool print_kv, - uint64_t read_num, - bool has_from, - const std::string& from_key, - bool has_to, - const std::string& to_key); - - Status ReadTableProperties( - std::shared_ptr* table_properties); - uint64_t GetReadNumber() { return read_num_; } - TableProperties* GetInitTableProperties() { return table_properties_.get(); } - - private: - Status NewTableReader(const std::string& file_path); - Status ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, uint64_t file_size); - Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); - Status SetOldTableOptions(); - - std::string file_name_; - uint64_t read_num_; - bool verify_checksum_; - bool output_hex_; - EnvOptions soptions_; - - Status init_result_; - unique_ptr table_reader_; - unique_ptr file_; - // options_ and internal_comparator_ will also be used in - // ReadSequential internally (specifically, seek-related operations) - Options options_; - InternalKeyComparator internal_comparator_; - unique_ptr table_properties_; -}; - -SstFileReader::SstFileReader(const std::string& file_path, - bool verify_checksum, - bool output_hex) - :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), - output_hex_(output_hex), internal_comparator_(BytewiseComparator()) { - fprintf(stdout, "Process %s\n", file_path.c_str()); - - init_result_ = NewTableReader(file_name_); -} - -extern uint64_t kBlockBasedTableMagicNumber; -extern uint64_t kLegacyBlockBasedTableMagicNumber; -extern uint64_t kPlainTableMagicNumber; -extern uint64_t kLegacyPlainTableMagicNumber; - -Status SstFileReader::NewTableReader(const std::string& file_path) { - uint64_t magic_number; - - // read table magic number - Footer footer; - - unique_ptr file; - uint64_t file_size; - Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); - if (s.ok()) { - s = options_.env->GetFileSize(file_path, &file_size); - } - if (s.ok()) { - s = ReadFooterFromFile(file_.get(), file_size, &footer); - } - if (s.ok()) { - magic_number = footer.table_magic_number(); - } - - if (s.ok()) { - if (magic_number == kPlainTableMagicNumber || - magic_number == kLegacyPlainTableMagicNumber) { - soptions_.use_mmap_reads = true; - options_.env->NewRandomAccessFile(file_path, &file_, soptions_); - } - options_.comparator = &internal_comparator_; - // For old sst format, ReadTableProperties might fail but file can be read - if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) { - SetTableOptionsByMagicNumber(magic_number); - } else { - SetOldTableOptions(); - } - } - - if (s.ok()) { - s = options_.table_factory->NewTableReader( - options_, soptions_, internal_comparator_, std::move(file_), file_size, - &table_reader_); - } - return s; -} - -Status SstFileReader::ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, - uint64_t file_size) { - TableProperties* table_properties = nullptr; - Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, - options_.env, options_.info_log.get(), - &table_properties); - if (s.ok()) { - table_properties_.reset(table_properties); - } else { - fprintf(stdout, "Not able to read table properties\n"); - } - return s; -} - -Status SstFileReader::SetTableOptionsByMagicNumber( - uint64_t table_magic_number) { - assert(table_properties_); - if (table_magic_number == kBlockBasedTableMagicNumber || - table_magic_number == kLegacyBlockBasedTableMagicNumber) { - options_.table_factory = std::make_shared(); - fprintf(stdout, "Sst file format: block-based\n"); - auto& props = table_properties_->user_collected_properties; - auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); - if (pos != props.end()) { - auto index_type_on_file = static_cast( - DecodeFixed32(pos->second.c_str())); - if (index_type_on_file == - BlockBasedTableOptions::IndexType::kHashSearch) { - options_.prefix_extractor.reset(NewNoopTransform()); - } - } - } else if (table_magic_number == kPlainTableMagicNumber || - table_magic_number == kLegacyPlainTableMagicNumber) { - options_.allow_mmap_reads = true; - - PlainTableOptions plain_table_options; - plain_table_options.user_key_len = kPlainTableVariableLength; - plain_table_options.bloom_bits_per_key = 0; - plain_table_options.hash_table_ratio = 0; - plain_table_options.index_sparseness = 1; - plain_table_options.huge_page_tlb_size = 0; - plain_table_options.encoding_type = kPlain; - plain_table_options.full_scan_mode = true; - - options_.table_factory.reset(NewPlainTableFactory(plain_table_options)); - fprintf(stdout, "Sst file format: plain table\n"); - } else { - char error_msg_buffer[80]; - snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, - "Unsupported table magic number --- %lx", - (long)table_magic_number); - return Status::InvalidArgument(error_msg_buffer); - } - - return Status::OK(); -} - -Status SstFileReader::SetOldTableOptions() { - assert(table_properties_ == nullptr); - options_.table_factory = std::make_shared(); - fprintf(stdout, "Sst file format: block-based(old version)\n"); - - return Status::OK(); -} - -Status SstFileReader::ReadSequential(bool print_kv, - uint64_t read_num, - bool has_from, - const std::string& from_key, - bool has_to, - const std::string& to_key) { - if (!table_reader_) { - return init_result_; - } - - Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_, - false)); - uint64_t i = 0; - if (has_from) { - InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek); - iter->Seek(ikey.Encode()); - } else { - iter->SeekToFirst(); - } - for (; iter->Valid(); iter->Next()) { - Slice key = iter->key(); - Slice value = iter->value(); - ++i; - if (read_num > 0 && i > read_num) - break; - - ParsedInternalKey ikey; - if (!ParseInternalKey(key, &ikey)) { - std::cerr << "Internal Key [" - << key.ToString(true /* in hex*/) - << "] parse error!\n"; - continue; - } - - // If end marker was specified, we stop before it - if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { - break; - } - - if (print_kv) { - fprintf(stdout, "%s => %s\n", - ikey.DebugString(output_hex_).c_str(), - value.ToString(output_hex_).c_str()); - } - } - - read_num_ += i; - - Status ret = iter->status(); - delete iter; - return ret; -} - -Status SstFileReader::ReadTableProperties( - std::shared_ptr* table_properties) { - if (!table_reader_) { - return init_result_; - } - - *table_properties = table_reader_->GetTableProperties(); - return init_result_; -} - -} // namespace rocksdb - -static void print_help() { - fprintf(stderr, - "sst_dump [--command=check|scan|none] [--verify_checksum] " - "--file=data_dir_OR_sst_file" - " [--output_hex]" - " [--input_key_hex]" - " [--from=]" - " [--to=]" - " [--read_num=NUM]" - " [--show_properties]\n"); -} - -namespace { -string HexToString(const string& str) { - string parsed; - if (str[0] != '0' || str[1] != 'x') { - fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", - str.c_str()); - throw "Invalid hex input"; - } - - for (unsigned int i = 2; i < str.length();) { - int c; - sscanf(str.c_str() + i, "%2X", &c); - parsed.push_back(c); - i += 2; - } - return parsed; +int main(int argc, char** argv) { + rocksdb::SSTDumpTool tool; + tool.Run(argc, argv); + return 0; } -} // namespace - +#else +#include int main(int argc, char** argv) { - const char* dir_or_file = nullptr; - uint64_t read_num = -1; - std::string command; - - char junk; - uint64_t n; - bool verify_checksum = false; - bool output_hex = false; - bool input_key_hex = false; - bool has_from = false; - bool has_to = false; - bool show_properties = false; - std::string from_key; - std::string to_key; - for (int i = 1; i < argc; i++) { - if (strncmp(argv[i], "--file=", 7) == 0) { - dir_or_file = argv[i] + 7; - } else if (strcmp(argv[i], "--output_hex") == 0) { - output_hex = true; - } else if (strcmp(argv[i], "--input_key_hex") == 0) { - input_key_hex = true; - } else if (sscanf(argv[i], - "--read_num=%lu%c", - (unsigned long*)&n, &junk) == 1) { - read_num = n; - } else if (strcmp(argv[i], "--verify_checksum") == 0) { - verify_checksum = true; - } else if (strncmp(argv[i], "--command=", 10) == 0) { - command = argv[i] + 10; - } else if (strncmp(argv[i], "--from=", 7) == 0) { - from_key = argv[i] + 7; - has_from = true; - } else if (strncmp(argv[i], "--to=", 5) == 0) { - to_key = argv[i] + 5; - has_to = true; - } else if (strcmp(argv[i], "--show_properties") == 0) { - show_properties = true; - } else { - print_help(); - exit(1); - } - } - - if (input_key_hex) { - if (has_from) { - from_key = HexToString(from_key); - } - if (has_to) { - to_key = HexToString(to_key); - } - } - - if (dir_or_file == nullptr) { - print_help(); - exit(1); - } - - std::vector filenames; - rocksdb::Env* env = rocksdb::Env::Default(); - rocksdb::Status st = env->GetChildren(dir_or_file, &filenames); - bool dir = true; - if (!st.ok()) { - filenames.clear(); - filenames.push_back(dir_or_file); - dir = false; - } - - fprintf(stdout, "from [%s] to [%s]\n", - rocksdb::Slice(from_key).ToString(true).c_str(), - rocksdb::Slice(to_key).ToString(true).c_str()); - - uint64_t total_read = 0; - for (size_t i = 0; i < filenames.size(); i++) { - std::string filename = filenames.at(i); - if (filename.length() <= 4 || - filename.rfind(".sst") != filename.length() - 4) { - // ignore - continue; - } - if (dir) { - filename = std::string(dir_or_file) + "/" + filename; - } - rocksdb::SstFileReader reader(filename, verify_checksum, - output_hex); - rocksdb::Status st; - // scan all files in give file path. - if (command == "" || command == "scan" || command == "check") { - st = reader.ReadSequential(command != "check", - read_num > 0 ? (read_num - total_read) : - read_num, - has_from, from_key, has_to, to_key); - if (!st.ok()) { - fprintf(stderr, "%s: %s\n", filename.c_str(), - st.ToString().c_str()); - } - total_read += reader.GetReadNumber(); - if (read_num > 0 && total_read > read_num) { - break; - } - } - if (show_properties) { - const rocksdb::TableProperties* table_properties; - - std::shared_ptr - table_properties_from_reader; - st = reader.ReadTableProperties(&table_properties_from_reader); - if (!st.ok()) { - fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); - fprintf(stderr, "Try to use initial table properties\n"); - table_properties = reader.GetInitTableProperties(); - } else { - table_properties = table_properties_from_reader.get(); - } - if (table_properties != nullptr) { - fprintf(stdout, - "Table Properties:\n" - "------------------------------\n" - " %s", - table_properties->ToString("\n ", ": ").c_str()); - fprintf(stdout, "# deleted keys: %zd\n", - rocksdb::GetDeletedKeys( - table_properties->user_collected_properties)); - } - } - } + fprintf(stderr, "Not supported in lite mode.\n"); + return 1; } +#endif // ROCKSDB_LITE diff --git a/util/allocator.h b/util/allocator.h new file mode 100644 index 000000000..58bf0da31 --- /dev/null +++ b/util/allocator.h @@ -0,0 +1,32 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// Abstract interface for allocating memory in blocks. This memory is freed +// when the allocator object is destroyed. See the Arena class for more info. + +#pragma once +#include +#include + +namespace rocksdb { + +class Logger; + +class Allocator { + public: + virtual ~Allocator() {} + + virtual char* Allocate(size_t bytes) = 0; + virtual char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, + Logger* logger = nullptr) = 0; + + virtual size_t BlockSize() const = 0; +}; + +} // namespace rocksdb diff --git a/util/arena.cc b/util/arena.cc index 6efe687c6..3f00f0845 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -32,13 +32,20 @@ size_t OptimizeBlockSize(size_t block_size) { return block_size; } -Arena::Arena(size_t block_size) : kBlockSize(OptimizeBlockSize(block_size)) { +Arena::Arena(size_t block_size, size_t huge_page_size) + : kBlockSize(OptimizeBlockSize(block_size)) { assert(kBlockSize >= kMinBlockSize && kBlockSize <= kMaxBlockSize && kBlockSize % kAlignUnit == 0); alloc_bytes_remaining_ = sizeof(inline_block_); blocks_memory_ += alloc_bytes_remaining_; aligned_alloc_ptr_ = inline_block_; unaligned_alloc_ptr_ = inline_block_ + alloc_bytes_remaining_; +#ifdef MAP_HUGETLB + hugetlb_size_ = huge_page_size; + if (hugetlb_size_ && kBlockSize > hugetlb_size_) { + hugetlb_size_ = ((kBlockSize - 1U) / hugetlb_size_ + 1U) * hugetlb_size_; + } +#endif } Arena::~Arena() { @@ -62,20 +69,49 @@ char* Arena::AllocateFallback(size_t bytes, bool aligned) { } // We waste the remaining space in the current block. - auto block_head = AllocateNewBlock(kBlockSize); - alloc_bytes_remaining_ = kBlockSize - bytes; + size_t size; + char* block_head = nullptr; + if (hugetlb_size_) { + size = hugetlb_size_; + block_head = AllocateFromHugePage(size); + } + if (!block_head) { + size = kBlockSize; + block_head = AllocateNewBlock(size); + } + alloc_bytes_remaining_ = size - bytes; if (aligned) { aligned_alloc_ptr_ = block_head + bytes; - unaligned_alloc_ptr_ = block_head + kBlockSize; + unaligned_alloc_ptr_ = block_head + size; return block_head; } else { aligned_alloc_ptr_ = block_head; - unaligned_alloc_ptr_ = block_head + kBlockSize - bytes; + unaligned_alloc_ptr_ = block_head + size - bytes; return unaligned_alloc_ptr_; } } +char* Arena::AllocateFromHugePage(size_t bytes) { +#ifdef MAP_HUGETLB + if (hugetlb_size_ == 0) { + return nullptr; + } + + void* addr = mmap(nullptr, bytes, (PROT_READ | PROT_WRITE), + (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0); + + if (addr == MAP_FAILED) { + return nullptr; + } + huge_blocks_.push_back(MmapInfo(addr, bytes)); + blocks_memory_ += bytes; + return reinterpret_cast(addr); +#else + return nullptr; +#endif +} + char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, Logger* logger) { assert((kAlignUnit & (kAlignUnit - 1)) == @@ -88,17 +124,14 @@ char* Arena::AllocateAligned(size_t bytes, size_t huge_page_size, size_t reserved_size = ((bytes - 1U) / huge_page_size + 1U) * huge_page_size; assert(reserved_size >= bytes); - void* addr = mmap(nullptr, reserved_size, (PROT_READ | PROT_WRITE), - (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB), 0, 0); - if (addr == MAP_FAILED) { + char* addr = AllocateFromHugePage(reserved_size); + if (addr == nullptr) { Warn(logger, "AllocateAligned fail to allocate huge TLB pages: %s", strerror(errno)); // fail back to malloc } else { - blocks_memory_ += reserved_size; - huge_blocks_.push_back(MmapInfo(addr, reserved_size)); - return reinterpret_cast(addr); + return addr; } } #endif diff --git a/util/arena.h b/util/arena.h index 0855c205c..644a12947 100644 --- a/util/arena.h +++ b/util/arena.h @@ -7,7 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Arena is an implementation of Arena class. For a request of small size, +// Arena is an implementation of Allocator class. For a request of small size, // it allocates a block with pre-defined block size. For a request of big // size, it uses malloc to directly get the requested size. @@ -17,15 +17,13 @@ #include #include #include -#include "util/arena.h" +#include "util/allocator.h" namespace rocksdb { -class Logger; - const size_t kInlineSize = 2048; -class Arena { +class Arena : public Allocator { public: // No copying allowed Arena(const Arena&) = delete; @@ -35,10 +33,13 @@ class Arena { static const size_t kMinBlockSize; static const size_t kMaxBlockSize; - explicit Arena(size_t block_size = kMinBlockSize); + // huge_page_size: if 0, don't use huge page TLB. If > 0 (should set to the + // supported hugepage size of the system), block allocation will try huge + // page TLB first. If allocation fails, will fall back to normal case. + explicit Arena(size_t block_size = kMinBlockSize, size_t huge_page_size = 0); ~Arena(); - char* Allocate(size_t bytes); + char* Allocate(size_t bytes) override; // huge_page_size: if >0, will try to allocate from huage page TLB. // The argument will be the size of the page size for huge page TLB. Bytes @@ -53,7 +54,7 @@ class Arena { // huge_page_tlb_size > 0, we highly recommend a logger is passed in. // Otherwise, the error message will be printed out to stderr directly. char* AllocateAligned(size_t bytes, size_t huge_page_size = 0, - Logger* logger = nullptr); + Logger* logger = nullptr) override; // Returns an estimate of the total memory usage of data allocated // by the arena (exclude the space allocated but not yet used for future @@ -69,9 +70,9 @@ class Arena { // If an allocation is too big, we'll allocate an irregular block with the // same size of that allocation. - virtual size_t IrregularBlockNum() const { return irregular_block_num; } + size_t IrregularBlockNum() const { return irregular_block_num; } - size_t BlockSize() const { return kBlockSize; } + size_t BlockSize() const override { return kBlockSize; } private: char inline_block_[kInlineSize]; @@ -100,6 +101,8 @@ class Arena { // How many bytes left in currently active block? size_t alloc_bytes_remaining_ = 0; + size_t hugetlb_size_ = 0; + char* AllocateFromHugePage(size_t bytes); char* AllocateFallback(size_t bytes, bool aligned); char* AllocateNewBlock(size_t block_bytes); diff --git a/util/arena_test.cc b/util/arena_test.cc index 7b6cfd0af..7f55a7e53 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -13,17 +13,21 @@ namespace rocksdb { +namespace { +const size_t kHugePageSize = 2 * 1024 * 1024; +} // namespace class ArenaTest {}; TEST(ArenaTest, Empty) { Arena arena0; } -TEST(ArenaTest, MemoryAllocatedBytes) { +namespace { +void MemoryAllocatedBytesTest(size_t huge_page_size) { const int N = 17; size_t req_sz; // requested size size_t bsz = 8192; // block size size_t expected_memory_allocated; - Arena arena(bsz); + Arena arena(bsz, huge_page_size); // requested size > quarter of a block: // allocate requested size separately @@ -44,8 +48,15 @@ TEST(ArenaTest, MemoryAllocatedBytes) { for (int i = 0; i < N; i++) { arena.Allocate(req_sz); } - expected_memory_allocated += bsz; - ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + if (huge_page_size) { + ASSERT_TRUE(arena.MemoryAllocatedBytes() == + expected_memory_allocated + bsz || + arena.MemoryAllocatedBytes() == + expected_memory_allocated + huge_page_size); + } else { + expected_memory_allocated += bsz; + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + } // requested size > quarter of a block: // allocate requested size separately @@ -54,16 +65,23 @@ TEST(ArenaTest, MemoryAllocatedBytes) { arena.Allocate(req_sz); } expected_memory_allocated += req_sz * N; - ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + if (huge_page_size) { + ASSERT_TRUE(arena.MemoryAllocatedBytes() == + expected_memory_allocated + bsz || + arena.MemoryAllocatedBytes() == + expected_memory_allocated + huge_page_size); + } else { + ASSERT_EQ(arena.MemoryAllocatedBytes(), expected_memory_allocated); + } } // Make sure we didn't count the allocate but not used memory space in // Arena::ApproximateMemoryUsage() -TEST(ArenaTest, ApproximateMemoryUsageTest) { +static void ApproximateMemoryUsageTest(size_t huge_page_size) { const size_t kBlockSize = 4096; const size_t kEntrySize = kBlockSize / 8; const size_t kZero = 0; - Arena arena(kBlockSize); + Arena arena(kBlockSize, huge_page_size); ASSERT_EQ(kZero, arena.ApproximateMemoryUsage()); // allocate inline bytes @@ -78,7 +96,12 @@ TEST(ArenaTest, ApproximateMemoryUsageTest) { // first allocation arena.AllocateAligned(kEntrySize); auto mem_usage = arena.MemoryAllocatedBytes(); - ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize); + if (huge_page_size) { + ASSERT_TRUE(mem_usage == kBlockSize + Arena::kInlineSize || + mem_usage == huge_page_size + Arena::kInlineSize); + } else { + ASSERT_EQ(mem_usage, kBlockSize + Arena::kInlineSize); + } auto usage = arena.ApproximateMemoryUsage(); ASSERT_LT(usage, mem_usage); for (size_t i = 1; i < num_blocks; ++i) { @@ -87,12 +110,17 @@ TEST(ArenaTest, ApproximateMemoryUsageTest) { ASSERT_EQ(arena.ApproximateMemoryUsage(), usage + kEntrySize); usage = arena.ApproximateMemoryUsage(); } - ASSERT_GT(usage, mem_usage); + if (huge_page_size) { + ASSERT_TRUE(usage > mem_usage || + usage + huge_page_size - kBlockSize == mem_usage); + } else { + ASSERT_GT(usage, mem_usage); + } } -TEST(ArenaTest, Simple) { +static void SimpleTest(size_t huge_page_size) { std::vector> allocated; - Arena arena; + Arena arena(Arena::kMinBlockSize, huge_page_size); const int N = 100000; size_t bytes = 0; Random rnd(301); @@ -136,7 +164,22 @@ TEST(ArenaTest, Simple) { } } } +} // namespace +TEST(ArenaTest, MemoryAllocatedBytes) { + MemoryAllocatedBytesTest(0); + MemoryAllocatedBytesTest(kHugePageSize); +} + +TEST(ArenaTest, ApproximateMemoryUsage) { + ApproximateMemoryUsageTest(0); + ApproximateMemoryUsageTest(kHugePageSize); +} + +TEST(ArenaTest, Simple) { + SimpleTest(0); + SimpleTest(kHugePageSize); +} } // namespace rocksdb int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/auto_roll_logger.cc b/util/auto_roll_logger.cc index 4812d1c4d..684abfc30 100644 --- a/util/auto_roll_logger.cc +++ b/util/auto_roll_logger.cc @@ -18,8 +18,7 @@ Status AutoRollLogger::ResetLogger() { return status_; } - if (logger_->GetLogFileSize() == - (size_t)Logger::DO_NOT_SUPPORT_GET_LOG_FILE_SIZE) { + if (logger_->GetLogFileSize() == Logger::kDoNotSupportGetLogFileSize) { status_ = Status::NotSupported( "The underlying logger doesn't support GetLogFileSize()"); } @@ -38,6 +37,27 @@ void AutoRollLogger::RollLogFile() { env_->RenameFile(log_fname_, old_fname); } +string AutoRollLogger::ValistToString(const char* format, va_list args) const { + // Any log messages longer than 1024 will get truncated. + // The user is responsible for chopping longer messages into multi line log + static const int MAXBUFFERSIZE = 1024; + char buffer[MAXBUFFERSIZE]; + + int count = vsnprintf(buffer, MAXBUFFERSIZE, format, args); + (void) count; + assert(count >= 0); + + return buffer; +} + +void AutoRollLogger::LogInternal(const char* format, ...) { + mutex_.AssertHeld(); + va_list args; + va_start(args, format); + logger_->Logv(format, args); + va_end(args); +} + void AutoRollLogger::Logv(const char* format, va_list ap) { assert(GetStatus().ok()); @@ -52,6 +72,8 @@ void AutoRollLogger::Logv(const char* format, va_list ap) { // can't really log the error if creating a new LOG file failed return; } + + WriteHeaderInfo(); } // pin down the current logger_ instance before releasing the mutex. @@ -67,6 +89,29 @@ void AutoRollLogger::Logv(const char* format, va_list ap) { logger->Logv(format, ap); } +void AutoRollLogger::WriteHeaderInfo() { + mutex_.AssertHeld(); + for (auto header : headers_) { + LogInternal("%s", header.c_str()); + } +} + +void AutoRollLogger::LogHeader(const char* format, va_list args) { + // header message are to be retained in memory. Since we cannot make any + // assumptions about the data contained in va_list, we will retain them as + // strings + va_list tmp; + va_copy(tmp, args); + string data = ValistToString(format, tmp); + va_end(tmp); + + MutexLock l(&mutex_); + headers_.push_back(data); + + // Log the original message to the current log + logger_->Logv(format, args); +} + bool AutoRollLogger::LogExpired() { if (cached_now_access_count >= call_NowMicros_every_N_records_) { cached_now = static_cast(env_->NowMicros() * 1e-6); diff --git a/util/auto_roll_logger.h b/util/auto_roll_logger.h index c592d79ce..486a1eae3 100644 --- a/util/auto_roll_logger.h +++ b/util/auto_roll_logger.h @@ -7,6 +7,8 @@ // where enough posix functionality is available. #pragma once +#include + #include "db/filename.h" #include "port/port.h" #include "util/posix_logger.h" @@ -38,8 +40,13 @@ class AutoRollLogger : public Logger { ResetLogger(); } + using Logger::Logv; void Logv(const char* format, va_list ap); + // Write a header entry to the log. All header information will be written + // again every time the log rolls over. + virtual void LogHeader(const char* format, va_list ap) override; + // check if the logger has encountered any problem. Status GetStatus() { return status_; @@ -57,10 +64,15 @@ class AutoRollLogger : public Logger { } private: - bool LogExpired(); Status ResetLogger(); void RollLogFile(); + // Log message to logger without rolling + void LogInternal(const char* format, ...); + // Serialize the va_list to a string + std::string ValistToString(const char* format, va_list args) const; + // Write the logs marked as headers to the new log file + void WriteHeaderInfo(); std::string log_fname_; // Current active info log's file name. std::string dbname_; @@ -72,6 +84,8 @@ class AutoRollLogger : public Logger { Status status_; const size_t kMaxLogFileSize; const size_t kLogFileTimeToRoll; + // header information + std::list headers_; // to avoid frequent env->NowMicros() calls, we cached the current time uint64_t cached_now; uint64_t ctime_; diff --git a/util/auto_roll_logger_test.cc b/util/auto_roll_logger_test.cc index 366ca084e..7f75edf99 100755 --- a/util/auto_roll_logger_test.cc +++ b/util/auto_roll_logger_test.cc @@ -122,7 +122,7 @@ uint64_t AutoRollLoggerTest::RollLogFileByTimeTest( } // -- Make the log file expire - sleep(time); + sleep(static_cast(time)); LogMessage(logger, log_message.c_str()); // At this time, the new log file should be created. @@ -285,6 +285,47 @@ TEST(AutoRollLoggerTest, InfoLogLevel) { inFile.close(); } +// Test the logger Header function for roll over logs +// We expect the new logs creates as roll over to carry the headers specified +TEST(AutoRollLoggerTest, LogHeaderTest) { + static const size_t MAX_HEADERS = 10; + static const size_t LOG_MAX_SIZE = 1024 * 5; + static const std::string HEADER_STR = "Log header line"; + + InitTestDb(); + + AutoRollLogger logger(Env::Default(), kTestDir, /*db_log_dir=*/ "", + LOG_MAX_SIZE, /*log_file_time_to_roll=*/ 0); + + // log some headers + for (size_t i = 0; i < MAX_HEADERS; i++) { + Header(&logger, "%s %d", HEADER_STR.c_str(), i); + } + + // log enough data to cause a roll over + size_t i = 0; + while (logger.GetLogFileSize() < LOG_MAX_SIZE) { + Info(&logger, (kSampleMessage + ":LogHeaderTest line %d").c_str(), i); + ++i; + } + + // verify that the new log contains all the header logs + std::stringstream ssbuf; + std::string line; + size_t count = 0; + + std::ifstream inFile(AutoRollLoggerTest::kLogFile.c_str()); + ssbuf << inFile.rdbuf(); + + while (getline(ssbuf, line)) { + if (line.find(HEADER_STR) != std::string::npos) { + count++; + } + } + + ASSERT_EQ(count, MAX_HEADERS); +} + TEST(AutoRollLoggerTest, LogFileExistence) { rocksdb::DB* db; rocksdb::Options options; diff --git a/util/autovector.h b/util/autovector.h index e143c46cb..9362536d3 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -201,16 +201,12 @@ class autovector { // will check boundry const_reference at(size_type n) const { - if (n >= size()) { - throw std::out_of_range("autovector: index out of range"); - } + assert(n < size()); return (*this)[n]; } reference at(size_type n) { - if (n >= size()) { - throw std::out_of_range("autovector: index out of range"); - } + assert(n < size()); return (*this)[n]; } diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 25ebaa24b..4ff982250 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -5,6 +5,7 @@ #include #include +#include #include "rocksdb/env.h" #include "util/autovector.h" @@ -48,11 +49,11 @@ TEST(AutoVectorTest, PushBackAndPopBack) { } TEST(AutoVectorTest, EmplaceBack) { - typedef std::pair ValueType; - autovector vec; + typedef std::pair ValType; + autovector vec; for (size_t i = 0; i < 1000 * kSize; ++i) { - vec.emplace_back(i, std::to_string(i + 123)); + vec.emplace_back(i, ToString(i + 123)); ASSERT_TRUE(!vec.empty()); if (i < kSize) { ASSERT_TRUE(vec.only_in_stack()); @@ -62,7 +63,7 @@ TEST(AutoVectorTest, EmplaceBack) { ASSERT_EQ(i + 1, vec.size()); ASSERT_EQ(i, vec[i].first); - ASSERT_EQ(std::to_string(i + 123), vec[i].second); + ASSERT_EQ(ToString(i + 123), vec[i].second); } vec.clear(); @@ -128,7 +129,7 @@ TEST(AutoVectorTest, CopyAndAssignment) { TEST(AutoVectorTest, Iterators) { autovector vec; for (size_t i = 0; i < kSize * 1000; ++i) { - vec.push_back(std::to_string(i)); + vec.push_back(ToString(i)); } // basic operator test diff --git a/util/benchharness.cc b/util/benchharness.cc index 8cd37007b..e533ed454 100644 --- a/util/benchharness.cc +++ b/util/benchharness.cc @@ -16,6 +16,26 @@ #include #include #include +#include "util/string_util.h" + +#ifndef GFLAGS +bool FLAGS_benchmark = false; +uint64_t FLAGS_bm_min_usec = 100; +int64_t FLAGS_bm_min_iters = 1; +int32_t FLAGS_bm_max_secs = 1; +#else +#include +DEFINE_bool(benchmark, false, "Run benchmarks."); + +DEFINE_uint64(bm_min_usec, 100, + "Minimum # of microseconds we'll accept for each benchmark."); + +DEFINE_int64(bm_min_iters, 1, + "Minimum # of iterations we'll try for each benchmark."); + +DEFINE_int32(bm_max_secs, 1, + "Maximum # of seconds we'll spend on each benchmark."); +#endif // GFLAGS using std::function; using std::get; @@ -28,18 +48,6 @@ using std::string; using std::tuple; using std::vector; -DEFINE_bool(benchmark, false, "Run benchmarks."); - -DEFINE_int64(bm_min_usec, 100, - "Minimum # of microseconds we'll accept for each benchmark."); - -DEFINE_int64(bm_min_iters, 1, - "Minimum # of iterations we'll try for each benchmark."); - -DEFINE_int32(bm_max_secs, 1, - "Maximum # of seconds we'll spend on each benchmark."); - - namespace rocksdb { namespace benchmark { @@ -206,7 +214,8 @@ static double RunBenchmarkGetNSPerIteration(const BenchmarkFun& fun, size_t actualEpochs = 0; for (; actualEpochs < epochs; ++actualEpochs) { - for (unsigned int n = FLAGS_bm_min_iters; n < (1UL << 30); n *= 2) { + for (unsigned int n = static_cast(FLAGS_bm_min_iters); + n < (1UL << 30); n *= 2) { auto const nsecs = fun(n); if (nsecs < minNanoseconds) { continue; @@ -275,7 +284,7 @@ static const ScaleInfo kMetricSuffixes[] { static string HumanReadable(double n, unsigned int decimals, const ScaleInfo* scales) { if (std::isinf(n) || std::isnan(n)) { - return std::to_string(n); + return ToString(n); } const double absValue = fabs(n); diff --git a/util/benchharness.h b/util/benchharness.h index 4fdef520c..948fdf2ff 100644 --- a/util/benchharness.h +++ b/util/benchharness.h @@ -9,8 +9,6 @@ #pragma once -#include - #include #include #include diff --git a/util/benchharness_test.cc b/util/benchharness_test.cc index 75ff65892..f2c910edb 100644 --- a/util/benchharness_test.cc +++ b/util/benchharness_test.cc @@ -10,35 +10,35 @@ namespace rocksdb { BENCHMARK(insertFrontVector) { - std::vector v; - for (int i = 0; i < 100; i++) { + std::vector v; + for (size_t i = 0; i < 100; i++) { v.insert(v.begin(), i); } } BENCHMARK_RELATIVE(insertBackVector) { - std::vector v; + std::vector v; for (size_t i = 0; i < 100; i++) { v.insert(v.end(), i); } } BENCHMARK_N(insertFrontVector_n, n) { - std::vector v; + std::vector v; for (size_t i = 0; i < n; i++) { v.insert(v.begin(), i); } } BENCHMARK_RELATIVE_N(insertBackVector_n, n) { - std::vector v; + std::vector v; for (size_t i = 0; i < n; i++) { v.insert(v.end(), i); } } BENCHMARK_N(insertFrontEnd_n, n) { - std::vector v; + std::vector v; for (size_t i = 0; i < n; i++) { v.insert(v.begin(), i); } @@ -48,7 +48,7 @@ BENCHMARK_N(insertFrontEnd_n, n) { } BENCHMARK_RELATIVE_N(insertFrontEndSuspend_n, n) { - std::vector v; + std::vector v; for (size_t i = 0; i < n; i++) { v.insert(v.begin(), i); } diff --git a/util/blob_store.cc b/util/blob_store.cc deleted file mode 100644 index daaf4bc02..000000000 --- a/util/blob_store.cc +++ /dev/null @@ -1,270 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef ROCKSDB_LITE -#include "util/blob_store.h" - -namespace rocksdb { - -using namespace std; - -// BlobChunk -bool BlobChunk::ImmediatelyBefore(const BlobChunk& chunk) const { - // overlapping!? - assert(!Overlap(chunk)); - // size == 0 is a marker, not a block - return size != 0 && - bucket_id == chunk.bucket_id && - offset + size == chunk.offset; -} - -bool BlobChunk::Overlap(const BlobChunk &chunk) const { - return size != 0 && chunk.size != 0 && bucket_id == chunk.bucket_id && - ((offset >= chunk.offset && offset < chunk.offset + chunk.size) || - (chunk.offset >= offset && chunk.offset < offset + size)); -} - -// Blob -string Blob::ToString() const { - string ret; - for (auto chunk : chunks) { - PutFixed32(&ret, chunk.bucket_id); - PutFixed32(&ret, chunk.offset); - PutFixed32(&ret, chunk.size); - } - return ret; -} - -Blob::Blob(const std::string& blob) { - for (uint32_t i = 0; i < blob.size(); ) { - uint32_t t[3] = {0}; - for (int j = 0; j < 3 && i + sizeof(uint32_t) - 1 < blob.size(); - ++j, i += sizeof(uint32_t)) { - t[j] = DecodeFixed32(blob.data() + i); - } - chunks.push_back(BlobChunk(t[0], t[1], t[2])); - } -} - -// FreeList -Status FreeList::Free(const Blob& blob) { - // add it back to the free list - for (auto chunk : blob.chunks) { - free_blocks_ += chunk.size; - if (fifo_free_chunks_.size() && - fifo_free_chunks_.back().ImmediatelyBefore(chunk)) { - fifo_free_chunks_.back().size += chunk.size; - } else { - fifo_free_chunks_.push_back(chunk); - } - } - - return Status::OK(); -} - -Status FreeList::Allocate(uint32_t blocks, Blob* blob) { - if (free_blocks_ < blocks) { - return Status::Incomplete(""); - } - - blob->chunks.clear(); - free_blocks_ -= blocks; - - while (blocks > 0) { - assert(fifo_free_chunks_.size() > 0); - auto& front = fifo_free_chunks_.front(); - if (front.size > blocks) { - blob->chunks.push_back(BlobChunk(front.bucket_id, front.offset, blocks)); - front.offset += blocks; - front.size -= blocks; - blocks = 0; - } else { - blob->chunks.push_back(front); - blocks -= front.size; - fifo_free_chunks_.pop_front(); - } - } - assert(blocks == 0); - - return Status::OK(); -} - -bool FreeList::Overlap(const Blob &blob) const { - for (auto chunk : blob.chunks) { - for (auto itr = fifo_free_chunks_.begin(); - itr != fifo_free_chunks_.end(); - ++itr) { - if (itr->Overlap(chunk)) { - return true; - } - } - } - return false; -} - -// BlobStore -BlobStore::BlobStore(const string& directory, - uint64_t block_size, - uint32_t blocks_per_bucket, - uint32_t max_buckets, - Env* env) : - directory_(directory), - block_size_(block_size), - blocks_per_bucket_(blocks_per_bucket), - env_(env), - max_buckets_(max_buckets) { - env_->CreateDirIfMissing(directory_); - - storage_options_.use_mmap_writes = false; - storage_options_.use_mmap_reads = false; - - buckets_size_ = 0; - buckets_ = new unique_ptr[max_buckets_]; - - CreateNewBucket(); -} - -BlobStore::~BlobStore() { - // TODO we don't care about recovery for now - delete [] buckets_; -} - -Status BlobStore::Put(const Slice& value, Blob* blob) { - // convert size to number of blocks - Status s = Allocate((value.size() + block_size_ - 1) / block_size_, blob); - if (!s.ok()) { - return s; - } - auto size_left = (uint64_t) value.size(); - - uint64_t offset = 0; // in bytes, not blocks - for (auto chunk : blob->chunks) { - uint64_t write_size = min(chunk.size * block_size_, size_left); - assert(chunk.bucket_id < buckets_size_); - s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_, - Slice(value.data() + offset, - write_size)); - if (!s.ok()) { - Delete(*blob); - return s; - } - offset += write_size; - size_left -= write_size; - if (write_size < chunk.size * block_size_) { - // if we have any space left in the block, fill it up with zeros - string zero_string(chunk.size * block_size_ - write_size, 0); - s = buckets_[chunk.bucket_id].get()->Write(chunk.offset * block_size_ + - write_size, - Slice(zero_string)); - } - } - - if (size_left > 0) { - Delete(*blob); - return Status::Corruption("Tried to write more data than fits in the blob"); - } - - return Status::OK(); -} - -Status BlobStore::Get(const Blob& blob, - string* value) const { - { - // assert that it doesn't overlap with free list - // it will get compiled out for release - MutexLock l(&free_list_mutex_); - assert(!free_list_.Overlap(blob)); - } - - value->resize(blob.Size() * block_size_); - - uint64_t offset = 0; // in bytes, not blocks - for (auto chunk : blob.chunks) { - Slice result; - assert(chunk.bucket_id < buckets_size_); - Status s; - s = buckets_[chunk.bucket_id].get()->Read(chunk.offset * block_size_, - chunk.size * block_size_, - &result, - &value->at(offset)); - if (!s.ok()) { - value->clear(); - return s; - } - if (result.size() < chunk.size * block_size_) { - value->clear(); - return Status::Corruption("Could not read in from file"); - } - offset += chunk.size * block_size_; - } - - // remove the '\0's at the end of the string - value->erase(find(value->begin(), value->end(), '\0'), value->end()); - - return Status::OK(); -} - -Status BlobStore::Delete(const Blob& blob) { - MutexLock l(&free_list_mutex_); - return free_list_.Free(blob); -} - -Status BlobStore::Sync() { - for (size_t i = 0; i < buckets_size_; ++i) { - Status s = buckets_[i].get()->Sync(); - if (!s.ok()) { - return s; - } - } - return Status::OK(); -} - -Status BlobStore::Allocate(uint32_t blocks, Blob* blob) { - MutexLock l(&free_list_mutex_); - Status s; - - s = free_list_.Allocate(blocks, blob); - if (!s.ok()) { - s = CreateNewBucket(); - if (!s.ok()) { - return s; - } - s = free_list_.Allocate(blocks, blob); - } - - return s; -} - -// called with free_list_mutex_ held -Status BlobStore::CreateNewBucket() { - MutexLock l(&buckets_mutex_); - - if (buckets_size_ >= max_buckets_) { - return Status::NotSupported("Max size exceeded\n"); - } - - int new_bucket_id = buckets_size_; - - char fname[200]; - sprintf(fname, "%s/%d.bs", directory_.c_str(), new_bucket_id); - - Status s = env_->NewRandomRWFile(string(fname), - &buckets_[new_bucket_id], - storage_options_); - if (!s.ok()) { - return s; - } - - // whether Allocate succeeds or not, does not affect the overall correctness - // of this function - calling Allocate is really optional - // (also, tmpfs does not support allocate) - buckets_[new_bucket_id].get()->Allocate(0, block_size_ * blocks_per_bucket_); - - buckets_size_ = new_bucket_id + 1; - - return free_list_.Free(Blob(new_bucket_id, 0, blocks_per_bucket_)); -} - -} // namespace rocksdb -#endif // ROCKSDB_LITE diff --git a/util/blob_store.h b/util/blob_store.h deleted file mode 100644 index ce8633740..000000000 --- a/util/blob_store.h +++ /dev/null @@ -1,163 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#ifndef ROCKSDB_LITE -#pragma once -#include "rocksdb/env.h" -#include "rocksdb/status.h" -#include "port/port.h" -#include "util/mutexlock.h" -#include "util/coding.h" - -#include -#include -#include -#include -#include -#include -#include - -namespace rocksdb { - -struct BlobChunk { - uint32_t bucket_id; - uint32_t offset; // in blocks - uint32_t size; // in blocks - BlobChunk() {} - BlobChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) : - bucket_id(bucket_id), offset(offset), size(size) {} - - // returns true if it's immediately before chunk - bool ImmediatelyBefore(const BlobChunk& chunk) const; - // returns true if chunks overlap - bool Overlap(const BlobChunk &chunk) const; -}; - -// We represent each Blob as a string in format: -// bucket_id offset size|bucket_id offset size... -// The string can be used to reference the Blob stored on external -// device/file -// Not thread-safe! -struct Blob { - // Generates the string - std::string ToString() const; - // Parses the previously generated string - explicit Blob(const std::string& blob); - // Creates unfragmented Blob - Blob(uint32_t bucket_id, uint32_t offset, uint32_t size) { - SetOneChunk(bucket_id, offset, size); - } - Blob() {} - - void SetOneChunk(uint32_t bucket_id, uint32_t offset, uint32_t size) { - chunks.clear(); - chunks.push_back(BlobChunk(bucket_id, offset, size)); - } - - uint32_t Size() const { // in blocks - uint32_t ret = 0; - for (auto chunk : chunks) { - ret += chunk.size; - } - assert(ret > 0); - return ret; - } - - // bucket_id, offset, size - std::vector chunks; -}; - -// Keeps a list of free chunks -// NOT thread-safe. Externally synchronized -class FreeList { - public: - FreeList() : - free_blocks_(0) {} - ~FreeList() {} - - // Allocates a a blob. Stores the allocated blob in - // 'blob'. Returns non-OK status if it failed to allocate. - // Thread-safe - Status Allocate(uint32_t blocks, Blob* blob); - // Frees the blob for reuse. Thread-safe - Status Free(const Blob& blob); - - // returns true if blob is overlapping with any of the - // chunks stored in free list - bool Overlap(const Blob &blob) const; - - private: - std::deque fifo_free_chunks_; - uint32_t free_blocks_; - mutable port::Mutex mutex_; -}; - -// thread-safe -class BlobStore { - public: - // directory - wherever the blobs should be stored. It will be created - // if missing - // block_size - self explanatory - // blocks_per_bucket - how many blocks we want to keep in one bucket. - // Bucket is a device or a file that we use to store the blobs. - // If we don't have enough blocks to allocate a new blob, we will - // try to create a new file or device. - // max_buckets - maximum number of buckets BlobStore will create - // BlobStore max size in bytes is - // max_buckets * blocks_per_bucket * block_size - // env - env for creating new files - BlobStore(const std::string& directory, - uint64_t block_size, - uint32_t blocks_per_bucket, - uint32_t max_buckets, - Env* env); - ~BlobStore(); - - // Allocates space for value.size bytes (rounded up to be multiple of - // block size) and writes value.size bytes from value.data to a backing store. - // Sets Blob blob that can than be used for addressing the - // stored value. Returns non-OK status on error. - Status Put(const Slice& value, Blob* blob); - // Value needs to have enough space to store all the loaded stuff. - // This function is thread safe! - Status Get(const Blob& blob, std::string* value) const; - // Frees the blob for reuse, but does not delete the data - // on the backing store. - Status Delete(const Blob& blob); - // Sync all opened files that are modified - Status Sync(); - - private: - const std::string directory_; - // block_size_ is uint64_t because when we multiply with - // blocks_size_ we want the result to be uint64_t or - // we risk overflowing - const uint64_t block_size_; - const uint32_t blocks_per_bucket_; - Env* env_; - EnvOptions storage_options_; - // protected by free_list_mutex_ - FreeList free_list_; - // free_list_mutex_ is locked BEFORE buckets_mutex_ - mutable port::Mutex free_list_mutex_; - // protected by buckets_mutex_ - // array of buckets - unique_ptr* buckets_; - // number of buckets in the array - uint32_t buckets_size_; - uint32_t max_buckets_; - mutable port::Mutex buckets_mutex_; - - // Calls FreeList allocate. If free list can't allocate - // new blob, creates new bucket and tries again - // Thread-safe - Status Allocate(uint32_t blocks, Blob* blob); - - // Creates a new backing store and adds all the blocks - // from the new backing store to the free list - Status CreateNewBucket(); -}; - -} // namespace rocksdb -#endif // ROCKSDB_LITE diff --git a/util/blob_store_test.cc b/util/blob_store_test.cc deleted file mode 100644 index f199f5ddd..000000000 --- a/util/blob_store_test.cc +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "util/blob_store.h" - -#include "util/testharness.h" -#include "util/testutil.h" -#include "util/random.h" - -#include -#include - -namespace rocksdb { - -using namespace std; - -class BlobStoreTest { }; - -TEST(BlobStoreTest, RangeParseTest) { - Blob e; - for (int i = 0; i < 5; ++i) { - e.chunks.push_back(BlobChunk(rand(), rand(), rand())); - } - string x = e.ToString(); - Blob nx(x); - - ASSERT_EQ(nx.ToString(), x); -} - -// make sure we're reusing the freed space -TEST(BlobStoreTest, SanityTest) { - const uint64_t block_size = 10; - const uint32_t blocks_per_file = 20; - Random random(5); - - BlobStore blob_store(test::TmpDir() + "/blob_store_test", - block_size, - blocks_per_file, - 1000, - Env::Default()); - - string buf; - - // put string of size 170 - test::RandomString(&random, 170, &buf); - Blob r1; - ASSERT_OK(blob_store.Put(Slice(buf), &r1)); - // use the first file - for (size_t i = 0; i < r1.chunks.size(); ++i) { - ASSERT_EQ(r1.chunks[0].bucket_id, 0u); - } - - // put string of size 30 - test::RandomString(&random, 30, &buf); - Blob r2; - ASSERT_OK(blob_store.Put(Slice(buf), &r2)); - // use the first file - for (size_t i = 0; i < r2.chunks.size(); ++i) { - ASSERT_EQ(r2.chunks[0].bucket_id, 0u); - } - - // delete blob of size 170 - ASSERT_OK(blob_store.Delete(r1)); - - // put a string of size 100 - test::RandomString(&random, 100, &buf); - Blob r3; - ASSERT_OK(blob_store.Put(Slice(buf), &r3)); - // use the first file - for (size_t i = 0; i < r3.chunks.size(); ++i) { - ASSERT_EQ(r3.chunks[0].bucket_id, 0u); - } - - // put a string of size 70 - test::RandomString(&random, 70, &buf); - Blob r4; - ASSERT_OK(blob_store.Put(Slice(buf), &r4)); - // use the first file - for (size_t i = 0; i < r4.chunks.size(); ++i) { - ASSERT_EQ(r4.chunks[0].bucket_id, 0u); - } - - // put a string of size 5 - test::RandomString(&random, 5, &buf); - Blob r5; - ASSERT_OK(blob_store.Put(Slice(buf), &r5)); - // now you get to use the second file - for (size_t i = 0; i < r5.chunks.size(); ++i) { - ASSERT_EQ(r5.chunks[0].bucket_id, 1u); - } -} - -TEST(BlobStoreTest, FragmentedChunksTest) { - const uint64_t block_size = 10; - const uint32_t blocks_per_file = 20; - Random random(5); - - BlobStore blob_store(test::TmpDir() + "/blob_store_test", - block_size, - blocks_per_file, - 1000, - Env::Default()); - - string buf; - - vector r(4); - - // put 4 strings of size 50 - for (int k = 0; k < 4; ++k) { - test::RandomString(&random, 50, &buf); - ASSERT_OK(blob_store.Put(Slice(buf), &r[k])); - // use the first file - for (size_t i = 0; i < r[k].chunks.size(); ++i) { - ASSERT_EQ(r[k].chunks[0].bucket_id, 0u); - } - } - - // delete the first and third - ASSERT_OK(blob_store.Delete(r[0])); - ASSERT_OK(blob_store.Delete(r[2])); - - // put string of size 100. it should reuse space that we deleting - // by deleting first and third strings of size 50 - test::RandomString(&random, 100, &buf); - Blob r2; - ASSERT_OK(blob_store.Put(Slice(buf), &r2)); - // use the first file - for (size_t i = 0; i < r2.chunks.size(); ++i) { - ASSERT_EQ(r2.chunks[0].bucket_id, 0u); - } -} - -TEST(BlobStoreTest, CreateAndStoreTest) { - const uint64_t block_size = 10; - const uint32_t blocks_per_file = 1000; - const int max_blurb_size = 300; - Random random(5); - - BlobStore blob_store(test::TmpDir() + "/blob_store_test", - block_size, - blocks_per_file, - 10000, - Env::Default()); - vector> ranges; - - for (int i = 0; i < 2000; ++i) { - int decision = rand() % 5; - if (decision <= 2 || ranges.size() == 0) { - string buf; - int size_blocks = (rand() % max_blurb_size + 1); - int string_size = size_blocks * block_size - (rand() % block_size); - test::RandomString(&random, string_size, &buf); - Blob r; - ASSERT_OK(blob_store.Put(Slice(buf), &r)); - ranges.push_back(make_pair(r, buf)); - } else if (decision == 3) { - int ti = rand() % ranges.size(); - string out_buf; - ASSERT_OK(blob_store.Get(ranges[ti].first, &out_buf)); - ASSERT_EQ(ranges[ti].second, out_buf); - } else { - int ti = rand() % ranges.size(); - ASSERT_OK(blob_store.Delete(ranges[ti].first)); - ranges.erase(ranges.begin() + ti); - } - } - ASSERT_OK(blob_store.Sync()); -} - -TEST(BlobStoreTest, MaxSizeTest) { - const uint64_t block_size = 10; - const uint32_t blocks_per_file = 100; - const int max_buckets = 10; - Random random(5); - - BlobStore blob_store(test::TmpDir() + "/blob_store_test", - block_size, - blocks_per_file, - max_buckets, - Env::Default()); - string buf; - for (int i = 0; i < max_buckets; ++i) { - test::RandomString(&random, 1000, &buf); - Blob r; - ASSERT_OK(blob_store.Put(Slice(buf), &r)); - } - - test::RandomString(&random, 1000, &buf); - Blob r; - // should fail because max size - Status s = blob_store.Put(Slice(buf), &r); - ASSERT_EQ(s.ok(), false); -} - -} // namespace rocksdb - -int main(int argc, char** argv) { - return rocksdb::test::RunAllTests(); -} diff --git a/util/bloom.cc b/util/bloom.cc index 723adf843..007d4f273 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2014, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,42 +10,268 @@ #include "rocksdb/filter_policy.h" #include "rocksdb/slice.h" +#include "table/block_based_filter_block.h" +#include "table/full_filter_block.h" #include "util/hash.h" +#include "util/coding.h" namespace rocksdb { +class BlockBasedFilterBlockBuilder; +class FullFilterBlockBuilder; + namespace { +class FullFilterBitsBuilder : public FilterBitsBuilder { + public: + explicit FullFilterBitsBuilder(const size_t bits_per_key, + const size_t num_probes) + : bits_per_key_(bits_per_key), + num_probes_(num_probes) { + assert(bits_per_key_); + } + + ~FullFilterBitsBuilder() {} + + virtual void AddKey(const Slice& key) override { + uint32_t hash = BloomHash(key); + if (hash_entries_.size() == 0 || hash != hash_entries_.back()) { + hash_entries_.push_back(hash); + } + } + + // Create a filter that for hashes [0, n-1], the filter is allocated here + // When creating filter, it is ensured that + // total_bits = num_lines * CACHE_LINE_SIZE * 8 + // dst len is >= 5, 1 for num_probes, 4 for num_lines + // Then total_bits = (len - 5) * 8, and cache_line_size could be calulated + // +----------------------------------------------------------------+ + // | filter data with length total_bits/8 | + // +----------------------------------------------------------------+ + // | | + // | ... | + // | | + // +----------------------------------------------------------------+ + // | ... | num_probes : 1 byte | num_lines : 4 bytes | + // +----------------------------------------------------------------+ + virtual Slice Finish(std::unique_ptr* buf) override { + uint32_t total_bits, num_lines; + char* data = ReserveSpace(static_cast(hash_entries_.size()), + &total_bits, &num_lines); + assert(data); + + if (total_bits != 0 && num_lines != 0) { + for (auto h : hash_entries_) { + AddHash(h, data, num_lines, total_bits); + } + } + data[total_bits/8] = static_cast(num_probes_); + EncodeFixed32(data + total_bits/8 + 1, static_cast(num_lines)); + + const char* const_data = data; + buf->reset(const_data); + hash_entries_.clear(); + + return Slice(data, total_bits / 8 + 5); + } -class BloomFilterPolicy : public FilterPolicy { private: size_t bits_per_key_; - size_t k_; - uint32_t (*hash_func_)(const Slice& key); + size_t num_probes_; + std::vector hash_entries_; - void initialize() { - // We intentionally round down to reduce probing cost a little bit - k_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) - if (k_ < 1) k_ = 1; - if (k_ > 30) k_ = 30; + // Get totalbits that optimized for cpu cache line + uint32_t GetTotalBitsForLocality(uint32_t total_bits); + + // Reserve space for new filter + char* ReserveSpace(const int num_entry, uint32_t* total_bits, + uint32_t* num_lines); + + // Assuming single threaded access to this function. + void AddHash(uint32_t h, char* data, uint32_t num_lines, + uint32_t total_bits); + + // No Copy allowed + FullFilterBitsBuilder(const FullFilterBitsBuilder&); + void operator=(const FullFilterBitsBuilder&); +}; + +uint32_t FullFilterBitsBuilder::GetTotalBitsForLocality(uint32_t total_bits) { + uint32_t num_lines = + (total_bits + CACHE_LINE_SIZE * 8 - 1) / (CACHE_LINE_SIZE * 8); + + // Make num_lines an odd number to make sure more bits are involved + // when determining which block. + if (num_lines % 2 == 0) { + num_lines++; + } + return num_lines * (CACHE_LINE_SIZE * 8); +} + +char* FullFilterBitsBuilder::ReserveSpace(const int num_entry, + uint32_t* total_bits, uint32_t* num_lines) { + assert(bits_per_key_); + char* data = nullptr; + if (num_entry != 0) { + uint32_t total_bits_tmp = num_entry * static_cast(bits_per_key_); + + *total_bits = GetTotalBitsForLocality(total_bits_tmp); + *num_lines = *total_bits / (CACHE_LINE_SIZE * 8); + assert(*total_bits > 0 && *total_bits % 8 == 0); + } else { + // filter is empty, just leave space for metadata + *total_bits = 0; + *num_lines = 0; } + // Reserve space for Filter + uint32_t sz = *total_bits / 8; + sz += 5; // 4 bytes for num_lines, 1 byte for num_probes + + data = new char[sz]; + memset(data, 0, sz); + return data; +} + +inline void FullFilterBitsBuilder::AddHash(uint32_t h, char* data, + uint32_t num_lines, uint32_t total_bits) { + assert(num_lines > 0 && total_bits > 0); + + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t b = (h % num_lines) * (CACHE_LINE_SIZE * 8); + + for (uint32_t i = 0; i < num_probes_; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple operation by compiler. + const uint32_t bitpos = b + (h % (CACHE_LINE_SIZE * 8)); + data[bitpos / 8] |= (1 << (bitpos % 8)); + + h += delta; + } +} + +class FullFilterBitsReader : public FilterBitsReader { public: - explicit BloomFilterPolicy(int bits_per_key, - uint32_t (*hash_func)(const Slice& key)) - : bits_per_key_(bits_per_key), hash_func_(hash_func) { - initialize(); + explicit FullFilterBitsReader(const Slice& contents) + : data_(const_cast(contents.data())), + data_len_(static_cast(contents.size())), + num_probes_(0), + num_lines_(0) { + assert(data_); + GetFilterMeta(contents, &num_probes_, &num_lines_); + // Sanitize broken parameter + if (num_lines_ != 0 && (data_len_-5) % num_lines_ != 0) { + num_lines_ = 0; + num_probes_ = 0; + } + } + + ~FullFilterBitsReader() {} + + virtual bool MayMatch(const Slice& entry) override { + if (data_len_ <= 5) { // remain same with original filter + return false; + } + // Other Error params, including a broken filter, regarded as match + if (num_probes_ == 0 || num_lines_ == 0) return true; + uint32_t hash = BloomHash(entry); + return HashMayMatch(hash, Slice(data_, data_len_), + num_probes_, num_lines_); } - explicit BloomFilterPolicy(int bits_per_key) - : bits_per_key_(bits_per_key) { - hash_func_ = BloomHash; + + private: + // Filter meta data + char* data_; + uint32_t data_len_; + size_t num_probes_; + uint32_t num_lines_; + + // Get num_probes, and num_lines from filter + // If filter format broken, set both to 0. + void GetFilterMeta(const Slice& filter, size_t* num_probes, + uint32_t* num_lines); + + // "filter" contains the data appended by a preceding call to + // CreateFilterFromHash() on this class. This method must return true if + // the key was in the list of keys passed to CreateFilter(). + // This method may return true or false if the key was not on the + // list, but it should aim to return false with a high probability. + // + // hash: target to be checked + // filter: the whole filter, including meta data bytes + // num_probes: number of probes, read before hand + // num_lines: filter metadata, read before hand + // Before calling this function, need to ensure the input meta data + // is valid. + bool HashMayMatch(const uint32_t& hash, const Slice& filter, + const size_t& num_probes, const uint32_t& num_lines); + + // No Copy allowed + FullFilterBitsReader(const FullFilterBitsReader&); + void operator=(const FullFilterBitsReader&); +}; + +void FullFilterBitsReader::GetFilterMeta(const Slice& filter, + size_t* num_probes, uint32_t* num_lines) { + uint32_t len = static_cast(filter.size()); + if (len <= 5) { + // filter is empty or broken + *num_probes = 0; + *num_lines = 0; + return; + } + + *num_probes = filter.data()[len - 5]; + *num_lines = DecodeFixed32(filter.data() + len - 4); +} + +bool FullFilterBitsReader::HashMayMatch(const uint32_t& hash, + const Slice& filter, const size_t& num_probes, + const uint32_t& num_lines) { + uint32_t len = static_cast(filter.size()); + if (len <= 5) return false; // remain the same with original filter + + // It is ensured the params are valid before calling it + assert(num_probes != 0); + assert(num_lines != 0 && (len - 5) % num_lines == 0); + uint32_t cache_line_size = (len - 5) / num_lines; + const char* data = filter.data(); + + uint32_t h = hash; + const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits + uint32_t b = (h % num_lines) * (cache_line_size * 8); + + for (uint32_t i = 0; i < num_probes; ++i) { + // Since CACHE_LINE_SIZE is defined as 2^n, this line will be optimized + // to a simple and operation by compiler. + const uint32_t bitpos = b + (h % (cache_line_size * 8)); + if (((data[bitpos / 8]) & (1 << (bitpos % 8))) == 0) { + return false; + } + + h += delta; + } + + return true; +} + +// An implementation of filter policy +class BloomFilterPolicy : public FilterPolicy { + public: + explicit BloomFilterPolicy(int bits_per_key, bool use_block_based_builder) + : bits_per_key_(bits_per_key), hash_func_(BloomHash), + use_block_based_builder_(use_block_based_builder) { initialize(); } - virtual const char* Name() const { + ~BloomFilterPolicy() { + } + + virtual const char* Name() const override { return "rocksdb.BuiltinBloomFilter"; } - virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const { + virtual void CreateFilter(const Slice* keys, int n, + std::string* dst) const override { // Compute bloom filter size (in both bits and bytes) size_t bits = n * bits_per_key_; @@ -58,14 +284,14 @@ class BloomFilterPolicy : public FilterPolicy { const size_t init_size = dst->size(); dst->resize(init_size + bytes, 0); - dst->push_back(static_cast(k_)); // Remember # of probes in filter + dst->push_back(static_cast(num_probes_)); // Remember # of probes char* array = &(*dst)[init_size]; for (size_t i = 0; i < (size_t)n; i++) { // Use double-hashing to generate a sequence of hash values. // See analysis in [Kirsch,Mitzenmacher 2006]. uint32_t h = hash_func_(keys[i]); const uint32_t delta = (h >> 17) | (h << 15); // Rotate right 17 bits - for (size_t j = 0; j < k_; j++) { + for (size_t j = 0; j < num_probes_; j++) { const uint32_t bitpos = h % bits; array[bitpos/8] |= (1 << (bitpos % 8)); h += delta; @@ -73,7 +299,8 @@ class BloomFilterPolicy : public FilterPolicy { } } - virtual bool KeyMayMatch(const Slice& key, const Slice& bloom_filter) const { + virtual bool KeyMayMatch(const Slice& key, + const Slice& bloom_filter) const override { const size_t len = bloom_filter.size(); if (len < 2) return false; @@ -98,11 +325,43 @@ class BloomFilterPolicy : public FilterPolicy { } return true; } + + virtual FilterBitsBuilder* GetFilterBitsBuilder() const override { + if (use_block_based_builder_) { + return nullptr; + } + + return new FullFilterBitsBuilder(bits_per_key_, num_probes_); + } + + virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) + const override { + return new FullFilterBitsReader(contents); + } + + // If choose to use block based builder + bool UseBlockBasedBuilder() { return use_block_based_builder_; } + + private: + size_t bits_per_key_; + size_t num_probes_; + uint32_t (*hash_func_)(const Slice& key); + + const bool use_block_based_builder_; + + void initialize() { + // We intentionally round down to reduce probing cost a little bit + num_probes_ = static_cast(bits_per_key_ * 0.69); // 0.69 =~ ln(2) + if (num_probes_ < 1) num_probes_ = 1; + if (num_probes_ > 30) num_probes_ = 30; + } }; -} -const FilterPolicy* NewBloomFilterPolicy(int bits_per_key) { - return new BloomFilterPolicy(bits_per_key); +} // namespace + +const FilterPolicy* NewBloomFilterPolicy(int bits_per_key, + bool use_block_based_builder) { + return new BloomFilterPolicy(bits_per_key, use_block_based_builder); } } // namespace rocksdb diff --git a/util/bloom_test.cc b/util/bloom_test.cc index 881e3b0f5..3d8764b7e 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -16,12 +16,13 @@ int main() { #else #include +#include #include "rocksdb/filter_policy.h" - #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/arena.h" using GFLAGS::ParseCommandLineFlags; @@ -36,6 +37,19 @@ static Slice Key(int i, char* buffer) { return Slice(buffer, sizeof(i)); } +static int NextLength(int length) { + if (length < 10) { + length += 1; + } else if (length < 100) { + length += 10; + } else if (length < 1000) { + length += 100; + } else { + length += 1000; + } + return length; +} + class BloomTest { private: const FilterPolicy* policy_; @@ -43,7 +57,8 @@ class BloomTest { std::vector keys_; public: - BloomTest() : policy_(NewBloomFilterPolicy(FLAGS_bits_per_key)) { } + BloomTest() : policy_( + NewBloomFilterPolicy(FLAGS_bits_per_key)) {} ~BloomTest() { delete policy_; @@ -64,7 +79,8 @@ class BloomTest { key_slices.push_back(Slice(keys_[i])); } filter_.clear(); - policy_->CreateFilter(&key_slices[0], key_slices.size(), &filter_); + policy_->CreateFilter(&key_slices[0], static_cast(key_slices.size()), + &filter_); keys_.clear(); if (kVerbose >= 2) DumpFilter(); } @@ -117,19 +133,6 @@ TEST(BloomTest, Small) { ASSERT_TRUE(! Matches("foo")); } -static int NextLength(int length) { - if (length < 10) { - length += 1; - } else if (length < 100) { - length += 10; - } else if (length < 1000) { - length += 100; - } else { - length += 1000; - } - return length; -} - TEST(BloomTest, VaryingLengths) { char buffer[sizeof(int)]; @@ -171,6 +174,121 @@ TEST(BloomTest, VaryingLengths) { // Different bits-per-byte +class FullBloomTest { + private: + const FilterPolicy* policy_; + std::unique_ptr bits_builder_; + std::unique_ptr bits_reader_; + std::unique_ptr buf_; + size_t filter_size_; + + public: + FullBloomTest() : + policy_(NewBloomFilterPolicy(FLAGS_bits_per_key, false)), + filter_size_(0) { + Reset(); + } + + ~FullBloomTest() { + delete policy_; + } + + void Reset() { + bits_builder_.reset(policy_->GetFilterBitsBuilder()); + bits_reader_.reset(nullptr); + buf_.reset(nullptr); + filter_size_ = 0; + } + + void Add(const Slice& s) { + bits_builder_->AddKey(s); + } + + void Build() { + Slice filter = bits_builder_->Finish(&buf_); + bits_reader_.reset(policy_->GetFilterBitsReader(filter)); + filter_size_ = filter.size(); + } + + size_t FilterSize() const { + return filter_size_; + } + + bool Matches(const Slice& s) { + if (bits_reader_ == nullptr) { + Build(); + } + return bits_reader_->MayMatch(s); + } + + double FalsePositiveRate() { + char buffer[sizeof(int)]; + int result = 0; + for (int i = 0; i < 10000; i++) { + if (Matches(Key(i + 1000000000, buffer))) { + result++; + } + } + return result / 10000.0; + } +}; + +TEST(FullBloomTest, FullEmptyFilter) { + // Empty filter is not match, at this level + ASSERT_TRUE(!Matches("hello")); + ASSERT_TRUE(!Matches("world")); +} + +TEST(FullBloomTest, FullSmall) { + Add("hello"); + Add("world"); + ASSERT_TRUE(Matches("hello")); + ASSERT_TRUE(Matches("world")); + ASSERT_TRUE(!Matches("x")); + ASSERT_TRUE(!Matches("foo")); +} + +TEST(FullBloomTest, FullVaryingLengths) { + char buffer[sizeof(int)]; + + // Count number of filters that significantly exceed the false positive rate + int mediocre_filters = 0; + int good_filters = 0; + + for (int length = 1; length <= 10000; length = NextLength(length)) { + Reset(); + for (int i = 0; i < length; i++) { + Add(Key(i, buffer)); + } + Build(); + + ASSERT_LE(FilterSize(), (size_t)((length * 10 / 8) + 128 + 5)) << length; + + // All added keys must match + for (int i = 0; i < length; i++) { + ASSERT_TRUE(Matches(Key(i, buffer))) + << "Length " << length << "; key " << i; + } + + // Check false positive rate + double rate = FalsePositiveRate(); + if (kVerbose >= 1) { + fprintf(stderr, "False positives: %5.2f%% @ length = %6d ; bytes = %6d\n", + rate*100.0, length, static_cast(FilterSize())); + } + ASSERT_LE(rate, 0.02); // Must not be over 2% + if (rate > 0.0125) + mediocre_filters++; // Allowed, but not too often + else + good_filters++; + } + if (kVerbose >= 1) { + fprintf(stderr, "Filters: %d good, %d mediocre\n", + good_filters, mediocre_filters); + } + ASSERT_LE(mediocre_filters, good_filters/5); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/cache.cc b/util/cache.cc index f1c48a829..d64ab00e2 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -26,8 +26,27 @@ namespace { // LRU cache implementation -// An entry is a variable length heap-allocated structure. Entries -// are kept in a circular doubly linked list ordered by access time. +// An entry is a variable length heap-allocated structure. +// Entries are referenced by cache and/or by any external entity. +// The cache keeps all its entries in table. Some elements +// are also stored on LRU list. +// +// LRUHandle can be in these states: +// 1. Referenced externally AND in hash table. +// In that case the entry is *not* in the LRU. (refs > 1 && in_cache == true) +// 2. Not referenced externally and in hash table. In that case the entry is +// in the LRU and can be freed. (refs == 1 && in_cache == true) +// 3. Referenced externally and not in hash table. In that case the entry is +// in not on LRU and not in table. (refs >= 1 && in_cache == false) +// +// All newly created LRUHandles are in state 1. If you call LRUCache::Release +// on entry in state 1, it will go into state 2. To move from state 1 to +// state 3, either call LRUCache::Erase or LRUCache::Insert with the same key. +// To move from state 2 to state 1, use LRUCache::Lookup. +// Before destruction, make sure that no handles are in state 1. This means +// that any successful LRUCache::Lookup/LRUCache::Insert have a matching +// RUCache::Release (to move into state 2) or LRUCache::Erase (for state 3) + struct LRUHandle { void* value; void (*deleter)(const Slice&, void* value); @@ -36,7 +55,9 @@ struct LRUHandle { LRUHandle* prev; size_t charge; // TODO(opt): Only allow uint32_t? size_t key_length; - uint32_t refs; + uint32_t refs; // a number of refs to this entry + // cache itself is counted as 1 + bool in_cache; // true, if this entry is referenced by the hash table uint32_t hash; // Hash of key(); used for fast sharding and comparisons char key_data[1]; // Beginning of key @@ -49,6 +70,12 @@ struct LRUHandle { return Slice(key_data, key_length); } } + + void Free() { + assert((refs == 1 && in_cache) || (refs == 0 && !in_cache)); + (*deleter)(key(), value); + free(this); + } }; // We provide our own simple hash table since it removes a whole bunch @@ -59,7 +86,28 @@ struct LRUHandle { class HandleTable { public: HandleTable() : length_(0), elems_(0), list_(nullptr) { Resize(); } - ~HandleTable() { delete[] list_; } + + template + void ApplyToAllCacheEntries(T func) { + for (uint32_t i = 0; i < length_; i++) { + LRUHandle* h = list_[i]; + while (h != nullptr) { + auto n = h->next_hash; + assert(h->in_cache); + func(h); + h = n; + } + } + } + + ~HandleTable() { + ApplyToAllCacheEntries([](LRUHandle* h) { + if (h->refs == 1) { + h->Free(); + } + }); + delete[] list_; + } LRUHandle* Lookup(const Slice& key, uint32_t hash) { return *FindPointer(key, hash); @@ -145,7 +193,7 @@ class LRUCache { // Separate from constructor so caller can easily make an array of LRUCache void SetCapacity(size_t capacity) { capacity_ = capacity; } - void SetRemoveScanCountLimit(size_t remove_scan_count_limit) { + void SetRemoveScanCountLimit(uint32_t remove_scan_count_limit) { remove_scan_count_limit_ = remove_scan_count_limit; } @@ -173,8 +221,6 @@ class LRUCache { // Just reduce the reference count by 1. // Return true if last reference bool Unref(LRUHandle* e); - // Call deleter and free - void FreeEntry(LRUHandle* e); // Initialized before use. size_t capacity_; @@ -188,6 +234,7 @@ class LRUCache { // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. + // LRU contains items which can be evicted, ie reference only by cache LRUHandle lru_; HandleTable table_; @@ -200,16 +247,7 @@ LRUCache::LRUCache() lru_.prev = &lru_; } -LRUCache::~LRUCache() { - for (LRUHandle* e = lru_.next; e != &lru_; ) { - LRUHandle* next = e->next; - assert(e->refs == 1); // Error if caller has an unreleased handle - if (Unref(e)) { - FreeEntry(e); - } - e = next; - } -} +LRUCache::~LRUCache() {} bool LRUCache::Unref(LRUHandle* e) { assert(e->refs > 0); @@ -217,47 +255,48 @@ bool LRUCache::Unref(LRUHandle* e) { return e->refs == 0; } -void LRUCache::FreeEntry(LRUHandle* e) { - assert(e->refs == 0); - (*e->deleter)(e->key(), e->value); - free(e); -} +// Call deleter and free void LRUCache::ApplyToAllCacheEntries(void (*callback)(void*, size_t), bool thread_safe) { if (thread_safe) { mutex_.Lock(); } - for (auto e = lru_.next; e != &lru_; e = e->next) { - callback(e->value, e->charge); - } + table_.ApplyToAllCacheEntries([callback](LRUHandle* h) { + callback(h->value, h->charge); + }); if (thread_safe) { mutex_.Unlock(); } } void LRUCache::LRU_Remove(LRUHandle* e) { + assert(e->next != nullptr); + assert(e->prev != nullptr); e->next->prev = e->prev; e->prev->next = e->next; - usage_ -= e->charge; + e->prev = e->next = nullptr; } void LRUCache::LRU_Append(LRUHandle* e) { // Make "e" newest entry by inserting just before lru_ + assert(e->next == nullptr); + assert(e->prev == nullptr); e->next = &lru_; e->prev = lru_.prev; e->prev->next = e; e->next->prev = e; - usage_ += e->charge; } Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { MutexLock l(&mutex_); LRUHandle* e = table_.Lookup(key, hash); if (e != nullptr) { + assert(e->in_cache); + if (e->refs == 1) { + LRU_Remove(e); + } e->refs++; - LRU_Remove(e); - LRU_Append(e); } return reinterpret_cast(e); } @@ -268,9 +307,31 @@ void LRUCache::Release(Cache::Handle* handle) { { MutexLock l(&mutex_); last_reference = Unref(e); + if (last_reference) { + usage_ -= e->charge; + } + if (e->refs == 1 && e->in_cache) { + // The item is still in cache, and nobody else holds a reference to it + if (usage_ > capacity_) { + // the cache is full + // The LRU list must be empty since the cache is full + assert(lru_.next == &lru_); + // take this opportunity and remove the item + table_.Remove(e->key(), e->hash); + e->in_cache = false; + Unref(e); + usage_ -= e->charge; + last_reference = true; + } else { + // put the item on the list to be potentially freed + LRU_Append(e); + } + } } + + // free outside of mutex if (last_reference) { - FreeEntry(e); + e->Free(); } } @@ -278,8 +339,11 @@ Cache::Handle* LRUCache::Insert( const Slice& key, uint32_t hash, void* value, size_t charge, void (*deleter)(const Slice& key, void* value)) { - LRUHandle* e = reinterpret_cast( - malloc(sizeof(LRUHandle)-1 + key.size())); + // Allocate the memory here outside of the mutex + // If the cache is full, we'll have to release it + // It shouldn't happen very often though. + LRUHandle* e = + reinterpret_cast(malloc(sizeof(LRUHandle) - 1 + key.size())); autovector last_reference_list; e->value = value; @@ -288,47 +352,40 @@ Cache::Handle* LRUCache::Insert( e->key_length = key.size(); e->hash = hash; e->refs = 2; // One from LRUCache, one for the returned handle + e->next = e->prev = nullptr; + e->in_cache = true; memcpy(e->key_data, key.data(), key.size()); { MutexLock l(&mutex_); - LRU_Append(e); - - LRUHandle* old = table_.Insert(e); - if (old != nullptr) { - LRU_Remove(old); - if (Unref(old)) { - last_reference_list.push_back(old); - } - } - - if (remove_scan_count_limit_ > 0) { - // Try to free the space by evicting the entries that are only - // referenced by the cache first. - LRUHandle* cur = lru_.next; - for (unsigned int scanCount = 0; - usage_ > capacity_ && cur != &lru_ - && scanCount < remove_scan_count_limit_; scanCount++) { - LRUHandle* next = cur->next; - if (cur->refs <= 1) { - LRU_Remove(cur); - table_.Remove(cur->key(), cur->hash); - if (Unref(cur)) { - last_reference_list.push_back(cur); - } - } - cur = next; - } - } - // Free the space following strict LRU policy until enough space - // is freed. - while (usage_ > capacity_ && lru_.next != &lru_) { + // is freed or the lru list is empty + while (usage_ + charge > capacity_ && lru_.next != &lru_) { LRUHandle* old = lru_.next; + assert(old->in_cache); + assert(old->refs == + 1); // LRU list contains elements which may be evicted LRU_Remove(old); table_.Remove(old->key(), old->hash); + old->in_cache = false; + Unref(old); + usage_ -= old->charge; + last_reference_list.push_back(old); + } + + // insert into the cache + // note that the cache might get larger than its capacity if not enough + // space was freed + LRUHandle* old = table_.Insert(e); + usage_ += e->charge; + if (old != nullptr) { + old->in_cache = false; if (Unref(old)) { + usage_ -= old->charge; + // old is on LRU because it's in cache and its reference count + // was just 1 (Unref returned 0) + LRU_Remove(old); last_reference_list.push_back(old); } } @@ -337,7 +394,7 @@ Cache::Handle* LRUCache::Insert( // we free the entries here outside of mutex for // performance reasons for (auto entry : last_reference_list) { - FreeEntry(entry); + entry->Free(); } return reinterpret_cast(e); @@ -350,14 +407,21 @@ void LRUCache::Erase(const Slice& key, uint32_t hash) { MutexLock l(&mutex_); e = table_.Remove(key, hash); if (e != nullptr) { - LRU_Remove(e); last_reference = Unref(e); + if (last_reference) { + usage_ -= e->charge; + } + if (last_reference && e->in_cache) { + LRU_Remove(e); + } + e->in_cache = false; } } + // mutex not held here // last_reference will only be true if e != nullptr if (last_reference) { - FreeEntry(e); + e->Free(); } } diff --git a/util/cache_bench.cc b/util/cache_bench.cc new file mode 100644 index 000000000..92df77267 --- /dev/null +++ b/util/cache_bench.cc @@ -0,0 +1,276 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else + +#include +#include +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/cache.h" +#include "rocksdb/env.h" +#include "port/port.h" +#include "util/mutexlock.h" +#include "util/random.h" + +using GFLAGS::ParseCommandLineFlags; + +static const uint32_t KB = 1024; + +DEFINE_int32(threads, 16, "Number of concurrent threads to run."); +DEFINE_int64(cache_size, 8 * KB * KB, + "Number of bytes to use as a cache of uncompressed data."); +DEFINE_int32(num_shard_bits, 4, "shard_bits."); + +DEFINE_int64(max_key, 1 * KB * KB * KB, "Max number of key to place in cache"); +DEFINE_uint64(ops_per_thread, 1200000, "Number of operations per thread."); + +DEFINE_bool(populate_cache, false, "Populate cache before operations"); +DEFINE_int32(insert_percent, 40, + "Ratio of insert to total workload (expressed as a percentage)"); +DEFINE_int32(lookup_percent, 50, + "Ratio of lookup to total workload (expressed as a percentage)"); +DEFINE_int32(erase_percent, 10, + "Ratio of erase to total workload (expressed as a percentage)"); + +namespace rocksdb { + +class CacheBench; +namespace { +void deleter(const Slice& key, void* value) { + delete reinterpret_cast(value); +} + +// State shared by all concurrent executions of the same benchmark. +class SharedState { + public: + explicit SharedState(CacheBench* cache_bench) + : cv_(&mu_), + num_threads_(FLAGS_threads), + num_initialized_(0), + start_(false), + num_done_(0), + cache_bench_(cache_bench) { + } + + ~SharedState() {} + + port::Mutex* GetMutex() { + return &mu_; + } + + port::CondVar* GetCondVar() { + return &cv_; + } + + CacheBench* GetCacheBench() const { + return cache_bench_; + } + + void IncInitialized() { + num_initialized_++; + } + + void IncDone() { + num_done_++; + } + + bool AllInitialized() const { + return num_initialized_ >= num_threads_; + } + + bool AllDone() const { + return num_done_ >= num_threads_; + } + + void SetStart() { + start_ = true; + } + + bool Started() const { + return start_; + } + + private: + port::Mutex mu_; + port::CondVar cv_; + + const uint64_t num_threads_; + uint64_t num_initialized_; + bool start_; + uint64_t num_done_; + + CacheBench* cache_bench_; +}; + +// Per-thread state for concurrent executions of the same benchmark. +struct ThreadState { + uint32_t tid; + Random rnd; + SharedState* shared; + + ThreadState(uint32_t index, SharedState* _shared) + : tid(index), rnd(1000 + index), shared(_shared) {} +}; +} // namespace + +class CacheBench { + public: + CacheBench() : + cache_(NewLRUCache(FLAGS_cache_size, FLAGS_num_shard_bits)), + num_threads_(FLAGS_threads) {} + + ~CacheBench() {} + + void PopulateCache() { + Random rnd(1); + for (int64_t i = 0; i < FLAGS_cache_size; i++) { + uint64_t rand_key = rnd.Next() % FLAGS_max_key; + // Cast uint64* to be char*, data would be copied to cache + Slice key(reinterpret_cast(&rand_key), 8); + // do insert + auto handle = cache_->Insert(key, new char[10], 1, &deleter); + cache_->Release(handle); + } + } + + bool Run() { + rocksdb::Env* env = rocksdb::Env::Default(); + + PrintEnv(); + SharedState shared(this); + std::vector threads(num_threads_); + for (uint32_t i = 0; i < num_threads_; i++) { + threads[i] = new ThreadState(i, &shared); + env->StartThread(ThreadBody, threads[i]); + } + { + MutexLock l(shared.GetMutex()); + while (!shared.AllInitialized()) { + shared.GetCondVar()->Wait(); + } + // Record start time + uint64_t start_time = env->NowMicros(); + + // Start all threads + shared.SetStart(); + shared.GetCondVar()->SignalAll(); + + // Wait threads to complete + while (!shared.AllDone()) { + shared.GetCondVar()->Wait(); + } + + // Record end time + uint64_t end_time = env->NowMicros(); + double elapsed = static_cast(end_time - start_time) * 1e-6; + uint32_t qps = static_cast( + static_cast(FLAGS_threads * FLAGS_ops_per_thread) / elapsed); + fprintf(stdout, "Complete in %.3f s; QPS = %u\n", elapsed, qps); + } + return true; + } + + private: + std::shared_ptr cache_; + uint32_t num_threads_; + + static void ThreadBody(void* v) { + ThreadState* thread = reinterpret_cast(v); + SharedState* shared = thread->shared; + + { + MutexLock l(shared->GetMutex()); + shared->IncInitialized(); + if (shared->AllInitialized()) { + shared->GetCondVar()->SignalAll(); + } + while (!shared->Started()) { + shared->GetCondVar()->Wait(); + } + } + thread->shared->GetCacheBench()->OperateCache(thread); + + { + MutexLock l(shared->GetMutex()); + shared->IncDone(); + if (shared->AllDone()) { + shared->GetCondVar()->SignalAll(); + } + } + } + + void OperateCache(ThreadState* thread) { + for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) { + uint64_t rand_key = thread->rnd.Next() % FLAGS_max_key; + // Cast uint64* to be char*, data would be copied to cache + Slice key(reinterpret_cast(&rand_key), 8); + int32_t prob_op = thread->rnd.Uniform(100); + if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { + // do insert + auto handle = cache_->Insert(key, new char[10], 1, &deleter); + cache_->Release(handle); + } else if (prob_op -= FLAGS_insert_percent && + prob_op < FLAGS_lookup_percent) { + // do lookup + auto handle = cache_->Lookup(key); + if (handle) { + cache_->Release(handle); + } + } else if (prob_op -= FLAGS_lookup_percent && + prob_op < FLAGS_erase_percent) { + // do erase + cache_->Erase(key); + } + } + } + + void PrintEnv() const { + printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion); + printf("Number of threads : %d\n", FLAGS_threads); + printf("Ops per thread : %" PRIu64 "\n", FLAGS_ops_per_thread); + printf("Cache size : %" PRIu64 "\n", FLAGS_cache_size); + printf("Num shard bits : %d\n", FLAGS_num_shard_bits); + printf("Max key : %" PRIu64 "\n", FLAGS_max_key); + printf("Populate cache : %d\n", FLAGS_populate_cache); + printf("Insert percentage : %d%%\n", FLAGS_insert_percent); + printf("Lookup percentage : %d%%\n", FLAGS_lookup_percent); + printf("Erase percentage : %d%%\n", FLAGS_erase_percent); + printf("----------------------------\n"); + } +}; +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + + if (FLAGS_threads <= 0) { + fprintf(stderr, "threads number <= 0\n"); + exit(1); + } + + rocksdb::CacheBench bench; + if (FLAGS_populate_cache) { + bench.PopulateCache(); + } + if (bench.Run()) { + return 0; + } else { + return 1; + } +} + +#endif // GFLAGS diff --git a/util/cache_test.cc b/util/cache_test.cc index c12cdb7e1..ea71124b2 100644 --- a/util/cache_test.cc +++ b/util/cache_test.cc @@ -28,7 +28,9 @@ static int DecodeKey(const Slice& k) { return DecodeFixed32(k.data()); } static void* EncodeValue(uintptr_t v) { return reinterpret_cast(v); } -static int DecodeValue(void* v) { return reinterpret_cast(v); } +static int DecodeValue(void* v) { + return static_cast(reinterpret_cast(v)); +} class CacheTest { public: @@ -131,7 +133,7 @@ TEST(CacheTest, UsageTest) { // make sure the cache will be overloaded for (uint64_t i = 1; i < kCapacity; ++i) { - auto key = std::to_string(i); + auto key = ToString(i); cache->Release( cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter) ); @@ -188,25 +190,30 @@ TEST(CacheTest, EntriesArePinned) { Insert(100, 101); Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); ASSERT_EQ(101, DecodeValue(cache_->Value(h1))); + ASSERT_EQ(1U, cache_->GetUsage()); Insert(100, 102); Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); ASSERT_EQ(102, DecodeValue(cache_->Value(h2))); ASSERT_EQ(0U, deleted_keys_.size()); + ASSERT_EQ(2U, cache_->GetUsage()); cache_->Release(h1); ASSERT_EQ(1U, deleted_keys_.size()); ASSERT_EQ(100, deleted_keys_[0]); ASSERT_EQ(101, deleted_values_[0]); + ASSERT_EQ(1U, cache_->GetUsage()); Erase(100); ASSERT_EQ(-1, Lookup(100)); ASSERT_EQ(1U, deleted_keys_.size()); + ASSERT_EQ(1U, cache_->GetUsage()); cache_->Release(h2); ASSERT_EQ(2U, deleted_keys_.size()); ASSERT_EQ(100, deleted_keys_[1]); ASSERT_EQ(102, deleted_values_[1]); + ASSERT_EQ(0U, cache_->GetUsage()); } TEST(CacheTest, EvictionPolicy) { @@ -271,76 +278,28 @@ TEST(CacheTest, EvictionPolicyRef) { cache_->Release(h204); } -TEST(CacheTest, EvictionPolicyRef2) { - std::vector handles; - - Insert(100, 101); - // Insert entries much more than Cache capacity - for (int i = 0; i < kCacheSize + 100; i++) { - Insert(1000 + i, 2000 + i); - if (i < kCacheSize ) { - handles.push_back(cache_->Lookup(EncodeKey(1000 + i))); - } - } - - // Make sure referenced keys are also possible to be deleted - // if there are not sufficient non-referenced keys - for (int i = 0; i < 5; i++) { - ASSERT_EQ(-1, Lookup(1000 + i)); - } +TEST(CacheTest, ErasedHandleState) { + // insert a key and get two handles + Insert(100, 1000); + Cache::Handle* h1 = cache_->Lookup(EncodeKey(100)); + Cache::Handle* h2 = cache_->Lookup(EncodeKey(100)); + ASSERT_EQ(h1, h2); + ASSERT_EQ(DecodeValue(cache_->Value(h1)), 1000); + ASSERT_EQ(DecodeValue(cache_->Value(h2)), 1000); - for (int i = kCacheSize; i < kCacheSize + 100; i++) { - ASSERT_EQ(2000 + i, Lookup(1000 + i)); - } + // delete the key from the cache + Erase(100); + // can no longer find in the cache ASSERT_EQ(-1, Lookup(100)); - // Cleaning up all the handles - while (handles.size() > 0) { - cache_->Release(handles.back()); - handles.pop_back(); - } -} - -TEST(CacheTest, EvictionPolicyRefLargeScanLimit) { - std::vector handles2; - - // Cache2 has a cache RemoveScanCountLimit higher than cache size - // so it would trigger a boundary condition. - - // Populate the cache with 10 more keys than its size. - // Reference all keys except one close to the end. - for (int i = 0; i < kCacheSize2 + 10; i++) { - Insert2(1000 + i, 2000+i); - if (i != kCacheSize2 ) { - handles2.push_back(cache2_->Lookup(EncodeKey(1000 + i))); - } - } - - // Make sure referenced keys are also possible to be deleted - // if there are not sufficient non-referenced keys - for (int i = 0; i < 3; i++) { - ASSERT_EQ(-1, Lookup2(1000 + i)); - } - // The non-referenced value is deleted even if it's accessed - // recently. - ASSERT_EQ(-1, Lookup2(1000 + kCacheSize2)); - // Other values recently accessed are not deleted since they - // are referenced. - for (int i = kCacheSize2 - 10; i < kCacheSize2 + 10; i++) { - if (i != kCacheSize2) { - ASSERT_EQ(2000 + i, Lookup2(1000 + i)); - } - } + // release one handle + cache_->Release(h1); + // still can't find in cache + ASSERT_EQ(-1, Lookup(100)); - // Cleaning up all the handles - while (handles2.size() > 0) { - cache2_->Release(handles2.back()); - handles2.pop_back(); - } + cache_->Release(h2); } - - TEST(CacheTest, HeavyEntries) { // Add a bunch of light and heavy entries and then count the combined // size of items still in the cache, which must be approximately the @@ -377,21 +336,21 @@ TEST(CacheTest, NewId) { class Value { private: - int v_; + size_t v_; public: - explicit Value(int v) : v_(v) { } + explicit Value(size_t v) : v_(v) { } ~Value() { std::cout << v_ << " is destructed\n"; } }; namespace { void deleter(const Slice& key, void* value) { - delete (Value *)value; + delete static_cast(value); } } // namespace -TEST(CacheTest, BadEviction) { - int n = 10; +TEST(CacheTest, OverCapacity) { + size_t n = 10; // a LRUCache with n entries and one shard only std::shared_ptr cache = NewLRUCache(n, 0); @@ -399,25 +358,42 @@ TEST(CacheTest, BadEviction) { std::vector handles(n+1); // Insert n+1 entries, but not releasing. - for (int i = 0; i < n+1; i++) { - std::string key = std::to_string(i+1); + for (size_t i = 0; i < n + 1; i++) { + std::string key = ToString(i+1); handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); } // Guess what's in the cache now? - for (int i = 0; i < n+1; i++) { - std::string key = std::to_string(i+1); + for (size_t i = 0; i < n + 1; i++) { + std::string key = ToString(i+1); auto h = cache->Lookup(key); std::cout << key << (h?" found\n":" not found\n"); - // Only the first entry should be missing - ASSERT_TRUE(h || i == 0); + ASSERT_TRUE(h != nullptr); if (h) cache->Release(h); } - for (int i = 0; i < n+1; i++) { + // the cache is over capacity since nothing could be evicted + ASSERT_EQ(n + 1U, cache->GetUsage()); + for (size_t i = 0; i < n + 1; i++) { cache->Release(handles[i]); } - std::cout << "Poor entries\n"; + + // cache is under capacity now since elements were released + ASSERT_EQ(n, cache->GetUsage()); + + // element 0 is evicted and the rest is there + // This is consistent with the LRU policy since the element 0 + // was released first + for (size_t i = 0; i < n + 1; i++) { + std::string key = ToString(i+1); + auto h = cache->Lookup(key); + if (h) { + ASSERT_NE(i, 0U); + cache->Release(h); + } else { + ASSERT_EQ(i, 0U); + } + } } namespace { diff --git a/util/coding.h b/util/coding.h index fa6652668..a72f7dbec 100644 --- a/util/coding.h +++ b/util/coding.h @@ -157,7 +157,7 @@ inline void PutFixed64(std::string* dst, uint64_t value) { inline void PutVarint32(std::string* dst, uint32_t v) { char buf[5]; char* ptr = EncodeVarint32(buf, v); - dst->append(buf, ptr - buf); + dst->append(buf, static_cast(ptr - buf)); } inline char* EncodeVarint64(char* dst, uint64_t v) { @@ -174,11 +174,11 @@ inline char* EncodeVarint64(char* dst, uint64_t v) { inline void PutVarint64(std::string* dst, uint64_t v) { char buf[10]; char* ptr = EncodeVarint64(buf, v); - dst->append(buf, ptr - buf); + dst->append(buf, static_cast(ptr - buf)); } inline void PutLengthPrefixedSlice(std::string* dst, const Slice& value) { - PutVarint32(dst, value.size()); + PutVarint32(dst, static_cast(value.size())); dst->append(value.data(), value.size()); } @@ -219,7 +219,7 @@ inline bool GetVarint32(Slice* input, uint32_t* value) { if (q == nullptr) { return false; } else { - *input = Slice(q, limit - q); + *input = Slice(q, static_cast(limit - q)); return true; } } @@ -231,7 +231,7 @@ inline bool GetVarint64(Slice* input, uint64_t* value) { if (q == nullptr) { return false; } else { - *input = Slice(q, limit - q); + *input = Slice(q, static_cast(limit - q)); return true; } } diff --git a/util/comparator.cc b/util/comparator.cc index adeacac0a..bbf0262f0 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -69,13 +69,29 @@ class BytewiseComparatorImpl : public Comparator { // *key is a run of 0xffs. Leave it alone. } }; -} // namespace + +class ReverseBytewiseComparatorImpl : public BytewiseComparatorImpl { + public: + ReverseBytewiseComparatorImpl() { } + + virtual const char* Name() const { + return "rocksdb.ReverseBytewiseComparator"; + } + + virtual int Compare(const Slice& a, const Slice& b) const { + return -a.compare(b); + } +}; + +}// namespace static port::OnceType once = LEVELDB_ONCE_INIT; static const Comparator* bytewise; +static const Comparator* rbytewise; static void InitModule() { bytewise = new BytewiseComparatorImpl; + rbytewise= new ReverseBytewiseComparatorImpl; } const Comparator* BytewiseComparator() { @@ -83,4 +99,9 @@ const Comparator* BytewiseComparator() { return bytewise; } +const Comparator* ReverseBytewiseComparator() { + port::InitOnce(&once, InitModule); + return rbytewise; +} + } // namespace rocksdb diff --git a/util/compression.h b/util/compression.h new file mode 100644 index 000000000..664036353 --- /dev/null +++ b/util/compression.h @@ -0,0 +1,525 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +#pragma once + +#include +#include + +#include "rocksdb/options.h" +#include "util/coding.h" + +#ifdef SNAPPY +#include +#endif + +#ifdef ZLIB +#include +#endif + +#ifdef BZIP2 +#include +#endif + +#if defined(LZ4) +#include +#include +#endif + +namespace rocksdb { + +// compress_format_version can have two values: +// 1 -- decompressed sizes for BZip2 and Zlib are not included in the compressed +// block. Also, decompressed sizes for LZ4 are encoded in platform-dependent +// way. +// 2 -- Zlib, BZip2 and LZ4 encode decompressed size as Varint32 just before the +// start of compressed block. Snappy format is the same as version 1. + +inline bool Snappy_Compress(const CompressionOptions& opts, const char* input, + size_t length, ::std::string* output) { +#ifdef SNAPPY + output->resize(snappy::MaxCompressedLength(length)); + size_t outlen; + snappy::RawCompress(input, length, &(*output)[0], &outlen); + output->resize(outlen); + return true; +#endif + + return false; +} + +inline bool Snappy_GetUncompressedLength(const char* input, size_t length, + size_t* result) { +#ifdef SNAPPY + return snappy::GetUncompressedLength(input, length, result); +#else + return false; +#endif +} + +inline bool Snappy_Uncompress(const char* input, size_t length, + char* output) { +#ifdef SNAPPY + return snappy::RawUncompress(input, length, output); +#else + return false; +#endif +} + +namespace compression { +// returns size +inline size_t PutDecompressedSizeInfo(std::string* output, uint32_t length) { + PutVarint32(output, length); + return output->size(); +} + +inline bool GetDecompressedSizeInfo(const char** input_data, + size_t* input_length, + uint32_t* output_len) { + auto new_input_data = + GetVarint32Ptr(*input_data, *input_data + *input_length, output_len); + if (new_input_data == nullptr) { + return false; + } + *input_length -= (new_input_data - *input_data); + *input_data = new_input_data; + return true; +} +} // namespace compression + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool Zlib_Compress(const CompressionOptions& opts, + uint32_t compress_format_version, + const char* input, size_t length, + ::std::string* output) { +#ifdef ZLIB + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + // The memLevel parameter specifies how much memory should be allocated for + // the internal compression state. + // memLevel=1 uses minimum memory but is slow and reduces compression ratio. + // memLevel=9 uses maximum memory for optimal speed. + // The default value is 8. See zconf.h for more details. + static const int memLevel = 8; + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + int st = deflateInit2(&_stream, opts.level, Z_DEFLATED, opts.window_bits, + memLevel, opts.strategy); + if (st != Z_OK) { + return false; + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (Bytef *)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(length); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + bool done = false; + while (!done) { + st = deflate(&_stream, Z_FINISH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: + // No output space. This means the compression is bigger than + // decompressed size. Just fail the compression in that case. + // Intentional fallback (to failure case) + case Z_BUF_ERROR: + default: + deflateEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out + output_header_len); + deflateEnd(&_stream); + return true; +#endif + return false; +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline char* Zlib_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, + uint32_t compress_format_version, + int windowBits = -14) { +#ifdef ZLIB + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + z_stream _stream; + memset(&_stream, 0, sizeof(z_stream)); + + // For raw inflate, the windowBits should be -8..-15. + // If windowBits is bigger than zero, it will use either zlib + // header or gzip header. Adding 32 to it will do automatic detection. + int st = inflateInit2(&_stream, + windowBits > 0 ? windowBits + 32 : windowBits); + if (st != Z_OK) { + return nullptr; + } + + _stream.next_in = (Bytef *)input_data; + _stream.avail_in = static_cast(input_length); + + char* output = new char[output_len]; + + _stream.next_out = (Bytef *)output; + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = inflate(&_stream, Z_SYNC_FLUSH); + switch (st) { + case Z_STREAM_END: + done = true; + break; + case Z_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + size_t old_sz = output_len; + size_t output_len_delta = static_cast(output_len * 0.2); + output_len += output_len_delta < 10 ? 10 : output_len_delta; + char* tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (Bytef *)(output + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + case Z_BUF_ERROR: + default: + delete[] output; + inflateEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + *decompress_size = static_cast(output_len - _stream.avail_out); + inflateEnd(&_stream); + return output; +#endif + + return nullptr; +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool BZip2_Compress(const CompressionOptions& opts, + uint32_t compress_format_version, + const char* input, size_t length, + ::std::string* output) { +#ifdef BZIP2 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + size_t output_header_len = 0; + if (compress_format_version == 2) { + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } + // Resize output to be the plain data length. + // This may not be big enough if the compression actually expands data. + output->resize(output_header_len + length); + + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + // Block size 1 is 100K. + // 0 is for silent. + // 30 is the default workFactor + int st = BZ2_bzCompressInit(&_stream, 1, 0, 30); + if (st != BZ_OK) { + return false; + } + + // Compress the input, and put compressed data in output. + _stream.next_in = (char *)input; + _stream.avail_in = static_cast(length); + + // Initialize the output size. + _stream.avail_out = static_cast(length); + _stream.next_out = reinterpret_cast(&(*output)[output_header_len]); + + while (_stream.next_in != nullptr && _stream.avail_in != 0) { + st = BZ2_bzCompress(&_stream, BZ_FINISH); + switch (st) { + case BZ_STREAM_END: + break; + case BZ_FINISH_OK: + // No output space. This means the compression is bigger than + // decompressed size. Just fail the compression in that case + // Intentional fallback (to failure case) + case BZ_SEQUENCE_ERROR: + default: + BZ2_bzCompressEnd(&_stream); + return false; + } + } + + output->resize(output->size() - _stream.avail_out + output_header_len); + BZ2_bzCompressEnd(&_stream); + return true; +#endif + return false; +} + +// compress_format_version == 1 -- decompressed size is not included in the +// block header +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline char* BZip2_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, + uint32_t compress_format_version) { +#ifdef BZIP2 + uint32_t output_len = 0; + if (compress_format_version == 2) { + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // Assume the decompressed data size will 5x of compressed size, but round + // to the next page size + size_t proposed_output_len = ((input_length * 5) & (~(4096 - 1))) + 4096; + output_len = static_cast( + std::min(proposed_output_len, + static_cast(std::numeric_limits::max()))); + } + + bz_stream _stream; + memset(&_stream, 0, sizeof(bz_stream)); + + int st = BZ2_bzDecompressInit(&_stream, 0, 0); + if (st != BZ_OK) { + return nullptr; + } + + _stream.next_in = (char *)input_data; + _stream.avail_in = static_cast(input_length); + + char* output = new char[output_len]; + + _stream.next_out = (char *)output; + _stream.avail_out = static_cast(output_len); + + bool done = false; + while (!done) { + st = BZ2_bzDecompress(&_stream); + switch (st) { + case BZ_STREAM_END: + done = true; + break; + case BZ_OK: { + // No output space. Increase the output space by 20%. + // We should never run out of output space if + // compress_format_version == 2 + assert(compress_format_version != 2); + uint32_t old_sz = output_len; + output_len = output_len * 1.2; + char* tmp = new char[output_len]; + memcpy(tmp, output, old_sz); + delete[] output; + output = tmp; + + // Set more output. + _stream.next_out = (char *)(output + old_sz); + _stream.avail_out = static_cast(output_len - old_sz); + break; + } + default: + delete[] output; + BZ2_bzDecompressEnd(&_stream); + return nullptr; + } + } + + // If we encoded decompressed block size, we should have no bytes left + assert(compress_format_version != 2 || _stream.avail_out == 0); + *decompress_size = static_cast(output_len - _stream.avail_out); + BZ2_bzDecompressEnd(&_stream); + return output; +#endif + return nullptr; +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool LZ4_Compress(const CompressionOptions& opts, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + + int compressBound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compressBound)); + int outlen = + LZ4_compress_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compressBound); + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#endif + return false; +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline char* LZ4_Uncompress(const char* input_data, size_t input_length, + int* decompress_size, + uint32_t compress_format_version) { +#ifdef LZ4 + uint32_t output_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + if (!compression::GetDecompressedSizeInfo(&input_data, &input_length, + &output_len)) { + return nullptr; + } + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + if (input_length < 8) { + return nullptr; + } + memcpy(&output_len, input_data, sizeof(output_len)); + input_length -= 8; + input_data += 8; + } + char* output = new char[output_len]; + *decompress_size = + LZ4_decompress_safe(input_data, output, static_cast(input_length), + static_cast(output_len)); + if (*decompress_size < 0) { + delete[] output; + return nullptr; + } + assert(*decompress_size == static_cast(output_len)); + return output; +#endif + return nullptr; +} + +// compress_format_version == 1 -- decompressed size is included in the +// block header using memcpy, which makes database non-portable) +// compress_format_version == 2 -- decompressed size is included in the block +// header in varint32 format +inline bool LZ4HC_Compress(const CompressionOptions& opts, + uint32_t compress_format_version, const char* input, + size_t length, ::std::string* output) { +#ifdef LZ4 + if (length > std::numeric_limits::max()) { + // Can't compress more than 4GB + return false; + } + + size_t output_header_len = 0; + if (compress_format_version == 2) { + // new encoding, using varint32 to store size information + output_header_len = compression::PutDecompressedSizeInfo( + output, static_cast(length)); + } else { + // legacy encoding, which is not really portable (depends on big/little + // endianness) + output_header_len = 8; + output->resize(output_header_len); + char* p = const_cast(output->c_str()); + memcpy(p, &length, sizeof(length)); + } + + int compressBound = LZ4_compressBound(static_cast(length)); + output->resize(static_cast(output_header_len + compressBound)); + int outlen; +#ifdef LZ4_VERSION_MAJOR // they only started defining this since r113 + outlen = LZ4_compressHC2_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), + compressBound, opts.level); +#else + outlen = + LZ4_compressHC_limitedOutput(input, &(*output)[output_header_len], + static_cast(length), compressBound); +#endif + if (outlen == 0) { + return false; + } + output->resize(static_cast(output_header_len + outlen)); + return true; +#endif + return false; +} + +} // namespace rocksdb diff --git a/util/crc32c.cc b/util/crc32c.cc index d27fb4be9..8f1a09e17 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -298,14 +298,14 @@ static inline uint64_t LE_LOAD64(const uint8_t *p) { #endif static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { - uint32_t c = *l ^ LE_LOAD32(*p); + uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ table1_[(c >> 16) & 0xff] ^ table0_[c >> 24]; // DO it twice. - c = *l ^ LE_LOAD32(*p); + c = static_cast(*l ^ LE_LOAD32(*p)); *p += 4; *l = table3_[c & 0xff] ^ table2_[(c >> 8) & 0xff] ^ @@ -362,7 +362,7 @@ uint32_t ExtendImpl(uint32_t crc, const char* buf, size_t size) { } #undef STEP1 #undef ALIGN - return l ^ 0xffffffffu; + return static_cast(l ^ 0xffffffffu); } // Detect if SS42 or not. diff --git a/util/db_info_dummper.cc b/util/db_info_dumper.cc similarity index 76% rename from util/db_info_dummper.cc rename to util/db_info_dumper.cc index d5dd97ad2..9c709282c 100644 --- a/util/db_info_dummper.cc +++ b/util/db_info_dumper.cc @@ -2,20 +2,21 @@ // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -// -// Must not be included from any .h files to avoid polluting the namespace -// with macros. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include #include #include +#include "db/filename.h" #include "rocksdb/options.h" #include "rocksdb/env.h" -#include "db/filename.h" +#include "util/db_info_dumper.h" namespace rocksdb { @@ -33,10 +34,11 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { uint64_t file_size; std::string file_info, wal_info; - Log(options.info_log, "DB SUMMARY\n"); + Log(InfoLogLevel::INFO_LEVEL, options.info_log, "DB SUMMARY\n"); // Get files in dbname dir if (!env->GetChildren(dbname, &files).ok()) { - Log(options.info_log, "Error when reading %s dir\n", dbname.c_str()); + Log(InfoLogLevel::ERROR_LEVEL, + options.info_log, "Error when reading %s dir\n", dbname.c_str()); } std::sort(files.begin(), files.end()); for (std::string file : files) { @@ -45,14 +47,17 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { } switch (type) { case kCurrentFile: - Log(options.info_log, "CURRENT file: %s\n", file.c_str()); + Log(InfoLogLevel::INFO_LEVEL, options.info_log, + "CURRENT file: %s\n", file.c_str()); break; case kIdentityFile: - Log(options.info_log, "IDENTITY file: %s\n", file.c_str()); + Log(InfoLogLevel::INFO_LEVEL, options.info_log, + "IDENTITY file: %s\n", file.c_str()); break; case kDescriptorFile: env->GetFileSize(dbname + "/" + file, &file_size); - Log(options.info_log, "MANIFEST file: %s size: %" PRIu64 " Bytes\n", + Log(InfoLogLevel::INFO_LEVEL, options.info_log, + "MANIFEST file: %s size: %" PRIu64 " Bytes\n", file.c_str(), file_size); break; case kLogFile: @@ -76,7 +81,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { for (auto& db_path : options.db_paths) { if (dbname.compare(db_path.path) != 0) { if (!env->GetChildren(db_path.path, &files).ok()) { - Log(options.info_log, "Error when reading %s dir\n", + Log(InfoLogLevel::ERROR_LEVEL, options.info_log, + "Error when reading %s dir\n", db_path.path.c_str()); continue; } @@ -89,7 +95,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { } } } - Log(options.info_log, "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n", + Log(InfoLogLevel::INFO_LEVEL, options.info_log, + "SST files in %s dir, Total Num: %" PRIu64 ", files: %s\n", db_path.path.c_str(), file_num, file_info.c_str()); file_num = 0; file_info.clear(); @@ -98,7 +105,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { // Get wal file in wal_dir if (dbname.compare(options.wal_dir) != 0) { if (!env->GetChildren(options.wal_dir, &files).ok()) { - Log(options.info_log, "Error when reading %s dir\n", + Log(InfoLogLevel::ERROR_LEVEL, options.info_log, + "Error when reading %s dir\n", options.wal_dir.c_str()); return; } @@ -115,7 +123,8 @@ void DumpDBFileSummary(const DBOptions& options, const std::string& dbname) { } } } - Log(options.info_log, "Write Ahead Log file in %s: %s\n", + Log(InfoLogLevel::INFO_LEVEL, options.info_log, + "Write Ahead Log file in %s: %s\n", options.wal_dir.c_str(), wal_info.c_str()); } } // namespace rocksdb diff --git a/util/db_info_dumper.h b/util/db_info_dumper.h new file mode 100644 index 000000000..ed0a63ded --- /dev/null +++ b/util/db_info_dumper.h @@ -0,0 +1,13 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include + +#include "rocksdb/options.h" + +namespace rocksdb { +void DumpDBFileSummary(const DBOptions& options, const std::string& dbname); +} // namespace rocksdb diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 73c2c9436..ffe8157cc 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -9,6 +9,7 @@ #include "port/port.h" #include "rocksdb/slice.h" +#include "util/allocator.h" #include "util/hash.h" namespace rocksdb { @@ -29,13 +30,13 @@ uint32_t GetTotalBitsForLocality(uint32_t total_bits) { } } -DynamicBloom::DynamicBloom(Arena* arena, uint32_t total_bits, uint32_t locality, - uint32_t num_probes, +DynamicBloom::DynamicBloom(Allocator* allocator, uint32_t total_bits, + uint32_t locality, uint32_t num_probes, uint32_t (*hash_func)(const Slice& key), size_t huge_page_tlb_size, Logger* logger) : DynamicBloom(num_probes, hash_func) { - SetTotalBits(arena, total_bits, locality, huge_page_tlb_size, logger); + SetTotalBits(allocator, total_bits, locality, huge_page_tlb_size, logger); } DynamicBloom::DynamicBloom(uint32_t num_probes, @@ -52,7 +53,7 @@ void DynamicBloom::SetRawData(unsigned char* raw_data, uint32_t total_bits, kNumBlocks = num_blocks; } -void DynamicBloom::SetTotalBits(Arena* arena, +void DynamicBloom::SetTotalBits(Allocator* allocator, uint32_t total_bits, uint32_t locality, size_t huge_page_tlb_size, Logger* logger) { @@ -67,9 +68,9 @@ void DynamicBloom::SetTotalBits(Arena* arena, if (kNumBlocks > 0) { sz += CACHE_LINE_SIZE - 1; } - assert(arena); + assert(allocator); raw_ = reinterpret_cast( - arena->AllocateAligned(sz, huge_page_tlb_size, logger)); + allocator->AllocateAligned(sz, huge_page_tlb_size, logger)); memset(raw_, 0, sz); if (kNumBlocks > 0 && (reinterpret_cast(raw_) % CACHE_LINE_SIZE)) { data_ = raw_ + CACHE_LINE_SIZE - diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 927710d24..a6e4d7367 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -9,8 +9,7 @@ #include "rocksdb/slice.h" -#include -#include +#include "port/port_posix.h" #include #include @@ -18,11 +17,12 @@ namespace rocksdb { class Slice; +class Allocator; class Logger; class DynamicBloom { public: - // arena: pass arena to bloom filter, hence trace the usage of memory + // allocator: pass allocator to bloom filter, hence trace the usage of memory // total_bits: fixed total bits for the bloom // num_probes: number of hash probes for a single key // locality: If positive, optimize for cache line locality, 0 otherwise. @@ -32,7 +32,7 @@ class DynamicBloom { // it to be allocated, like: // sysctl -w vm.nr_hugepages=20 // See linux doc Documentation/vm/hugetlbpage.txt - explicit DynamicBloom(Arena* arena, + explicit DynamicBloom(Allocator* allocator, uint32_t total_bits, uint32_t locality = 0, uint32_t num_probes = 6, uint32_t (*hash_func)(const Slice& key) = nullptr, @@ -42,8 +42,9 @@ class DynamicBloom { explicit DynamicBloom(uint32_t num_probes = 6, uint32_t (*hash_func)(const Slice& key) = nullptr); - void SetTotalBits(Arena* arena, uint32_t total_bits, uint32_t locality, - size_t huge_page_tlb_size, Logger* logger); + void SetTotalBits(Allocator* allocator, uint32_t total_bits, + uint32_t locality, size_t huge_page_tlb_size, + Logger* logger); ~DynamicBloom() {} diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index 3e55488f2..a8b1c529b 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -11,13 +11,17 @@ int main() { } #else +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include #include "dynamic_bloom.h" #include "port/port.h" +#include "util/arena.h" #include "util/logging.h" #include "util/testharness.h" #include "util/testutil.h" @@ -150,15 +154,15 @@ TEST(DynamicBloomTest, perf) { return; } - for (uint64_t m = 1; m <= 8; ++m) { + for (uint32_t m = 1; m <= 8; ++m) { Arena arena; - const uint64_t num_keys = m * 8 * 1024 * 1024; - fprintf(stderr, "testing %" PRIu64 "M keys\n", m * 8); + const uint32_t num_keys = m * 8 * 1024 * 1024; + fprintf(stderr, "testing %" PRIu32 "M keys\n", m * 8); DynamicBloom std_bloom(&arena, num_keys * 10, 0, num_probes); timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { + for (uint32_t i = 1; i <= num_keys; ++i) { std_bloom.Add(Slice(reinterpret_cast(&i), 8)); } @@ -166,9 +170,9 @@ TEST(DynamicBloomTest, perf) { fprintf(stderr, "standard bloom, avg add latency %" PRIu64 "\n", elapsed / num_keys); - uint64_t count = 0; + uint32_t count = 0; timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { + for (uint32_t i = 1; i <= num_keys; ++i) { if (std_bloom.MayContain(Slice(reinterpret_cast(&i), 8))) { ++count; } @@ -182,7 +186,7 @@ TEST(DynamicBloomTest, perf) { DynamicBloom blocked_bloom(&arena, num_keys * 10, 1, num_probes); timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { + for (uint32_t i = 1; i <= num_keys; ++i) { blocked_bloom.Add(Slice(reinterpret_cast(&i), 8)); } @@ -193,9 +197,9 @@ TEST(DynamicBloomTest, perf) { count = 0; timer.Start(); - for (uint64_t i = 1; i <= num_keys; ++i) { + for (uint32_t i = 1; i <= num_keys; ++i) { if (blocked_bloom.MayContain( - Slice(reinterpret_cast(&i), 8))) { + Slice(reinterpret_cast(&i), 8))) { ++count; } } diff --git a/util/env.cc b/util/env.cc index 91ae0784b..a95205273 100644 --- a/util/env.cc +++ b/util/env.cc @@ -41,7 +41,7 @@ void LogFlush(Logger *info_log) { } void Log(Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); @@ -51,7 +51,7 @@ void Log(Logger* info_log, const char* format, ...) { void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= log_level) { va_list ap; va_start(ap, format); info_log->Logv(log_level, format, ap); @@ -59,8 +59,17 @@ void Log(const InfoLogLevel log_level, Logger* info_log, const char* format, } } -void Debug(Logger* info_log, const char* format, ...) { +void Header(Logger* info_log, const char* format, ...) { if (info_log) { + va_list ap; + va_start(ap, format); + info_log->LogHeader(format, ap); + va_end(ap); + } +} + +void Debug(Logger* info_log, const char* format, ...) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::DEBUG_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::DEBUG_LEVEL, format, ap); @@ -69,7 +78,7 @@ void Debug(Logger* info_log, const char* format, ...) { } void Info(Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::INFO_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::INFO_LEVEL, format, ap); @@ -78,7 +87,7 @@ void Info(Logger* info_log, const char* format, ...) { } void Warn(Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::WARN_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::WARN_LEVEL, format, ap); @@ -86,7 +95,7 @@ void Warn(Logger* info_log, const char* format, ...) { } } void Error(Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::ERROR_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::ERROR_LEVEL, format, ap); @@ -94,7 +103,7 @@ void Error(Logger* info_log, const char* format, ...) { } } void Fatal(Logger* info_log, const char* format, ...) { - if (info_log) { + if (info_log && info_log->GetInfoLogLevel() <= InfoLogLevel::FATAL_LEVEL) { va_list ap; va_start(ap, format); info_log->Logv(InfoLogLevel::FATAL_LEVEL, format, ap); @@ -118,6 +127,15 @@ void Log(const InfoLogLevel log_level, const shared_ptr& info_log, } } +void Header(const shared_ptr& info_log, const char* format, ...) { + if (info_log) { + va_list ap; + va_start(ap, format); + info_log->LogHeader(format, ap); + va_end(ap); + } +} + void Debug(const shared_ptr& info_log, const char* format, ...) { if (info_log) { va_list ap; diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc index 1618e5468..b0a3b6751 100644 --- a/util/env_hdfs.cc +++ b/util/env_hdfs.cc @@ -52,18 +52,22 @@ class HdfsReadableFile : virtual public SequentialFile, public: HdfsReadableFile(hdfsFS fileSys, const std::string& fname) : fileSys_(fileSys), filename_(fname), hfile_(nullptr) { - Log(mylog, "[hdfs] HdfsReadableFile opening file %s\n", + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile opening file %s\n", filename_.c_str()); hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_RDONLY, 0, 0, 0); - Log(mylog, "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n", - filename_.c_str(), hfile_); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile opened file %s hfile_=0x%p\n", + filename_.c_str(), hfile_); } virtual ~HdfsReadableFile() { - Log(mylog, "[hdfs] HdfsReadableFile closing file %s\n", - filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile closing file %s\n", + filename_.c_str()); hdfsCloseFile(fileSys_, hfile_); - Log(mylog, "[hdfs] HdfsReadableFile closed file %s\n", + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile closed file %s\n", filename_.c_str()); hfile_ = nullptr; } @@ -75,7 +79,8 @@ class HdfsReadableFile : virtual public SequentialFile, // sequential access, read data at current offset in file virtual Status Read(size_t n, Slice* result, char* scratch) { Status s; - Log(mylog, "[hdfs] HdfsReadableFile reading %s %ld\n", + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile reading %s %ld\n", filename_.c_str(), n); char* buffer = scratch; @@ -97,7 +102,8 @@ class HdfsReadableFile : virtual public SequentialFile, } assert(total_bytes_read <= n); - Log(mylog, "[hdfs] HdfsReadableFile read %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile read %s\n", filename_.c_str()); if (bytes_read < 0) { s = IOError(filename_, errno); @@ -112,10 +118,12 @@ class HdfsReadableFile : virtual public SequentialFile, virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { Status s; - Log(mylog, "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile preading %s\n", filename_.c_str()); ssize_t bytes_read = hdfsPread(fileSys_, hfile_, offset, (void*)scratch, (tSize)n); - Log(mylog, "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile pread %s\n", filename_.c_str()); *result = Slice(scratch, (bytes_read < 0) ? 0 : bytes_read); if (bytes_read < 0) { // An error: return a non-ok status @@ -125,7 +133,8 @@ class HdfsReadableFile : virtual public SequentialFile, } virtual Status Skip(uint64_t n) { - Log(mylog, "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile skip %s\n", filename_.c_str()); // get current offset from file tOffset current = hdfsTell(fileSys_, hfile_); if (current < 0) { @@ -144,7 +153,8 @@ class HdfsReadableFile : virtual public SequentialFile, // returns true if we are at the end of file, false otherwise bool feof() { - Log(mylog, "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile feof %s\n", filename_.c_str()); if (hdfsTell(fileSys_, hfile_) == fileSize()) { return true; } @@ -153,7 +163,8 @@ class HdfsReadableFile : virtual public SequentialFile, // the current size of the file tOffset fileSize() { - Log(mylog, "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsReadableFile fileSize %s\n", filename_.c_str()); hdfsFileInfo* pFileInfo = hdfsGetPathInfo(fileSys_, filename_.c_str()); tOffset size = 0L; if (pFileInfo != nullptr) { @@ -176,16 +187,20 @@ class HdfsWritableFile: public WritableFile { public: HdfsWritableFile(hdfsFS fileSys, const std::string& fname) : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) { - Log(mylog, "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile opening %s\n", filename_.c_str()); hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0); - Log(mylog, "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile opened %s\n", filename_.c_str()); assert(hfile_ != nullptr); } virtual ~HdfsWritableFile() { if (hfile_ != nullptr) { - Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); hdfsCloseFile(fileSys_, hfile_); - Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); hfile_ = nullptr; } } @@ -202,11 +217,13 @@ class HdfsWritableFile: public WritableFile { } virtual Status Append(const Slice& data) { - Log(mylog, "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile Append %s\n", filename_.c_str()); const char* src = data.data(); size_t left = data.size(); size_t ret = hdfsWrite(fileSys_, hfile_, src, left); - Log(mylog, "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile Appended %s\n", filename_.c_str()); if (ret != left) { return IOError(filename_, errno); } @@ -219,14 +236,16 @@ class HdfsWritableFile: public WritableFile { virtual Status Sync() { Status s; - Log(mylog, "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile Sync %s\n", filename_.c_str()); if (hdfsFlush(fileSys_, hfile_) == -1) { return IOError(filename_, errno); } if (hdfsHSync(fileSys_, hfile_) == -1) { return IOError(filename_, errno); } - Log(mylog, "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile Synced %s\n", filename_.c_str()); return Status::OK(); } @@ -239,11 +258,13 @@ class HdfsWritableFile: public WritableFile { } virtual Status Close() { - Log(mylog, "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile closing %s\n", filename_.c_str()); if (hdfsCloseFile(fileSys_, hfile_) != 0) { return IOError(filename_, errno); } - Log(mylog, "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsWritableFile closed %s\n", filename_.c_str()); hfile_ = nullptr; return Status::OK(); } @@ -258,13 +279,15 @@ class HdfsLogger : public Logger { public: HdfsLogger(HdfsWritableFile* f, uint64_t (*gettid)()) : file_(f), gettid_(gettid) { - Log(mylog, "[hdfs] HdfsLogger opened %s\n", - file_->getName().c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsLogger opened %s\n", + file_->getName().c_str()); } virtual ~HdfsLogger() { - Log(mylog, "[hdfs] HdfsLogger closed %s\n", - file_->getName().c_str()); + Log(InfoLogLevel::DEBUG_LEVEL, mylog, + "[hdfs] HdfsLogger closed %s\n", + file_->getName().c_str()); delete file_; if (mylog != nullptr && mylog == this) { mylog = nullptr; @@ -417,9 +440,10 @@ Status HdfsEnv::NewDirectory(const std::string& name, result->reset(new HdfsDirectory(0)); return Status::OK(); default: // fail if the directory doesn't exist - Log(mylog, "NewDirectory hdfsExists call failed"); + Log(InfoLogLevel::FATAL_LEVEL, + mylog, "NewDirectory hdfsExists call failed"); throw HdfsFatalException("hdfsExists call failed with error " + - std::to_string(value) + " on path " + name + + ToString(value) + " on path " + name + ".\n"); } } @@ -433,9 +457,10 @@ bool HdfsEnv::FileExists(const std::string& fname) { case HDFS_DOESNT_EXIST: return false; default: // anything else should be an error - Log(mylog, "FileExists hdfsExists call failed"); + Log(InfoLogLevel::FATAL_LEVEL, + mylog, "FileExists hdfsExists call failed"); throw HdfsFatalException("hdfsExists call failed with error " + - std::to_string(value) + " on path " + fname + + ToString(value) + " on path " + fname + ".\n"); } } @@ -461,7 +486,8 @@ Status HdfsEnv::GetChildren(const std::string& path, } } else { // numEntries < 0 indicates error - Log(mylog, "hdfsListDirectory call failed with error "); + Log(InfoLogLevel::FATAL_LEVEL, mylog, + "hdfsListDirectory call failed with error "); throw HdfsFatalException( "hdfsListDirectory call failed negative error.\n"); } @@ -470,9 +496,10 @@ Status HdfsEnv::GetChildren(const std::string& path, case HDFS_DOESNT_EXIST: // directory does not exist, exit break; default: // anything else should be an error - Log(mylog, "GetChildren hdfsExists call failed"); + Log(InfoLogLevel::FATAL_LEVEL, mylog, + "GetChildren hdfsExists call failed"); throw HdfsFatalException("hdfsExists call failed with error " + - std::to_string(value) + ".\n"); + ToString(value) + ".\n"); } return Status::OK(); } @@ -500,9 +527,10 @@ Status HdfsEnv::CreateDirIfMissing(const std::string& name) { case HDFS_DOESNT_EXIST: return CreateDir(name); default: // anything else should be an error - Log(mylog, "CreateDirIfMissing hdfsExists call failed"); + Log(InfoLogLevel::FATAL_LEVEL, mylog, + "CreateDirIfMissing hdfsExists call failed"); throw HdfsFatalException("hdfsExists call failed with error " + - std::to_string(value) + ".\n"); + ToString(value) + ".\n"); } }; diff --git a/util/env_posix.cc b/util/env_posix.cc index cf917e874..4adf58bcc 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include #include #include #include @@ -41,6 +42,8 @@ #include "util/random.h" #include "util/iostats_context_imp.h" #include "util/rate_limiter.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" // Get nano time for mach systems #ifdef __MACH__ @@ -86,6 +89,10 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) { #endif } +ThreadStatusUpdater* CreateThreadStatusUpdater() { + return new ThreadStatusUpdater(); +} + // list of pathnames that are locked static std::set lockedFiles; static port::Mutex mutex_lockedFiles; @@ -200,7 +207,7 @@ class PosixSequentialFile: public SequentialFile { } virtual Status Skip(uint64_t n) { - if (fseek(file_, n, SEEK_CUR)) { + if (fseek(file_, static_cast(n), SEEK_CUR)) { return IOError(filename_, errno); } return Status::OK(); @@ -485,7 +492,7 @@ class PosixMmapFile : public WritableFile { const char* src = data.data(); size_t left = data.size(); TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); - PrepareWrite(GetFileSize(), left); + PrepareWrite(static_cast(GetFileSize()), left); while (left > 0) { assert(base_ <= dst_); assert(dst_ <= limit_); @@ -682,7 +689,7 @@ class PosixWritableFile : public WritableFile { TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - PrepareWrite(GetFileSize(), left); + PrepareWrite(static_cast(GetFileSize()), left); // if there is no space in the cache, then flush if (cursize_ + left > capacity_) { s = Flush(); @@ -736,14 +743,29 @@ class PosixWritableFile : public WritableFile { GetPreallocationStatus(&block_size, &last_allocated_block); if (last_allocated_block > 0) { // trim the extra space preallocated at the end of the file + // NOTE(ljin): we probably don't want to surface failure as an IOError, + // but it will be nice to log these errors. int dummy __attribute__((unused)); - dummy = ftruncate(fd_, filesize_); // ignore errors + dummy = ftruncate(fd_, filesize_); +#ifdef ROCKSDB_FALLOCATE_PRESENT + // in some file systems, ftruncate only trims trailing space if the + // new file size is smaller than the current size. Calling fallocate + // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused + // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following + // filesystems: + // XFS (since Linux 2.6.38) + // ext4 (since Linux 3.0) + // Btrfs (since Linux 3.7) + // tmpfs (since Linux 3.5) + // We ignore error since failure of this operation does not affect + // correctness. + fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + filesize_, block_size * last_allocated_block - filesize_); +#endif } if (close(fd_) < 0) { - if (s.ok()) { - s = IOError(filename_, errno); - } + s = IOError(filename_, errno); } fd_ = -1; return s; @@ -1047,7 +1069,7 @@ class PosixFileLock : public FileLock { void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); - exit(1); + abort(); } } @@ -1055,10 +1077,16 @@ class PosixEnv : public Env { public: PosixEnv(); - virtual ~PosixEnv(){ + virtual ~PosixEnv() { for (const auto tid : threads_to_join_) { pthread_join(tid, nullptr); } + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].JoinAllThreads(); + } + // All threads must be joined before the deletion of + // thread_status_updater_. + delete thread_status_updater_; } void SetFD_CLOEXEC(int fd, const EnvOptions* options) { @@ -1272,6 +1300,17 @@ class PosixEnv : public Env { return result; } + virtual Status LinkFile(const std::string& src, const std::string& target) { + Status result; + if (link(src.c_str(), target.c_str()) != 0) { + if (errno == EXDEV) { + return Status::NotSupported("No cross FS links allowed"); + } + result = IOError(src, errno); + } + return result; + } + virtual Status LockFile(const std::string& fname, FileLock** lock) { *lock = nullptr; Status result; @@ -1324,6 +1363,12 @@ class PosixEnv : public Env { return Status::OK(); } + virtual Status GetThreadList( + std::vector* thread_list) override { + assert(thread_status_updater_); + return thread_status_updater_->GetThreadList(thread_list); + } + static uint64_t gettid(pthread_t tid) { uint64_t thread_id = 0; memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid))); @@ -1350,25 +1395,13 @@ class PosixEnv : public Env { } virtual uint64_t NowMicros() { - struct timeval tv; - // TODO(kailiu) MAC DON'T HAVE THIS - gettimeofday(&tv, nullptr); - return static_cast(tv.tv_sec) * 1000000 + tv.tv_usec; + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); } virtual uint64_t NowNanos() { -#ifdef OS_LINUX - struct timespec ts; - clock_gettime(CLOCK_MONOTONIC, &ts); - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; -#elif __MACH__ - clock_serv_t cclock; - mach_timespec_t ts; - host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock); - clock_get_time(cclock, &ts); - mach_port_deallocate(mach_task_self(), cclock); -#endif - return static_cast(ts.tv_sec) * 1000000000 + ts.tv_nsec; + return std::chrono::duration_cast( + std::chrono::steady_clock::now().time_since_epoch()).count(); } virtual void SleepForMicroseconds(int micros) { @@ -1376,7 +1409,7 @@ class PosixEnv : public Env { } virtual Status GetHostName(char* name, uint64_t len) { - int ret = gethostname(name, len); + int ret = gethostname(name, static_cast(len)); if (ret < 0) { if (errno == EFAULT || errno == EINVAL) return Status::InvalidArgument(strerror(errno)); @@ -1418,6 +1451,12 @@ class PosixEnv : public Env { thread_pools_[pri].SetBackgroundThreads(num); } + // Allow increasing the number of worker threads. + virtual void IncBackgroundThreadsIfNeeded(int num, Priority pri) { + assert(pri >= Priority::LOW && pri <= Priority::HIGH); + thread_pools_[pri].IncBackgroundThreadsIfNeeded(num); + } + virtual void LowerThreadPoolIOPriority(Priority pool = LOW) override { assert(pool >= Priority::LOW && pool <= Priority::HIGH); #ifdef OS_LINUX @@ -1508,12 +1547,17 @@ class PosixEnv : public Env { queue_(), queue_len_(0), exit_all_threads_(false), - low_io_priority_(false) { + low_io_priority_(false), + env_(nullptr) { PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); PthreadCall("cvar_init", pthread_cond_init(&bgsignal_, nullptr)); } ~ThreadPool() { + assert(bgthreads_.size() == 0U); + } + + void JoinAllThreads() { PthreadCall("lock", pthread_mutex_lock(&mu_)); assert(!exit_all_threads_); exit_all_threads_ = true; @@ -1522,6 +1566,11 @@ class PosixEnv : public Env { for (const auto tid : bgthreads_) { pthread_join(tid, nullptr); } + bgthreads_.clear(); + } + + void SetHostEnv(Env* env) { + env_ = env; } void LowerIOPriority() { @@ -1549,6 +1598,17 @@ class PosixEnv : public Env { return static_cast(thread_id) >= total_threads_limit_; } + // Return the thread priority. + // This would allow its member-thread to know its priority. + Env::Priority GetThreadPriority() { + return priority_; + } + + // Set the thread priority. + void SetThreadPriority(Env::Priority priority) { + priority_ = priority; + } + void BGThread(size_t thread_id) { bool low_io_priority = false; while (true) { @@ -1575,16 +1635,13 @@ class PosixEnv : public Env { WakeUpAllThreads(); } PthreadCall("unlock", pthread_mutex_unlock(&mu_)); - // TODO(sdong): temp logging. Need to help debugging. Remove it when - // the feature is proved to be stable. - fprintf(stdout, "Bg thread %zu terminates %llx\n", thread_id, - static_cast(gettid())); break; } void (*function)(void*) = queue_.front().function; void* arg = queue_.front().arg; queue_.pop_front(); - queue_len_.store(queue_.size(), std::memory_order_relaxed); + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); bool decrease_io_priority = (low_io_priority != low_io_priority_); PthreadCall("unlock", pthread_mutex_unlock(&mu_)); @@ -1629,8 +1686,18 @@ class PosixEnv : public Env { BGThreadMetadata* meta = reinterpret_cast(arg); size_t thread_id = meta->thread_id_; ThreadPool* tp = meta->thread_pool_; +#if ROCKSDB_USING_THREAD_STATUS + // for thread-status + ThreadStatusUtil::SetThreadType(tp->env_, + (tp->GetThreadPriority() == Env::Priority::HIGH ? + ThreadStatus::HIGH_PRIORITY : + ThreadStatus::LOW_PRIORITY)); +#endif delete meta; tp->BGThread(thread_id); +#if ROCKSDB_USING_THREAD_STATUS + ThreadStatusUtil::UnregisterThread(); +#endif return nullptr; } @@ -1638,21 +1705,29 @@ class PosixEnv : public Env { PthreadCall("signalall", pthread_cond_broadcast(&bgsignal_)); } - void SetBackgroundThreads(int num) { + void SetBackgroundThreadsInternal(int num, bool allow_reduce) { PthreadCall("lock", pthread_mutex_lock(&mu_)); if (exit_all_threads_) { PthreadCall("unlock", pthread_mutex_unlock(&mu_)); return; } - if (num != total_threads_limit_) { - total_threads_limit_ = num; + if (num > total_threads_limit_ || + (num < total_threads_limit_ && allow_reduce)) { + total_threads_limit_ = std::max(1, num); WakeUpAllThreads(); StartBGThreads(); } - assert(total_threads_limit_ > 0); PthreadCall("unlock", pthread_mutex_unlock(&mu_)); } + void IncBackgroundThreadsIfNeeded(int num) { + SetBackgroundThreadsInternal(num, false); + } + + void SetBackgroundThreads(int num) { + SetBackgroundThreadsInternal(num, true); + } + void StartBGThreads() { // Start background thread if necessary while ((int)bgthreads_.size() < total_threads_limit_) { @@ -1690,7 +1765,8 @@ class PosixEnv : public Env { queue_.push_back(BGItem()); queue_.back().function = function; queue_.back().arg = arg; - queue_len_.store(queue_.size(), std::memory_order_relaxed); + queue_len_.store(static_cast(queue_.size()), + std::memory_order_relaxed); if (!HasExcessiveThread()) { // Wake up at least one waiting thread. @@ -1721,6 +1797,8 @@ class PosixEnv : public Env { std::atomic_uint queue_len_; // Queue length. Used for stats reporting bool exit_all_threads_; bool low_io_priority_; + Env::Priority priority_; + Env* env_; }; std::vector thread_pools_; @@ -1735,6 +1813,13 @@ PosixEnv::PosixEnv() : checkedDiskForMmap_(false), page_size_(getpagesize()), thread_pools_(Priority::TOTAL) { PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); + for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { + thread_pools_[pool_id].SetThreadPriority( + static_cast(pool_id)); + // This allows later initializing the thread-local-env of each thread. + thread_pools_[pool_id].SetHostEnv(this); + } + thread_status_updater_ = CreateThreadStatusUpdater(); } void PosixEnv::Schedule(void (*function)(void*), void* arg, Priority pri) { diff --git a/util/env_test.cc b/util/env_test.cc index c0d00ce94..9e484c77f 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -11,12 +11,18 @@ #include #include +#include #ifdef OS_LINUX #include #include #endif +#ifdef ROCKSDB_FALLOCATE_PRESENT +#include +#include +#endif + #include "rocksdb/env.h" #include "port/port.h" #include "util/coding.h" @@ -39,30 +45,31 @@ class EnvPosixTest { }; static void SetBool(void* ptr) { - reinterpret_cast(ptr)->NoBarrier_Store(ptr); + reinterpret_cast*>(ptr) + ->store(true, std::memory_order_relaxed); } TEST(EnvPosixTest, RunImmediately) { - port::AtomicPointer called (nullptr); + std::atomic called(false); env_->Schedule(&SetBool, &called); Env::Default()->SleepForMicroseconds(kDelayMicros); - ASSERT_TRUE(called.NoBarrier_Load() != nullptr); + ASSERT_TRUE(called.load(std::memory_order_relaxed)); } TEST(EnvPosixTest, RunMany) { - port::AtomicPointer last_id (nullptr); + std::atomic last_id(0); struct CB { - port::AtomicPointer* last_id_ptr; // Pointer to shared slot - uintptr_t id; // Order# for the execution of this callback + std::atomic* last_id_ptr; // Pointer to shared slot + int id; // Order# for the execution of this callback - CB(port::AtomicPointer* p, int i) : last_id_ptr(p), id(i) { } + CB(std::atomic* p, int i) : last_id_ptr(p), id(i) {} static void Run(void* v) { CB* cb = reinterpret_cast(v); - void* cur = cb->last_id_ptr->NoBarrier_Load(); - ASSERT_EQ(cb->id-1, reinterpret_cast(cur)); - cb->last_id_ptr->Release_Store(reinterpret_cast(cb->id)); + int cur = cb->last_id_ptr->load(std::memory_order_relaxed); + ASSERT_EQ(cb->id - 1, cur); + cb->last_id_ptr->store(cb->id, std::memory_order_release); } }; @@ -77,8 +84,8 @@ TEST(EnvPosixTest, RunMany) { env_->Schedule(&CB::Run, &cb4); Env::Default()->SleepForMicroseconds(kDelayMicros); - void* cur = last_id.Acquire_Load(); - ASSERT_EQ(4U, reinterpret_cast(cur)); + int cur = last_id.load(std::memory_order_acquire); + ASSERT_EQ(4, cur); } struct State { @@ -134,10 +141,8 @@ TEST(EnvPosixTest, TwoPools) { { MutexLock l(&mu_); num_running_++; - std::cout << "Pool " << pool_name_ << ": " - << num_running_ << " running threads.\n"; // make sure we don't have more than pool_size_ jobs running. - ASSERT_LE(num_running_, pool_size_); + ASSERT_LE(num_running_, pool_size_.load()); } // sleep for 1 sec @@ -155,11 +160,16 @@ TEST(EnvPosixTest, TwoPools) { return num_finished_; } + void Reset(int pool_size) { + pool_size_.store(pool_size); + num_finished_ = 0; + } + private: port::Mutex mu_; int num_running_; int num_finished_; - int pool_size_; + std::atomic pool_size_; std::string pool_name_; }; @@ -198,6 +208,35 @@ TEST(EnvPosixTest, TwoPools) { ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); ASSERT_EQ(0U, env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // call IncBackgroundThreadsIfNeeded to two pools. One increasing and + // the other decreasing + env_->IncBackgroundThreadsIfNeeded(kLowPoolSize - 1, Env::Priority::LOW); + env_->IncBackgroundThreadsIfNeeded(kHighPoolSize + 1, Env::Priority::HIGH); + high_pool_job.Reset(kHighPoolSize + 1); + low_pool_job.Reset(kLowPoolSize); + + // schedule same number of jobs in each pool + for (int i = 0; i < kJobs; i++) { + env_->Schedule(&CB::Run, &low_pool_job); + env_->Schedule(&CB::Run, &high_pool_job, Env::Priority::HIGH); + } + // Wait a short while for the jobs to be dispatched. + Env::Default()->SleepForMicroseconds(kDelayMicros); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen()); + ASSERT_EQ((unsigned int)(kJobs - kLowPoolSize), + env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + ASSERT_EQ((unsigned int)(kJobs - (kHighPoolSize + 1)), + env_->GetThreadPoolQueueLen(Env::Priority::HIGH)); + + // wait for all jobs to finish + while (low_pool_job.NumFinished() < kJobs || + high_pool_job.NumFinished() < kJobs) { + env_->SleepForMicroseconds(kDelayMicros); + } + + env_->SetBackgroundThreads(kHighPoolSize, Env::Priority::HIGH); } TEST(EnvPosixTest, DecreaseNumBgThreads) { @@ -392,6 +431,9 @@ TEST(EnvPosixTest, DecreaseNumBgThreads) { } #ifdef OS_LINUX +// Travis doesn't support fallocate or getting unique ID from files for whatever +// reason. +#ifndef TRAVIS // To make sure the Env::GetUniqueId() related tests work correctly, The files // should be stored in regular storage like "hard disk" or "flash device". // Otherwise we cannot get the correct id. @@ -475,6 +517,31 @@ TEST(EnvPosixTest, RandomAccessUniqueID) { #ifdef ROCKSDB_FALLOCATE_PRESENT TEST(EnvPosixTest, AllocateTest) { std::string fname = GetOnDiskTestDir() + "/preallocate_testfile"; + + // Try fallocate in a file to see whether the target file system supports it. + // Skip the test if fallocate is not supported. + std::string fname_test_fallocate = + GetOnDiskTestDir() + "/preallocate_testfile_2"; + int fd = -1; + do { + fd = open(fname_test_fallocate.c_str(), O_CREAT | O_RDWR | O_TRUNC, 0644); + } while (fd < 0 && errno == EINTR); + ASSERT_GT(fd, 0); + + int alloc_status = fallocate(fd, 0, 0, 1); + + int err_number = 0; + if (alloc_status != 0) { + err_number = errno; + fprintf(stderr, "Warning: fallocate() fails, %s\n", strerror(err_number)); + } + close(fd); + ASSERT_OK(env_->DeleteFile(fname_test_fallocate)); + if (alloc_status != 0 && err_number == EOPNOTSUPP) { + // The filesystem containing the file does not support fallocate + return; + } + EnvOptions soptions; soptions.use_mmap_writes = false; unique_ptr wfile; @@ -483,7 +550,8 @@ TEST(EnvPosixTest, AllocateTest) { // allocate 100 MB size_t kPreallocateSize = 100 * 1024 * 1024; size_t kBlockSize = 512; - std::string data = "test"; + size_t kPageSize = 4096; + std::string data(1024 * 1024, 'a'); wfile->SetPreallocationBlockSize(kPreallocateSize); ASSERT_OK(wfile->Append(Slice(data))); ASSERT_OK(wfile->Flush()); @@ -496,8 +564,7 @@ TEST(EnvPosixTest, AllocateTest) { // we only require that number of allocated blocks is at least what we expect. // It looks like some FS give us more blocks that we asked for. That's fine. // It might be worth investigating further. - auto st_blocks = f_stat.st_blocks; - ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), st_blocks); + ASSERT_LE((unsigned int)(kPreallocateSize / kBlockSize), f_stat.st_blocks); // close the file, should deallocate the blocks wfile.reset(); @@ -505,9 +572,11 @@ TEST(EnvPosixTest, AllocateTest) { stat(fname.c_str(), &f_stat); ASSERT_EQ((unsigned int)data.size(), f_stat.st_size); // verify that preallocated blocks were deallocated on file close - ASSERT_GT(st_blocks, f_stat.st_blocks); + // Because the FS might give us more blocks, we add a full page to the size + // and expect the number of blocks to be less or equal to that. + ASSERT_GE((f_stat.st_size + kPageSize + kBlockSize - 1) / kBlockSize, (unsigned int)f_stat.st_blocks); } -#endif +#endif // ROCKSDB_FALLOCATE_PRESENT // Returns true if any of the strings in ss are the prefix of another string. bool HasPrefix(const std::unordered_set& ss) { @@ -532,7 +601,8 @@ TEST(EnvPosixTest, RandomAccessUniqueIDConcurrent) { // Create the files std::vector fnames; for (int i = 0; i < 1000; ++i) { - fnames.push_back(GetOnDiskTestDir() + "/" + "testfile" + std::to_string(i)); + fnames.push_back( + GetOnDiskTestDir() + "/" + "testfile" + ToString(i)); // Create file. unique_ptr wfile; @@ -638,7 +708,8 @@ TEST(EnvPosixTest, InvalidateCache) { // Delete the file ASSERT_OK(env_->DeleteFile(fname)); } -#endif +#endif // not TRAVIS +#endif // OS_LINUX TEST(EnvPosixTest, PosixRandomRWFileTest) { EnvOptions soptions; @@ -664,6 +735,7 @@ TEST(EnvPosixTest, PosixRandomRWFileTest) { class TestLogger : public Logger { public: + using Logger::Logv; virtual void Logv(const char* format, va_list ap) override { log_count++; @@ -734,6 +806,72 @@ TEST(EnvPosixTest, LogBufferTest) { ASSERT_EQ(10, test_logger.char_x_count); } +class TestLogger2 : public Logger { + public: + explicit TestLogger2(size_t max_log_size) : max_log_size_(max_log_size) {} + using Logger::Logv; + virtual void Logv(const char* format, va_list ap) override { + char new_format[2000]; + std::fill_n(new_format, sizeof(new_format), '2'); + { + va_list backup_ap; + va_copy(backup_ap, ap); + int n = vsnprintf(new_format, sizeof(new_format) - 1, format, backup_ap); + // 48 bytes for extra information + bytes allocated + ASSERT_TRUE( + n <= 48 + static_cast(max_log_size_ - sizeof(struct timeval))); + ASSERT_TRUE(n > static_cast(max_log_size_ - sizeof(struct timeval))); + va_end(backup_ap); + } + } + size_t max_log_size_; +}; + +TEST(EnvPosixTest, LogBufferMaxSizeTest) { + char bytes9000[9000]; + std::fill_n(bytes9000, sizeof(bytes9000), '1'); + bytes9000[sizeof(bytes9000) - 1] = '\0'; + + for (size_t max_log_size = 256; max_log_size <= 1024; + max_log_size += 1024 - 256) { + TestLogger2 test_logger(max_log_size); + test_logger.SetInfoLogLevel(InfoLogLevel::INFO_LEVEL); + LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, &test_logger); + LogToBuffer(&log_buffer, max_log_size, "%s", bytes9000); + log_buffer.FlushBufferToLog(); + } +} + +TEST(EnvPosixTest, Preallocation) { + const std::string src = test::TmpDir() + "/" + "testfile"; + unique_ptr srcfile; + const EnvOptions soptions; + ASSERT_OK(env_->NewWritableFile(src, &srcfile, soptions)); + srcfile->SetPreallocationBlockSize(1024 * 1024); + + // No writes should mean no preallocation + size_t block_size, last_allocated_block; + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 0UL); + + // Small write should preallocate one block + srcfile->Append("test"); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 1UL); + + // Write an entire preallocation block, make sure we increased by two. + std::string buf(block_size, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 2UL); + + // Write five more blocks at once, ensure we're where we need to be. + buf = std::string(block_size * 5, ' '); + srcfile->Append(buf); + srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); + ASSERT_EQ(last_allocated_block, 7UL); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/file_util.cc b/util/file_util.cc new file mode 100644 index 000000000..c75d59c5f --- /dev/null +++ b/util/file_util.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include +#include "util/file_util.h" +#include "rocksdb/env.h" +#include "db/filename.h" + +namespace rocksdb { + +// Utility function to copy a file up to a specified length +Status CopyFile(Env* env, const std::string& source, + const std::string& destination, uint64_t size) { + const EnvOptions soptions; + unique_ptr srcfile; + Status s; + s = env->NewSequentialFile(source, &srcfile, soptions); + unique_ptr destfile; + if (s.ok()) { + s = env->NewWritableFile(destination, &destfile, soptions); + } else { + return s; + } + + if (size == 0) { + // default argument means copy everything + if (s.ok()) { + s = env->GetFileSize(source, &size); + } else { + return s; + } + } + + char buffer[4096]; + Slice slice; + while (size > 0) { + uint64_t bytes_to_read = + std::min(static_cast(sizeof(buffer)), size); + if (s.ok()) { + s = srcfile->Read(bytes_to_read, &slice, buffer); + } + if (s.ok()) { + if (slice.size() == 0) { + return Status::Corruption("file too small"); + } + s = destfile->Append(slice); + } + if (!s.ok()) { + return s; + } + size -= slice.size(); + } + return Status::OK(); +} + +} // namespace rocksdb diff --git a/util/file_util.h b/util/file_util.h new file mode 100644 index 000000000..84b37345b --- /dev/null +++ b/util/file_util.h @@ -0,0 +1,18 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include + +#pragma once +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +extern Status CopyFile(Env* env, const std::string& source, + const std::string& destination, uint64_t size = 0); + +} // namespace rocksdb diff --git a/util/hash.cc b/util/hash.cc index e38c186c3..427f0d138 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -18,7 +18,7 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { const uint32_t m = 0xc6a4a793; const uint32_t r = 24; const char* limit = data + n; - uint32_t h = seed ^ (n * m); + uint32_t h = static_cast(seed ^ (n * m)); // Pick up four bytes at a time while (data + 4 <= limit) { @@ -31,14 +31,26 @@ uint32_t Hash(const char* data, size_t n, uint32_t seed) { // Pick up remaining bytes switch (limit - data) { + // Note: It would be better if this was cast to unsigned char, but that + // would be a disk format change since we previously didn't have any cast + // at all (so gcc used signed char). + // To understand the difference between shifting unsigned and signed chars, + // let's use 250 as an example. unsigned char will be 250, while signed char + // will be -6. Bit-wise, they are equivalent: 11111010. However, when + // converting negative number (signed char) to int, it will be converted + // into negative int (of equivalent value, which is -6), while converting + // positive number (unsigned char) will be converted to 250. Bitwise, + // this looks like this: + // signed char 11111010 -> int 11111111111111111111111111111010 + // unsigned char 11111010 -> int 00000000000000000000000011111010 case 3: - h += data[2] << 16; - // fall through + h += static_cast(static_cast(data[2]) << 16); + // fall through case 2: - h += data[1] << 8; - // fall through + h += static_cast(static_cast(data[1]) << 8); + // fall through case 1: - h += data[0]; + h += static_cast(static_cast(data[0])); h *= m; h ^= (h >> r); break; diff --git a/util/hash.h b/util/hash.h index 6d9bebaf8..cab8d4677 100644 --- a/util/hash.h +++ b/util/hash.h @@ -24,4 +24,5 @@ inline uint32_t BloomHash(const Slice& key) { inline uint32_t GetSliceHash(const Slice& s) { return Hash(s.data(), s.size(), 397); } -} + +} // namespace rocksdb diff --git a/util/hash_cuckoo_rep.cc b/util/hash_cuckoo_rep.cc index a9a79a274..3ac5ba746 100644 --- a/util/hash_cuckoo_rep.cc +++ b/util/hash_cuckoo_rep.cc @@ -52,25 +52,26 @@ struct CuckooStep { class HashCuckooRep : public MemTableRep { public: explicit HashCuckooRep(const MemTableRep::KeyComparator& compare, - Arena* arena, const size_t bucket_count, + MemTableAllocator* allocator, + const size_t bucket_count, const unsigned int hash_func_count) - : MemTableRep(arena), + : MemTableRep(allocator), compare_(compare), - arena_(arena), + allocator_(allocator), bucket_count_(bucket_count), cuckoo_path_max_depth_(kDefaultCuckooPathMaxDepth), occupied_count_(0), hash_function_count_(hash_func_count), backup_table_(nullptr) { char* mem = reinterpret_cast( - arena_->Allocate(sizeof(std::atomic) * bucket_count_)); + allocator_->Allocate(sizeof(std::atomic) * bucket_count_)); cuckoo_array_ = new (mem) std::atomic[bucket_count_]; for (unsigned int bid = 0; bid < bucket_count_; ++bid) { cuckoo_array_[bid].store(nullptr, std::memory_order_relaxed); } cuckoo_path_ = reinterpret_cast( - arena_->Allocate(sizeof(int*) * (cuckoo_path_max_depth_ + 1))); + allocator_->Allocate(sizeof(int) * (cuckoo_path_max_depth_ + 1))); is_nearly_full_ = false; } @@ -181,8 +182,8 @@ class HashCuckooRep : public MemTableRep { private: const MemTableRep::KeyComparator& compare_; - // the pointer to Arena to allocate memory, immutable after construction. - Arena* const arena_; + // the pointer to Allocator to allocate memory, immutable after construction. + MemTableAllocator* const allocator_; // the number of hash bucket in the hash table. const size_t bucket_count_; // the maxinum depth of the cuckoo path. @@ -213,9 +214,10 @@ class HashCuckooRep : public MemTableRep { static const int kMurmurHashSeeds[HashCuckooRepFactory::kMaxHashCount] = { 545609244, 1769731426, 763324157, 13099088, 592422103, 1899789565, 248369300, 1984183468, 1613664382, 1491157517}; - return MurmurHash(slice.data(), slice.size(), - kMurmurHashSeeds[hash_func_id]) % - bucket_count_; + return static_cast( + MurmurHash(slice.data(), static_cast(slice.size()), + kMurmurHashSeeds[hash_func_id]) % + bucket_count_); } // A cuckoo path is a sequence of bucket ids, where each id points to a @@ -320,7 +322,7 @@ void HashCuckooRep::Insert(KeyHandle handle) { if (backup_table_.get() == nullptr) { VectorRepFactory factory(10); backup_table_.reset( - factory.CreateMemTableRep(compare_, arena_, nullptr, nullptr)); + factory.CreateMemTableRep(compare_, allocator_, nullptr, nullptr)); is_nearly_full_ = true; } backup_table_->Insert(key); @@ -600,7 +602,7 @@ void HashCuckooRep::Iterator::SeekToLast() { } // anom namespace MemTableRep* HashCuckooRepFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) { // The estimated average fullness. The write performance of any close hash // degrades as the fullness of the mem-table increases. Setting kFullness @@ -619,7 +621,8 @@ MemTableRep* HashCuckooRepFactory::CreateMemTableRep( if (hash_function_count > kMaxHashCount) { hash_function_count = kMaxHashCount; } - return new HashCuckooRep(compare, arena, bucket_count, hash_function_count); + return new HashCuckooRep(compare, allocator, bucket_count, + hash_function_count); } MemTableRepFactory* NewHashCuckooRepFactory(size_t write_buffer_size, diff --git a/util/hash_cuckoo_rep.h b/util/hash_cuckoo_rep.h index 669b6b7d4..9f374a978 100644 --- a/util/hash_cuckoo_rep.h +++ b/util/hash_cuckoo_rep.h @@ -28,7 +28,7 @@ class HashCuckooRepFactory : public MemTableRepFactory { virtual ~HashCuckooRepFactory() {} virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { return "HashCuckooRepFactory"; } diff --git a/util/hash_linklist_rep.cc b/util/hash_linklist_rep.cc index 8e3dc5826..3e98f3d00 100644 --- a/util/hash_linklist_rep.cc +++ b/util/hash_linklist_rep.cc @@ -8,12 +8,12 @@ #include "util/hash_linklist_rep.h" #include +#include #include "rocksdb/memtablerep.h" #include "util/arena.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "port/port.h" -#include "port/atomic_pointer.h" #include "util/histogram.h" #include "util/murmurhash.h" #include "db/memtable.h" @@ -24,17 +24,30 @@ namespace { typedef const char* Key; typedef SkipList MemtableSkipList; -typedef port::AtomicPointer Pointer; +typedef std::atomic Pointer; // A data structure used as the header of a link list of a hash bucket. struct BucketHeader { Pointer next; - uint32_t num_entries; + std::atomic num_entries; explicit BucketHeader(void* n, uint32_t count) : next(n), num_entries(count) {} - bool IsSkipListBucket() { return next.NoBarrier_Load() == this; } + bool IsSkipListBucket() { + return next.load(std::memory_order_relaxed) == this; + } + + uint32_t GetNumEntries() const { + return num_entries.load(std::memory_order_relaxed); + } + + // REQUIRES: called from single-threaded Insert() + void IncNumEntries() { + // Only one thread can do write at one time. No need to do atomic + // incremental. Update it with relaxed load and store. + num_entries.store(GetNumEntries() + 1, std::memory_order_relaxed); + } }; // A data structure used as the header of a skip list of a hash bucket. @@ -43,10 +56,10 @@ struct SkipListBucketHeader { MemtableSkipList skip_list; explicit SkipListBucketHeader(const MemTableRep::KeyComparator& cmp, - Arena* arena, uint32_t count) + MemTableAllocator* allocator, uint32_t count) : Counting_header(this, // Pointing to itself to indicate header type. count), - skip_list(cmp, arena) {} + skip_list(cmp, allocator) {} }; struct Node { @@ -55,24 +68,23 @@ struct Node { Node* Next() { // Use an 'acquire load' so that we observe a fully initialized // version of the returned Node. - return reinterpret_cast(next_.Acquire_Load()); + return next_.load(std::memory_order_acquire); } void SetNext(Node* x) { // Use a 'release store' so that anybody who reads through this // pointer observes a fully initialized version of the inserted node. - next_.Release_Store(x); + next_.store(x, std::memory_order_release); } // No-barrier variants that can be safely used in a few locations. Node* NoBarrier_Next() { - return reinterpret_cast(next_.NoBarrier_Load()); + return next_.load(std::memory_order_relaxed); } - void NoBarrier_SetNext(Node* x) { - next_.NoBarrier_Store(x); - } + void NoBarrier_SetNext(Node* x) { next_.store(x, std::memory_order_relaxed); } private: - port::AtomicPointer next_; + std::atomic next_; + public: char key[0]; }; @@ -142,10 +154,11 @@ struct Node { // which can be significant decrease of memory utilization. class HashLinkListRep : public MemTableRep { public: - HashLinkListRep(const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform, size_t bucket_size, - uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, - Logger* logger, int bucket_entries_logging_threshold, + HashLinkListRep(const MemTableRep::KeyComparator& compare, + MemTableAllocator* allocator, const SliceTransform* transform, + size_t bucket_size, uint32_t threshold_use_skiplist, + size_t huge_page_tlb_size, Logger* logger, + int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash); virtual KeyHandle Allocate(const size_t len, char** buf) override; @@ -165,7 +178,7 @@ class HashLinkListRep : public MemTableRep { virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override; virtual MemTableRep::Iterator* GetDynamicPrefixIterator( - Arena* arena = nullptr) override; + Arena* arena = nullptr) override; private: friend class DynamicIterator; @@ -174,7 +187,7 @@ class HashLinkListRep : public MemTableRep { // Maps slices (which are transformed user keys) to buckets of keys sharing // the same transform. - port::AtomicPointer* buckets_; + Pointer* buckets_; const uint32_t threshold_use_skiplist_; @@ -199,11 +212,12 @@ class HashLinkListRep : public MemTableRep { } size_t GetHash(const Slice& slice) const { - return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + return MurmurHash(slice.data(), static_cast(slice.size()), 0) % + bucket_size_; } Pointer* GetBucket(size_t i) const { - return static_cast(buckets_[i].Acquire_Load()); + return static_cast(buckets_[i].load(std::memory_order_acquire)); } Pointer* GetBucket(const Slice& slice) const { @@ -231,8 +245,8 @@ class HashLinkListRep : public MemTableRep { class FullListIterator : public MemTableRep::Iterator { public: - explicit FullListIterator(MemtableSkipList* list, Arena* arena) - : iter_(list), full_list_(list), arena_(arena) {} + explicit FullListIterator(MemtableSkipList* list, Allocator* allocator) + : iter_(list), full_list_(list), allocator_(allocator) {} virtual ~FullListIterator() { } @@ -286,7 +300,7 @@ class HashLinkListRep : public MemTableRep { MemtableSkipList::Iterator iter_; // To destruct with the iterator. std::unique_ptr full_list_; - std::unique_ptr arena_; + std::unique_ptr allocator_; std::string tmp_; // For passing to EncodeKey }; @@ -451,13 +465,14 @@ class HashLinkListRep : public MemTableRep { }; HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, - Arena* arena, const SliceTransform* transform, + MemTableAllocator* allocator, + const SliceTransform* transform, size_t bucket_size, uint32_t threshold_use_skiplist, size_t huge_page_tlb_size, Logger* logger, int bucket_entries_logging_threshold, bool if_log_bucket_dist_when_flash) - : MemTableRep(arena), + : MemTableRep(allocator), bucket_size_(bucket_size), // Threshold to use skip list doesn't make sense if less than 3, so we // force it to be minimum of 3 to simplify implementation. @@ -467,13 +482,13 @@ HashLinkListRep::HashLinkListRep(const MemTableRep::KeyComparator& compare, logger_(logger), bucket_entries_logging_threshold_(bucket_entries_logging_threshold), if_log_bucket_dist_when_flash_(if_log_bucket_dist_when_flash) { - char* mem = arena_->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size, + char* mem = allocator_->AllocateAligned(sizeof(Pointer) * bucket_size, huge_page_tlb_size, logger); - buckets_ = new (mem) port::AtomicPointer[bucket_size]; + buckets_ = new (mem) Pointer[bucket_size]; for (size_t i = 0; i < bucket_size_; ++i) { - buckets_[i].NoBarrier_Store(nullptr); + buckets_[i].store(nullptr, std::memory_order_relaxed); } } @@ -481,7 +496,7 @@ HashLinkListRep::~HashLinkListRep() { } KeyHandle HashLinkListRep::Allocate(const size_t len, char** buf) { - char* mem = arena_->AllocateAligned(sizeof(Node) + len); + char* mem = allocator_->AllocateAligned(sizeof(Node) + len); Node* x = new (mem) Node(); *buf = x->key; return static_cast(x); @@ -492,21 +507,21 @@ SkipListBucketHeader* HashLinkListRep::GetSkipListBucketHeader( if (first_next_pointer == nullptr) { return nullptr; } - if (first_next_pointer->NoBarrier_Load() == nullptr) { + if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { // Single entry bucket return nullptr; } // Counting header BucketHeader* header = reinterpret_cast(first_next_pointer); if (header->IsSkipListBucket()) { - assert(header->num_entries > threshold_use_skiplist_); + assert(header->GetNumEntries() > threshold_use_skiplist_); auto* skip_list_bucket_header = reinterpret_cast(header); - assert(skip_list_bucket_header->Counting_header.next.NoBarrier_Load() == - header); + assert(skip_list_bucket_header->Counting_header.next.load( + std::memory_order_relaxed) == header); return skip_list_bucket_header; } - assert(header->num_entries <= threshold_use_skiplist_); + assert(header->GetNumEntries() <= threshold_use_skiplist_); return nullptr; } @@ -514,17 +529,18 @@ Node* HashLinkListRep::GetLinkListFirstNode(Pointer* first_next_pointer) const { if (first_next_pointer == nullptr) { return nullptr; } - if (first_next_pointer->NoBarrier_Load() == nullptr) { + if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { // Single entry bucket return reinterpret_cast(first_next_pointer); } // Counting header BucketHeader* header = reinterpret_cast(first_next_pointer); if (!header->IsSkipListBucket()) { - assert(header->num_entries <= threshold_use_skiplist_); - return reinterpret_cast(header->next.NoBarrier_Load()); + assert(header->GetNumEntries() <= threshold_use_skiplist_); + return reinterpret_cast( + header->next.load(std::memory_order_acquire)); } - assert(header->num_entries > threshold_use_skiplist_); + assert(header->GetNumEntries() > threshold_use_skiplist_); return nullptr; } @@ -534,19 +550,20 @@ void HashLinkListRep::Insert(KeyHandle handle) { Slice internal_key = GetLengthPrefixedSlice(x->key); auto transformed = GetPrefix(internal_key); auto& bucket = buckets_[GetHash(transformed)]; - Pointer* first_next_pointer = static_cast(bucket.NoBarrier_Load()); + Pointer* first_next_pointer = + static_cast(bucket.load(std::memory_order_relaxed)); if (first_next_pointer == nullptr) { // Case 1. empty bucket // NoBarrier_SetNext() suffices since we will add a barrier when // we publish a pointer to "x" in prev[i]. x->NoBarrier_SetNext(nullptr); - bucket.Release_Store(x); + bucket.store(x, std::memory_order_release); return; } BucketHeader* header = nullptr; - if (first_next_pointer->NoBarrier_Load() == nullptr) { + if (first_next_pointer->load(std::memory_order_relaxed) == nullptr) { // Case 2. only one entry in the bucket // Need to convert to a Counting bucket and turn to case 4. Node* first = reinterpret_cast(first_next_pointer); @@ -555,40 +572,43 @@ void HashLinkListRep::Insert(KeyHandle handle) { // the new node. Otherwise, we might need to change next pointer of first. // In that case, a reader might sees the next pointer is NULL and wrongly // think the node is a bucket header. - auto* mem = arena_->AllocateAligned(sizeof(BucketHeader)); + auto* mem = allocator_->AllocateAligned(sizeof(BucketHeader)); header = new (mem) BucketHeader(first, 1); - bucket.Release_Store(header); + bucket.store(header, std::memory_order_release); } else { header = reinterpret_cast(first_next_pointer); if (header->IsSkipListBucket()) { // Case 4. Bucket is already a skip list - assert(header->num_entries > threshold_use_skiplist_); + assert(header->GetNumEntries() > threshold_use_skiplist_); auto* skip_list_bucket_header = reinterpret_cast(header); - skip_list_bucket_header->Counting_header.num_entries++; + // Only one thread can execute Insert() at one time. No need to do atomic + // incremental. + skip_list_bucket_header->Counting_header.IncNumEntries(); skip_list_bucket_header->skip_list.Insert(x->key); return; } } if (bucket_entries_logging_threshold_ > 0 && - header->num_entries == + header->GetNumEntries() == static_cast(bucket_entries_logging_threshold_)) { Info(logger_, "HashLinkedList bucket %zu has more than %d " "entries. Key to insert: %s", - GetHash(transformed), header->num_entries, + GetHash(transformed), header->GetNumEntries(), GetLengthPrefixedSlice(x->key).ToString(true).c_str()); } - if (header->num_entries == threshold_use_skiplist_) { + if (header->GetNumEntries() == threshold_use_skiplist_) { // Case 3. number of entries reaches the threshold so need to convert to // skip list. LinkListIterator bucket_iter( - this, reinterpret_cast(first_next_pointer->NoBarrier_Load())); - auto mem = arena_->AllocateAligned(sizeof(SkipListBucketHeader)); + this, reinterpret_cast( + first_next_pointer->load(std::memory_order_relaxed))); + auto mem = allocator_->AllocateAligned(sizeof(SkipListBucketHeader)); SkipListBucketHeader* new_skip_list_header = new (mem) - SkipListBucketHeader(compare_, arena_, header->num_entries + 1); + SkipListBucketHeader(compare_, allocator_, header->GetNumEntries() + 1); auto& skip_list = new_skip_list_header->skip_list; // Add all current entries to the skip list @@ -599,16 +619,17 @@ void HashLinkListRep::Insert(KeyHandle handle) { // insert the new entry skip_list.Insert(x->key); // Set the bucket - bucket.Release_Store(new_skip_list_header); + bucket.store(new_skip_list_header, std::memory_order_release); } else { // Case 5. Need to insert to the sorted linked list without changing the // header. - Node* first = reinterpret_cast(header->next.NoBarrier_Load()); + Node* first = + reinterpret_cast(header->next.load(std::memory_order_relaxed)); assert(first != nullptr); // Advance counter unless the bucket needs to be advanced to skip list. // In that case, we need to make sure the previous count never exceeds // threshold_use_skiplist_ to avoid readers to cast to wrong format. - header->num_entries++; + header->IncNumEntries(); Node* cur = first; Node* prev = nullptr; @@ -640,7 +661,7 @@ void HashLinkListRep::Insert(KeyHandle handle) { if (prev) { prev->SetNext(x); } else { - header->next.Release_Store(static_cast(x)); + header->next.store(static_cast(x), std::memory_order_release); } } } @@ -663,7 +684,7 @@ bool HashLinkListRep::Contains(const char* key) const { } size_t HashLinkListRep::ApproximateMemoryUsage() { - // Memory is always allocated from the arena. + // Memory is always allocated from the allocator. return 0; } @@ -694,7 +715,7 @@ void HashLinkListRep::Get(const LookupKey& k, void* callback_args, MemTableRep::Iterator* HashLinkListRep::GetIterator(Arena* alloc_arena) { // allocate a new arena of similar size to the one currently in use - Arena* new_arena = new Arena(arena_->BlockSize()); + Arena* new_arena = new Arena(allocator_->BlockSize()); auto list = new MemtableSkipList(compare_, new_arena); HistogramImpl keys_per_bucket_hist; @@ -778,9 +799,9 @@ Node* HashLinkListRep::FindGreaterOrEqualInBucket(Node* head, } // anon namespace MemTableRep* HashLinkListRepFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) { - return new HashLinkListRep(compare, arena, transform, bucket_count_, + return new HashLinkListRep(compare, allocator, transform, bucket_count_, threshold_use_skiplist_, huge_page_tlb_size_, logger, bucket_entries_logging_threshold_, if_log_bucket_dist_when_flash_); diff --git a/util/hash_linklist_rep.h b/util/hash_linklist_rep.h index 0df35b545..629272394 100644 --- a/util/hash_linklist_rep.h +++ b/util/hash_linklist_rep.h @@ -29,7 +29,7 @@ class HashLinkListRepFactory : public MemTableRepFactory { virtual ~HashLinkListRepFactory() {} virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index 1c7a459bd..4fb226811 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -7,12 +7,13 @@ #ifndef ROCKSDB_LITE #include "util/hash_skiplist_rep.h" +#include + #include "rocksdb/memtablerep.h" #include "util/arena.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "port/port.h" -#include "port/atomic_pointer.h" #include "util/murmurhash.h" #include "db/memtable.h" #include "db/skiplist.h" @@ -22,9 +23,10 @@ namespace { class HashSkipListRep : public MemTableRep { public: - HashSkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform* transform, size_t bucket_size, - int32_t skiplist_height, int32_t skiplist_branching_factor); + HashSkipListRep(const MemTableRep::KeyComparator& compare, + MemTableAllocator* allocator, const SliceTransform* transform, + size_t bucket_size, int32_t skiplist_height, + int32_t skiplist_branching_factor); virtual void Insert(KeyHandle handle) override; @@ -54,20 +56,21 @@ class HashSkipListRep : public MemTableRep { // Maps slices (which are transformed user keys) to buckets of keys sharing // the same transform. - port::AtomicPointer* buckets_; + std::atomic* buckets_; // The user-supplied transform whose domain is the user keys. const SliceTransform* transform_; const MemTableRep::KeyComparator& compare_; // immutable after construction - Arena* const arena_; + MemTableAllocator* const allocator_; inline size_t GetHash(const Slice& slice) const { - return MurmurHash(slice.data(), slice.size(), 0) % bucket_size_; + return MurmurHash(slice.data(), static_cast(slice.size()), 0) % + bucket_size_; } inline Bucket* GetBucket(size_t i) const { - return static_cast(buckets_[i].Acquire_Load()); + return buckets_[i].load(std::memory_order_acquire); } inline Bucket* GetBucket(const Slice& slice) const { return GetBucket(GetHash(slice)); @@ -219,22 +222,23 @@ class HashSkipListRep : public MemTableRep { }; HashSkipListRep::HashSkipListRep(const MemTableRep::KeyComparator& compare, - Arena* arena, const SliceTransform* transform, + MemTableAllocator* allocator, + const SliceTransform* transform, size_t bucket_size, int32_t skiplist_height, int32_t skiplist_branching_factor) - : MemTableRep(arena), + : MemTableRep(allocator), bucket_size_(bucket_size), skiplist_height_(skiplist_height), skiplist_branching_factor_(skiplist_branching_factor), transform_(transform), compare_(compare), - arena_(arena) { - auto mem = - arena->AllocateAligned(sizeof(port::AtomicPointer) * bucket_size); - buckets_ = new (mem) port::AtomicPointer[bucket_size]; + allocator_(allocator) { + auto mem = allocator->AllocateAligned( + sizeof(std::atomic) * bucket_size); + buckets_ = new (mem) std::atomic[bucket_size]; for (size_t i = 0; i < bucket_size_; ++i) { - buckets_[i].NoBarrier_Store(nullptr); + buckets_[i].store(nullptr, std::memory_order_relaxed); } } @@ -246,10 +250,10 @@ HashSkipListRep::Bucket* HashSkipListRep::GetInitializedBucket( size_t hash = GetHash(transformed); auto bucket = GetBucket(hash); if (bucket == nullptr) { - auto addr = arena_->AllocateAligned(sizeof(Bucket)); - bucket = new (addr) Bucket(compare_, arena_, skiplist_height_, + auto addr = allocator_->AllocateAligned(sizeof(Bucket)); + bucket = new (addr) Bucket(compare_, allocator_, skiplist_height_, skiplist_branching_factor_); - buckets_[hash].Release_Store(static_cast(bucket)); + buckets_[hash].store(bucket, std::memory_order_release); } return bucket; } @@ -290,7 +294,7 @@ void HashSkipListRep::Get(const LookupKey& k, void* callback_args, MemTableRep::Iterator* HashSkipListRep::GetIterator(Arena* arena) { // allocate a new arena of similar size to the one currently in use - Arena* new_arena = new Arena(arena_->BlockSize()); + Arena* new_arena = new Arena(allocator_->BlockSize()); auto list = new Bucket(compare_, new_arena); for (size_t i = 0; i < bucket_size_; ++i) { auto bucket = GetBucket(i); @@ -321,9 +325,9 @@ MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator(Arena* arena) { } // anon namespace MemTableRep* HashSkipListRepFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) { - return new HashSkipListRep(compare, arena, transform, bucket_count_, + return new HashSkipListRep(compare, allocator, transform, bucket_count_, skiplist_height_, skiplist_branching_factor_); } diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h index 6fec60a47..15d0fc77f 100644 --- a/util/hash_skiplist_rep.h +++ b/util/hash_skiplist_rep.h @@ -26,7 +26,7 @@ class HashSkipListRepFactory : public MemTableRepFactory { virtual ~HashSkipListRepFactory() {} virtual MemTableRep* CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform* transform, Logger* logger) override; virtual const char* Name() const override { diff --git a/util/histogram.cc b/util/histogram.cc index 968769cef..67621a5fc 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -53,14 +53,14 @@ HistogramBucketMapper::HistogramBucketMapper() } } -const size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { +size_t HistogramBucketMapper::IndexForValue(const uint64_t value) const { if (value >= maxBucketValue_) { return bucketValues_.size() - 1; } else if ( value >= minBucketValue_ ) { std::map::const_iterator lowerBound = valueIndexMap_.lower_bound(value); if (lowerBound != valueIndexMap_.end()) { - return lowerBound->second; + return static_cast(lowerBound->second); } else { return 0; } diff --git a/util/histogram.h b/util/histogram.h index d95588dc2..77ed9bed7 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -23,10 +23,10 @@ class HistogramBucketMapper { HistogramBucketMapper(); // converts a value to the bucket index. - const size_t IndexForValue(const uint64_t value) const; + size_t IndexForValue(const uint64_t value) const; // number of buckets required. - const size_t BucketCount() const { + size_t BucketCount() const { return bucketValues_.size(); } @@ -38,7 +38,7 @@ class HistogramBucketMapper { return minBucketValue_; } - uint64_t BucketLimit(const uint64_t bucketNumber) const { + uint64_t BucketLimit(const size_t bucketNumber) const { assert(bucketNumber < BucketCount()); return bucketValues_[bucketNumber]; } @@ -65,6 +65,8 @@ class HistogramImpl { virtual double StandardDeviation() const; virtual void Data(HistogramData * const data) const; + virtual ~HistogramImpl() {} + private: // To be able to use HistogramImpl as thread local variable, its constructor // has to be static. That's why we're using manually values from BucketMapper diff --git a/util/instrumented_mutex.cc b/util/instrumented_mutex.cc new file mode 100644 index 000000000..2e240cc82 --- /dev/null +++ b/util/instrumented_mutex.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/perf_context_imp.h" +#include "util/instrumented_mutex.h" +#include "util/thread_status_util.h" + +namespace rocksdb { +void InstrumentedMutex::Lock() { + PERF_TIMER_GUARD(db_mutex_lock_nanos); + uint64_t wait_time_micros = 0; + if (env_ != nullptr && stats_ != nullptr) { + { + StopWatch sw(env_, nullptr, 0, &wait_time_micros); + LockInternal(); + } + RecordTick(stats_, stats_code_, wait_time_micros); + } else { + LockInternal(); + } +} + +void InstrumentedMutex::LockInternal() { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif + mutex_.Lock(); +} + +void InstrumentedCondVar::Wait() { + PERF_TIMER_GUARD(db_condition_wait_nanos); + uint64_t wait_time_micros = 0; + if (env_ != nullptr && stats_ != nullptr) { + { + StopWatch sw(env_, nullptr, 0, &wait_time_micros); + WaitInternal(); + } + RecordTick(stats_, stats_code_, wait_time_micros); + } else { + WaitInternal(); + } +} + +void InstrumentedCondVar::WaitInternal() { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif + cond_.Wait(); +} + +bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) { + PERF_TIMER_GUARD(db_condition_wait_nanos); + uint64_t wait_time_micros = 0; + bool result = false; + if (env_ != nullptr && stats_ != nullptr) { + { + StopWatch sw(env_, nullptr, 0, &wait_time_micros); + result = TimedWaitInternal(abs_time_us); + } + RecordTick(stats_, stats_code_, wait_time_micros); + } else { + result = TimedWaitInternal(abs_time_us); + } + return result; +} + +bool InstrumentedCondVar::TimedWaitInternal(uint64_t abs_time_us) { +#ifndef NDEBUG + ThreadStatusUtil::TEST_StateDelay(ThreadStatus::STATE_MUTEX_WAIT); +#endif + return cond_.TimedWait(abs_time_us); +} + +} // namespace rocksdb diff --git a/util/instrumented_mutex.h b/util/instrumented_mutex.h new file mode 100644 index 000000000..3f233494a --- /dev/null +++ b/util/instrumented_mutex.h @@ -0,0 +1,98 @@ +// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "port/port.h" +#include "rocksdb/env.h" +#include "rocksdb/statistics.h" +#include "rocksdb/thread_status.h" +#include "util/statistics.h" +#include "util/stop_watch.h" + +namespace rocksdb { +class InstrumentedCondVar; + +// A wrapper class for port::Mutex that provides additional layer +// for collecting stats and instrumentation. +class InstrumentedMutex { + public: + explicit InstrumentedMutex(bool adaptive = false) + : mutex_(adaptive), stats_(nullptr), env_(nullptr), + stats_code_(0) {} + + InstrumentedMutex( + Statistics* stats, Env* env, + int stats_code, bool adaptive = false) + : mutex_(adaptive), stats_(stats), env_(env), + stats_code_(stats_code) {} + + void Lock(); + + void Unlock() { + mutex_.Unlock(); + } + + void AssertHeld() { + mutex_.AssertHeld(); + } + + private: + void LockInternal(); + friend class InstrumentedCondVar; + port::Mutex mutex_; + Statistics* stats_; + Env* env_; + int stats_code_; +}; + +// A wrapper class for port::Mutex that provides additional layer +// for collecting stats and instrumentation. +class InstrumentedMutexLock { + public: + explicit InstrumentedMutexLock(InstrumentedMutex* mutex) : mutex_(mutex) { + mutex_->Lock(); + } + + ~InstrumentedMutexLock() { + mutex_->Unlock(); + } + + private: + InstrumentedMutex* const mutex_; + InstrumentedMutexLock(const InstrumentedMutexLock&) = delete; + void operator=(const InstrumentedMutexLock&) = delete; +}; + +class InstrumentedCondVar { + public: + explicit InstrumentedCondVar(InstrumentedMutex* instrumented_mutex) + : cond_(&(instrumented_mutex->mutex_)), + stats_(instrumented_mutex->stats_), + env_(instrumented_mutex->env_), + stats_code_(instrumented_mutex->stats_code_) {} + + void Wait(); + + bool TimedWait(uint64_t abs_time_us); + + void Signal() { + cond_.Signal(); + } + + void SignalAll() { + cond_.SignalAll(); + } + + private: + void WaitInternal(); + bool TimedWaitInternal(uint64_t abs_time_us); + port::CondVar cond_; + Statistics* stats_; + Env* env_; + int stats_code_; +}; + +} // namespace rocksdb diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 1aa3856a3..5547fc085 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -10,10 +10,14 @@ #include "db/db_impl.h" #include "db/log_reader.h" #include "db/filename.h" +#include "db/writebuffer.h" #include "db/write_batch_internal.h" #include "rocksdb/write_batch.h" #include "rocksdb/cache.h" +#include "rocksdb/table_properties.h" #include "util/coding.h" +#include "util/sst_dump_tool_imp.h" +#include "util/scoped_arena_iterator.h" #include "utilities/ttl/db_ttl_impl.h" #include @@ -39,9 +43,11 @@ const string LDBCommand::ARG_FROM = "from"; const string LDBCommand::ARG_TO = "to"; const string LDBCommand::ARG_MAX_KEYS = "max_keys"; const string LDBCommand::ARG_BLOOM_BITS = "bloom_bits"; +const string LDBCommand::ARG_FIX_PREFIX_LEN = "fix_prefix_len"; const string LDBCommand::ARG_COMPRESSION_TYPE = "compression_type"; const string LDBCommand::ARG_BLOCK_SIZE = "block_size"; const string LDBCommand::ARG_AUTO_COMPACTION = "auto_compaction"; +const string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size"; const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; const string LDBCommand::ARG_FILE_SIZE = "file_size"; const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; @@ -90,7 +96,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( for (const auto& arg : args) { if (arg[0] == '-' && arg[1] == '-'){ - vector splits = stringSplit(arg, '='); + vector splits = StringSplit(arg, '='); if (splits.size() == 2) { string optionKey = splits[0].substr(OPTION_PREFIX.size()); option_map[optionKey] = splits[1]; @@ -161,6 +167,8 @@ LDBCommand* LDBCommand::SelectCommand( return new ManifestDumpCommand(cmdParams, option_map, flags); } else if (cmd == ListColumnFamiliesCommand::Name()) { return new ListColumnFamiliesCommand(cmdParams, option_map, flags); + } else if (cmd == DBFileDumperCommand::Name()) { + return new DBFileDumperCommand(cmdParams, option_map, flags); } else if (cmd == InternalDumpCommand::Name()) { return new InternalDumpCommand(cmdParams, option_map, flags); } else if (cmd == CheckConsistencyCommand::Name()) { @@ -220,9 +228,11 @@ Options LDBCommand::PrepareOptionsForOpenDB() { map::const_iterator itr; BlockBasedTableOptions table_options; + bool use_table_options = false; int bits; if (ParseIntOption(option_map_, ARG_BLOOM_BITS, bits, exec_state_)) { if (bits > 0) { + use_table_options = true; table_options.filter_policy.reset(NewBloomFilterPolicy(bits)); } else { exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOOM_BITS + @@ -233,14 +243,18 @@ Options LDBCommand::PrepareOptionsForOpenDB() { int block_size; if (ParseIntOption(option_map_, ARG_BLOCK_SIZE, block_size, exec_state_)) { if (block_size > 0) { + use_table_options = true; table_options.block_size = block_size; - opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); } else { exec_state_ = LDBCommandExecuteResult::FAILED(ARG_BLOCK_SIZE + " must be > 0."); } } + if (use_table_options) { + opt.table_factory.reset(NewBlockBasedTableFactory(table_options)); + } + itr = option_map_.find(ARG_AUTO_COMPACTION); if (itr != option_map_.end()) { opt.disable_auto_compactions = ! StringToBool(itr->second); @@ -268,6 +282,17 @@ Options LDBCommand::PrepareOptionsForOpenDB() { } } + int db_write_buffer_size; + if (ParseIntOption(option_map_, ARG_DB_WRITE_BUFFER_SIZE, + db_write_buffer_size, exec_state_)) { + if (db_write_buffer_size >= 0) { + opt.db_write_buffer_size = db_write_buffer_size; + } else { + exec_state_ = LDBCommandExecuteResult::FAILED(ARG_DB_WRITE_BUFFER_SIZE + + " must be >= 0."); + } + } + int write_buffer_size; if (ParseIntOption(option_map_, ARG_WRITE_BUFFER_SIZE, write_buffer_size, exec_state_)) { @@ -293,6 +318,18 @@ Options LDBCommand::PrepareOptionsForOpenDB() { opt.db_paths.emplace_back(db_path_, std::numeric_limits::max()); } + int fix_prefix_len; + if (ParseIntOption(option_map_, ARG_FIX_PREFIX_LEN, fix_prefix_len, + exec_state_)) { + if (fix_prefix_len > 0) { + opt.prefix_extractor.reset( + NewFixedPrefixTransform(static_cast(fix_prefix_len))); + } else { + exec_state_ = + LDBCommandExecuteResult::FAILED(ARG_FIX_PREFIX_LEN + " must be > 0."); + } + } + return opt; } @@ -324,7 +361,7 @@ bool LDBCommand::ParseKeyValue(const string& line, string* key, string* value, bool LDBCommand::ValidateCmdLineOptions() { for (map::const_iterator itr = option_map_.begin(); - itr != option_map_.end(); itr++) { + itr != option_map_.end(); ++itr) { if (find(valid_cmd_line_options_.begin(), valid_cmd_line_options_.end(), itr->first) == valid_cmd_line_options_.end()) { @@ -334,7 +371,7 @@ bool LDBCommand::ValidateCmdLineOptions() { } for (vector::const_iterator itr = flags_.begin(); - itr != flags_.end(); itr++) { + itr != flags_.end(); ++itr) { if (find(valid_cmd_line_options_.begin(), valid_cmd_line_options_.end(), *itr) == valid_cmd_line_options_.end()) { @@ -405,6 +442,8 @@ void CompactorCommand::DoCommand() { delete end; } +// ---------------------------------------------------------------------------- + const string DBLoaderCommand::ARG_DISABLE_WAL = "disable_wal"; const string DBLoaderCommand::ARG_BULK_LOAD = "bulk_load"; const string DBLoaderCommand::ARG_COMPACT = "compact"; @@ -480,6 +519,31 @@ void DBLoaderCommand::DoCommand() { // ---------------------------------------------------------------------------- +namespace { + +void DumpManifestFile(std::string file, bool verbose, bool hex) { + Options options; + EnvOptions sopt; + std::string dbname("dummy"); + std::shared_ptr tc( + NewLRUCache(options.max_open_files - 10, options.table_cache_numshardbits, + options.table_cache_remove_scan_count_limit)); + // Notice we are using the default options not through SanitizeOptions(), + // if VersionSet::DumpManifest() depends on any option done by + // SanitizeOptions(), we need to initialize it manually. + options.db_paths.emplace_back("dummy", 0); + WriteController wc; + WriteBuffer wb(options.db_write_buffer_size); + VersionSet versions(dbname, &options, sopt, tc.get(), &wb, &wc); + Status s = versions.DumpManifest(options, file, verbose, hex); + if (!s.ok()) { + printf("Error in processing file %s %s\n", file.c_str(), + s.ToString().c_str()); + } +} + +} // namespace + const string ManifestDumpCommand::ARG_VERBOSE = "verbose"; const string ManifestDumpCommand::ARG_PATH = "path"; @@ -540,6 +604,7 @@ void ManifestDumpCommand::DoCommand() { } else { exec_state_ = LDBCommandExecuteResult::FAILED( "Multiple MANIFEST files found; use --path to select one"); + closedir(d); return; } } @@ -551,23 +616,7 @@ void ManifestDumpCommand::DoCommand() { printf("Processing Manifest file %s\n", manifestfile.c_str()); } - Options options; - EnvOptions sopt; - std::string file(manifestfile); - std::string dbname("dummy"); - std::shared_ptr tc(NewLRUCache( - options.max_open_files - 10, options.table_cache_numshardbits, - options.table_cache_remove_scan_count_limit)); - // Notice we are using the default options not through SanitizeOptions(), - // if VersionSet::DumpManifest() depends on any option done by - // SanitizeOptions(), we need to initialize it manually. - options.db_paths.emplace_back("dummy", 0); - VersionSet versions(dbname, &options, sopt, tc.get()); - Status s = versions.DumpManifest(options, file, verbose_, is_key_hex_); - if (!s.ok()) { - printf("Error in processing file %s %s\n", manifestfile.c_str(), - s.ToString().c_str()); - } + DumpManifestFile(manifestfile, verbose_, is_key_hex_); if (verbose_) { printf("Processing Manifest file %s done\n", manifestfile.c_str()); } @@ -739,7 +788,8 @@ void InternalDumpCommand::DoCommand() { uint64_t c=0; uint64_t s1=0,s2=0; // Setup internal key iterator - auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Arena arena; + ScopedArenaIterator iter(idb->TEST_NewInternalIterator(&arena)); Status st = iter->status(); if (!st.ok()) { exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" @@ -948,8 +998,9 @@ void DBDumperCommand::DoCommand() { uint64_t s1=0,s2=0; // At this point, bucket_size=0 => time_range=0 - uint64_t num_buckets = (bucket_size >= time_range) ? 1 : - ((time_range + bucket_size - 1) / bucket_size); + int num_buckets = (bucket_size >= time_range) + ? 1 + : ((time_range + bucket_size - 1) / bucket_size); vector bucket_counts(num_buckets, 0); if (is_db_ttl_ && !count_only_ && timestamp_ && !count_delim_) { fprintf(stdout, "Dumping key-values from %s to %s\n", @@ -1086,7 +1137,9 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, NewLRUCache(opt.max_open_files - 10, opt.table_cache_numshardbits, opt.table_cache_remove_scan_count_limit)); const InternalKeyComparator cmp(opt.comparator); - VersionSet versions(db_path_, &opt, soptions, tc.get()); + WriteController wc; + WriteBuffer wb(opt.db_write_buffer_size); + VersionSet versions(db_path_, &opt, soptions, tc.get(), &wb, &wc); std::vector dummy; ColumnFamilyDescriptor dummy_descriptor(kDefaultColumnFamilyName, ColumnFamilyOptions(opt)); @@ -1101,7 +1154,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, int max = -1; auto default_cfd = versions.GetColumnFamilySet()->GetDefault(); for (int i = 0; i < default_cfd->NumberLevels(); i++) { - if (default_cfd->current()->NumLevelFiles(i)) { + if (default_cfd->current()->storage_info()->NumLevelFiles(i)) { max = i; } } @@ -1268,15 +1321,15 @@ void ChangeCompactionStyleCommand::DoCommand() { // level 0 should have only 1 file if (i == 0 && num_files != 1) { exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " - "level 0 after compaction is " + std::to_string(num_files) + + "level 0 after compaction is " + ToString(num_files) + ", not 1.\n"); return; } // other levels should have no file if (i > 0 && num_files != 0) { exec_state_ = LDBCommandExecuteResult::FAILED("Number of db files at " - "level " + std::to_string(i) + " after compaction is " + - std::to_string(num_files) + ", not 0.\n"); + "level " + ToString(i) + " after compaction is " + + ToString(num_files) + ", not 0.\n"); return; } } @@ -1285,9 +1338,19 @@ void ChangeCompactionStyleCommand::DoCommand() { files_per_level.c_str()); } +// ---------------------------------------------------------------------------- + +namespace { + +struct StdErrReporter : public log::Reader::Reporter { + virtual void Corruption(size_t bytes, const Status& s) { + cerr << "Corruption detected in log file " << s.ToString() << "\n"; + } +}; + class InMemoryHandler : public WriteBatch::Handler { public: - InMemoryHandler(stringstream& row, bool print_values) : Handler(),row_(row) { + InMemoryHandler(stringstream& row, bool print_values) : Handler(), row_(row) { print_values_ = print_values; } @@ -1317,13 +1380,63 @@ class InMemoryHandler : public WriteBatch::Handler { row_ << LDBCommand::StringToHex(key.ToString()) << " "; } - virtual ~InMemoryHandler() { }; + virtual ~InMemoryHandler() {} private: stringstream & row_; bool print_values_; }; +void DumpWalFile(std::string wal_file, bool print_header, bool print_values, + LDBCommandExecuteResult* exec_state) { + unique_ptr file; + Env* env_ = Env::Default(); + EnvOptions soptions; + Status status = env_->NewSequentialFile(wal_file, &file, soptions); + if (!status.ok()) { + if (exec_state) { + *exec_state = LDBCommandExecuteResult::FAILED("Failed to open WAL file " + + status.ToString()); + } else { + cerr << "Error: Failed to open WAL file " << status.ToString() + << std::endl; + } + } else { + StdErrReporter reporter; + log::Reader reader(move(file), &reporter, true, 0); + string scratch; + WriteBatch batch; + Slice record; + stringstream row; + if (print_header) { + cout << "Sequence,Count,ByteSize,Physical Offset,Key(s)"; + if (print_values) { + cout << " : value "; + } + cout << "\n"; + } + while (reader.ReadRecord(&record, &scratch)) { + row.str(""); + if (record.size() < 12) { + reporter.Corruption(record.size(), + Status::Corruption("log record too small")); + } else { + WriteBatchInternal::SetContents(&batch, record); + row << WriteBatchInternal::Sequence(&batch) << ","; + row << WriteBatchInternal::Count(&batch) << ","; + row << WriteBatchInternal::ByteSize(&batch) << ","; + row << reader.LastRecordOffset() << ","; + InMemoryHandler handler(row, print_values); + batch.Iterate(&handler); + row << "\n"; + } + cout << row.str(); + } + } +} + +} // namespace + const string WALDumperCommand::ARG_WAL_FILE = "walfile"; const string WALDumperCommand::ARG_PRINT_VALUE = "print_value"; const string WALDumperCommand::ARG_PRINT_HEADER = "header"; @@ -1361,53 +1474,10 @@ void WALDumperCommand::Help(string& ret) { } void WALDumperCommand::DoCommand() { - struct StdErrReporter : public log::Reader::Reporter { - virtual void Corruption(size_t bytes, const Status& s) { - cerr<<"Corruption detected in log file "< file; - Env* env_ = Env::Default(); - EnvOptions soptions; - Status status = env_->NewSequentialFile(wal_file_, &file, soptions); - if (!status.ok()) { - exec_state_ = LDBCommandExecuteResult::FAILED("Failed to open WAL file " + - status.ToString()); - } else { - StdErrReporter reporter; - log::Reader reader(move(file), &reporter, true, 0); - string scratch; - WriteBatch batch; - Slice record; - stringstream row; - if (print_header_) { - cout<<"Sequence,Count,ByteSize,Physical Offset,Key(s)"; - if (print_values_) { - cout << " : value "; - } - cout << "\n"; - } - while(reader.ReadRecord(&record, &scratch)) { - row.str(""); - if (record.size() < 12) { - reporter.Corruption( - record.size(), Status::Corruption("log record too small")); - } else { - WriteBatchInternal::SetContents(&batch, record); - row<& params, const map& options, const vector& flags) : @@ -1446,6 +1516,7 @@ void GetCommand::DoCommand() { } } +// ---------------------------------------------------------------------------- ApproxSizeCommand::ApproxSizeCommand(const vector& params, const map& options, const vector& flags) : @@ -1497,6 +1568,7 @@ void ApproxSizeCommand::DoCommand() { */ } +// ---------------------------------------------------------------------------- BatchPutCommand::BatchPutCommand(const vector& params, const map& options, const vector& flags) : @@ -1533,7 +1605,7 @@ void BatchPutCommand::DoCommand() { WriteBatch batch; for (vector>::const_iterator itr - = key_values_.begin(); itr != key_values_.end(); itr++) { + = key_values_.begin(); itr != key_values_.end(); ++itr) { batch.Put(itr->first, itr->second); } Status st = db_->Write(WriteOptions(), &batch); @@ -1550,6 +1622,7 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() { return opt; } +// ---------------------------------------------------------------------------- ScanCommand::ScanCommand(const vector& params, const map& options, const vector& flags) : @@ -1661,6 +1734,7 @@ void ScanCommand::DoCommand() { delete it; } +// ---------------------------------------------------------------------------- DeleteCommand::DeleteCommand(const vector& params, const map& options, const vector& flags) : @@ -1740,6 +1814,7 @@ Options PutCommand::PrepareOptionsForOpenDB() { return opt; } +// ---------------------------------------------------------------------------- const char* DBQuerierCommand::HELP_CMD = "help"; const char* DBQuerierCommand::GET_CMD = "get"; @@ -1821,6 +1896,8 @@ void DBQuerierCommand::DoCommand() { } } +// ---------------------------------------------------------------------------- + CheckConsistencyCommand::CheckConsistencyCommand(const vector& params, const map& options, const vector& flags) : LDBCommand(options, flags, false, @@ -1849,5 +1926,117 @@ void CheckConsistencyCommand::DoCommand() { } } +// ---------------------------------------------------------------------------- + +namespace { + +void DumpSstFile(std::string filename, bool output_hex, bool show_properties) { + std::string from_key; + std::string to_key; + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + std::cout << "Invalid sst file name." << std::endl; + return; + } + // no verification + rocksdb::SstFileReader reader(filename, false, output_hex); + Status st = reader.ReadSequential(true, -1, false, // has_from + from_key, false, // has_to + to_key); + if (!st.ok()) { + std::cerr << "Error in reading SST file " << filename << st.ToString() + << std::endl; + return; + } + + if (show_properties) { + const rocksdb::TableProperties* table_properties; + + std::shared_ptr + table_properties_from_reader; + st = reader.ReadTableProperties(&table_properties_from_reader); + if (!st.ok()) { + std::cerr << filename << ": " << st.ToString() + << ". Try to use initial table properties" << std::endl; + table_properties = reader.GetInitTableProperties(); + } else { + table_properties = table_properties_from_reader.get(); + } + if (table_properties != nullptr) { + std::cout << std::endl << "Table Properties:" << std::endl; + std::cout << table_properties->ToString("\n") << std::endl; + std::cout << "# deleted keys: " + << rocksdb::GetDeletedKeys( + table_properties->user_collected_properties) + << std::endl; + } + } +} + +} // namespace + +DBFileDumperCommand::DBFileDumperCommand(const vector& params, + const map& options, + const vector& flags) + : LDBCommand(options, flags, true, BuildCmdLineOptions({})) {} + +void DBFileDumperCommand::Help(string& ret) { + ret.append(" "); + ret.append(DBFileDumperCommand::Name()); + ret.append("\n"); +} + +void DBFileDumperCommand::DoCommand() { + if (!db_) { + return; + } + Status s; + + std::cout << "Manifest File" << std::endl; + std::cout << "==============================" << std::endl; + std::string manifest_filename; + s = ReadFileToString(db_->GetEnv(), CurrentFileName(db_->GetName()), + &manifest_filename); + if (!s.ok() || manifest_filename.empty() || + manifest_filename.back() != '\n') { + std::cerr << "Error when reading CURRENT file " + << CurrentFileName(db_->GetName()) << std::endl; + } + // remove the trailing '\n' + manifest_filename.resize(manifest_filename.size() - 1); + string manifest_filepath = db_->GetName() + "/" + manifest_filename; + std::cout << manifest_filepath << std::endl; + DumpManifestFile(manifest_filepath, false, false); + std::cout << std::endl; + + std::cout << "SST Files" << std::endl; + std::cout << "==============================" << std::endl; + std::vector metadata; + db_->GetLiveFilesMetaData(&metadata); + for (auto& fileMetadata : metadata) { + std::string filename = fileMetadata.db_path + fileMetadata.name; + std::cout << filename << " level:" << fileMetadata.level << std::endl; + std::cout << "------------------------------" << std::endl; + DumpSstFile(filename, false, true); + std::cout << std::endl; + } + std::cout << std::endl; + + std::cout << "Write Ahead Log Files" << std::endl; + std::cout << "==============================" << std::endl; + rocksdb::VectorLogPtr wal_files; + s = db_->GetSortedWalFiles(wal_files); + if (!s.ok()) { + std::cerr << "Error when getting WAL files" << std::endl; + } else { + for (auto& wal : wal_files) { + // TODO(qyang): option.wal_dir should be passed into ldb command + std::string filename = db_->GetOptions().wal_dir + wal->PathName(); + std::cout << filename << std::endl; + DumpWalFile(filename, true, true, &exec_state_); + } + } +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h index 0553fe64a..e75433e76 100644 --- a/util/ldb_cmd.h +++ b/util/ldb_cmd.h @@ -4,12 +4,17 @@ // of patent rights can be found in the PATENTS file in the same directory. // #pragma once + +#ifndef ROCKSDB_LITE + #include #include #include #include #include #include +#include +#include #include "db/version_set.h" #include "rocksdb/env.h" @@ -46,9 +51,11 @@ public: static const string ARG_TO; static const string ARG_MAX_KEYS; static const string ARG_BLOOM_BITS; + static const string ARG_FIX_PREFIX_LEN; static const string ARG_COMPRESSION_TYPE; static const string ARG_BLOCK_SIZE; static const string ARG_AUTO_COMPACTION; + static const string ARG_DB_WRITE_BUFFER_SIZE; static const string ARG_WRITE_BUFFER_SIZE; static const string ARG_FILE_SIZE; static const string ARG_CREATE_IF_MISSING; @@ -284,9 +291,10 @@ protected: * passed in. */ vector BuildCmdLineOptions(vector options) { - vector ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, - ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE, - ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE}; + vector ret = {ARG_DB, ARG_BLOOM_BITS, + ARG_BLOCK_SIZE, ARG_AUTO_COMPACTION, + ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE, + ARG_FILE_SIZE, ARG_FIX_PREFIX_LEN}; ret.insert(ret.end(), options.begin(), options.end()); return ret; } @@ -386,6 +394,19 @@ private: string to_; }; +class DBFileDumperCommand : public LDBCommand { + public: + static string Name() { return "dump_live_files"; } + + DBFileDumperCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); +}; + class DBDumperCommand: public LDBCommand { public: static string Name() { return "dump"; } @@ -728,3 +749,5 @@ public: }; } // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/ldb_cmd_execute_result.h b/util/ldb_cmd_execute_result.h index b9121b2b0..b8e6c4634 100644 --- a/util/ldb_cmd_execute_result.h +++ b/util/ldb_cmd_execute_result.h @@ -13,15 +13,10 @@ public: EXEC_NOT_STARTED = 0, EXEC_SUCCEED = 1, EXEC_FAILED = 2, }; - LDBCommandExecuteResult() { - state_ = EXEC_NOT_STARTED; - message_ = ""; - } + LDBCommandExecuteResult() : state_(EXEC_NOT_STARTED), message_("") {} - LDBCommandExecuteResult(State state, std::string& msg) { - state_ = state; - message_ = msg; - } + LDBCommandExecuteResult(State state, std::string& msg) : + state_(state), message_(msg) {} std::string ToString() { std::string ret; diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc index 271dba350..fe84fa933 100644 --- a/util/ldb_tool.cc +++ b/util/ldb_tool.cc @@ -47,11 +47,14 @@ public: " with 'put','get','scan','dump','query','batchput'" " : DB supports ttl and value is internally timestamp-suffixed\n"); ret.append(" --" + LDBCommand::ARG_BLOOM_BITS + "=\n"); + ret.append(" --" + LDBCommand::ARG_FIX_PREFIX_LEN + "=\n"); ret.append(" --" + LDBCommand::ARG_COMPRESSION_TYPE + "=\n"); ret.append(" --" + LDBCommand::ARG_BLOCK_SIZE + "=\n"); ret.append(" --" + LDBCommand::ARG_AUTO_COMPACTION + "=\n"); + ret.append(" --" + LDBCommand::ARG_DB_WRITE_BUFFER_SIZE + + "=\n"); ret.append(" --" + LDBCommand::ARG_WRITE_BUFFER_SIZE + "=\n"); ret.append(" --" + LDBCommand::ARG_FILE_SIZE + "=\n"); @@ -77,6 +80,7 @@ public: DBLoaderCommand::Help(ret); ManifestDumpCommand::Help(ret); ListColumnFamiliesCommand::Help(ret); + DBFileDumperCommand::Help(ret); InternalDumpCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); diff --git a/util/log_buffer.cc b/util/log_buffer.cc index 726c01442..ddddaec9f 100644 --- a/util/log_buffer.cc +++ b/util/log_buffer.cc @@ -13,17 +13,17 @@ LogBuffer::LogBuffer(const InfoLogLevel log_level, Logger*info_log) : log_level_(log_level), info_log_(info_log) {} -void LogBuffer::AddLogToBuffer(const char* format, va_list ap) { +void LogBuffer::AddLogToBuffer(size_t max_log_size, const char* format, + va_list ap) { if (!info_log_ || log_level_ < info_log_->GetInfoLogLevel()) { // Skip the level because of its level. return; } - const size_t kLogSizeLimit = 512; - char* alloc_mem = arena_.AllocateAligned(kLogSizeLimit); + char* alloc_mem = arena_.AllocateAligned(max_log_size); BufferedLog* buffered_log = new (alloc_mem) BufferedLog(); char* p = buffered_log->message; - char* limit = alloc_mem + kLogSizeLimit - 1; + char* limit = alloc_mem + max_log_size - 1; // store the time gettimeofday(&(buffered_log->now_tv), nullptr); @@ -61,11 +61,22 @@ void LogBuffer::FlushBufferToLog() { logs_.clear(); } +void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, const char* format, + ...) { + if (log_buffer != nullptr) { + va_list ap; + va_start(ap, format); + log_buffer->AddLogToBuffer(max_log_size, format, ap); + va_end(ap); + } +} + void LogToBuffer(LogBuffer* log_buffer, const char* format, ...) { + const size_t kDefaultMaxLogSize = 512; if (log_buffer != nullptr) { va_list ap; va_start(ap, format); - log_buffer->AddLogToBuffer(format, ap); + log_buffer->AddLogToBuffer(kDefaultMaxLogSize, format, ap); va_end(ap); } } diff --git a/util/log_buffer.h b/util/log_buffer.h index 2a24bf854..2d790086e 100644 --- a/util/log_buffer.h +++ b/util/log_buffer.h @@ -21,8 +21,9 @@ class LogBuffer { // info_log: logger to write the logs to LogBuffer(const InfoLogLevel log_level, Logger* info_log); - // Add a log entry to the buffer. - void AddLogToBuffer(const char* format, va_list ap); + // Add a log entry to the buffer. Use default max_log_size. + // max_log_size indicates maximize log size, including some metadata. + void AddLogToBuffer(size_t max_log_size, const char* format, va_list ap); size_t IsEmpty() const { return logs_.empty(); } @@ -44,6 +45,10 @@ class LogBuffer { // Add log to the LogBuffer for a delayed info logging. It can be used when // we want to add some logs inside a mutex. +// max_log_size indicates maximize log size, including some metadata. +extern void LogToBuffer(LogBuffer* log_buffer, size_t max_log_size, + const char* format, ...); +// Same as previous function, but with default max log size. extern void LogToBuffer(LogBuffer* log_buffer, const char* format, ...); } // namespace rocksdb diff --git a/util/logging.cc b/util/logging.cc index 1b5549d73..98d96b82b 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -9,7 +9,10 @@ #include "util/logging.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include @@ -42,7 +45,7 @@ int AppendHumanBytes(uint64_t bytes, char* output, int len) { void AppendNumberTo(std::string* str, uint64_t num) { char buf[30]; - snprintf(buf, sizeof(buf), "%llu", (unsigned long long) num); + snprintf(buf, sizeof(buf), "%" PRIu64, num); str->append(buf); } diff --git a/util/logging.h b/util/logging.h index ce0269726..7ca8ae0a3 100644 --- a/util/logging.h +++ b/util/logging.h @@ -19,7 +19,6 @@ namespace rocksdb { class Slice; -class WritableFile; // Append a human-readable size in bytes int AppendHumanBytes(uint64_t bytes, char* output, int len); diff --git a/helpers/memenv/memenv.cc b/util/memenv.cc similarity index 84% rename from helpers/memenv/memenv.cc rename to util/memenv.cc index 185e7d822..e2db2e140 100644 --- a/helpers/memenv/memenv.cc +++ b/util/memenv.cc @@ -13,8 +13,30 @@ namespace rocksdb { +#ifndef ROCKSDB_LITE + namespace { +std::string NormalizeFileName(const std::string fname) { + if (fname.find("//") == std::string::npos) { + return fname; + } + std::string out_name = ""; + bool is_slash = false; + for (char c : fname) { + if (c == '/' && is_slash) { + continue; + } + out_name.append(1, c); + if (c == '/') { + is_slash = true; + } else { + is_slash = false; + } + } + return out_name; +} + class FileState { public: // FileStates are reference counted. The initial reference count is zero @@ -238,40 +260,43 @@ class InMemoryEnv : public EnvWrapper { virtual Status NewSequentialFile(const std::string& fname, unique_ptr* result, const EnvOptions& soptions) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); if (file_map_.find(fname) == file_map_.end()) { *result = NULL; return Status::IOError(fname, "File not found"); } - result->reset(new SequentialFileImpl(file_map_[fname])); + result->reset(new SequentialFileImpl(file_map_[nfname])); return Status::OK(); } virtual Status NewRandomAccessFile(const std::string& fname, unique_ptr* result, const EnvOptions& soptions) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); - if (file_map_.find(fname) == file_map_.end()) { + if (file_map_.find(nfname) == file_map_.end()) { *result = NULL; return Status::IOError(fname, "File not found"); } - result->reset(new RandomAccessFileImpl(file_map_[fname])); + result->reset(new RandomAccessFileImpl(file_map_[nfname])); return Status::OK(); } virtual Status NewWritableFile(const std::string& fname, unique_ptr* result, const EnvOptions& soptions) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); - if (file_map_.find(fname) != file_map_.end()) { - DeleteFileInternal(fname); + if (file_map_.find(nfname) != file_map_.end()) { + DeleteFileInternal(nfname); } FileState* file = new FileState(); file->Ref(); - file_map_[fname] = file; + file_map_[nfname] = file; result->reset(new WritableFileImpl(file)); return Status::OK(); @@ -284,8 +309,9 @@ class InMemoryEnv : public EnvWrapper { } virtual bool FileExists(const std::string& fname) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); - return file_map_.find(fname) != file_map_.end(); + return file_map_.find(nfname) != file_map_.end(); } virtual Status GetChildren(const std::string& dir, @@ -315,12 +341,13 @@ class InMemoryEnv : public EnvWrapper { } virtual Status DeleteFile(const std::string& fname) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); - if (file_map_.find(fname) == file_map_.end()) { + if (file_map_.find(nfname) == file_map_.end()) { return Status::IOError(fname, "File not found"); } - DeleteFileInternal(fname); + DeleteFileInternal(nfname); return Status::OK(); } @@ -337,12 +364,14 @@ class InMemoryEnv : public EnvWrapper { } virtual Status GetFileSize(const std::string& fname, uint64_t* file_size) { + std::string nfname = NormalizeFileName(fname); MutexLock lock(&mutex_); - if (file_map_.find(fname) == file_map_.end()) { + + if (file_map_.find(nfname) == file_map_.end()) { return Status::IOError(fname, "File not found"); } - *file_size = file_map_[fname]->Size(); + *file_size = file_map_[nfname]->Size(); return Status::OK(); } @@ -351,16 +380,17 @@ class InMemoryEnv : public EnvWrapper { return Status::NotSupported("getFileMTime", "Not supported in MemEnv"); } - virtual Status RenameFile(const std::string& src, - const std::string& target) { + virtual Status RenameFile(const std::string& src, const std::string& dest) { + std::string nsrc = NormalizeFileName(src); + std::string ndest = NormalizeFileName(dest); MutexLock lock(&mutex_); - if (file_map_.find(src) == file_map_.end()) { + if (file_map_.find(nsrc) == file_map_.end()) { return Status::IOError(src, "File not found"); } - DeleteFileInternal(target); - file_map_[target] = file_map_[src]; - file_map_.erase(src); + DeleteFileInternal(dest); + file_map_[ndest] = file_map_[nsrc]; + file_map_.erase(nsrc); return Status::OK(); } @@ -392,4 +422,10 @@ Env* NewMemEnv(Env* base_env) { return new InMemoryEnv(base_env); } +#else // ROCKSDB_LITE + +Env* NewMemEnv(Env* base_env) { return nullptr; } + +#endif // !ROCKSDB_LITE + } // namespace rocksdb diff --git a/helpers/memenv/memenv_test.cc b/util/memenv_test.cc similarity index 96% rename from helpers/memenv/memenv_test.cc rename to util/memenv_test.cc index ea3ed61a0..6154893f0 100644 --- a/helpers/memenv/memenv_test.cc +++ b/util/memenv_test.cc @@ -222,6 +222,15 @@ TEST(MemEnvTest, DBTest) { } delete db; + + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + delete db; } } // namespace rocksdb diff --git a/util/mock_env.cc b/util/mock_env.cc new file mode 100644 index 000000000..2b357cefe --- /dev/null +++ b/util/mock_env.cc @@ -0,0 +1,699 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/mock_env.h" +#include +#include +#include +#include "util/rate_limiter.h" +#include "util/random.h" +#include "util/murmurhash.h" + +namespace rocksdb { + +class MemFile { + public: + explicit MemFile(const std::string& fn, bool _is_lock_file = false) + : fn_(fn), + refs_(0), + is_lock_file_(_is_lock_file), + locked_(false), + size_(0), + modified_time_(Now()), + rnd_(static_cast( + MurmurHash(fn.data(), static_cast(fn.size()), 0))), + fsynced_bytes_(0) {} + + void Ref() { + MutexLock lock(&mutex_); + ++refs_; + } + + bool is_lock_file() const { return is_lock_file_; } + + bool Lock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + if (locked_) { + return false; + } else { + locked_ = true; + return true; + } + } + + void Unlock() { + assert(is_lock_file_); + MutexLock lock(&mutex_); + locked_ = false; + } + + void Unref() { + bool do_delete = false; + { + MutexLock lock(&mutex_); + --refs_; + assert(refs_ >= 0); + if (refs_ <= 0) { + do_delete = true; + } + } + + if (do_delete) { + delete this; + } + } + + uint64_t Size() const { + return size_; + } + + void Truncate(size_t size) { + MutexLock lock(&mutex_); + if (size < size_) { + data_.resize(size); + size_ = size; + } + } + + void CorruptBuffer() { + if (fsynced_bytes_ >= size_) { + return; + } + uint64_t buffered_bytes = size_ - fsynced_bytes_; + uint64_t start = + fsynced_bytes_ + rnd_.Uniform(static_cast(buffered_bytes)); + uint64_t end = std::min(start + 512, size_.load()); + MutexLock lock(&mutex_); + for (uint64_t pos = start; pos < end; ++pos) { + data_[pos] = static_cast(rnd_.Uniform(256)); + } + } + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const { + MutexLock lock(&mutex_); + if (offset > Size()) { + return Status::IOError("Offset greater than file size."); + } + const uint64_t available = Size() - offset; + if (n > available) { + n = available; + } + if (n == 0) { + *result = Slice(); + return Status::OK(); + } + if (scratch) { + memcpy(scratch, &(data_[offset]), n); + *result = Slice(scratch, n); + } else { + *result = Slice(&(data_[offset]), n); + } + return Status::OK(); + } + + Status Append(const Slice& data) { + MutexLock lock(&mutex_); + data_.append(data.data(), data.size()); + size_ = data_.size(); + modified_time_ = Now(); + return Status::OK(); + } + + Status Fsync() { + fsynced_bytes_ = size_.load(); + return Status::OK(); + } + + uint64_t ModifiedTime() const { + return modified_time_; + } + + private: + uint64_t Now() { + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()).count(); + } + + // Private since only Unref() should be used to delete it. + ~MemFile() { + assert(refs_ == 0); + } + + // No copying allowed. + MemFile(const MemFile&); + void operator=(const MemFile&); + + const std::string fn_; + mutable port::Mutex mutex_; + int refs_; + bool is_lock_file_; + bool locked_; + + // Data written into this file, all bytes before fsynced_bytes are + // persistent. + std::string data_; + std::atomic size_; + std::atomic modified_time_; + + Random rnd_; + std::atomic fsynced_bytes_; +}; + +namespace { + +class MockSequentialFile : public SequentialFile { + public: + explicit MockSequentialFile(MemFile* file) : file_(file), pos_(0) { + file_->Ref(); + } + + ~MockSequentialFile() { + file_->Unref(); + } + + virtual Status Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(pos_, n, result, scratch); + if (s.ok()) { + pos_ += result->size(); + } + return s; + } + + virtual Status Skip(uint64_t n) { + if (pos_ > file_->Size()) { + return Status::IOError("pos_ > file_->Size()"); + } + const size_t available = file_->Size() - pos_; + if (n > available) { + n = available; + } + pos_ += n; + return Status::OK(); + } + + private: + MemFile* file_; + size_t pos_; +}; + +class MockRandomAccessFile : public RandomAccessFile { + public: + explicit MockRandomAccessFile(MemFile* file) : file_(file) { + file_->Ref(); + } + + ~MockRandomAccessFile() { + file_->Unref(); + } + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + return file_->Read(offset, n, result, scratch); + } + + private: + MemFile* file_; +}; + +class MockWritableFile : public WritableFile { + public: + MockWritableFile(MemFile* file, RateLimiter* rate_limiter) + : file_(file), + rate_limiter_(rate_limiter) { + file_->Ref(); + } + + ~MockWritableFile() { + file_->Unref(); + } + + virtual Status Append(const Slice& data) { + uint64_t bytes_written = 0; + while (bytes_written < data.size()) { + auto bytes = RequestToken(data.size() - bytes_written); + Status s = file_->Append(Slice(data.data() + bytes_written, bytes)); + if (!s.ok()) { + return s; + } + bytes_written += bytes; + } + return Status::OK(); + } + + virtual Status Close() { + return file_->Fsync(); + } + + virtual Status Flush() { + return Status::OK(); + } + + virtual Status Sync() { + return file_->Fsync(); + } + + virtual uint64_t GetFileSize() { + return file_->Size(); + } + + private: + inline size_t RequestToken(size_t bytes) { + if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { + bytes = std::min(bytes, + static_cast(rate_limiter_->GetSingleBurstBytes())); + rate_limiter_->Request(bytes, io_priority_); + } + return bytes; + } + + MemFile* file_; + RateLimiter* rate_limiter_; +}; + +class MockEnvDirectory : public Directory { + public: + virtual Status Fsync() { return Status::OK(); } +}; + +class MockEnvFileLock : public FileLock { + public: + explicit MockEnvFileLock(const std::string& fname) + : fname_(fname) {} + + std::string FileName() const { + return fname_; + } + + private: + const std::string fname_; +}; + +class TestMemLogger : public Logger { + private: + std::unique_ptr file_; + std::atomic_size_t log_size_; + static const uint64_t flush_every_seconds_ = 5; + std::atomic_uint_fast64_t last_flush_micros_; + Env* env_; + bool flush_pending_; + + public: + TestMemLogger(std::unique_ptr f, Env* env, + const InfoLogLevel log_level = InfoLogLevel::ERROR_LEVEL) + : Logger(log_level), + file_(std::move(f)), + log_size_(0), + last_flush_micros_(0), + env_(env), + flush_pending_(false) {} + virtual ~TestMemLogger() { + } + + virtual void Flush() { + if (flush_pending_) { + flush_pending_ = false; + } + last_flush_micros_ = env_->NowMicros(); + } + + using Logger::Logv; + virtual void Logv(const char* format, va_list ap) { + // We try twice: the first time with a fixed-size stack allocated buffer, + // and the second time with a much larger dynamically allocated buffer. + char buffer[500]; + for (int iter = 0; iter < 2; iter++) { + char* base; + int bufsize; + if (iter == 0) { + bufsize = sizeof(buffer); + base = buffer; + } else { + bufsize = 30000; + base = new char[bufsize]; + } + char* p = base; + char* limit = base + bufsize; + + struct timeval now_tv; + gettimeofday(&now_tv, nullptr); + const time_t seconds = now_tv.tv_sec; + struct tm t; + localtime_r(&seconds, &t); + p += snprintf(p, limit - p, + "%04d/%02d/%02d-%02d:%02d:%02d.%06d ", + t.tm_year + 1900, + t.tm_mon + 1, + t.tm_mday, + t.tm_hour, + t.tm_min, + t.tm_sec, + static_cast(now_tv.tv_usec)); + + // Print the message + if (p < limit) { + va_list backup_ap; + va_copy(backup_ap, ap); + p += vsnprintf(p, limit - p, format, backup_ap); + va_end(backup_ap); + } + + // Truncate to available space if necessary + if (p >= limit) { + if (iter == 0) { + continue; // Try again with larger buffer + } else { + p = limit - 1; + } + } + + // Add newline if necessary + if (p == base || p[-1] != '\n') { + *p++ = '\n'; + } + + assert(p <= limit); + const size_t write_size = p - base; + + file_->Append(Slice(base, write_size)); + flush_pending_ = true; + log_size_ += write_size; + uint64_t now_micros = static_cast(now_tv.tv_sec) * 1000000 + + now_tv.tv_usec; + if (now_micros - last_flush_micros_ >= flush_every_seconds_ * 1000000) { + flush_pending_ = false; + last_flush_micros_ = now_micros; + } + if (base != buffer) { + delete[] base; + } + break; + } + } + size_t GetLogFileSize() const { + return log_size_; + } +}; + +} // Anonymous namespace + +MockEnv::MockEnv(Env* base_env) + : EnvWrapper(base_env) {} + +MockEnv::~MockEnv() { + for (FileSystem::iterator i = file_map_.begin(); i != file_map_.end(); ++i) { + i->second->Unref(); + } +} + + // Partial implementation of the Env interface. +Status MockEnv::NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = NULL; + return Status::IOError(fn, "File not found"); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return Status::InvalidArgument(fn, "Cannot open a lock file."); + } + result->reset(new MockSequentialFile(f)); + return Status::OK(); +} + +Status MockEnv::NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + *result = NULL; + return Status::IOError(fn, "File not found"); + } + auto* f = file_map_[fn]; + if (f->is_lock_file()) { + return Status::InvalidArgument(fn, "Cannot open a lock file."); + } + result->reset(new MockRandomAccessFile(f)); + return Status::OK(); +} + +Status MockEnv::NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& env_options) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + DeleteFileInternal(fn); + } + MemFile* file = new MemFile(fn, false); + file->Ref(); + file_map_[fn] = file; + + result->reset(new MockWritableFile(file, env_options.rate_limiter)); + return Status::OK(); +} + +Status MockEnv::NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) { + return Status::OK(); +} + +Status MockEnv::NewDirectory(const std::string& name, + unique_ptr* result) { + result->reset(new MockEnvDirectory()); + return Status::OK(); +} + +bool MockEnv::FileExists(const std::string& fname) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + // File exists + return true; + } + // Now also check if fn exists as a dir + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + if (filename.size() >= fn.size() + 1 && + filename[fn.size()] == '/' && + Slice(filename).starts_with(Slice(fn))) { + return true; + } + } + return false; +} + +Status MockEnv::GetChildren(const std::string& dir, + std::vector* result) { + auto d = NormalizePath(dir); + { + MutexLock lock(&mutex_); + result->clear(); + for (const auto& iter : file_map_) { + const std::string& filename = iter.first; + + if (filename.size() >= d.size() + 1 && filename[d.size()] == '/' && + Slice(filename).starts_with(Slice(d))) { + size_t next_slash = filename.find('/', d.size() + 1); + if (next_slash != std::string::npos) { + result->push_back(filename.substr( + d.size() + 1, next_slash - d.size() - 1)); + } else { + result->push_back(filename.substr(d.size() + 1)); + } + } + } + } + result->erase(std::unique(result->begin(), result->end()), result->end()); + return Status::OK(); +} + +void MockEnv::DeleteFileInternal(const std::string& fname) { + assert(fname == NormalizePath(fname)); + const auto& pair = file_map_.find(fname); + if (pair != file_map_.end()) { + pair->second->Unref(); + file_map_.erase(fname); + } +} + +Status MockEnv::DeleteFile(const std::string& fname) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + if (file_map_.find(fn) == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + + DeleteFileInternal(fn); + return Status::OK(); +} + +Status MockEnv::CreateDir(const std::string& dirname) { + return Status::OK(); +} + +Status MockEnv::CreateDirIfMissing(const std::string& dirname) { + return Status::OK(); +} + +Status MockEnv::DeleteDir(const std::string& dirname) { + return Status::OK(); +} + +Status MockEnv::GetFileSize(const std::string& fname, uint64_t* file_size) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + + *file_size = iter->second->Size(); + return Status::OK(); +} + +Status MockEnv::GetFileModificationTime(const std::string& fname, + uint64_t* time) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + *time = iter->second->ModifiedTime(); + return Status::OK(); +} + +Status MockEnv::RenameFile(const std::string& src, const std::string& dest) { + auto s = NormalizePath(src); + auto t = NormalizePath(dest); + MutexLock lock(&mutex_); + if (file_map_.find(s) == file_map_.end()) { + return Status::IOError(s, "File not found"); + } + + DeleteFileInternal(t); + file_map_[t] = file_map_[s]; + file_map_.erase(s); + return Status::OK(); +} + +Status MockEnv::LinkFile(const std::string& src, const std::string& dest) { + auto s = NormalizePath(src); + auto t = NormalizePath(dest); + MutexLock lock(&mutex_); + if (file_map_.find(s) == file_map_.end()) { + return Status::IOError(s, "File not found"); + } + + DeleteFileInternal(t); + file_map_[t] = file_map_[s]; + return Status::OK(); +} + +Status MockEnv::NewLogger(const std::string& fname, + shared_ptr* result) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + MemFile* file = nullptr; + if (iter == file_map_.end()) { + file = new MemFile(fn, false); + file->Ref(); + file_map_[fn] = file; + } else { + file = iter->second; + } + std::unique_ptr f(new MockWritableFile(file, nullptr)); + result->reset(new TestMemLogger(std::move(f), this)); + return Status::OK(); +} + +Status MockEnv::LockFile(const std::string& fname, FileLock** flock) { + auto fn = NormalizePath(fname); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return Status::InvalidArgument(fname, "Not a lock file."); + } + if (!file_map_[fn]->Lock()) { + return Status::IOError(fn, "Lock is already held."); + } + } else { + auto* file = new MemFile(fn, true); + file->Ref(); + file->Lock(); + file_map_[fn] = file; + } + } + *flock = new MockEnvFileLock(fn); + return Status::OK(); +} + +Status MockEnv::UnlockFile(FileLock* flock) { + std::string fn = dynamic_cast(flock)->FileName(); + { + MutexLock lock(&mutex_); + if (file_map_.find(fn) != file_map_.end()) { + if (!file_map_[fn]->is_lock_file()) { + return Status::InvalidArgument(fn, "Not a lock file."); + } + file_map_[fn]->Unlock(); + } + } + delete flock; + return Status::OK(); +} + +Status MockEnv::GetTestDirectory(std::string* path) { + *path = "/test"; + return Status::OK(); +} + +// Non-virtual functions, specific to MockEnv +Status MockEnv::Truncate(const std::string& fname, size_t size) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + iter->second->Truncate(size); + return Status::OK(); +} + +Status MockEnv::CorruptBuffer(const std::string& fname) { + auto fn = NormalizePath(fname); + MutexLock lock(&mutex_); + auto iter = file_map_.find(fn); + if (iter == file_map_.end()) { + return Status::IOError(fn, "File not found"); + } + iter->second->CorruptBuffer(); + return Status::OK(); +} + +std::string MockEnv::NormalizePath(const std::string path) { + std::string dst; + for (auto c : path) { + if (!dst.empty() && c == '/' && dst.back() == '/') { + continue; + } + dst.push_back(c); + } + return dst; +} + +} // namespace rocksdb diff --git a/util/mock_env.h b/util/mock_env.h new file mode 100644 index 000000000..bbd191d78 --- /dev/null +++ b/util/mock_env.h @@ -0,0 +1,97 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/status.h" +#include "port/port.h" +#include "util/mutexlock.h" + +namespace rocksdb { + +class MemFile; +class MockEnv : public EnvWrapper { + public: + explicit MockEnv(Env* base_env); + + virtual ~MockEnv(); + + // Partial implementation of the Env interface. + virtual Status NewSequentialFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions); + + virtual Status NewRandomAccessFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& soptions); + + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& env_options); + + virtual Status NewRandomRWFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options); + + virtual Status NewDirectory(const std::string& name, + unique_ptr* result); + + virtual bool FileExists(const std::string& fname); + + virtual Status GetChildren(const std::string& dir, + std::vector* result); + + void DeleteFileInternal(const std::string& fname); + + virtual Status DeleteFile(const std::string& fname); + + virtual Status CreateDir(const std::string& dirname); + + virtual Status CreateDirIfMissing(const std::string& dirname); + + virtual Status DeleteDir(const std::string& dirname); + + virtual Status GetFileSize(const std::string& fname, uint64_t* file_size); + + virtual Status GetFileModificationTime(const std::string& fname, + uint64_t* time); + + virtual Status RenameFile(const std::string& src, + const std::string& target); + + virtual Status LinkFile(const std::string& src, const std::string& target); + + virtual Status NewLogger(const std::string& fname, + shared_ptr* result); + + virtual Status LockFile(const std::string& fname, FileLock** flock); + + virtual Status UnlockFile(FileLock* flock); + + virtual Status GetTestDirectory(std::string* path); + + // Non-virtual functions, specific to MockEnv + Status Truncate(const std::string& fname, size_t size); + + Status CorruptBuffer(const std::string& fname); + + private: + std::string NormalizePath(const std::string path); + + // Map from filenames to MemFile objects, representing a simple file system. + typedef std::map FileSystem; + port::Mutex mutex_; + FileSystem file_map_; // Protected by mutex_. +}; + +} // namespace rocksdb diff --git a/util/mock_env_test.cc b/util/mock_env_test.cc new file mode 100644 index 000000000..521f0fb1c --- /dev/null +++ b/util/mock_env_test.cc @@ -0,0 +1,271 @@ +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include +#include +#include "util/mock_env.h" +#include "db/db_impl.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/testharness.h" + +namespace rocksdb { + +class MockEnvTest { + public: + Env* env_; + const EnvOptions soptions_; + + MockEnvTest() + : env_(new MockEnv(Env::Default())) { + } + ~MockEnvTest() { + delete env_; + } +}; + +TEST(MockEnvTest, Basics) { + uint64_t file_size; + unique_ptr writable_file; + std::vector children; + + ASSERT_OK(env_->CreateDir("/dir")); + + // Check that the directory is empty. + ASSERT_TRUE(!env_->FileExists("/dir/non_existent")); + ASSERT_TRUE(!env_->GetFileSize("/dir/non_existent", &file_size).ok()); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + + // Create a file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + writable_file.reset(); + + // Check that the file exists. + ASSERT_TRUE(env_->FileExists("/dir/f")); + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(0U, file_size); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(1U, children.size()); + ASSERT_EQ("f", children[0]); + + // Write to the file. + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("abc")); + writable_file.reset(); + + // Check for expected size. + ASSERT_OK(env_->GetFileSize("/dir/f", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that renaming works. + ASSERT_TRUE(!env_->RenameFile("/dir/non_existent", "/dir/g").ok()); + ASSERT_OK(env_->RenameFile("/dir/f", "/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/f")); + ASSERT_TRUE(env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetFileSize("/dir/g", &file_size)); + ASSERT_EQ(3U, file_size); + + // Check that opening non-existent file fails. + unique_ptr seq_file; + unique_ptr rand_file; + ASSERT_TRUE(!env_->NewSequentialFile("/dir/non_existent", &seq_file, + soptions_).ok()); + ASSERT_TRUE(!seq_file); + ASSERT_TRUE(!env_->NewRandomAccessFile("/dir/non_existent", &rand_file, + soptions_).ok()); + ASSERT_TRUE(!rand_file); + + // Check that deleting works. + ASSERT_TRUE(!env_->DeleteFile("/dir/non_existent").ok()); + ASSERT_OK(env_->DeleteFile("/dir/g")); + ASSERT_TRUE(!env_->FileExists("/dir/g")); + ASSERT_OK(env_->GetChildren("/dir", &children)); + ASSERT_EQ(0U, children.size()); + ASSERT_OK(env_->DeleteDir("/dir")); +} + +TEST(MockEnvTest, ReadWrite) { + unique_ptr writable_file; + unique_ptr seq_file; + unique_ptr rand_file; + Slice result; + char scratch[100]; + + ASSERT_OK(env_->CreateDir("/dir")); + + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("hello ")); + ASSERT_OK(writable_file->Append("world")); + writable_file.reset(); + + // Read sequentially. + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(seq_file->Skip(1)); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(seq_file->Read(1000, &result, scratch)); // Try reading past EOF. + ASSERT_EQ(0U, result.size()); + ASSERT_OK(seq_file->Skip(100)); // Try to skip past end of file. + ASSERT_OK(seq_file->Read(1000, &result, scratch)); + ASSERT_EQ(0U, result.size()); + + // Random reads. + ASSERT_OK(env_->NewRandomAccessFile("/dir/f", &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(6, 5, &result, scratch)); // Read "world". + ASSERT_EQ(0, result.compare("world")); + ASSERT_OK(rand_file->Read(0, 5, &result, scratch)); // Read "hello". + ASSERT_EQ(0, result.compare("hello")); + ASSERT_OK(rand_file->Read(10, 100, &result, scratch)); // Read "d". + ASSERT_EQ(0, result.compare("d")); + + // Too high offset. + ASSERT_TRUE(!rand_file->Read(1000, 5, &result, scratch).ok()); +} + +TEST(MockEnvTest, Locks) { + FileLock* lock; + + // These are no-ops, but we test they return success. + ASSERT_OK(env_->LockFile("some file", &lock)); + ASSERT_OK(env_->UnlockFile(lock)); +} + +TEST(MockEnvTest, Misc) { + std::string test_dir; + ASSERT_OK(env_->GetTestDirectory(&test_dir)); + ASSERT_TRUE(!test_dir.empty()); + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/a/b", &writable_file, soptions_)); + + // These are no-ops, but we test they return success. + ASSERT_OK(writable_file->Sync()); + ASSERT_OK(writable_file->Flush()); + ASSERT_OK(writable_file->Close()); + writable_file.reset(); +} + +TEST(MockEnvTest, LargeWrite) { + const size_t kWriteSize = 300 * 1024; + char* scratch = new char[kWriteSize * 2]; + + std::string write_data; + for (size_t i = 0; i < kWriteSize; ++i) { + write_data.append(1, static_cast(i)); + } + + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile("/dir/f", &writable_file, soptions_)); + ASSERT_OK(writable_file->Append("foo")); + ASSERT_OK(writable_file->Append(write_data)); + writable_file.reset(); + + unique_ptr seq_file; + Slice result; + ASSERT_OK(env_->NewSequentialFile("/dir/f", &seq_file, soptions_)); + ASSERT_OK(seq_file->Read(3, &result, scratch)); // Read "foo". + ASSERT_EQ(0, result.compare("foo")); + + size_t read = 0; + std::string read_data; + while (read < kWriteSize) { + ASSERT_OK(seq_file->Read(kWriteSize - read, &result, scratch)); + read_data.append(result.data(), result.size()); + read += result.size(); + } + ASSERT_TRUE(write_data == read_data); + delete [] scratch; +} + +TEST(MockEnvTest, Corrupt) { + const std::string kGood = "this is a good string, synced to disk"; + const std::string kCorrupted = "this part may be corrupted"; + const std::string kFileName = "/dir/f"; + unique_ptr writable_file; + ASSERT_OK(env_->NewWritableFile(kFileName, &writable_file, soptions_)); + ASSERT_OK(writable_file->Append(kGood)); + ASSERT_TRUE(writable_file->GetFileSize() == kGood.size()); + + std::string scratch; + scratch.resize(kGood.size() + kCorrupted.size() + 16); + Slice result; + unique_ptr rand_file; + ASSERT_OK(env_->NewRandomAccessFile(kFileName, &rand_file, soptions_)); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kGood), 0); + + // Sync + corrupt => no change + ASSERT_OK(writable_file->Fsync()); + ASSERT_OK(dynamic_cast(env_)->CorruptBuffer(kFileName)); + result.clear(); + ASSERT_OK(rand_file->Read(0, kGood.size(), &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kGood), 0); + + // Add new data and corrupt it + ASSERT_OK(writable_file->Append(kCorrupted)); + ASSERT_TRUE(writable_file->GetFileSize() == kGood.size() + kCorrupted.size()); + result.clear(); + ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), + &result, &(scratch[0]))); + ASSERT_EQ(result.compare(kCorrupted), 0); + // Corrupted + ASSERT_OK(dynamic_cast(env_)->CorruptBuffer(kFileName)); + result.clear(); + ASSERT_OK(rand_file->Read(kGood.size(), kCorrupted.size(), + &result, &(scratch[0]))); + ASSERT_NE(result.compare(kCorrupted), 0); +} + +TEST(MockEnvTest, DBTest) { + Options options; + options.create_if_missing = true; + options.env = env_; + DB* db; + + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} diff --git a/util/murmurhash.h b/util/murmurhash.h index faa86556d..40ee357a7 100644 --- a/util/murmurhash.h +++ b/util/murmurhash.h @@ -36,7 +36,7 @@ typedef unsigned int murmur_t; namespace rocksdb { struct murmur_hash { size_t operator()(const Slice& slice) const { - return MurmurHash(slice.data(), slice.size(), 0); + return MurmurHash(slice.data(), static_cast(slice.size()), 0); } }; } // rocksdb diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc new file mode 100644 index 000000000..4ec2a4138 --- /dev/null +++ b/util/mutable_cf_options.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include +#include "rocksdb/env.h" +#include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#include "util/mutable_cf_options.h" + +namespace rocksdb { + +namespace { +// Multiple two operands. If they overflow, return op1. +uint64_t MultiplyCheckOverflow(uint64_t op1, int op2) { + if (op1 == 0) { + return 0; + } + if (op2 <= 0) { + return op1; + } + uint64_t casted_op2 = (uint64_t) op2; + if (std::numeric_limits::max() / op1 < casted_op2) { + return op1; + } + return op1 * casted_op2; +} +} // anonymous namespace + +void MutableCFOptions::RefreshDerivedOptions( + const ImmutableCFOptions& ioptions) { + max_file_size.resize(ioptions.num_levels); + level_max_bytes.resize(ioptions.num_levels); + for (int i = 0; i < ioptions.num_levels; ++i) { + if (i == 0 && ioptions.compaction_style == kCompactionStyleUniversal) { + max_file_size[i] = ULLONG_MAX; + level_max_bytes[i] = max_bytes_for_level_base; + } else if (i > 1) { + max_file_size[i] = MultiplyCheckOverflow(max_file_size[i - 1], + target_file_size_multiplier); + level_max_bytes[i] = MultiplyCheckOverflow( + MultiplyCheckOverflow(level_max_bytes[i - 1], + max_bytes_for_level_multiplier), + max_bytes_for_level_multiplier_additional[i - 1]); + } else { + max_file_size[i] = target_file_size_base; + level_max_bytes[i] = max_bytes_for_level_base; + } + } +} + +uint64_t MutableCFOptions::MaxFileSizeForLevel(int level) const { + assert(level >= 0); + assert(level < (int)max_file_size.size()); + return max_file_size[level]; +} +uint64_t MutableCFOptions::MaxBytesForLevel(int level) const { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < (int)level_max_bytes.size()); + return level_max_bytes[level]; +} +uint64_t MutableCFOptions::MaxGrandParentOverlapBytes(int level) const { + return MaxFileSizeForLevel(level) * max_grandparent_overlap_factor; +} +uint64_t MutableCFOptions::ExpandedCompactionByteSizeLimit(int level) const { + return MaxFileSizeForLevel(level) * expanded_compaction_factor; +} + +void MutableCFOptions::Dump(Logger* log) const { + // Memtable related options + Log(log, " write_buffer_size: %zu", write_buffer_size); + Log(log, " max_write_buffer_number: %d", + max_write_buffer_number); + Log(log, " arena_block_size: %zu", arena_block_size); + Log(log, " memtable_prefix_bloom_bits: %" PRIu32, + memtable_prefix_bloom_bits); + Log(log, " memtable_prefix_bloom_probes: %" PRIu32, + memtable_prefix_bloom_probes); + Log(log, " memtable_prefix_bloom_huge_page_tlb_size: %zu", + memtable_prefix_bloom_huge_page_tlb_size); + Log(log, " max_successive_merges: %zu", + max_successive_merges); + Log(log, " filter_deletes: %d", + filter_deletes); + Log(log, " disable_auto_compactions: %d", + disable_auto_compactions); + Log(log, " soft_rate_limit: %lf", + soft_rate_limit); + Log(log, " hard_rate_limit: %lf", + hard_rate_limit); + Log(log, " level0_file_num_compaction_trigger: %d", + level0_file_num_compaction_trigger); + Log(log, " level0_slowdown_writes_trigger: %d", + level0_slowdown_writes_trigger); + Log(log, " level0_stop_writes_trigger: %d", + level0_stop_writes_trigger); + Log(log, " max_grandparent_overlap_factor: %d", + max_grandparent_overlap_factor); + Log(log, " expanded_compaction_factor: %d", + expanded_compaction_factor); + Log(log, " source_compaction_factor: %d", + source_compaction_factor); + Log(log, " target_file_size_base: %" PRIu64, + target_file_size_base); + Log(log, " target_file_size_multiplier: %d", + target_file_size_multiplier); + Log(log, " max_bytes_for_level_base: %" PRIu64, + max_bytes_for_level_base); + Log(log, " max_bytes_for_level_multiplier: %d", + max_bytes_for_level_multiplier); + std::string result; + char buf[10]; + for (const auto m : max_bytes_for_level_multiplier_additional) { + snprintf(buf, sizeof(buf), "%d, ", m); + result += buf; + } + result.resize(result.size() - 2); + Log(log, "max_bytes_for_level_multiplier_additional: %s", result.c_str()); + Log(log, " max_mem_compaction_level: %d", + max_mem_compaction_level); + Log(log, " verify_checksums_in_compaction: %d", + verify_checksums_in_compaction); + Log(log, " max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); +} + +} // namespace rocksdb diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h new file mode 100644 index 000000000..9f876ace0 --- /dev/null +++ b/util/mutable_cf_options.h @@ -0,0 +1,131 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" + +namespace rocksdb { + +struct MutableCFOptions { + MutableCFOptions(const Options& options, const ImmutableCFOptions& ioptions) + : write_buffer_size(options.write_buffer_size), + max_write_buffer_number(options.max_write_buffer_number), + arena_block_size(options.arena_block_size), + memtable_prefix_bloom_bits(options.memtable_prefix_bloom_bits), + memtable_prefix_bloom_probes(options.memtable_prefix_bloom_probes), + memtable_prefix_bloom_huge_page_tlb_size( + options.memtable_prefix_bloom_huge_page_tlb_size), + max_successive_merges(options.max_successive_merges), + filter_deletes(options.filter_deletes), + inplace_update_num_locks(options.inplace_update_num_locks), + disable_auto_compactions(options.disable_auto_compactions), + soft_rate_limit(options.soft_rate_limit), + hard_rate_limit(options.hard_rate_limit), + level0_file_num_compaction_trigger( + options.level0_file_num_compaction_trigger), + level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), + level0_stop_writes_trigger(options.level0_stop_writes_trigger), + max_grandparent_overlap_factor(options.max_grandparent_overlap_factor), + expanded_compaction_factor(options.expanded_compaction_factor), + source_compaction_factor(options.source_compaction_factor), + target_file_size_base(options.target_file_size_base), + target_file_size_multiplier(options.target_file_size_multiplier), + max_bytes_for_level_base(options.max_bytes_for_level_base), + max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), + max_bytes_for_level_multiplier_additional( + options.max_bytes_for_level_multiplier_additional), + max_mem_compaction_level(options.max_mem_compaction_level), + verify_checksums_in_compaction(options.verify_checksums_in_compaction), + max_sequential_skip_in_iterations( + options.max_sequential_skip_in_iterations) + { + RefreshDerivedOptions(ioptions); + } + MutableCFOptions() + : write_buffer_size(0), + max_write_buffer_number(0), + arena_block_size(0), + memtable_prefix_bloom_bits(0), + memtable_prefix_bloom_probes(0), + memtable_prefix_bloom_huge_page_tlb_size(0), + max_successive_merges(0), + filter_deletes(false), + inplace_update_num_locks(0), + disable_auto_compactions(false), + soft_rate_limit(0), + hard_rate_limit(0), + level0_file_num_compaction_trigger(0), + level0_slowdown_writes_trigger(0), + level0_stop_writes_trigger(0), + max_grandparent_overlap_factor(0), + expanded_compaction_factor(0), + source_compaction_factor(0), + target_file_size_base(0), + target_file_size_multiplier(0), + max_bytes_for_level_base(0), + max_bytes_for_level_multiplier(0), + max_mem_compaction_level(0), + verify_checksums_in_compaction(false), + max_sequential_skip_in_iterations(0) + {} + + // Must be called after any change to MutableCFOptions + void RefreshDerivedOptions(const ImmutableCFOptions& ioptions); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level) const; + // Returns maximum total bytes of data on a given level. + uint64_t MaxBytesForLevel(int level) const; + // Returns maximum total overlap bytes with grandparent + // level (i.e., level+2) before we stop building a single + // file in level->level+1 compaction. + uint64_t MaxGrandParentOverlapBytes(int level) const; + uint64_t ExpandedCompactionByteSizeLimit(int level) const; + + void Dump(Logger* log) const; + + // Memtable related options + size_t write_buffer_size; + int max_write_buffer_number; + size_t arena_block_size; + uint32_t memtable_prefix_bloom_bits; + uint32_t memtable_prefix_bloom_probes; + size_t memtable_prefix_bloom_huge_page_tlb_size; + size_t max_successive_merges; + bool filter_deletes; + size_t inplace_update_num_locks; + + // Compaction related options + bool disable_auto_compactions; + double soft_rate_limit; + double hard_rate_limit; + int level0_file_num_compaction_trigger; + int level0_slowdown_writes_trigger; + int level0_stop_writes_trigger; + int max_grandparent_overlap_factor; + int expanded_compaction_factor; + int source_compaction_factor; + uint64_t target_file_size_base; + int target_file_size_multiplier; + uint64_t max_bytes_for_level_base; + int max_bytes_for_level_multiplier; + std::vector max_bytes_for_level_multiplier_additional; + int max_mem_compaction_level; + bool verify_checksums_in_compaction; + + // Misc options + uint64_t max_sequential_skip_in_iterations; + + // Derived options + // Per-level target file size. + std::vector max_file_size; + // Per-level max bytes + std::vector level_max_bytes; +}; + +} // namespace rocksdb diff --git a/util/options.cc b/util/options.cc index b16c6f2f5..fbfa74ccc 100644 --- a/util/options.cc +++ b/util/options.cc @@ -8,11 +8,16 @@ // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "rocksdb/options.h" +#include "rocksdb/immutable_options.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include +#include "db/writebuffer.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/comparator.h" @@ -28,6 +33,45 @@ namespace rocksdb { +ImmutableCFOptions::ImmutableCFOptions(const Options& options) + : compaction_style(options.compaction_style), + compaction_options_universal(options.compaction_options_universal), + compaction_options_fifo(options.compaction_options_fifo), + prefix_extractor(options.prefix_extractor.get()), + comparator(options.comparator), + merge_operator(options.merge_operator.get()), + compaction_filter(options.compaction_filter), + compaction_filter_factory(options.compaction_filter_factory.get()), + compaction_filter_factory_v2(options.compaction_filter_factory_v2.get()), + inplace_update_support(options.inplace_update_support), + inplace_callback(options.inplace_callback), + info_log(options.info_log.get()), + statistics(options.statistics.get()), + env(options.env), + allow_mmap_reads(options.allow_mmap_reads), + allow_mmap_writes(options.allow_mmap_writes), + db_paths(options.db_paths), + memtable_factory(options.memtable_factory.get()), + table_factory(options.table_factory.get()), + table_properties_collector_factories( + options.table_properties_collector_factories), + advise_random_on_open(options.advise_random_on_open), + bloom_locality(options.bloom_locality), + purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), + min_partial_merge_operands(options.min_partial_merge_operands), + disable_data_sync(options.disableDataSync), + use_fsync(options.use_fsync), + compression(options.compression), + compression_per_level(options.compression_per_level), + compression_opts(options.compression_opts), + access_hint_on_compaction_start(options.access_hint_on_compaction_start), + num_levels(options.num_levels) +#ifndef ROCKSDB_LITE + , listeners(options.listeners) {} +#else // ROCKSDB_LITE + {} +#endif // ROCKSDB_LITE + ColumnFamilyOptions::ColumnFamilyOptions() : comparator(BytewiseComparator()), merge_operator(nullptr), @@ -74,7 +118,12 @@ ColumnFamilyOptions::ColumnFamilyOptions() memtable_prefix_bloom_huge_page_tlb_size(0), bloom_locality(0), max_successive_merges(0), - min_partial_merge_operands(2) { + min_partial_merge_operands(2) +#ifndef ROCKSDB_LITE + , listeners() { +#else // ROCKSDB_LITE + { +#endif // ROCKSDB_LITE assert(memtable_factory.get() != nullptr); } @@ -134,7 +183,12 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) options.memtable_prefix_bloom_huge_page_tlb_size), bloom_locality(options.bloom_locality), max_successive_merges(options.max_successive_merges), - min_partial_merge_operands(options.min_partial_merge_operands) { + min_partial_merge_operands(options.min_partial_merge_operands) +#ifndef ROCKSDB_LITE + , listeners(options.listeners) { +#else // ROCKSDB_LITE + { +#endif // ROCKSDB_LITE assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -150,7 +204,11 @@ DBOptions::DBOptions() env(Env::Default()), rate_limiter(nullptr), info_log(nullptr), +#ifdef NDEBUG info_log_level(INFO_LEVEL), +#else + info_log_level(DEBUG_LEVEL), +#endif // NDEBUG max_open_files(5000), max_total_wal_size(0), statistics(nullptr), @@ -177,10 +235,11 @@ DBOptions::DBOptions() skip_log_error_on_recovery(false), stats_dump_period_sec(3600), advise_random_on_open(true), + db_write_buffer_size(0), access_hint_on_compaction_start(NORMAL), use_adaptive_mutex(false), - allow_thread_local(true), - bytes_per_sync(0) {} + bytes_per_sync(0), + enable_thread_tracking(false) {} DBOptions::DBOptions(const Options& options) : create_if_missing(options.create_if_missing), @@ -220,10 +279,11 @@ DBOptions::DBOptions(const Options& options) skip_log_error_on_recovery(options.skip_log_error_on_recovery), stats_dump_period_sec(options.stats_dump_period_sec), advise_random_on_open(options.advise_random_on_open), + db_write_buffer_size(options.db_write_buffer_size), access_hint_on_compaction_start(options.access_hint_on_compaction_start), use_adaptive_mutex(options.use_adaptive_mutex), - allow_thread_local(options.allow_thread_local), - bytes_per_sync(options.bytes_per_sync) {} + bytes_per_sync(options.bytes_per_sync), + enable_thread_tracking(options.enable_thread_tracking) {} static const char* const access_hints[] = { "NONE", "NORMAL", "SEQUENTIAL", "WILLNEED" @@ -240,8 +300,8 @@ void DBOptions::Dump(Logger* log) const { Log(log, " Options.disableDataSync: %d", disableDataSync); Log(log, " Options.use_fsync: %d", use_fsync); Log(log, " Options.max_log_file_size: %zu", max_log_file_size); - Log(log, "Options.max_manifest_file_size: %lu", - (unsigned long)max_manifest_file_size); + Log(log, "Options.max_manifest_file_size: %" PRIu64, + max_manifest_file_size); Log(log, " Options.log_file_time_to_roll: %zu", log_file_time_to_roll); Log(log, " Options.keep_log_file_num: %zu", keep_log_file_num); Log(log, " Options.allow_os_buffer: %d", allow_os_buffer); @@ -257,16 +317,16 @@ void DBOptions::Dump(Logger* log) const { table_cache_numshardbits); Log(log, " Options.table_cache_remove_scan_count_limit: %d", table_cache_remove_scan_count_limit); - Log(log, " Options.delete_obsolete_files_period_micros: %lu", - (unsigned long)delete_obsolete_files_period_micros); + Log(log, " Options.delete_obsolete_files_period_micros: %" PRIu64, + delete_obsolete_files_period_micros); Log(log, " Options.max_background_compactions: %d", max_background_compactions); Log(log, " Options.max_background_flushes: %d", max_background_flushes); - Log(log, " Options.WAL_ttl_seconds: %lu", - (unsigned long)WAL_ttl_seconds); - Log(log, " Options.WAL_size_limit_MB: %lu", - (unsigned long)WAL_size_limit_MB); + Log(log, " Options.WAL_ttl_seconds: %" PRIu64, + WAL_ttl_seconds); + Log(log, " Options.WAL_size_limit_MB: %" PRIu64, + WAL_size_limit_MB); Log(log, " Options.manifest_preallocation_size: %zu", manifest_preallocation_size); Log(log, " Options.allow_os_buffer: %d", @@ -277,20 +337,22 @@ void DBOptions::Dump(Logger* log) const { allow_mmap_writes); Log(log, " Options.is_fd_close_on_exec: %d", is_fd_close_on_exec); - Log(log, " Options.skip_log_error_on_recovery: %d", - skip_log_error_on_recovery); Log(log, " Options.stats_dump_period_sec: %u", stats_dump_period_sec); Log(log, " Options.advise_random_on_open: %d", advise_random_on_open); + Log(log, " Options.db_write_buffer_size: %zd", + db_write_buffer_size); Log(log, " Options.access_hint_on_compaction_start: %s", access_hints[access_hint_on_compaction_start]); Log(log, " Options.use_adaptive_mutex: %d", use_adaptive_mutex); Log(log, " Options.rate_limiter: %p", rate_limiter.get()); - Log(log, " Options.bytes_per_sync: %lu", - (unsigned long)bytes_per_sync); + Log(log, " Options.bytes_per_sync: %" PRIu64, + bytes_per_sync); + Log(log, " enable_thread_tracking: %d", + enable_thread_tracking); } // DBOptions::Dump void ColumnFamilyOptions::Dump(Logger* log) const { @@ -338,20 +400,20 @@ void ColumnFamilyOptions::Dump(Logger* log) const { level0_stop_writes_trigger); Log(log," Options.max_mem_compaction_level: %d", max_mem_compaction_level); - Log(log," Options.target_file_size_base: %d", + Log(log," Options.target_file_size_base: %" PRIu64, target_file_size_base); Log(log," Options.target_file_size_multiplier: %d", target_file_size_multiplier); - Log(log," Options.max_bytes_for_level_base: %lu", - (unsigned long)max_bytes_for_level_base); + Log(log," Options.max_bytes_for_level_base: %" PRIu64, + max_bytes_for_level_base); Log(log," Options.max_bytes_for_level_multiplier: %d", max_bytes_for_level_multiplier); for (int i = 0; i < num_levels; i++) { Log(log,"Options.max_bytes_for_level_multiplier_addtl[%d]: %d", i, max_bytes_for_level_multiplier_additional[i]); } - Log(log," Options.max_sequential_skip_in_iterations: %lu", - (unsigned long)max_sequential_skip_in_iterations); + Log(log," Options.max_sequential_skip_in_iterations: %" PRIu64, + max_sequential_skip_in_iterations); Log(log," Options.expanded_compaction_factor: %d", expanded_compaction_factor); Log(log," Options.source_compaction_factor: %d", @@ -386,7 +448,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const { "max_size_amplification_percent: %u", compaction_options_universal.max_size_amplification_percent); Log(log, - "Options.compaction_options_universal.compression_size_percent: %u", + "Options.compaction_options_universal.compression_size_percent: %d", compaction_options_universal.compression_size_percent); Log(log, "Options.compaction_options_fifo.max_table_files_size: %" PRIu64, compaction_options_fifo.max_table_files_size); @@ -447,6 +509,15 @@ Options::PrepareForBulkLoad() // increasing the total time needed for compactions. num_levels = 2; + // Need to allow more write buffers to allow more parallism + // of flushes. + max_write_buffer_number = 6; + min_write_buffer_number_to_merge = 1; + + // When compaction is disabled, more parallel flush threads can + // help with write throughput. + max_background_flushes = 4; + // Prevent a memtable flush to automatically promote files // to L1. This is helpful so that all files that are // input to the manual compaction are all at L0. @@ -457,6 +528,7 @@ Options::PrepareForBulkLoad() return this; } +#ifndef ROCKSDB_LITE // Optimization functions ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( uint64_t block_cache_size_mb) { @@ -465,17 +537,15 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeForPointLookup( block_based_options.index_type = BlockBasedTableOptions::kHashSearch; block_based_options.filter_policy.reset(NewBloomFilterPolicy(10)); block_based_options.block_cache = - NewLRUCache(block_cache_size_mb * 1024 * 1024); + NewLRUCache(static_cast(block_cache_size_mb * 1024 * 1024)); table_factory.reset(new BlockBasedTableFactory(block_based_options)); -#ifndef ROCKSDB_LITE memtable_factory.reset(NewHashLinkListRepFactory()); -#endif return this; } ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( uint64_t memtable_memory_budget) { - write_buffer_size = memtable_memory_budget / 4; + write_buffer_size = static_cast(memtable_memory_budget / 4); // merge two memtables when flushing to L0 min_write_buffer_number_to_merge = 2; // this means we'll use 50% extra memory in the worst case, but will reduce @@ -507,7 +577,7 @@ ColumnFamilyOptions* ColumnFamilyOptions::OptimizeLevelStyleCompaction( ColumnFamilyOptions* ColumnFamilyOptions::OptimizeUniversalStyleCompaction( uint64_t memtable_memory_budget) { - write_buffer_size = memtable_memory_budget / 4; + write_buffer_size = static_cast(memtable_memory_budget / 4); // merge two memtables when flushing to L0 min_write_buffer_number_to_merge = 2; // this means we'll use 50% extra memory in the worst case, but will reduce @@ -526,5 +596,6 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) { env->SetBackgroundThreads(1, Env::HIGH); return this; } +#endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/util/options_builder.cc b/util/options_builder.cc index 06ce670f0..3ac3debd7 100644 --- a/util/options_builder.cc +++ b/util/options_builder.cc @@ -20,6 +20,7 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size, int read_amp_threshold, int write_amp_threshold, uint64_t target_db_size) { +#ifndef ROCKSDB_LITE // Estimate read amplification and write amplification of two compaction // styles. If there is hard limit to force a choice, make the choice. // Otherwise, calculate a score based on threshold and expected value of @@ -70,6 +71,9 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size, } else { return kCompactionStyleUniversal; } +#else + return kCompactionStyleLevel; +#endif // !ROCKSDB_LITE } // Pick mem table size @@ -95,16 +99,18 @@ void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) { options->write_buffer_size = write_buffer_size; options->max_write_buffer_number = - total_write_buffer_limit / write_buffer_size; + static_cast(total_write_buffer_limit / write_buffer_size); options->min_write_buffer_number_to_merge = 1; } +#ifndef ROCKSDB_LITE void OptimizeForUniversal(Options* options) { options->level0_file_num_compaction_trigger = 2; options->level0_slowdown_writes_trigger = 30; options->level0_stop_writes_trigger = 40; options->max_open_files = -1; } +#endif // Optimize parameters for level-based compaction void OptimizeForLevel(int read_amplification_threshold, @@ -147,10 +153,10 @@ void OptimizeForLevel(int read_amplification_threshold, // This doesn't consider compaction and overheads of mem tables. But usually // it is in the same order of magnitude. - int expected_level0_compaction_size = + size_t expected_level0_compaction_size = options->level0_file_num_compaction_trigger * options->write_buffer_size; // Enlarge level1 target file size if level0 compaction size is larger. - int max_bytes_for_level_base = 10 * kBytesForOneMb; + uint64_t max_bytes_for_level_base = 10 * kBytesForOneMb; if (expected_level0_compaction_size > max_bytes_for_level_base) { max_bytes_for_level_base = expected_level0_compaction_size; } @@ -158,9 +164,9 @@ void OptimizeForLevel(int read_amplification_threshold, // Now always set level multiplier to be 10 options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier; - const int kMinFileSize = 2 * kBytesForOneMb; + const uint64_t kMinFileSize = 2 * kBytesForOneMb; // Allow at least 3-way parallelism for compaction between level 1 and 2. - int max_file_size = max_bytes_for_level_base / 3; + uint64_t max_file_size = max_bytes_for_level_base / 3; if (max_file_size < kMinFileSize) { options->target_file_size_base = kMinFileSize; } else { @@ -184,9 +190,13 @@ Options GetOptions(size_t total_write_buffer_limit, options.compaction_style = PickCompactionStyle(write_buffer_size, read_amplification_threshold, write_amplification_threshold, target_db_size); +#ifndef ROCKSDB_LITE if (options.compaction_style == kCompactionStyleUniversal) { OptimizeForUniversal(&options); } else { +#else + { +#endif // !ROCKSDB_LITE OptimizeForLevel(read_amplification_threshold, write_amplification_threshold, target_db_size, &options); } diff --git a/util/options_helper.cc b/util/options_helper.cc new file mode 100644 index 000000000..d720a91e6 --- /dev/null +++ b/util/options_helper.cc @@ -0,0 +1,659 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include +#include "rocksdb/cache.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/convenience.h" +#include "util/options_helper.h" + +namespace rocksdb { + +#ifndef ROCKSDB_LITE + +namespace { +CompressionType ParseCompressionType(const std::string& type) { + if (type == "kNoCompression") { + return kNoCompression; + } else if (type == "kSnappyCompression") { + return kSnappyCompression; + } else if (type == "kZlibCompression") { + return kZlibCompression; + } else if (type == "kBZip2Compression") { + return kBZip2Compression; + } else if (type == "kLZ4Compression") { + return kLZ4Compression; + } else if (type == "kLZ4HCCompression") { + return kLZ4HCCompression; + } else { + throw std::invalid_argument("Unknown compression type: " + type); + } + return kNoCompression; +} + +BlockBasedTableOptions::IndexType ParseBlockBasedTableIndexType( + const std::string& type) { + if (type == "kBinarySearch") { + return BlockBasedTableOptions::kBinarySearch; + } else if (type == "kHashSearch") { + return BlockBasedTableOptions::kHashSearch; + } + throw std::invalid_argument("Unknown index type: " + type); +} + +ChecksumType ParseBlockBasedTableChecksumType( + const std::string& type) { + if (type == "kNoChecksum") { + return kNoChecksum; + } else if (type == "kCRC32c") { + return kCRC32c; + } else if (type == "kxxHash") { + return kxxHash; + } + throw std::invalid_argument("Unknown checksum type: " + type); +} + +bool ParseBoolean(const std::string& type, const std::string& value) { + if (value == "true" || value == "1") { + return true; + } else if (value == "false" || value == "0") { + return false; + } + throw std::invalid_argument(type); +} + +uint64_t ParseUint64(const std::string& value) { + size_t endchar; + uint64_t num = std::stoull(value.c_str(), &endchar); + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10LL; + else if (c == 'm' || c == 'M') + num <<= 20LL; + else if (c == 'g' || c == 'G') + num <<= 30LL; + else if (c == 't' || c == 'T') + num <<= 40LL; + } + + return num; +} + +size_t ParseSizeT(const std::string& value) { + return static_cast(ParseUint64(value)); +} + +uint32_t ParseUint32(const std::string& value) { + uint64_t num = ParseUint64(value); + if ((num >> 32LL) == 0) { + return static_cast(num); + } else { + throw std::out_of_range(value); + } +} + +int ParseInt(const std::string& value) { + size_t endchar; + int num = std::stoi(value.c_str(), &endchar); + + if (endchar < value.length()) { + char c = value[endchar]; + if (c == 'k' || c == 'K') + num <<= 10; + else if (c == 'm' || c == 'M') + num <<= 20; + else if (c == 'g' || c == 'G') + num <<= 30; + } + + return num; +} + +double ParseDouble(const std::string& value) { + return std::stod(value); +} + +CompactionStyle ParseCompactionStyle(const std::string& type) { + if (type == "kCompactionStyleLevel") { + return kCompactionStyleLevel; + } else if (type == "kCompactionStyleUniversal") { + return kCompactionStyleUniversal; + } else if (type == "kCompactionStyleFIFO") { + return kCompactionStyleFIFO; + } else { + throw std::invalid_argument("unknown compaction style: " + type); + } + return kCompactionStyleLevel; +} +} // anonymouse namespace + +template +bool ParseMemtableOptions(const std::string& name, const std::string& value, + OptionsType* new_options) { + if (name == "write_buffer_size") { + new_options->write_buffer_size = ParseSizeT(value); + } else if (name == "arena_block_size") { + new_options->arena_block_size = ParseSizeT(value); + } else if (name == "memtable_prefix_bloom_bits") { + new_options->memtable_prefix_bloom_bits = ParseUint32(value); + } else if (name == "memtable_prefix_bloom_probes") { + new_options->memtable_prefix_bloom_probes = ParseUint32(value); + } else if (name == "memtable_prefix_bloom_huge_page_tlb_size") { + new_options->memtable_prefix_bloom_huge_page_tlb_size = + ParseSizeT(value); + } else if (name == "max_successive_merges") { + new_options->max_successive_merges = ParseSizeT(value); + } else if (name == "filter_deletes") { + new_options->filter_deletes = ParseBoolean(name, value); + } else if (name == "max_write_buffer_number") { + new_options->max_write_buffer_number = ParseInt(value); + } else if (name == "inplace_update_num_locks") { + new_options->inplace_update_num_locks = ParseSizeT(value); + } else { + return false; + } + return true; +} + +template +bool ParseCompactionOptions(const std::string& name, const std::string& value, + OptionsType* new_options) { + if (name == "disable_auto_compactions") { + new_options->disable_auto_compactions = ParseBoolean(name, value); + } else if (name == "soft_rate_limit") { + new_options->soft_rate_limit = ParseDouble(value); + } else if (name == "hard_rate_limit") { + new_options->hard_rate_limit = ParseDouble(value); + } else if (name == "level0_file_num_compaction_trigger") { + new_options->level0_file_num_compaction_trigger = ParseInt(value); + } else if (name == "level0_slowdown_writes_trigger") { + new_options->level0_slowdown_writes_trigger = ParseInt(value); + } else if (name == "level0_stop_writes_trigger") { + new_options->level0_stop_writes_trigger = ParseInt(value); + } else if (name == "max_grandparent_overlap_factor") { + new_options->max_grandparent_overlap_factor = ParseInt(value); + } else if (name == "expanded_compaction_factor") { + new_options->expanded_compaction_factor = ParseInt(value); + } else if (name == "source_compaction_factor") { + new_options->source_compaction_factor = ParseInt(value); + } else if (name == "target_file_size_base") { + new_options->target_file_size_base = ParseInt(value); + } else if (name == "target_file_size_multiplier") { + new_options->target_file_size_multiplier = ParseInt(value); + } else if (name == "max_bytes_for_level_base") { + new_options->max_bytes_for_level_base = ParseUint64(value); + } else if (name == "max_bytes_for_level_multiplier") { + new_options->max_bytes_for_level_multiplier = ParseInt(value); + } else if (name == "max_bytes_for_level_multiplier_additional") { + new_options->max_bytes_for_level_multiplier_additional.clear(); + size_t start = 0; + while (true) { + size_t end = value.find(':', start); + if (end == std::string::npos) { + new_options->max_bytes_for_level_multiplier_additional.push_back( + ParseInt(value.substr(start))); + break; + } else { + new_options->max_bytes_for_level_multiplier_additional.push_back( + ParseInt(value.substr(start, end - start))); + start = end + 1; + } + } + } else if (name == "max_mem_compaction_level") { + new_options->max_mem_compaction_level = ParseInt(value); + } else if (name == "verify_checksums_in_compaction") { + new_options->verify_checksums_in_compaction = ParseBoolean(name, value); + } else { + return false; + } + return true; +} + +template +bool ParseMiscOptions(const std::string& name, const std::string& value, + OptionsType* new_options) { + if (name == "max_sequential_skip_in_iterations") { + new_options->max_sequential_skip_in_iterations = ParseUint64(value); + } else { + return false; + } + return true; +} + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + MutableCFOptions* new_options) { + assert(new_options); + *new_options = base_options; + for (const auto& o : options_map) { + try { + if (ParseMemtableOptions(o.first, o.second, new_options)) { + } else if (ParseCompactionOptions(o.first, o.second, new_options)) { + } else if (ParseMiscOptions(o.first, o.second, new_options)) { + } else { + return Status::InvalidArgument( + "unsupported dynamic option: " + o.first); + } + } catch (std::exception& e) { + return Status::InvalidArgument("error parsing " + o.first + ":" + + std::string(e.what())); + } + } + return Status::OK(); +} + +namespace { + +std::string trim(const std::string& str) { + size_t start = 0; + size_t end = str.size() - 1; + while (isspace(str[start]) != 0 && start <= end) { + ++start; + } + while (isspace(str[end]) != 0 && start <= end) { + --end; + } + if (start <= end) { + return str.substr(start, end - start + 1); + } + return std::string(); +} + +} // anonymous namespace + +Status StringToMap(const std::string& opts_str, + std::unordered_map* opts_map) { + assert(opts_map); + // Example: + // opts_str = "write_buffer_size=1024;max_write_buffer_number=2;" + // "nested_opt={opt1=1;opt2=2};max_bytes_for_level_base=100" + size_t pos = 0; + std::string opts = trim(opts_str); + while (pos < opts.size()) { + size_t eq_pos = opts.find('=', pos); + if (eq_pos == std::string::npos) { + return Status::InvalidArgument("Mismatched key value pair, '=' expected"); + } + std::string key = trim(opts.substr(pos, eq_pos - pos)); + if (key.empty()) { + return Status::InvalidArgument("Empty key found"); + } + + // skip space after '=' and look for '{' for possible nested options + pos = eq_pos + 1; + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + // Empty value at the end + if (pos >= opts.size()) { + (*opts_map)[key] = ""; + break; + } + if (opts[pos] == '{') { + int count = 1; + size_t brace_pos = pos + 1; + while (brace_pos < opts.size()) { + if (opts[brace_pos] == '{') { + ++count; + } else if (opts[brace_pos] == '}') { + --count; + if (count == 0) { + break; + } + } + ++brace_pos; + } + // found the matching closing brace + if (count == 0) { + (*opts_map)[key] = trim(opts.substr(pos + 1, brace_pos - pos - 1)); + // skip all whitespace and move to the next ';' + // brace_pos points to the next position after the matching '}' + pos = brace_pos + 1; + while (pos < opts.size() && isspace(opts[pos])) { + ++pos; + } + if (pos < opts.size() && opts[pos] != ';') { + return Status::InvalidArgument( + "Unexpected chars after nested options"); + } + ++pos; + } else { + return Status::InvalidArgument( + "Mismatched curly braces for nested options"); + } + } else { + size_t sc_pos = opts.find(';', pos); + if (sc_pos == std::string::npos) { + (*opts_map)[key] = trim(opts.substr(pos)); + // It either ends with a trailing semi-colon or the last key-value pair + break; + } else { + (*opts_map)[key] = trim(opts.substr(pos, sc_pos - pos)); + } + pos = sc_pos + 1; + } + } + + return Status::OK(); +} + + +Status GetBlockBasedTableOptionsFromMap( + const BlockBasedTableOptions& table_options, + const std::unordered_map& opts_map, + BlockBasedTableOptions* new_table_options) { + + assert(new_table_options); + *new_table_options = table_options; + for (const auto& o : opts_map) { + try { + if (o.first == "cache_index_and_filter_blocks") { + new_table_options->cache_index_and_filter_blocks = + ParseBoolean(o.first, o.second); + } else if (o.first == "index_type") { + new_table_options->index_type = ParseBlockBasedTableIndexType(o.second); + } else if (o.first == "hash_index_allow_collision") { + new_table_options->hash_index_allow_collision = + ParseBoolean(o.first, o.second); + } else if (o.first == "checksum") { + new_table_options->checksum = + ParseBlockBasedTableChecksumType(o.second); + } else if (o.first == "no_block_cache") { + new_table_options->no_block_cache = ParseBoolean(o.first, o.second); + } else if (o.first == "block_cache") { + new_table_options->block_cache = NewLRUCache(ParseSizeT(o.second)); + } else if (o.first == "block_cache_compressed") { + new_table_options->block_cache_compressed = + NewLRUCache(ParseSizeT(o.second)); + } else if (o.first == "block_size") { + new_table_options->block_size = ParseSizeT(o.second); + } else if (o.first == "block_size_deviation") { + new_table_options->block_size_deviation = ParseInt(o.second); + } else if (o.first == "block_restart_interval") { + new_table_options->block_restart_interval = ParseInt(o.second); + } else if (o.first == "filter_policy") { + // Expect the following format + // bloomfilter:int:bool + const std::string kName = "bloomfilter:"; + if (o.second.compare(0, kName.size(), kName) != 0) { + return Status::InvalidArgument("Invalid filter policy name"); + } + size_t pos = o.second.find(':', kName.size()); + if (pos == std::string::npos) { + return Status::InvalidArgument("Invalid filter policy config, " + "missing bits_per_key"); + } + int bits_per_key = ParseInt( + trim(o.second.substr(kName.size(), pos - kName.size()))); + bool use_block_based_builder = + ParseBoolean("use_block_based_builder", + trim(o.second.substr(pos + 1))); + new_table_options->filter_policy.reset( + NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); + } else if (o.first == "whole_key_filtering") { + new_table_options->whole_key_filtering = + ParseBoolean(o.first, o.second); + } else { + return Status::InvalidArgument("Unrecognized option: " + o.first); + } + } catch (std::exception& e) { + return Status::InvalidArgument("error parsing " + o.first + ":" + + std::string(e.what())); + } + } + return Status::OK(); +} + +Status GetBlockBasedTableOptionsFromString( + const BlockBasedTableOptions& table_options, + const std::string& opts_str, + BlockBasedTableOptions* new_table_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + return GetBlockBasedTableOptionsFromMap(table_options, opts_map, + new_table_options); +} + +Status GetColumnFamilyOptionsFromMap( + const ColumnFamilyOptions& base_options, + const std::unordered_map& opts_map, + ColumnFamilyOptions* new_options) { + assert(new_options); + *new_options = base_options; + for (const auto& o : opts_map) { + try { + if (ParseMemtableOptions(o.first, o.second, new_options)) { + } else if (ParseCompactionOptions(o.first, o.second, new_options)) { + } else if (ParseMiscOptions(o.first, o.second, new_options)) { + } else if (o.first == "block_based_table_factory") { + // Nested options + BlockBasedTableOptions table_opt; + Status table_opt_s = GetBlockBasedTableOptionsFromString( + BlockBasedTableOptions(), o.second, &table_opt); + if (!table_opt_s.ok()) { + return table_opt_s; + } + new_options->table_factory.reset(NewBlockBasedTableFactory(table_opt)); + } else if (o.first == "min_write_buffer_number_to_merge") { + new_options->min_write_buffer_number_to_merge = ParseInt(o.second); + } else if (o.first == "compression") { + new_options->compression = ParseCompressionType(o.second); + } else if (o.first == "compression_per_level") { + new_options->compression_per_level.clear(); + size_t start = 0; + while (true) { + size_t end = o.second.find(':', start); + if (end == std::string::npos) { + new_options->compression_per_level.push_back( + ParseCompressionType(o.second.substr(start))); + break; + } else { + new_options->compression_per_level.push_back( + ParseCompressionType(o.second.substr(start, end - start))); + start = end + 1; + } + } + } else if (o.first == "compression_opts") { + size_t start = 0; + size_t end = o.second.find(':'); + if (end == std::string::npos) { + return Status::InvalidArgument("invalid config value for: " + + o.first); + } + new_options->compression_opts.window_bits = + ParseInt(o.second.substr(start, end - start)); + start = end + 1; + end = o.second.find(':', start); + if (end == std::string::npos) { + return Status::InvalidArgument("invalid config value for: " + + o.first); + } + new_options->compression_opts.level = + ParseInt(o.second.substr(start, end - start)); + start = end + 1; + if (start >= o.second.size()) { + return Status::InvalidArgument("invalid config value for: " + + o.first); + } + new_options->compression_opts.strategy = + ParseInt(o.second.substr(start, o.second.size() - start)); + } else if (o.first == "num_levels") { + new_options->num_levels = ParseInt(o.second); + } else if (o.first == "purge_redundant_kvs_while_flush") { + new_options->purge_redundant_kvs_while_flush = + ParseBoolean(o.first, o.second); + } else if (o.first == "compaction_style") { + new_options->compaction_style = ParseCompactionStyle(o.second); + } else if (o.first == "compaction_options_universal") { + // TODO(ljin): add support + return Status::NotSupported("Not supported: " + o.first); + } else if (o.first == "compaction_options_fifo") { + new_options->compaction_options_fifo.max_table_files_size + = ParseUint64(o.second); + } else if (o.first == "bloom_locality") { + new_options->bloom_locality = ParseUint32(o.second); + } else if (o.first == "min_partial_merge_operands") { + new_options->min_partial_merge_operands = ParseUint32(o.second); + } else if (o.first == "inplace_update_support") { + new_options->inplace_update_support = ParseBoolean(o.first, o.second); + } else if (o.first == "prefix_extractor") { + const std::string kFixedPrefixName = "fixed:"; + const std::string kCappedPrefixName = "capped:"; + auto& pe_value = o.second; + if (pe_value.size() > kFixedPrefixName.size() && + pe_value.compare(0, kFixedPrefixName.size(), kFixedPrefixName) == + 0) { + int prefix_length = + ParseInt(trim(o.second.substr(kFixedPrefixName.size()))); + new_options->prefix_extractor.reset( + NewFixedPrefixTransform(prefix_length)); + } else if (pe_value.size() > kCappedPrefixName.size() && + pe_value.compare(0, kCappedPrefixName.size(), + kCappedPrefixName) == 0) { + int prefix_length = + ParseInt(trim(pe_value.substr(kCappedPrefixName.size()))); + new_options->prefix_extractor.reset( + NewCappedPrefixTransform(prefix_length)); + } else { + return Status::InvalidArgument("Invalid Prefix Extractor type: " + + pe_value); + } + } else { + return Status::InvalidArgument("Unrecognized option: " + o.first); + } + } catch (std::exception& e) { + return Status::InvalidArgument("error parsing " + o.first + ":" + + std::string(e.what())); + } + } + return Status::OK(); +} + +Status GetColumnFamilyOptionsFromString( + const ColumnFamilyOptions& base_options, + const std::string& opts_str, + ColumnFamilyOptions* new_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + return GetColumnFamilyOptionsFromMap(base_options, opts_map, new_options); +} + +Status GetDBOptionsFromMap( + const DBOptions& base_options, + const std::unordered_map& opts_map, + DBOptions* new_options) { + assert(new_options); + *new_options = base_options; + for (const auto& o : opts_map) { + try { + if (o.first == "create_if_missing") { + new_options->create_if_missing = ParseBoolean(o.first, o.second); + } else if (o.first == "create_missing_column_families") { + new_options->create_missing_column_families = + ParseBoolean(o.first, o.second); + } else if (o.first == "error_if_exists") { + new_options->error_if_exists = ParseBoolean(o.first, o.second); + } else if (o.first == "paranoid_checks") { + new_options->paranoid_checks = ParseBoolean(o.first, o.second); + } else if (o.first == "max_open_files") { + new_options->max_open_files = ParseInt(o.second); + } else if (o.first == "max_total_wal_size") { + new_options->max_total_wal_size = ParseUint64(o.second); + } else if (o.first == "disable_data_sync") { + new_options->disableDataSync = ParseBoolean(o.first, o.second); + } else if (o.first == "use_fsync") { + new_options->use_fsync = ParseBoolean(o.first, o.second); + } else if (o.first == "db_paths") { + // TODO(ljin): add support + return Status::NotSupported("Not supported: " + o.first); + } else if (o.first == "db_log_dir") { + new_options->db_log_dir = o.second; + } else if (o.first == "wal_dir") { + new_options->wal_dir = o.second; + } else if (o.first == "delete_obsolete_files_period_micros") { + new_options->delete_obsolete_files_period_micros = + ParseUint64(o.second); + } else if (o.first == "max_background_compactions") { + new_options->max_background_compactions = ParseInt(o.second); + } else if (o.first == "max_background_flushes") { + new_options->max_background_flushes = ParseInt(o.second); + } else if (o.first == "max_log_file_size") { + new_options->max_log_file_size = ParseSizeT(o.second); + } else if (o.first == "log_file_time_to_roll") { + new_options->log_file_time_to_roll = ParseSizeT(o.second); + } else if (o.first == "keep_log_file_num") { + new_options->keep_log_file_num = ParseSizeT(o.second); + } else if (o.first == "max_manifest_file_size") { + new_options->max_manifest_file_size = ParseUint64(o.second); + } else if (o.first == "table_cache_numshardbits") { + new_options->table_cache_numshardbits = ParseInt(o.second); + } else if (o.first == "table_cache_remove_scan_count_limit") { + new_options->table_cache_remove_scan_count_limit = ParseInt(o.second); + } else if (o.first == "WAL_ttl_seconds") { + new_options->WAL_ttl_seconds = ParseUint64(o.second); + } else if (o.first == "WAL_size_limit_MB") { + new_options->WAL_size_limit_MB = ParseUint64(o.second); + } else if (o.first == "manifest_preallocation_size") { + new_options->manifest_preallocation_size = ParseSizeT(o.second); + } else if (o.first == "allow_os_buffer") { + new_options->allow_os_buffer = ParseBoolean(o.first, o.second); + } else if (o.first == "allow_mmap_reads") { + new_options->allow_mmap_reads = ParseBoolean(o.first, o.second); + } else if (o.first == "allow_mmap_writes") { + new_options->allow_mmap_writes = ParseBoolean(o.first, o.second); + } else if (o.first == "is_fd_close_on_exec") { + new_options->is_fd_close_on_exec = ParseBoolean(o.first, o.second); + } else if (o.first == "skip_log_error_on_recovery") { + new_options->skip_log_error_on_recovery = + ParseBoolean(o.first, o.second); + } else if (o.first == "stats_dump_period_sec") { + new_options->stats_dump_period_sec = ParseUint32(o.second); + } else if (o.first == "advise_random_on_open") { + new_options->advise_random_on_open = ParseBoolean(o.first, o.second); + } else if (o.first == "db_write_buffer_size") { + new_options->db_write_buffer_size = ParseUint64(o.second); + } else if (o.first == "use_adaptive_mutex") { + new_options->use_adaptive_mutex = ParseBoolean(o.first, o.second); + } else if (o.first == "bytes_per_sync") { + new_options->bytes_per_sync = ParseUint64(o.second); + } else { + return Status::InvalidArgument("Unrecognized option: " + o.first); + } + } catch (std::exception& e) { + return Status::InvalidArgument("error parsing " + o.first + ":" + + std::string(e.what())); + } + } + return Status::OK(); +} + +Status GetDBOptionsFromString( + const DBOptions& base_options, + const std::string& opts_str, + DBOptions* new_options) { + std::unordered_map opts_map; + Status s = StringToMap(opts_str, &opts_map); + if (!s.ok()) { + return s; + } + return GetDBOptionsFromMap(base_options, opts_map, new_options); +} + +#endif // ROCKSDB_LITE +} // namespace rocksdb diff --git a/util/options_helper.h b/util/options_helper.h new file mode 100644 index 000000000..02c788114 --- /dev/null +++ b/util/options_helper.h @@ -0,0 +1,20 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include "util/mutable_cf_options.h" +#include "rocksdb/status.h" + +namespace rocksdb { + +Status GetMutableOptionsFromStrings( + const MutableCFOptions& base_options, + const std::unordered_map& options_map, + MutableCFOptions* new_options); + +} // namespace rocksdb diff --git a/util/options_test.cc b/util/options_test.cc index be07a83f5..0c1e3ce21 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -7,15 +7,28 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + +#include #include -#include +#include "rocksdb/cache.h" #include "rocksdb/options.h" +#include "rocksdb/table.h" +#include "rocksdb/utilities/convenience.h" +#include "rocksdb/utilities/leveldb_options.h" +#include "table/block_based_table_factory.h" #include "util/testharness.h" +#ifndef GFLAGS +bool FLAGS_enable_print = false; +#else +#include using GFLAGS::ParseCommandLineFlags; DEFINE_bool(enable_print, false, "Print options generated to console."); +#endif // GFLAGS namespace rocksdb { @@ -23,6 +36,7 @@ class OptionsTest {}; class StderrLogger : public Logger { public: + using Logger::Logv; virtual void Logv(const char* format, va_list ap) override { vprintf(format, ap); printf("\n"); @@ -65,16 +79,582 @@ TEST(OptionsTest, LooseCondition) { options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100); ASSERT_EQ(options.compaction_style, kCompactionStyleLevel); +#ifndef ROCKSDB_LITE // Universal compaction is not supported in ROCKSDB_LITE // Tight write amplification options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10); ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); +#endif // !ROCKSDB_LITE // Both tight amplifications PrintAndGetOptions(128 * 1024 * 1024, 4, 8); } + +#ifndef ROCKSDB_LITE // GetOptionsFromMap is not supported in ROCKSDB_LITE +TEST(OptionsTest, GetOptionsFromMapTest) { + std::unordered_map cf_options_map = { + {"write_buffer_size", "1"}, + {"max_write_buffer_number", "2"}, + {"min_write_buffer_number_to_merge", "3"}, + {"compression", "kSnappyCompression"}, + {"compression_per_level", "kNoCompression:" + "kSnappyCompression:" + "kZlibCompression:" + "kBZip2Compression:" + "kLZ4Compression:" + "kLZ4HCCompression"}, + {"compression_opts", "4:5:6"}, + {"num_levels", "7"}, + {"level0_file_num_compaction_trigger", "8"}, + {"level0_slowdown_writes_trigger", "9"}, + {"level0_stop_writes_trigger", "10"}, + {"max_mem_compaction_level", "11"}, + {"target_file_size_base", "12"}, + {"target_file_size_multiplier", "13"}, + {"max_bytes_for_level_base", "14"}, + {"max_bytes_for_level_multiplier", "15"}, + {"max_bytes_for_level_multiplier_additional", "16:17:18"}, + {"expanded_compaction_factor", "19"}, + {"source_compaction_factor", "20"}, + {"max_grandparent_overlap_factor", "21"}, + {"soft_rate_limit", "1.1"}, + {"hard_rate_limit", "2.1"}, + {"arena_block_size", "22"}, + {"disable_auto_compactions", "true"}, + {"purge_redundant_kvs_while_flush", "1"}, + {"compaction_style", "kCompactionStyleLevel"}, + {"verify_checksums_in_compaction", "false"}, + {"compaction_options_fifo", "23"}, + {"filter_deletes", "0"}, + {"max_sequential_skip_in_iterations", "24"}, + {"inplace_update_support", "true"}, + {"inplace_update_num_locks", "25"}, + {"memtable_prefix_bloom_bits", "26"}, + {"memtable_prefix_bloom_probes", "27"}, + {"memtable_prefix_bloom_huge_page_tlb_size", "28"}, + {"bloom_locality", "29"}, + {"max_successive_merges", "30"}, + {"min_partial_merge_operands", "31"}, + {"prefix_extractor", "fixed:31"} + }; + + std::unordered_map db_options_map = { + {"create_if_missing", "false"}, + {"create_missing_column_families", "true"}, + {"error_if_exists", "false"}, + {"paranoid_checks", "true"}, + {"max_open_files", "32"}, + {"max_total_wal_size", "33"}, + {"disable_data_sync", "false"}, + {"use_fsync", "true"}, + {"db_log_dir", "/db_log_dir"}, + {"wal_dir", "/wal_dir"}, + {"delete_obsolete_files_period_micros", "34"}, + {"max_background_compactions", "35"}, + {"max_background_flushes", "36"}, + {"max_log_file_size", "37"}, + {"log_file_time_to_roll", "38"}, + {"keep_log_file_num", "39"}, + {"max_manifest_file_size", "40"}, + {"table_cache_numshardbits", "41"}, + {"table_cache_remove_scan_count_limit", "42"}, + {"WAL_ttl_seconds", "43"}, + {"WAL_size_limit_MB", "44"}, + {"manifest_preallocation_size", "45"}, + {"allow_os_buffer", "false"}, + {"allow_mmap_reads", "true"}, + {"allow_mmap_writes", "false"}, + {"is_fd_close_on_exec", "true"}, + {"skip_log_error_on_recovery", "false"}, + {"stats_dump_period_sec", "46"}, + {"advise_random_on_open", "true"}, + {"use_adaptive_mutex", "false"}, + {"bytes_per_sync", "47"}, + }; + + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 1U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 2); + ASSERT_EQ(new_cf_opt.min_write_buffer_number_to_merge, 3); + ASSERT_EQ(new_cf_opt.compression, kSnappyCompression); + ASSERT_EQ(new_cf_opt.compression_per_level.size(), 6U); + ASSERT_EQ(new_cf_opt.compression_per_level[0], kNoCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[1], kSnappyCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[2], kZlibCompression); + ASSERT_EQ(new_cf_opt.compression_per_level[3], kBZip2Compression); + ASSERT_EQ(new_cf_opt.compression_per_level[4], kLZ4Compression); + ASSERT_EQ(new_cf_opt.compression_per_level[5], kLZ4HCCompression); + ASSERT_EQ(new_cf_opt.compression_opts.window_bits, 4); + ASSERT_EQ(new_cf_opt.compression_opts.level, 5); + ASSERT_EQ(new_cf_opt.compression_opts.strategy, 6); + ASSERT_EQ(new_cf_opt.num_levels, 7); + ASSERT_EQ(new_cf_opt.level0_file_num_compaction_trigger, 8); + ASSERT_EQ(new_cf_opt.level0_slowdown_writes_trigger, 9); + ASSERT_EQ(new_cf_opt.level0_stop_writes_trigger, 10); + ASSERT_EQ(new_cf_opt.max_mem_compaction_level, 11); + ASSERT_EQ(new_cf_opt.target_file_size_base, static_cast(12)); + ASSERT_EQ(new_cf_opt.target_file_size_multiplier, 13); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_base, 14U); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier, 15); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional.size(), 3U); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[0], 16); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[1], 17); + ASSERT_EQ(new_cf_opt.max_bytes_for_level_multiplier_additional[2], 18); + ASSERT_EQ(new_cf_opt.expanded_compaction_factor, 19); + ASSERT_EQ(new_cf_opt.source_compaction_factor, 20); + ASSERT_EQ(new_cf_opt.max_grandparent_overlap_factor, 21); + ASSERT_EQ(new_cf_opt.soft_rate_limit, 1.1); + ASSERT_EQ(new_cf_opt.hard_rate_limit, 2.1); + ASSERT_EQ(new_cf_opt.arena_block_size, 22U); + ASSERT_EQ(new_cf_opt.disable_auto_compactions, true); + ASSERT_EQ(new_cf_opt.purge_redundant_kvs_while_flush, true); + ASSERT_EQ(new_cf_opt.compaction_style, kCompactionStyleLevel); + ASSERT_EQ(new_cf_opt.verify_checksums_in_compaction, false); + ASSERT_EQ(new_cf_opt.compaction_options_fifo.max_table_files_size, + static_cast(23)); + ASSERT_EQ(new_cf_opt.filter_deletes, false); + ASSERT_EQ(new_cf_opt.max_sequential_skip_in_iterations, + static_cast(24)); + ASSERT_EQ(new_cf_opt.inplace_update_support, true); + ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 25U); + ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 26U); + ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_probes, 27U); + ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_huge_page_tlb_size, 28U); + ASSERT_EQ(new_cf_opt.bloom_locality, 29U); + ASSERT_EQ(new_cf_opt.max_successive_merges, 30U); + ASSERT_EQ(new_cf_opt.min_partial_merge_operands, 31U); + ASSERT_TRUE(new_cf_opt.prefix_extractor != nullptr); + ASSERT_EQ(std::string(new_cf_opt.prefix_extractor->Name()), + "rocksdb.FixedPrefix.31"); + + cf_options_map["write_buffer_size"] = "hello"; + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + cf_options_map["write_buffer_size"] = "1"; + ASSERT_OK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + cf_options_map["unknown_option"] = "1"; + ASSERT_NOK(GetColumnFamilyOptionsFromMap( + base_cf_opt, cf_options_map, &new_cf_opt)); + + DBOptions base_db_opt; + DBOptions new_db_opt; + ASSERT_OK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt)); + ASSERT_EQ(new_db_opt.create_if_missing, false); + ASSERT_EQ(new_db_opt.create_missing_column_families, true); + ASSERT_EQ(new_db_opt.error_if_exists, false); + ASSERT_EQ(new_db_opt.paranoid_checks, true); + ASSERT_EQ(new_db_opt.max_open_files, 32); + ASSERT_EQ(new_db_opt.max_total_wal_size, static_cast(33)); + ASSERT_EQ(new_db_opt.disableDataSync, false); + ASSERT_EQ(new_db_opt.use_fsync, true); + ASSERT_EQ(new_db_opt.db_log_dir, "/db_log_dir"); + ASSERT_EQ(new_db_opt.wal_dir, "/wal_dir"); + ASSERT_EQ(new_db_opt.delete_obsolete_files_period_micros, + static_cast(34)); + ASSERT_EQ(new_db_opt.max_background_compactions, 35); + ASSERT_EQ(new_db_opt.max_background_flushes, 36); + ASSERT_EQ(new_db_opt.max_log_file_size, 37U); + ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U); + ASSERT_EQ(new_db_opt.keep_log_file_num, 39U); + ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast(40)); + ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41); + ASSERT_EQ(new_db_opt.table_cache_remove_scan_count_limit, 42); + ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast(43)); + ASSERT_EQ(new_db_opt.WAL_size_limit_MB, static_cast(44)); + ASSERT_EQ(new_db_opt.manifest_preallocation_size, 45U); + ASSERT_EQ(new_db_opt.allow_os_buffer, false); + ASSERT_EQ(new_db_opt.allow_mmap_reads, true); + ASSERT_EQ(new_db_opt.allow_mmap_writes, false); + ASSERT_EQ(new_db_opt.is_fd_close_on_exec, true); + ASSERT_EQ(new_db_opt.skip_log_error_on_recovery, false); + ASSERT_EQ(new_db_opt.stats_dump_period_sec, 46U); + ASSERT_EQ(new_db_opt.advise_random_on_open, true); + ASSERT_EQ(new_db_opt.use_adaptive_mutex, false); + ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast(47)); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // GetOptionsFromString is not supported in ROCKSDB_LITE +TEST(OptionsTest, GetOptionsFromStringTest) { + ColumnFamilyOptions base_cf_opt; + ColumnFamilyOptions new_cf_opt; + base_cf_opt.table_factory.reset(); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, "", &new_cf_opt)); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=5", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 5U); + ASSERT_TRUE(new_cf_opt.table_factory == nullptr); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=6;", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 6U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + " write_buffer_size = 7 ", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 7U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + " write_buffer_size = 8 ; ", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 8U); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=9;max_write_buffer_number=10", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 9U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 10); + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=11; max_write_buffer_number = 12 ;", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 11U); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 12); + // Wrong name "max_write_buffer_number_" + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number_=14;", + &new_cf_opt)); + // Wrong key/value pair + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number;", &new_cf_opt)); + // Error Paring value + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13;max_write_buffer_number=;", &new_cf_opt)); + // Missing option name + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=13; =100;", &new_cf_opt)); + // Units (k) + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "memtable_prefix_bloom_bits=14k;max_write_buffer_number=-15K", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.memtable_prefix_bloom_bits, 14UL*1024UL); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, -15*1024); + // Units (m) + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "max_write_buffer_number=16m;inplace_update_num_locks=17M", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.max_write_buffer_number, 16*1024*1024); + ASSERT_EQ(new_cf_opt.inplace_update_num_locks, 17*1024UL*1024UL); + // Units (g) + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "write_buffer_size=18g;prefix_extractor=capped:8;" + "arena_block_size=19G", + &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 18*1024UL*1024UL*1024UL); + ASSERT_EQ(new_cf_opt.arena_block_size, 19*1024UL*1024UL*1024UL); + ASSERT_TRUE(new_cf_opt.prefix_extractor.get() != nullptr); + std::string prefix_name(new_cf_opt.prefix_extractor->Name()); + ASSERT_EQ(prefix_name, "rocksdb.CappedPrefix.8"); + + // Units (t) + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=20t;arena_block_size=21T", &new_cf_opt)); + ASSERT_EQ(new_cf_opt.write_buffer_size, 20*1024UL*1024UL*1024UL*1024UL); + ASSERT_EQ(new_cf_opt.arena_block_size, 21*1024UL*1024UL*1024UL*1024UL); + + // Nested block based table options + // Emtpy + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={};arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Non-empty + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Last one + ASSERT_OK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_cache=1M;block_size=4;}", + &new_cf_opt)); + ASSERT_TRUE(new_cf_opt.table_factory != nullptr); + // Mismatch curly braces + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={{{block_size=4;};" + "arena_block_size=1024", + &new_cf_opt)); + // Unexpected chars after closing curly brace + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}};" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa;" + "arena_block_size=1024", + &new_cf_opt)); + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={block_size=4;}xdfa", + &new_cf_opt)); + // Invalid block based table option + ASSERT_NOK(GetColumnFamilyOptionsFromString(base_cf_opt, + "write_buffer_size=10;max_write_buffer_number=16;" + "block_based_table_factory={xx_block_size=4;}", + &new_cf_opt)); +} +#endif // !ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // GetBlockBasedTableOptionsFromString is not supported +TEST(OptionsTest, GetBlockBasedTableOptionsFromString) { + BlockBasedTableOptions table_opt; + BlockBasedTableOptions new_opt; + // make sure default values are overwritten by something else + ASSERT_OK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;index_type=kHashSearch;" + "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" + "block_cache=1M;block_cache_compressed=1k;block_size=1024;" + "block_size_deviation=8;block_restart_interval=4;" + "filter_policy=bloomfilter:4:true;whole_key_filtering=1", + &new_opt)); + ASSERT_TRUE(new_opt.cache_index_and_filter_blocks); + ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch); + ASSERT_EQ(new_opt.checksum, ChecksumType::kxxHash); + ASSERT_TRUE(new_opt.hash_index_allow_collision); + ASSERT_TRUE(new_opt.no_block_cache); + ASSERT_TRUE(new_opt.block_cache != nullptr); + ASSERT_EQ(new_opt.block_cache->GetCapacity(), 1024UL*1024UL); + ASSERT_TRUE(new_opt.block_cache_compressed != nullptr); + ASSERT_EQ(new_opt.block_cache_compressed->GetCapacity(), 1024UL); + ASSERT_EQ(new_opt.block_size, 1024UL); + ASSERT_EQ(new_opt.block_size_deviation, 8); + ASSERT_EQ(new_opt.block_restart_interval, 4); + ASSERT_TRUE(new_opt.filter_policy != nullptr); + + // unknown option + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearch;" + "bad_option=1", + &new_opt)); + + // unrecognized index type + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;index_type=kBinarySearchXX", + &new_opt)); + + // unrecognized checksum type + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;checksum=kxxHashXX", + &new_opt)); + + // unrecognized filter policy name + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilterxx:4:true", + &new_opt)); + // unrecognized filter policy config + ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt, + "cache_index_and_filter_blocks=1;" + "filter_policy=bloomfilter:4", + &new_opt)); +} +#endif // !ROCKSDB_LITE + +Status StringToMap( + const std::string& opts_str, + std::unordered_map* opts_map); + +#ifndef ROCKSDB_LITE // StringToMap is not supported in ROCKSDB_LITE +TEST(OptionsTest, StringToMapTest) { + std::unordered_map opts_map; + // Regular options + ASSERT_OK(StringToMap("k1=v1;k2=v2;k3=v3", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "v2"); + ASSERT_EQ(opts_map["k3"], "v3"); + // Value with '=' + opts_map.clear(); + ASSERT_OK(StringToMap("k1==v1;k2=v2=;", &opts_map)); + ASSERT_EQ(opts_map["k1"], "=v1"); + ASSERT_EQ(opts_map["k2"], "v2="); + // Overwrriten option + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k1=v2;k3=v3", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v2"); + ASSERT_EQ(opts_map["k3"], "v3"); + // Empty value + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4=", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_TRUE(opts_map.find("k2") != opts_map.end()); + ASSERT_EQ(opts_map["k2"], ""); + ASSERT_EQ(opts_map["k3"], "v3"); + ASSERT_TRUE(opts_map.find("k4") != opts_map.end()); + ASSERT_EQ(opts_map["k4"], ""); + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2=;k3=v3;k4= ", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_TRUE(opts_map.find("k2") != opts_map.end()); + ASSERT_EQ(opts_map["k2"], ""); + ASSERT_EQ(opts_map["k3"], "v3"); + ASSERT_TRUE(opts_map.find("k4") != opts_map.end()); + ASSERT_EQ(opts_map["k4"], ""); + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2=;k3=", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_TRUE(opts_map.find("k2") != opts_map.end()); + ASSERT_EQ(opts_map["k2"], ""); + ASSERT_TRUE(opts_map.find("k3") != opts_map.end()); + ASSERT_EQ(opts_map["k3"], ""); + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2=;k3=;", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_TRUE(opts_map.find("k2") != opts_map.end()); + ASSERT_EQ(opts_map["k2"], ""); + ASSERT_TRUE(opts_map.find("k3") != opts_map.end()); + ASSERT_EQ(opts_map["k3"], ""); + // Regular nested options + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2=nv2};k3=v3", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2=nv2"); + ASSERT_EQ(opts_map["k3"], "v3"); + // Multi-level nested options + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2={nk1=nv1;nk2={nnk1=nnk2}};" + "k3={nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}};k4=v4", + &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "nk1=nv1;nk2={nnk1=nnk2}"); + ASSERT_EQ(opts_map["k3"], "nk1={nnk1={nnnk1=nnnv1;nnnk2;nnnv2}}"); + ASSERT_EQ(opts_map["k4"], "v4"); + // Garbage inside curly braces + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2={dfad=};k3={=};k4=v4", + &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "dfad="); + ASSERT_EQ(opts_map["k3"], "="); + ASSERT_EQ(opts_map["k4"], "v4"); + // Empty nested options + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2={};", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], ""); + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2={{{{}}}{}{}};", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "{{{}}}{}{}"); + // With random spaces + opts_map.clear(); + ASSERT_OK(StringToMap(" k1 = v1 ; k2= {nk1=nv1; nk2={nnk1=nnk2}} ; " + "k3={ { } }; k4= v4 ", + &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "nk1=nv1; nk2={nnk1=nnk2}"); + ASSERT_EQ(opts_map["k3"], "{ }"); + ASSERT_EQ(opts_map["k4"], "v4"); + + // Empty key + ASSERT_NOK(StringToMap("k1=v1;k2=v2;=", &opts_map)); + ASSERT_NOK(StringToMap("=v1;k2=v2", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2v2;", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2=v2;fadfa", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2=v2;;", &opts_map)); + // Mismatch curly braces + ASSERT_NOK(StringToMap("k1=v1;k2={;k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{};k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={}};k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{}{}}};k3=v3", &opts_map)); + // However this is valid! + opts_map.clear(); + ASSERT_OK(StringToMap("k1=v1;k2=};k3=v3", &opts_map)); + ASSERT_EQ(opts_map["k1"], "v1"); + ASSERT_EQ(opts_map["k2"], "}"); + ASSERT_EQ(opts_map["k3"], "v3"); + + // Invalid chars after closing curly brace + ASSERT_NOK(StringToMap("k1=v1;k2={{}}{};k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{}}cfda;k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{}} cfda;k3=v3", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{}} cfda", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{}}{}", &opts_map)); + ASSERT_NOK(StringToMap("k1=v1;k2={{dfdl}adfa}{}", &opts_map)); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE // StringToMap is not supported in ROCKSDB_LITE +TEST(OptionsTest, StringToMapRandomTest) { + std::unordered_map opts_map; + // Make sure segfault is not hit by semi-random strings + + std::vector bases = { + "a={aa={};tt={xxx={}}};c=defff", + "a={aa={};tt={xxx={}}};c=defff;d={{}yxx{}3{xx}}", + "abc={{}{}{}{{{}}}{{}{}{}{}{}{}{}"}; + + for (std::string base : bases) { + for (int rand_seed = 301; rand_seed < 401; rand_seed++) { + Random rnd(rand_seed); + for (int attempt = 0; attempt < 10; attempt++) { + std::string str = base; + // Replace random position to space + size_t pos = static_cast( + rnd.Uniform(static_cast(base.size()))); + str[pos] = ' '; + Status s = StringToMap(str, &opts_map); + ASSERT_TRUE(s.ok() || s.IsInvalidArgument()); + opts_map.clear(); + } + } + } + + // Random Construct a string + std::vector chars = {'{', '}', ' ', '=', ';', 'c'}; + for (int rand_seed = 301; rand_seed < 1301; rand_seed++) { + Random rnd(rand_seed); + int len = rnd.Uniform(30); + std::string str = ""; + for (int attempt = 0; attempt < len; attempt++) { + // Add a random character + size_t pos = static_cast( + rnd.Uniform(static_cast(chars.size()))); + str.append(1, chars[pos]); + } + Status s = StringToMap(str, &opts_map); + ASSERT_TRUE(s.ok() || s.IsInvalidArgument()); + s = StringToMap("name=" + str, &opts_map); + ASSERT_TRUE(s.ok() || s.IsInvalidArgument()); + opts_map.clear(); + } +} +#endif // !ROCKSDB_LITE + +TEST(OptionsTest, ConvertOptionsTest) { + LevelDBOptions leveldb_opt; + Options converted_opt = ConvertOptions(leveldb_opt); + + ASSERT_EQ(converted_opt.create_if_missing, leveldb_opt.create_if_missing); + ASSERT_EQ(converted_opt.error_if_exists, leveldb_opt.error_if_exists); + ASSERT_EQ(converted_opt.paranoid_checks, leveldb_opt.paranoid_checks); + ASSERT_EQ(converted_opt.env, leveldb_opt.env); + ASSERT_EQ(converted_opt.info_log.get(), leveldb_opt.info_log); + ASSERT_EQ(converted_opt.write_buffer_size, leveldb_opt.write_buffer_size); + ASSERT_EQ(converted_opt.max_open_files, leveldb_opt.max_open_files); + ASSERT_EQ(converted_opt.compression, leveldb_opt.compression); + + std::shared_ptr table_factory = + std::dynamic_pointer_cast( + converted_opt.table_factory); + + ASSERT_TRUE(table_factory.get() != nullptr); + + const BlockBasedTableOptions table_opt = table_factory->GetTableOptions(); + + ASSERT_EQ(table_opt.block_cache->GetCapacity(), 8UL << 20); + ASSERT_EQ(table_opt.block_size, leveldb_opt.block_size); + ASSERT_EQ(table_opt.block_restart_interval, + leveldb_opt.block_restart_interval); + ASSERT_EQ(table_opt.filter_policy.get(), leveldb_opt.filter_policy); +} + } // namespace rocksdb int main(int argc, char** argv) { +#ifdef GFLAGS ParseCommandLineFlags(&argc, &argv, true); +#endif // GFLAGS return rocksdb::test::RunAllTests(); } diff --git a/util/perf_context.cc b/util/perf_context.cc index 5443471d5..e89856513 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -51,6 +51,8 @@ void PerfContext::Reset() { find_next_user_entry_time = 0; write_pre_and_post_process_time = 0; write_memtable_time = 0; + db_mutex_lock_nanos = 0; + db_condition_wait_nanos = 0; #endif } @@ -82,7 +84,9 @@ std::string PerfContext::ToString() const { << OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time) << OUTPUT(write_pre_and_post_process_time) - << OUTPUT(write_memtable_time); + << OUTPUT(write_memtable_time) + << OUTPUT(db_mutex_lock_nanos) + << OUTPUT(db_condition_wait_nanos); return ss.str(); #endif } diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h index dc4ae95e5..e39790105 100644 --- a/util/perf_context_imp.h +++ b/util/perf_context_imp.h @@ -11,11 +11,10 @@ namespace rocksdb { #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) -#define PERF_TIMER_DECLARE() -#define PERF_TIMER_START(metric) -#define PERF_TIMER_AUTO(metric) +#define PERF_TIMER_GUARD(metric) #define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_STOP(metric) +#define PERF_TIMER_START(metric) #define PERF_COUNTER_ADD(metric, value) #else @@ -24,10 +23,15 @@ extern __thread PerfLevel perf_level; class PerfStepTimer { public: - PerfStepTimer() + PerfStepTimer(uint64_t* metric) : enabled_(perf_level >= PerfLevel::kEnableTime), env_(enabled_ ? Env::Default() : nullptr), - start_(0) { + start_(0), + metric_(metric) { + } + + ~PerfStepTimer() { + Stop(); } void Start() { @@ -36,17 +40,17 @@ class PerfStepTimer { } } - void Measure(uint64_t* metric) { + void Measure() { if (start_) { uint64_t now = env_->NowNanos(); - *metric += now - start_; + *metric_ += now - start_; start_ = now; } } - void Stop(uint64_t* metric) { + void Stop() { if (start_) { - *metric += env_->NowNanos() - start_; + *metric_ += env_->NowNanos() - start_; start_ = 0; } } @@ -55,29 +59,25 @@ class PerfStepTimer { const bool enabled_; Env* const env_; uint64_t start_; + uint64_t* metric_; }; -// Declare the local timer object to be used later on -#define PERF_TIMER_DECLARE() \ - PerfStepTimer perf_step_timer; +// Stop the timer and update the metric +#define PERF_TIMER_STOP(metric) \ + perf_step_timer_ ## metric.Stop(); -// Set start time of the timer #define PERF_TIMER_START(metric) \ - perf_step_timer.Start(); + perf_step_timer_ ## metric.Start(); // Declare and set start time of the timer -#define PERF_TIMER_AUTO(metric) \ - PerfStepTimer perf_step_timer; \ - perf_step_timer.Start(); +#define PERF_TIMER_GUARD(metric) \ + PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \ + perf_step_timer_ ## metric.Start(); // Update metric with time elapsed since last START. start time is reset // to current timestamp. #define PERF_TIMER_MEASURE(metric) \ - perf_step_timer.Measure(&(perf_context.metric)); - -// Update metric with time elapsed since last START. But start time is not set. -#define PERF_TIMER_STOP(metric) \ - perf_step_timer.Stop(&(perf_context.metric)); + perf_step_timer_ ## metric.Measure(); // Increase metric value #define PERF_COUNTER_ADD(metric, value) \ diff --git a/util/posix_logger.h b/util/posix_logger.h index 6aba769f1..6faa844ba 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -58,6 +58,8 @@ class PosixLogger : public Logger { } last_flush_micros_ = env_->NowMicros(); } + + using Logger::Logv; virtual void Logv(const char* format, va_list ap) { const uint64_t thread_id = (*gettid_)(); @@ -123,14 +125,15 @@ class PosixLogger : public Logger { // space, pre-allocate more space to avoid overly large // allocations from filesystem allocsize options. const size_t log_size = log_size_; - const int last_allocation_chunk = + const size_t last_allocation_chunk = ((kDebugLogChunkSize - 1 + log_size) / kDebugLogChunkSize); - const int desired_allocation_chunk = + const size_t desired_allocation_chunk = ((kDebugLogChunkSize - 1 + log_size + write_size) / kDebugLogChunkSize); if (last_allocation_chunk != desired_allocation_chunk) { - fallocate(fd_, FALLOC_FL_KEEP_SIZE, 0, - desired_allocation_chunk * kDebugLogChunkSize); + fallocate( + fd_, FALLOC_FL_KEEP_SIZE, 0, + static_cast(desired_allocation_chunk * kDebugLogChunkSize)); } #endif diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index cde86f3c9..2beefd58f 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -15,8 +15,8 @@ namespace rocksdb { // Pending request struct GenericRateLimiter::Req { - explicit Req(int64_t bytes, port::Mutex* mu) : - bytes(bytes), cv(mu), granted(false) {} + explicit Req(int64_t _bytes, port::Mutex* _mu) + : bytes(_bytes), cv(_mu), granted(false) {} int64_t bytes; port::CondVar cv; bool granted; @@ -47,7 +47,8 @@ GenericRateLimiter::GenericRateLimiter( GenericRateLimiter::~GenericRateLimiter() { MutexLock g(&request_mutex_); stop_ = true; - requests_to_wait_ = queue_[Env::IO_LOW].size() + queue_[Env::IO_HIGH].size(); + requests_to_wait_ = static_cast(queue_[Env::IO_LOW].size() + + queue_[Env::IO_HIGH].size()); for (auto& r : queue_[Env::IO_HIGH]) { r->cv.Signal(); } @@ -60,7 +61,7 @@ GenericRateLimiter::~GenericRateLimiter() { } void GenericRateLimiter::Request(int64_t bytes, const Env::IOPriority pri) { - assert(bytes < refill_bytes_per_period_); + assert(bytes <= refill_bytes_per_period_); MutexLock g(&request_mutex_); if (stop_) { diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index 1b72e4ed0..269582ff1 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -7,7 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include #include "util/testharness.h" @@ -27,34 +30,33 @@ TEST(RateLimiterTest, StartStop) { TEST(RateLimiterTest, Rate) { auto* env = Env::Default(); struct Arg { - Arg(int64_t target_rate, int burst) - : limiter(new GenericRateLimiter(target_rate, 100 * 1000, 10)), - request_size(target_rate / 10), - burst(burst) {} + Arg(int32_t _target_rate, int _burst) + : limiter(new GenericRateLimiter(_target_rate, 100 * 1000, 10)), + request_size(_target_rate / 10), + burst(_burst) {} std::unique_ptr limiter; - int64_t request_size; + int32_t request_size; int burst; }; auto writer = [](void* p) { - auto* env = Env::Default(); + auto* thread_env = Env::Default(); auto* arg = static_cast(p); // Test for 2 seconds - auto until = env->NowMicros() + 2 * 1000000; - Random r((uint32_t)(env->NowNanos() % - std::numeric_limits::max())); - while (env->NowMicros() < until) { + auto until = thread_env->NowMicros() + 2 * 1000000; + Random r((uint32_t)(thread_env->NowNanos() % + std::numeric_limits::max())); + while (thread_env->NowMicros() < until) { for (int i = 0; i < static_cast(r.Skewed(arg->burst) + 1); ++i) { arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_HIGH); } - arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, - Env::IO_LOW); + arg->limiter->Request(r.Uniform(arg->request_size - 1) + 1, Env::IO_LOW); } }; for (int i = 1; i <= 16; i*=2) { - int64_t target = i * 1024 * 10; + int32_t target = i * 1024 * 10; Arg arg(target, i / 4 + 1); auto start = env->NowMicros(); for (int t = 0; t < i; ++t) { @@ -65,7 +67,7 @@ TEST(RateLimiterTest, Rate) { auto elapsed = env->NowMicros() - start; double rate = arg.limiter->GetTotalBytesThrough() * 1000000.0 / elapsed; - fprintf(stderr, "request size [1 - %" PRIi64 "], limit %" PRIi64 + fprintf(stderr, "request size [1 - %" PRIi32 "], limit %" PRIi32 " KB/sec, actual rate: %lf KB/sec, elapsed %.2lf seconds\n", arg.request_size - 1, target / 1024, rate / 1024, elapsed / 1000000.0); diff --git a/util/scoped_arena_iterator.h b/util/scoped_arena_iterator.h new file mode 100644 index 000000000..2021d2dc2 --- /dev/null +++ b/util/scoped_arena_iterator.h @@ -0,0 +1,28 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once + +#include "rocksdb/iterator.h" + +namespace rocksdb { +class ScopedArenaIterator { + public: + explicit ScopedArenaIterator(Iterator* iter = nullptr) : iter_(iter) {} + + Iterator* operator->() { return iter_; } + + void set(Iterator* iter) { iter_ = iter; } + + Iterator* get() { return iter_; } + + ~ScopedArenaIterator() { iter_->~Iterator(); } + + private: + Iterator* iter_; +}; +} // namespace rocksdb diff --git a/util/signal_test.cc b/util/signal_test.cc index f51fa548e..b23ad6a98 100644 --- a/util/signal_test.cc +++ b/util/signal_test.cc @@ -9,6 +9,7 @@ namespace { void f0() { char *p = nullptr; + // cppcheck-suppress nullPointer *p = 10; /* SIGSEGV here!! */ } diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index a3c940d0e..ee57372fa 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -12,9 +12,17 @@ namespace rocksdb { namespace { class SkipListRep : public MemTableRep { SkipList skip_list_; + const MemTableRep::KeyComparator& cmp_; + const SliceTransform* transform_; + const size_t lookahead_; + + friend class LookaheadIterator; public: - explicit SkipListRep(const MemTableRep::KeyComparator& compare, Arena* arena) - : MemTableRep(arena), skip_list_(compare, arena) { + explicit SkipListRep(const MemTableRep::KeyComparator& compare, + MemTableAllocator* allocator, + const SliceTransform* transform, const size_t lookahead) + : MemTableRep(allocator), skip_list_(compare, allocator), cmp_(compare), + transform_(transform), lookahead_(lookahead) { } // Insert key into the list. @@ -29,7 +37,7 @@ public: } virtual size_t ApproximateMemoryUsage() override { - // All memory is allocated through arena; nothing to report here + // All memory is allocated through allocator; nothing to report here return 0; } @@ -106,11 +114,110 @@ public: std::string tmp_; // For passing to EncodeKey }; + // Iterator over the contents of a skip list which also keeps track of the + // previously visited node. In Seek(), it examines a few nodes after it + // first, falling back to O(log n) search from the head of the list only if + // the target key hasn't been found. + class LookaheadIterator : public MemTableRep::Iterator { + public: + explicit LookaheadIterator(const SkipListRep& rep) : + rep_(rep), iter_(&rep_.skip_list_), prev_(iter_) {} + + virtual ~LookaheadIterator() override {} + + virtual bool Valid() const override { + return iter_.Valid(); + } + + virtual const char *key() const override { + assert(Valid()); + return iter_.key(); + } + + virtual void Next() override { + assert(Valid()); + + bool advance_prev = true; + if (prev_.Valid()) { + auto k1 = rep_.UserKey(prev_.key()); + auto k2 = rep_.UserKey(iter_.key()); + + if (k1.compare(k2) == 0) { + // same user key, don't move prev_ + advance_prev = false; + } else if (rep_.transform_) { + // only advance prev_ if it has the same prefix as iter_ + auto t1 = rep_.transform_->Transform(k1); + auto t2 = rep_.transform_->Transform(k2); + advance_prev = t1.compare(t2) == 0; + } + } + + if (advance_prev) { + prev_ = iter_; + } + iter_.Next(); + } + + virtual void Prev() override { + assert(Valid()); + iter_.Prev(); + prev_ = iter_; + } + + virtual void Seek(const Slice& internal_key, const char *memtable_key) + override { + const char *encoded_key = + (memtable_key != nullptr) ? + memtable_key : EncodeKey(&tmp_, internal_key); + + if (prev_.Valid() && rep_.cmp_(encoded_key, prev_.key()) >= 0) { + // prev_.key() is smaller or equal to our target key; do a quick + // linear search (at most lookahead_ steps) starting from prev_ + iter_ = prev_; + + size_t cur = 0; + while (cur++ <= rep_.lookahead_ && iter_.Valid()) { + if (rep_.cmp_(encoded_key, iter_.key()) <= 0) { + return; + } + Next(); + } + } + + iter_.Seek(encoded_key); + prev_ = iter_; + } + + virtual void SeekToFirst() override { + iter_.SeekToFirst(); + prev_ = iter_; + } + + virtual void SeekToLast() override { + iter_.SeekToLast(); + prev_ = iter_; + } + + protected: + std::string tmp_; // For passing to EncodeKey + + private: + const SkipListRep& rep_; + SkipList::Iterator iter_; + SkipList::Iterator prev_; + }; + virtual MemTableRep::Iterator* GetIterator(Arena* arena = nullptr) override { - if (arena == nullptr) { - return new SkipListRep::Iterator(&skip_list_); + if (lookahead_ > 0) { + void *mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::LookaheadIterator)) + : operator new(sizeof(SkipListRep::LookaheadIterator)); + return new (mem) SkipListRep::LookaheadIterator(*this); } else { - auto mem = arena->AllocateAligned(sizeof(SkipListRep::Iterator)); + void *mem = + arena ? arena->AllocateAligned(sizeof(SkipListRep::Iterator)) + : operator new(sizeof(SkipListRep::Iterator)); return new (mem) SkipListRep::Iterator(&skip_list_); } } @@ -118,9 +225,9 @@ public: } MemTableRep* SkipListFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, - const SliceTransform*, Logger* logger) { - return new SkipListRep(compare, arena); + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, + const SliceTransform* transform, Logger* logger) { + return new SkipListRep(compare, allocator, transform, lookahead_); } } // namespace rocksdb diff --git a/util/slice.cc b/util/slice.cc index 5a1f4f10e..734ea974b 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -9,6 +9,7 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/slice.h" +#include "util/string_util.h" namespace rocksdb { @@ -22,7 +23,7 @@ class FixedPrefixTransform : public SliceTransform { public: explicit FixedPrefixTransform(size_t prefix_len) : prefix_len_(prefix_len), - name_("rocksdb.FixedPrefix." + std::to_string(prefix_len_)) {} + name_("rocksdb.FixedPrefix." + ToString(prefix_len_)) {} virtual const char* Name() const { return name_.c_str(); } @@ -38,6 +39,38 @@ class FixedPrefixTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const { return (dst.size() == prefix_len_); } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return InDomain(prefix); + } +}; + +class CappedPrefixTransform : public SliceTransform { + private: + size_t cap_len_; + std::string name_; + + public: + explicit CappedPrefixTransform(size_t cap_len) + : cap_len_(cap_len), + name_("rocksdb.CappedPrefix." + ToString(cap_len_)) {} + + virtual const char* Name() const { return name_.c_str(); } + + virtual Slice Transform(const Slice& src) const { + assert(InDomain(src)); + return Slice(src.data(), std::min(cap_len_, src.size())); + } + + virtual bool InDomain(const Slice& src) const { return true; } + + virtual bool InRange(const Slice& dst) const { + return (dst.size() <= cap_len_); + } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return prefix.size() >= cap_len_; + } }; class NoopTransform : public SliceTransform { @@ -59,6 +92,10 @@ class NoopTransform : public SliceTransform { virtual bool InRange(const Slice& dst) const { return true; } + + virtual bool SameResultWhenAppended(const Slice& prefix) const { + return false; + } }; } @@ -67,6 +104,10 @@ const SliceTransform* NewFixedPrefixTransform(size_t prefix_len) { return new FixedPrefixTransform(prefix_len); } +const SliceTransform* NewCappedPrefixTransform(size_t cap_len) { + return new CappedPrefixTransform(cap_len); +} + const SliceTransform* NewNoopTransform() { return new NoopTransform; } diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc new file mode 100644 index 000000000..9f0e34b15 --- /dev/null +++ b/util/slice_transform_test.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/slice_transform.h" + +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/statistics.h" +#include "rocksdb/table.h" +#include "util/testharness.h" + +namespace rocksdb { + +class SliceTransformTest {}; + +TEST(SliceTransformTest, CapPrefixTransform) { + std::string s; + s = "abcdefge"; + + unique_ptr transform; + + transform.reset(NewCappedPrefixTransform(6)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdef"); + ASSERT_TRUE(transform->SameResultWhenAppended("123456")); + ASSERT_TRUE(transform->SameResultWhenAppended("1234567")); + ASSERT_TRUE(!transform->SameResultWhenAppended("12345")); + + transform.reset(NewCappedPrefixTransform(8)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(10)); + ASSERT_EQ(transform->Transform(s).ToString(), "abcdefge"); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform(s).ToString(), ""); + + transform.reset(NewCappedPrefixTransform(0)); + ASSERT_EQ(transform->Transform("").ToString(), ""); +} + +class SliceTransformDBTest { + private: + std::string dbname_; + Env* env_; + DB* db_; + + public: + SliceTransformDBTest() : env_(Env::Default()), db_(nullptr) { + dbname_ = test::TmpDir() + "/slice_transform_db_test"; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + ~SliceTransformDBTest() { + delete db_; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + DB* db() { return db_; } + + // Return the current option configuration. + Options* GetOptions() { return &last_options_; } + + void DestroyAndReopen() { + // Destroy using last options + Destroy(); + ASSERT_OK(TryReopen()); + } + + void Destroy() { + delete db_; + db_ = nullptr; + ASSERT_OK(DestroyDB(dbname_, last_options_)); + } + + Status TryReopen() { + delete db_; + db_ = nullptr; + last_options_.create_if_missing = true; + + return DB::Open(last_options_, dbname_, &db_); + } + + Options last_options_; +}; + +namespace { +uint64_t TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} +} // namespace + +TEST(SliceTransformDBTest, CapPrefix) { + last_options_.prefix_extractor.reset(NewCappedPrefixTransform(8)); + last_options_.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions bbto; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + last_options_.table_factory.reset(NewBlockBasedTableFactory(bbto)); + ASSERT_OK(TryReopen()); + + ReadOptions ro; + FlushOptions fo; + WriteOptions wo; + + ASSERT_OK(db()->Put(wo, "barbarbar", "foo")); + ASSERT_OK(db()->Put(wo, "barbarbar2", "foo2")); + ASSERT_OK(db()->Put(wo, "foo", "bar")); + ASSERT_OK(db()->Put(wo, "foo3", "bar3")); + ASSERT_OK(db()->Flush(fo)); + + unique_ptr iter(db()->NewIterator(ro)); + + iter->Seek("foo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "bar"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 0U); + + iter->Seek("foo2"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barbarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(iter->value().ToString(), "foo"); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 1U); + + iter->Seek("barfoofoo"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 2U); + + iter->Seek("foobarbar"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + ASSERT_EQ(TestGetTickerCount(last_options_, BLOOM_FILTER_PREFIX_USEFUL), 3U); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/sst_dump_test.cc b/util/sst_dump_test.cc new file mode 100644 index 000000000..f3fa1664d --- /dev/null +++ b/util/sst_dump_test.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include +#include "rocksdb/sst_dump_tool.h" + +#include "rocksdb/filter_policy.h" +#include "table/block_based_table_factory.h" +#include "table/table_builder.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +const uint32_t optLength = 100; + +namespace { +static std::string MakeKey(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "k_%04d", i); + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +static std::string MakeValue(int i) { + char buf[100]; + snprintf(buf, sizeof(buf), "v_%04d", i); + InternalKey key(std::string(buf), 0, ValueType::kTypeValue); + return key.Encode().ToString(); +} + +void createSST(const std::string& file_name, + const BlockBasedTableOptions& table_options) { + std::shared_ptr tf; + tf.reset(new rocksdb::BlockBasedTableFactory(table_options)); + + unique_ptr file; + Env* env = Env::Default(); + EnvOptions env_options; + ReadOptions read_options; + Options opts; + const ImmutableCFOptions imoptions(opts); + rocksdb::InternalKeyComparator ikc(opts.comparator); + unique_ptr tb; + + env->NewWritableFile(file_name, &file, env_options); + opts.table_factory = tf; + tb.reset(opts.table_factory->NewTableBuilder(imoptions, ikc, file.get(), + CompressionType::kNoCompression, + CompressionOptions())); + + // Populate slightly more than 1K keys + uint32_t num_keys = 1024; + for (uint32_t i = 0; i < num_keys; i++) { + tb->Add(MakeKey(i), MakeValue(i)); + } + tb->Finish(); + file->Close(); +} + +void cleanup(const std::string& file_name) { + Env* env = Env::Default(); + env->DeleteFile(file_name); + std::string outfile_name = file_name.substr(0, file_name.length() - 4); + outfile_name.append("_dump.txt"); + env->DeleteFile(outfile_name); +} +} // namespace + +// Test for sst dump tool "raw" mode +class SSTDumpToolTest { + public: + BlockBasedTableOptions table_options_; + + SSTDumpToolTest() {} + + ~SSTDumpToolTest() {} +}; + +TEST(SSTDumpToolTest, EmptyFilter) { + std::string file_name = "rocksdb_sst_test.sst"; + createSST(file_name, table_options_); + + char* usage[3]; + for (int i = 0; i < 3; i++) { + usage[i] = new char[optLength]; + } + snprintf(usage[0], optLength, "./sst_dump"); + snprintf(usage[1], optLength, "--command=raw"); + snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst"); + + rocksdb::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage)); + + cleanup(file_name); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST(SSTDumpToolTest, FilterBlock) { + table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); + std::string file_name = "rocksdb_sst_test.sst"; + createSST(file_name, table_options_); + + char* usage[3]; + for (int i = 0; i < 3; i++) { + usage[i] = new char[optLength]; + } + snprintf(usage[0], optLength, "./sst_dump"); + snprintf(usage[1], optLength, "--command=raw"); + snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst"); + + rocksdb::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage)); + + cleanup(file_name); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} + +TEST(SSTDumpToolTest, FullFilterBlock) { + table_options_.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + std::string file_name = "rocksdb_sst_test.sst"; + createSST(file_name, table_options_); + + char* usage[3]; + for (int i = 0; i < 3; i++) { + usage[i] = new char[optLength]; + } + snprintf(usage[0], optLength, "./sst_dump"); + snprintf(usage[1], optLength, "--command=raw"); + snprintf(usage[2], optLength, "--file=rocksdb_sst_test.sst"); + + rocksdb::SSTDumpTool tool; + ASSERT_TRUE(!tool.Run(3, usage)); + + cleanup(file_name); + for (int i = 0; i < 3; i++) { + delete[] usage[i]; + } +} +} // namespace rocksdb + +int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); } diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc new file mode 100644 index 000000000..1d0270c72 --- /dev/null +++ b/util/sst_dump_tool.cc @@ -0,0 +1,397 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "util/sst_dump_tool_imp.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include + +namespace rocksdb { + +SstFileReader::SstFileReader(const std::string& file_path, + bool verify_checksum, + bool output_hex) + :file_name_(file_path), read_num_(0), verify_checksum_(verify_checksum), + output_hex_(output_hex), ioptions_(options_), + internal_comparator_(BytewiseComparator()) { + fprintf(stdout, "Process %s\n", file_path.c_str()); + + init_result_ = NewTableReader(file_name_); +} + +extern const uint64_t kBlockBasedTableMagicNumber; +extern const uint64_t kLegacyBlockBasedTableMagicNumber; +extern const uint64_t kPlainTableMagicNumber; +extern const uint64_t kLegacyPlainTableMagicNumber; + +Status SstFileReader::NewTableReader(const std::string& file_path) { + uint64_t magic_number; + + // read table magic number + Footer footer; + + unique_ptr file; + uint64_t file_size; + Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + if (s.ok()) { + s = options_.env->GetFileSize(file_path, &file_size); + } + if (s.ok()) { + s = ReadFooterFromFile(file_.get(), file_size, &footer); + } + if (s.ok()) { + magic_number = footer.table_magic_number(); + } + + if (s.ok()) { + if (magic_number == kPlainTableMagicNumber || + magic_number == kLegacyPlainTableMagicNumber) { + soptions_.use_mmap_reads = true; + options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + } + options_.comparator = &internal_comparator_; + // For old sst format, ReadTableProperties might fail but file can be read + if (ReadTableProperties(magic_number, file_.get(), file_size).ok()) { + SetTableOptionsByMagicNumber(magic_number); + } else { + SetOldTableOptions(); + } + } + + if (s.ok()) { + s = options_.table_factory->NewTableReader( + ioptions_, soptions_, internal_comparator_, std::move(file_), file_size, + &table_reader_); + } + return s; +} + +Status SstFileReader::DumpTable(const std::string& out_filename) { + unique_ptr out_file; + Env* env = Env::Default(); + env->NewWritableFile(out_filename, &out_file, soptions_); + Status s = table_reader_->DumpTable(out_file.get()); + out_file->Close(); + return s; +} + +Status SstFileReader::ReadTableProperties(uint64_t table_magic_number, + RandomAccessFile* file, + uint64_t file_size) { + TableProperties* table_properties = nullptr; + Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, + options_.env, options_.info_log.get(), + &table_properties); + if (s.ok()) { + table_properties_.reset(table_properties); + } else { + fprintf(stdout, "Not able to read table properties\n"); + } + return s; +} + +Status SstFileReader::SetTableOptionsByMagicNumber( + uint64_t table_magic_number) { + assert(table_properties_); + if (table_magic_number == kBlockBasedTableMagicNumber || + table_magic_number == kLegacyBlockBasedTableMagicNumber) { + options_.table_factory = std::make_shared(); + fprintf(stdout, "Sst file format: block-based\n"); + auto& props = table_properties_->user_collected_properties; + auto pos = props.find(BlockBasedTablePropertyNames::kIndexType); + if (pos != props.end()) { + auto index_type_on_file = static_cast( + DecodeFixed32(pos->second.c_str())); + if (index_type_on_file == + BlockBasedTableOptions::IndexType::kHashSearch) { + options_.prefix_extractor.reset(NewNoopTransform()); + } + } + } else if (table_magic_number == kPlainTableMagicNumber || + table_magic_number == kLegacyPlainTableMagicNumber) { + options_.allow_mmap_reads = true; + + PlainTableOptions plain_table_options; + plain_table_options.user_key_len = kPlainTableVariableLength; + plain_table_options.bloom_bits_per_key = 0; + plain_table_options.hash_table_ratio = 0; + plain_table_options.index_sparseness = 1; + plain_table_options.huge_page_tlb_size = 0; + plain_table_options.encoding_type = kPlain; + plain_table_options.full_scan_mode = true; + + options_.table_factory.reset(NewPlainTableFactory(plain_table_options)); + fprintf(stdout, "Sst file format: plain table\n"); + } else { + char error_msg_buffer[80]; + snprintf(error_msg_buffer, sizeof(error_msg_buffer) - 1, + "Unsupported table magic number --- %lx", + (long)table_magic_number); + return Status::InvalidArgument(error_msg_buffer); + } + + return Status::OK(); +} + +Status SstFileReader::SetOldTableOptions() { + assert(table_properties_ == nullptr); + options_.table_factory = std::make_shared(); + fprintf(stdout, "Sst file format: block-based(old version)\n"); + + return Status::OK(); +} + +Status SstFileReader::ReadSequential(bool print_kv, + uint64_t read_num, + bool has_from, + const std::string& from_key, + bool has_to, + const std::string& to_key) { + if (!table_reader_) { + return init_result_; + } + + Iterator* iter = table_reader_->NewIterator(ReadOptions(verify_checksum_, + false)); + uint64_t i = 0; + if (has_from) { + InternalKey ikey(from_key, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + for (; iter->Valid(); iter->Next()) { + Slice key = iter->key(); + Slice value = iter->value(); + ++i; + if (read_num > 0 && i > read_num) + break; + + ParsedInternalKey ikey; + if (!ParseInternalKey(key, &ikey)) { + std::cerr << "Internal Key [" + << key.ToString(true /* in hex*/) + << "] parse error!\n"; + continue; + } + + // If end marker was specified, we stop before it + if (has_to && BytewiseComparator()->Compare(ikey.user_key, to_key) >= 0) { + break; + } + + if (print_kv) { + fprintf(stdout, "%s => %s\n", + ikey.DebugString(output_hex_).c_str(), + value.ToString(output_hex_).c_str()); + } + } + + read_num_ += i; + + Status ret = iter->status(); + delete iter; + return ret; +} + +Status SstFileReader::ReadTableProperties( + std::shared_ptr* table_properties) { + if (!table_reader_) { + return init_result_; + } + + *table_properties = table_reader_->GetTableProperties(); + return init_result_; +} + +namespace { + +void print_help() { + fprintf(stderr, + "sst_dump [--command=check|scan|none|raw] [--verify_checksum] " + "--file=data_dir_OR_sst_file" + " [--output_hex]" + " [--input_key_hex]" + " [--from=]" + " [--to=]" + " [--read_num=NUM]" + " [--show_properties]\n"); +} + +string HexToString(const string& str) { + string parsed; + if (str[0] != '0' || str[1] != 'x') { + fprintf(stderr, "Invalid hex input %s. Must start with 0x\n", + str.c_str()); + throw "Invalid hex input"; + } + + for (unsigned int i = 2; i < str.length();) { + int c; + sscanf(str.c_str() + i, "%2X", &c); + parsed.push_back(c); + i += 2; + } + return parsed; +} + +} // namespace + +int SSTDumpTool::Run(int argc, char** argv) { + const char* dir_or_file = nullptr; + uint64_t read_num = -1; + std::string command; + + char junk; + uint64_t n; + bool verify_checksum = false; + bool output_hex = false; + bool input_key_hex = false; + bool has_from = false; + bool has_to = false; + bool show_properties = false; + std::string from_key; + std::string to_key; + for (int i = 1; i < argc; i++) { + if (strncmp(argv[i], "--file=", 7) == 0) { + dir_or_file = argv[i] + 7; + } else if (strcmp(argv[i], "--output_hex") == 0) { + output_hex = true; + } else if (strcmp(argv[i], "--input_key_hex") == 0) { + input_key_hex = true; + } else if (sscanf(argv[i], + "--read_num=%lu%c", + (unsigned long*)&n, &junk) == 1) { + read_num = n; + } else if (strcmp(argv[i], "--verify_checksum") == 0) { + verify_checksum = true; + } else if (strncmp(argv[i], "--command=", 10) == 0) { + command = argv[i] + 10; + } else if (strncmp(argv[i], "--from=", 7) == 0) { + from_key = argv[i] + 7; + has_from = true; + } else if (strncmp(argv[i], "--to=", 5) == 0) { + to_key = argv[i] + 5; + has_to = true; + } else if (strcmp(argv[i], "--show_properties") == 0) { + show_properties = true; + } else { + print_help(); + exit(1); + } + } + + if (input_key_hex) { + if (has_from) { + from_key = HexToString(from_key); + } + if (has_to) { + to_key = HexToString(to_key); + } + } + + if (dir_or_file == nullptr) { + print_help(); + exit(1); + } + + std::vector filenames; + rocksdb::Env* env = rocksdb::Env::Default(); + rocksdb::Status st = env->GetChildren(dir_or_file, &filenames); + bool dir = true; + if (!st.ok()) { + filenames.clear(); + filenames.push_back(dir_or_file); + dir = false; + } + + fprintf(stdout, "from [%s] to [%s]\n", + rocksdb::Slice(from_key).ToString(true).c_str(), + rocksdb::Slice(to_key).ToString(true).c_str()); + + uint64_t total_read = 0; + for (size_t i = 0; i < filenames.size(); i++) { + std::string filename = filenames.at(i); + if (filename.length() <= 4 || + filename.rfind(".sst") != filename.length() - 4) { + // ignore + continue; + } + if (dir) { + filename = std::string(dir_or_file) + "/" + filename; + } + + rocksdb::SstFileReader reader(filename, verify_checksum, + output_hex); + if (!reader.getStatus().ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + reader.getStatus().ToString().c_str()); + exit(1); + } + + if (command == "raw") { + std::string out_filename = filename.substr(0, filename.length() - 4); + out_filename.append("_dump.txt"); + + st = reader.DumpTable(out_filename); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + exit(1); + } else { + fprintf(stdout, "raw dump written to file %s\n", &out_filename[0]); + } + continue; + } + + // scan all files in give file path. + if (command == "" || command == "scan" || command == "check") { + st = reader.ReadSequential(command != "check", + read_num > 0 ? (read_num - total_read) : + read_num, + has_from, from_key, has_to, to_key); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), + st.ToString().c_str()); + } + total_read += reader.GetReadNumber(); + if (read_num > 0 && total_read > read_num) { + break; + } + } + if (show_properties) { + const rocksdb::TableProperties* table_properties; + + std::shared_ptr + table_properties_from_reader; + st = reader.ReadTableProperties(&table_properties_from_reader); + if (!st.ok()) { + fprintf(stderr, "%s: %s\n", filename.c_str(), st.ToString().c_str()); + fprintf(stderr, "Try to use initial table properties\n"); + table_properties = reader.GetInitTableProperties(); + } else { + table_properties = table_properties_from_reader.get(); + } + if (table_properties != nullptr) { + fprintf(stdout, + "Table Properties:\n" + "------------------------------\n" + " %s", + table_properties->ToString("\n ", ": ").c_str()); + fprintf(stdout, "# deleted keys: %" PRIu64 "\n", + rocksdb::GetDeletedKeys( + table_properties->user_collected_properties)); + } + } + } + return 0; +} +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/sst_dump_tool_imp.h b/util/sst_dump_tool_imp.h new file mode 100644 index 000000000..7e975a534 --- /dev/null +++ b/util/sst_dump_tool_imp.h @@ -0,0 +1,81 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE +#pragma once + +#include "rocksdb/sst_dump_tool.h" + +#include +#include +#include + +#include "db/dbformat.h" +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/immutable_options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table.h" +#include "rocksdb/table_properties.h" +#include "table/block.h" +#include "table/block_based_table_factory.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "util/ldb_cmd.h" +#include "util/random.h" +#include "util/testharness.h" +#include "util/testutil.h" + +namespace rocksdb { + +class SstFileReader { + public: + explicit SstFileReader(const std::string& file_name, bool verify_checksum, + bool output_hex); + + Status ReadSequential(bool print_kv, uint64_t read_num, bool has_from, + const std::string& from_key, bool has_to, + const std::string& to_key); + + Status ReadTableProperties( + std::shared_ptr* table_properties); + uint64_t GetReadNumber() { return read_num_; } + TableProperties* GetInitTableProperties() { return table_properties_.get(); } + + Status DumpTable(const std::string& out_filename); + Status getStatus() { return init_result_; } + + private: + Status NewTableReader(const std::string& file_path); + Status ReadTableProperties(uint64_t table_magic_number, + RandomAccessFile* file, uint64_t file_size); + Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); + Status SetOldTableOptions(); + + std::string file_name_; + uint64_t read_num_; + bool verify_checksum_; + bool output_hex_; + EnvOptions soptions_; + + Status init_result_; + unique_ptr table_reader_; + unique_ptr file_; + // options_ and internal_comparator_ will also be used in + // ReadSequential internally (specifically, seek-related operations) + Options options_; + const ImmutableCFOptions ioptions_; + InternalKeyComparator internal_comparator_; + unique_ptr table_properties_; +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/statistics.cc b/util/statistics.cc index 24957c9b6..ba7670bb4 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -5,7 +5,10 @@ // #include "util/statistics.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include #include "rocksdb/statistics.h" #include "port/likely.h" @@ -41,8 +44,8 @@ void StatisticsImpl::histogramData(uint32_t histogramType, HistogramData* const data) const { assert( enable_internal_stats_ ? - histogramType < INTERNAL_TICKER_ENUM_MAX : - histogramType < TICKER_ENUM_MAX); + histogramType < INTERNAL_HISTOGRAM_ENUM_MAX : + histogramType < HISTOGRAM_ENUM_MAX); // Return its own ticker version histograms_[histogramType].Data(data); } diff --git a/util/status.cc b/util/status.cc index 3165a497d..f0112d3e1 100644 --- a/util/status.cc +++ b/util/status.cc @@ -21,11 +21,10 @@ const char* Status::CopyState(const char* state) { return result; } -Status::Status(Code code, const Slice& msg, const Slice& msg2) : - code_(code) { - assert(code != kOk); - const uint32_t len1 = msg.size(); - const uint32_t len2 = msg2.size(); +Status::Status(Code _code, const Slice& msg, const Slice& msg2) : code_(_code) { + assert(code_ != kOk); + const uint32_t len1 = static_cast(msg.size()); + const uint32_t len2 = static_cast(msg2.size()); const uint32_t size = len1 + (len2 ? (2 + len2) : 0); char* result = new char[size + 4]; memcpy(result, &size, sizeof(size)); @@ -71,6 +70,9 @@ std::string Status::ToString() const { case kTimedOut: type = "Operation timed out: "; break; + case kAborted: + type = "Operation aborted: "; + break; default: snprintf(tmp, sizeof(tmp), "Unknown code(%d): ", static_cast(code())); diff --git a/util/string_util.cc b/util/string_util.cc index 97b7f9de9..4e0bc4668 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -10,7 +10,7 @@ namespace rocksdb { -std::vector stringSplit(std::string arg, char delim) { +std::vector StringSplit(const std::string& arg, char delim) { std::vector splits; std::stringstream ss(arg); std::string item; diff --git a/util/string_util.h b/util/string_util.h index 676f4aae8..2238a569b 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -10,6 +10,19 @@ #pragma once namespace rocksdb { -extern std::vector stringSplit(std::string arg, char delim); +extern std::vector StringSplit(const std::string& arg, char delim); + +template +inline std::string ToString(T value) { +#ifndef OS_ANDROID + return std::to_string(value); +#else + // Andorid doesn't support all of C++11, std::to_string() being + // one of the not supported features. + std::ostringstream os; + os << value; + return os.str(); +#endif +} } // namespace rocksdb diff --git a/util/testharness.cc b/util/testharness.cc index 4208d2c46..967a8f20a 100644 --- a/util/testharness.cc +++ b/util/testharness.cc @@ -41,18 +41,29 @@ bool RegisterTest(const char* base, const char* name, void (*func)()) { int RunAllTests() { port::InstallStackTraceHandler(); - const char* matcher = getenv("ROCKSDB_TESTS"); + const char* one_matcher = getenv("ROCKSDB_TESTS"); + const char* from_matcher = getenv("ROCKSDB_TESTS_FROM"); int num = 0; + bool tests_on = (one_matcher == nullptr && from_matcher == nullptr); if (tests != nullptr) { for (unsigned int i = 0; i < tests->size(); i++) { const Test& t = (*tests)[i]; - if (matcher != nullptr) { - std::string name = t.base; - name.push_back('.'); - name.append(t.name); - if (strstr(name.c_str(), matcher) == nullptr) { - continue; + if (tests_on == false) { + if (one_matcher != nullptr || from_matcher != nullptr) { + std::string name = t.base; + name.push_back('.'); + name.append(t.name); + if (from_matcher != nullptr && + strstr(name.c_str(), from_matcher) != nullptr) { + tests_on = true; + } + if (!tests_on) { + if (one_matcher == nullptr || + strstr(name.c_str(), one_matcher) == nullptr) { + continue; + } + } } } fprintf(stderr, "==== Test %s.%s\n", t.base, t.name); @@ -64,9 +75,9 @@ int RunAllTests() { return 0; } -std::string TmpDir() { +std::string TmpDir(Env* env) { std::string dir; - Status s = Env::Default()->GetTestDirectory(&dir); + Status s = env->GetTestDirectory(&dir); ASSERT_TRUE(s.ok()) << s.ToString(); return dir; } diff --git a/util/testharness.h b/util/testharness.h index 52c29848d..e57b98a6f 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -12,18 +12,22 @@ #include #include #include +#include #include "port/stack_trace.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "util/random.h" +#include "util/string_util.h" namespace rocksdb { namespace test { // Run some of the tests registered by the TEST() macro. If the -// environment variable "ROCKSDB_TESTS" is not set, runs all tests. -// Otherwise, runs only the tests whose name contains the value of -// "ROCKSDB_TESTS" as a substring. E.g., suppose the tests are: +// environment variable "ROCKSDB_TESTS" and "ROCKSDB_TESTS_FROM" +// are not set, runs all tests. Otherwise, run all tests after +// ROCKSDB_TESTS_FROM and those specified by ROCKSDB_TESTS. +// Partial name match also works for ROCKSDB_TESTS and +// ROCKSDB_TESTS_FROM. E.g., suppose the tests are: // TEST(Foo, Hello) { ... } // TEST(Foo, World) { ... } // ROCKSDB_TESTS=Hello will run the first test @@ -35,7 +39,7 @@ namespace test { extern int RunAllTests(); // Return the directory to use for temporary storage. -extern std::string TmpDir(); +extern std::string TmpDir(Env* env = Env::Default()); // Return a randomization seed for this run. Typically returns the // same number on repeated invocations of this binary, but automated @@ -80,6 +84,14 @@ class Tester { return *this; } + Tester& IsNotOk(const Status& s) { + if (s.ok()) { + ss_ << " Error status expected"; + ok_ = false; + } + return *this; + } + #define BINARY_OP(name,op) \ template \ Tester& name(const X& x, const Y& y) { \ @@ -110,6 +122,7 @@ class Tester { #define ASSERT_TRUE(c) ::rocksdb::test::Tester(__FILE__, __LINE__).Is((c), #c) #define ASSERT_OK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsOk((s)) +#define ASSERT_NOK(s) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNotOk((s)) #define ASSERT_EQ(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsEq((a),(b)) #define ASSERT_NE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsNe((a),(b)) #define ASSERT_GE(a,b) ::rocksdb::test::Tester(__FILE__, __LINE__).IsGe((a),(b)) diff --git a/util/testutil.cc b/util/testutil.cc index 363b8ff19..20f22c2dc 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -23,6 +23,15 @@ Slice RandomString(Random* rnd, int len, std::string* dst) { return Slice(*dst); } +extern std::string RandomHumanReadableString(Random* rnd, int len) { + std::string ret; + ret.resize(len); + for (int i = 0; i < len; ++i) { + ret[i] = static_cast('a' + rnd->Uniform(26)); + } + return ret; +} + std::string RandomKey(Random* rnd, int len) { // Make sure to generate a wide variety of characters so we // test the boundary conditions for short-key optimizations. diff --git a/util/testutil.h b/util/testutil.h index c615fc1e7..b489e9175 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -21,6 +21,8 @@ namespace test { // references the generated data. extern Slice RandomString(Random* rnd, int len, std::string* dst); +extern std::string RandomHumanReadableString(Random* rnd, int len); + // Return a random key with the specified length that may contain interesting // characters (e.g. \x00, \xff, etc.). extern std::string RandomKey(Random* rnd, int len); @@ -76,6 +78,36 @@ class PlainInternalKeyComparator : public InternalKeyComparator { } }; +// A test comparator which compare two strings in this way: +// (1) first compare prefix of 8 bytes in alphabet order, +// (2) if two strings share the same prefix, sort the other part of the string +// in the reverse alphabet order. +// This helps simulate the case of compounded key of [entity][timestamp] and +// latest timestamp first. +class SimpleSuffixReverseComparator : public Comparator { + public: + SimpleSuffixReverseComparator() {} + + virtual const char* Name() const { return "SimpleSuffixReverseComparator"; } + + virtual int Compare(const Slice& a, const Slice& b) const { + Slice prefix_a = Slice(a.data(), 8); + Slice prefix_b = Slice(b.data(), 8); + int prefix_comp = prefix_a.compare(prefix_b); + if (prefix_comp != 0) { + return prefix_comp; + } else { + Slice suffix_a = Slice(a.data() + 8, a.size() - 8); + Slice suffix_b = Slice(b.data() + 8, b.size() - 8); + return -(suffix_a.compare(suffix_b)); + } + } + virtual void FindShortestSeparator(std::string* start, + const Slice& limit) const {} + + virtual void FindShortSuccessor(std::string* key) const {} +}; + // Returns a user key comparator that can be used for comparing two uint64_t // slices. Instead of comparing slices byte-wise, it compares all the 8 bytes // at once. Assumes same endian-ness is used though the database's lifetime. diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc new file mode 100644 index 000000000..737b78fe3 --- /dev/null +++ b/util/thread_list_test.cc @@ -0,0 +1,343 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include + +#include "util/thread_status_updater.h" +#include "util/testharness.h" +#include "rocksdb/db.h" + +#if ROCKSDB_USING_THREAD_STATUS + +namespace rocksdb { + +class SimulatedBackgroundTask { + public: + SimulatedBackgroundTask( + const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name, + const ThreadStatus::OperationType operation_type = + ThreadStatus::OP_UNKNOWN, + const ThreadStatus::StateType state_type = + ThreadStatus::STATE_UNKNOWN) + : db_key_(db_key), db_name_(db_name), + cf_key_(cf_key), cf_name_(cf_name), + operation_type_(operation_type), state_type_(state_type), + should_run_(true), running_count_(0) { + Env::Default()->GetThreadStatusUpdater()->NewColumnFamilyInfo( + db_key_, db_name_, cf_key_, cf_name_); + } + + ~SimulatedBackgroundTask() { + Env::Default()->GetThreadStatusUpdater()->EraseDatabaseInfo(db_key_); + } + + void Run() { + std::unique_lock l(mutex_); + running_count_++; + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( + operation_type_); + Env::Default()->GetThreadStatusUpdater()->SetThreadState(state_type_); + while (should_run_) { + bg_cv_.wait(l); + } + Env::Default()->GetThreadStatusUpdater()->ClearThreadState(); + Env::Default()->GetThreadStatusUpdater()->ClearThreadOperation(); + Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(0); + running_count_--; + bg_cv_.notify_all(); + } + + void FinishAllTasks() { + std::unique_lock l(mutex_); + should_run_ = false; + bg_cv_.notify_all(); + } + + void WaitUntilScheduled(int job_count, Env* env) { + while (running_count_ < job_count) { + env->SleepForMicroseconds(1000); + } + } + + void WaitUntilDone() { + std::unique_lock l(mutex_); + while (running_count_ > 0) { + bg_cv_.wait(l); + } + } + + static void DoSimulatedTask(void* arg) { + reinterpret_cast(arg)->Run(); + } + + private: + const void* db_key_; + const std::string db_name_; + const void* cf_key_; + const std::string cf_name_; + const ThreadStatus::OperationType operation_type_; + const ThreadStatus::StateType state_type_; + std::mutex mutex_; + std::condition_variable bg_cv_; + bool should_run_; + std::atomic running_count_; +}; + +class ThreadListTest { + public: + ThreadListTest() { + } +}; + +TEST(ThreadListTest, EventTables) { + // verify the global tables for operations and states are properly indexed. + for (int type = 0; type != ThreadStatus::NUM_OP_TYPES; ++type) { + ASSERT_EQ(global_operation_table[type].type, type); + ASSERT_EQ(global_operation_table[type].name, + ThreadStatus::GetOperationName( + ThreadStatus::OperationType(type))); + } + + for (int type = 0; type != ThreadStatus::NUM_STATE_TYPES; ++type) { + ASSERT_EQ(global_state_table[type].type, type); + ASSERT_EQ(global_state_table[type].name, + ThreadStatus::GetStateName( + ThreadStatus::StateType(type))); + } +} + +TEST(ThreadListTest, SimpleColumnFamilyInfoTest) { + Env* env = Env::Default(); + const int kHighPriorityThreads = 3; + const int kLowPriorityThreads = 5; + const int kSimulatedHighPriThreads = kHighPriorityThreads - 1; + const int kSimulatedLowPriThreads = kLowPriorityThreads / 3; + env->SetBackgroundThreads(kHighPriorityThreads, Env::HIGH); + env->SetBackgroundThreads(kLowPriorityThreads, Env::LOW); + + SimulatedBackgroundTask running_task( + reinterpret_cast(1234), "running", + reinterpret_cast(5678), "pikachu"); + + for (int test = 0; test < kSimulatedHighPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &running_task, Env::Priority::HIGH); + } + for (int test = 0; test < kSimulatedLowPriThreads; ++test) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &running_task, Env::Priority::LOW); + } + running_task.WaitUntilScheduled( + kSimulatedHighPriThreads + kSimulatedLowPriThreads, env); + + std::vector thread_list; + + // Verify the number of running threads in each pool. + env->GetThreadList(&thread_list); + int running_count[ThreadStatus::NUM_THREAD_TYPES] = {0}; + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + ASSERT_EQ( + running_count[ThreadStatus::HIGH_PRIORITY], + kSimulatedHighPriThreads); + ASSERT_EQ( + running_count[ThreadStatus::LOW_PRIORITY], + kSimulatedLowPriThreads); + ASSERT_EQ( + running_count[ThreadStatus::USER], 0); + + running_task.FinishAllTasks(); + running_task.WaitUntilDone(); + + // Verify none of the threads are running + env->GetThreadList(&thread_list); + + for (int i = 0; i < ThreadStatus::NUM_THREAD_TYPES; ++i) { + running_count[i] = 0; + } + for (auto thread_status : thread_list) { + if (thread_status.cf_name == "pikachu" && + thread_status.db_name == "running") { + running_count[thread_status.thread_type]++; + } + } + + ASSERT_EQ( + running_count[ThreadStatus::HIGH_PRIORITY], 0); + ASSERT_EQ( + running_count[ThreadStatus::LOW_PRIORITY], 0); + ASSERT_EQ( + running_count[ThreadStatus::USER], 0); +} + +namespace { + void UpdateStatusCounts( + const std::vector& thread_list, + int operation_counts[], int state_counts[]) { + for (auto thread_status : thread_list) { + operation_counts[thread_status.operation_type]++; + state_counts[thread_status.state_type]++; + } + } + + void VerifyAndResetCounts( + const int correct_counts[], int collected_counts[], int size) { + for (int i = 0; i < size; ++i) { + ASSERT_EQ(collected_counts[i], correct_counts[i]); + collected_counts[i] = 0; + } + } + + void UpdateCount( + int operation_counts[], int from_event, int to_event, int amount) { + operation_counts[from_event] -= amount; + operation_counts[to_event] += amount; + } +} // namespace + +TEST(ThreadListTest, SimpleEventTest) { + Env* env = Env::Default(); + + // simulated tasks + const int kFlushWriteTasks = 3; + SimulatedBackgroundTask flush_write_task( + reinterpret_cast(1234), "running", + reinterpret_cast(5678), "pikachu", + ThreadStatus::OP_FLUSH); + + const int kCompactionWriteTasks = 4; + SimulatedBackgroundTask compaction_write_task( + reinterpret_cast(1234), "running", + reinterpret_cast(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + const int kCompactionReadTasks = 5; + SimulatedBackgroundTask compaction_read_task( + reinterpret_cast(1234), "running", + reinterpret_cast(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + const int kCompactionWaitTasks = 6; + SimulatedBackgroundTask compaction_wait_task( + reinterpret_cast(1234), "running", + reinterpret_cast(5678), "pikachu", + ThreadStatus::OP_COMPACTION); + + // setup right answers + int correct_operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + correct_operation_counts[ThreadStatus::OP_FLUSH] = + kFlushWriteTasks; + correct_operation_counts[ThreadStatus::OP_COMPACTION] = + kCompactionWriteTasks + kCompactionReadTasks + kCompactionWaitTasks; + + env->SetBackgroundThreads( + correct_operation_counts[ThreadStatus::OP_FLUSH], Env::HIGH); + env->SetBackgroundThreads( + correct_operation_counts[ThreadStatus::OP_COMPACTION], Env::LOW); + + // schedule the simulated tasks + for (int t = 0; t < kFlushWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &flush_write_task, Env::Priority::HIGH); + } + flush_write_task.WaitUntilScheduled(kFlushWriteTasks, env); + + for (int t = 0; t < kCompactionWriteTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_write_task, Env::Priority::LOW); + } + compaction_write_task.WaitUntilScheduled(kCompactionWriteTasks, env); + + for (int t = 0; t < kCompactionReadTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_read_task, Env::Priority::LOW); + } + compaction_read_task.WaitUntilScheduled(kCompactionReadTasks, env); + + for (int t = 0; t < kCompactionWaitTasks; ++t) { + env->Schedule(&SimulatedBackgroundTask::DoSimulatedTask, + &compaction_wait_task, Env::Priority::LOW); + } + compaction_wait_task.WaitUntilScheduled(kCompactionWaitTasks, env); + + // verify the thread-status + int operation_counts[ThreadStatus::NUM_OP_TYPES] = {0}; + int state_counts[ThreadStatus::NUM_STATE_TYPES] = {0}; + + std::vector thread_list; + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-wait tasks and see if the thread-status + // reflects this update + compaction_wait_task.FinishAllTasks(); + compaction_wait_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWaitTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate flush-write tasks and see if the thread-status + // reflects this update + flush_write_task.FinishAllTasks(); + flush_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_FLUSH, + ThreadStatus::OP_UNKNOWN, kFlushWriteTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_write_task.FinishAllTasks(); + compaction_write_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionWriteTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); + + // terminate compaction-write tasks and see if the thread-status + // reflects this update + compaction_read_task.FinishAllTasks(); + compaction_read_task.WaitUntilDone(); + UpdateCount(correct_operation_counts, ThreadStatus::OP_COMPACTION, + ThreadStatus::OP_UNKNOWN, kCompactionReadTasks); + + env->GetThreadList(&thread_list); + UpdateStatusCounts(thread_list, operation_counts, state_counts); + VerifyAndResetCounts(correct_operation_counts, operation_counts, + ThreadStatus::NUM_OP_TYPES); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + return rocksdb::test::RunAllTests(); +} + +#else + +int main(int argc, char** argv) { + return 0; +} + +#endif // ROCKSDB_USING_THREAD_STATUS diff --git a/util/thread_local.cc b/util/thread_local.cc index bc8a4c7d2..af0c8e12b 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -10,7 +10,7 @@ #include "util/thread_local.h" #include "util/mutexlock.h" #include "port/likely.h" - +#include namespace rocksdb { @@ -36,7 +36,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { // Unref stored pointers of current thread from all instances uint32_t id = 0; for (auto& e : tls->entries) { - void* raw = e.ptr.load(std::memory_order_relaxed); + void* raw = e.ptr.load(); if (raw != nullptr) { auto unref = inst->GetHandler(id); if (unref != nullptr) { @@ -51,7 +51,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) { if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { - throw std::runtime_error("pthread_key_create failed"); + abort(); } head_.next = &head_; head_.prev = &head_; @@ -98,7 +98,7 @@ ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { inst->RemoveThreadData(tls_); } delete tls_; - throw std::runtime_error("pthread_setspecific failed"); + abort(); } } return tls_; @@ -109,7 +109,7 @@ void* ThreadLocalPtr::StaticMeta::Get(uint32_t id) const { if (UNLIKELY(id >= tls->entries.size())) { return nullptr; } - return tls->entries[id].ptr.load(std::memory_order_relaxed); + return tls->entries[id].ptr.load(std::memory_order_acquire); } void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { @@ -119,7 +119,7 @@ void ThreadLocalPtr::StaticMeta::Reset(uint32_t id, void* ptr) { MutexLock l(&mutex_); tls->entries.resize(id + 1); } - tls->entries[id].ptr.store(ptr, std::memory_order_relaxed); + tls->entries[id].ptr.store(ptr, std::memory_order_release); } void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { @@ -129,7 +129,7 @@ void* ThreadLocalPtr::StaticMeta::Swap(uint32_t id, void* ptr) { MutexLock l(&mutex_); tls->entries.resize(id + 1); } - return tls->entries[id].ptr.exchange(ptr, std::memory_order_relaxed); + return tls->entries[id].ptr.exchange(ptr, std::memory_order_acquire); } bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, @@ -140,8 +140,8 @@ bool ThreadLocalPtr::StaticMeta::CompareAndSwap(uint32_t id, void* ptr, MutexLock l(&mutex_); tls->entries.resize(id + 1); } - return tls->entries[id].ptr.compare_exchange_strong(expected, ptr, - std::memory_order_relaxed, std::memory_order_relaxed); + return tls->entries[id].ptr.compare_exchange_strong( + expected, ptr, std::memory_order_release, std::memory_order_relaxed); } void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, @@ -150,7 +150,7 @@ void ThreadLocalPtr::StaticMeta::Scrape(uint32_t id, autovector* ptrs, for (ThreadData* t = head_.next; t != &head_; t = t->next) { if (id < t->entries.size()) { void* ptr = - t->entries[id].ptr.exchange(replacement, std::memory_order_relaxed); + t->entries[id].ptr.exchange(replacement, std::memory_order_acquire); if (ptr != nullptr) { ptrs->push_back(ptr); } @@ -198,8 +198,7 @@ void ThreadLocalPtr::StaticMeta::ReclaimId(uint32_t id) { auto unref = GetHandler(id); for (ThreadData* t = head_.next; t != &head_; t = t->next) { if (id < t->entries.size()) { - void* ptr = - t->entries[id].ptr.exchange(nullptr, std::memory_order_relaxed); + void* ptr = t->entries[id].ptr.exchange(nullptr); if (ptr != nullptr && unref != nullptr) { unref(ptr); } diff --git a/util/thread_local.h b/util/thread_local.h index a037a9ceb..6884ed138 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -26,13 +26,14 @@ namespace rocksdb { // (2) a ThreadLocalPtr is destroyed typedef void (*UnrefHandler)(void* ptr); -// Thread local storage that only stores value of pointer type. The storage -// distinguish data coming from different thread and different ThreadLocalPtr -// instances. For example, if a regular thread_local variable A is declared -// in DBImpl, two DBImpl objects would share the same A. ThreadLocalPtr avoids -// the confliction. The total storage size equals to # of threads * # of -// ThreadLocalPtr instances. It is not efficient in terms of space, but it -// should serve most of our use cases well and keep code simple. +// ThreadLocalPtr stores only values of pointer type. Different from +// the usual thread-local-storage, ThreadLocalPtr has the ability to +// distinguish data coming from different threads and different +// ThreadLocalPtr instances. For example, if a regular thread_local +// variable A is declared in DBImpl, two DBImpl objects would share +// the same A. However, a ThreadLocalPtr that is defined under the +// scope of DBImpl can avoid such confliction. As a result, its memory +// usage would be O(# of threads * # of ThreadLocalPtr instances). class ThreadLocalPtr { public: explicit ThreadLocalPtr(UnrefHandler handler = nullptr); diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 70dfa956e..155ef243c 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -24,11 +24,11 @@ class ThreadLocalTest { namespace { struct Params { - Params(port::Mutex* m, port::CondVar* c, int* unref, int n, + Params(port::Mutex* m, port::CondVar* c, int* u, int n, UnrefHandler handler = nullptr) : mu(m), cv(c), - unref(unref), + unref(u), total(n), started(0), completed(0), @@ -112,24 +112,24 @@ TEST(ThreadLocalTest, SequentialReadWriteTest) { p.tls2 = &tls2; auto func = [](void* ptr) { - auto& p = *static_cast(ptr); - - ASSERT_TRUE(p.tls1.Get() == nullptr); - p.tls1.Reset(reinterpret_cast(1)); - ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(1)); - p.tls1.Reset(reinterpret_cast(2)); - ASSERT_TRUE(p.tls1.Get() == reinterpret_cast(2)); - - ASSERT_TRUE(p.tls2->Get() == nullptr); - p.tls2->Reset(reinterpret_cast(1)); - ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(1)); - p.tls2->Reset(reinterpret_cast(2)); - ASSERT_TRUE(p.tls2->Get() == reinterpret_cast(2)); - - p.mu->Lock(); - ++(p.completed); - p.cv->SignalAll(); - p.mu->Unlock(); + auto& params = *static_cast(ptr); + + ASSERT_TRUE(params.tls1.Get() == nullptr); + params.tls1.Reset(reinterpret_cast(1)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast(1)); + params.tls1.Reset(reinterpret_cast(2)); + ASSERT_TRUE(params.tls1.Get() == reinterpret_cast(2)); + + ASSERT_TRUE(params.tls2->Get() == nullptr); + params.tls2->Reset(reinterpret_cast(1)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast(1)); + params.tls2->Reset(reinterpret_cast(2)); + ASSERT_TRUE(params.tls2->Get() == reinterpret_cast(2)); + + params.mu->Lock(); + ++(params.completed); + params.cv->SignalAll(); + params.mu->Unlock(); }; for (int iter = 0; iter < 1024; ++iter) { diff --git a/util/thread_operation.h b/util/thread_operation.h new file mode 100644 index 000000000..45521e227 --- /dev/null +++ b/util/thread_operation.h @@ -0,0 +1,69 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// This file defines the structures for thread operation and state. +// Thread operations are used to describe high level action of a +// thread such as doing compaction or flush, while thread state +// are used to describe lower-level action such as reading / +// writing a file or waiting for a mutex. Operations and states +// are designed to be independent. Typically, a thread usually involves +// in one operation and one state at any specific point in time. + +#pragma once + +#include "include/rocksdb/thread_status.h" + +#include + +namespace rocksdb { + +#if ROCKSDB_USING_THREAD_STATUS + +// The structure that describes a major thread operation. +struct OperationInfo { + const ThreadStatus::OperationType type; + const std::string name; +}; + +// The global operation table. +// +// When updating a status of a thread, the pointer of the OperationInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +// +// Note that it's not designed to be constant as in the future we +// might consider adding global count to the OperationInfo. +static OperationInfo global_operation_table[] = { + {ThreadStatus::OP_UNKNOWN, ""}, + {ThreadStatus::OP_COMPACTION, "Compaction"}, + {ThreadStatus::OP_FLUSH, "Flush"} +}; + +// The structure that describes a state. +struct StateInfo { + const ThreadStatus::StateType type; + const std::string name; +}; + +// The global state table. +// +// When updating a status of a thread, the pointer of the StateInfo +// of the current ThreadStatusData will be pointing to one of the +// rows in this global table. +static StateInfo global_state_table[] = { + {ThreadStatus::STATE_UNKNOWN, ""}, + {ThreadStatus::STATE_MUTEX_WAIT, "Mutex Wait"}, +}; + +#else + +struct OperationInfo { +}; + +struct StateInfo { +}; + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace rocksdb diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc new file mode 100644 index 000000000..faeadf302 --- /dev/null +++ b/util/thread_status_impl.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// + +#include "rocksdb/thread_status.h" +#include "util/thread_operation.h" + +namespace rocksdb { + +#if ROCKSDB_USING_THREAD_STATUS +const std::string& ThreadStatus::GetOperationName( + ThreadStatus::OperationType op_type) { + return global_operation_table[op_type].name; +} + +const std::string& ThreadStatus::GetStateName( + ThreadStatus::StateType state_type) { + return global_state_table[state_type].name; +} + +#else + +const std::string& ThreadStatus::GetOperationName( + ThreadStatus::OperationType op_type) { + static std::string dummy_str = ""; + return dummy_str; +} + +const std::string& ThreadStatus::GetStateName( + ThreadStatus::StateType state_type) { + static std::string dummy_str = ""; + return dummy_str; +} + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace rocksdb diff --git a/util/thread_status_updater.cc b/util/thread_status_updater.cc new file mode 100644 index 000000000..25f7b1c5c --- /dev/null +++ b/util/thread_status_updater.cc @@ -0,0 +1,256 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "port/likely.h" +#include "util/mutexlock.h" +#include "util/thread_status_updater.h" + +namespace rocksdb { + +#if ROCKSDB_USING_THREAD_STATUS + +__thread ThreadStatusData* ThreadStatusUpdater::thread_status_data_ = nullptr; + +void ThreadStatusUpdater::UnregisterThread() { + if (thread_status_data_ != nullptr) { + std::lock_guard lck(thread_list_mutex_); + thread_data_set_.erase(thread_status_data_); + delete thread_status_data_; + thread_status_data_ = nullptr; + } +} + +void ThreadStatusUpdater::SetThreadType( + ThreadStatus::ThreadType ttype) { + auto* data = InitAndGet(); + data->thread_type.store(ttype, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::ResetThreadStatus() { + ClearThreadState(); + ClearThreadOperation(); + SetColumnFamilyInfoKey(nullptr); +} + +void ThreadStatusUpdater::SetColumnFamilyInfoKey( + const void* cf_key) { + auto* data = InitAndGet(); + // set the tracking flag based on whether cf_key is non-null or not. + // If enable_thread_tracking is set to false, the input cf_key + // would be nullptr. + data->enable_tracking = (cf_key != nullptr); + data->cf_key.store(cf_key, std::memory_order_relaxed); +} + +const void* ThreadStatusUpdater::GetColumnFamilyInfoKey() { + auto* data = InitAndGet(); + if (data->enable_tracking == false) { + return nullptr; + } + return data->cf_key.load(std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetThreadOperation( + const ThreadStatus::OperationType type) { + auto* data = InitAndGet(); + if (!data->enable_tracking) { + assert(data->cf_key.load(std::memory_order_relaxed) == nullptr); + return; + } + data->operation_type.store(type, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::ClearThreadOperation() { + auto* data = InitAndGet(); + if (!data->enable_tracking) { + assert(data->cf_key.load(std::memory_order_relaxed) == nullptr); + return; + } + data->operation_type.store( + ThreadStatus::OP_UNKNOWN, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::SetThreadState( + const ThreadStatus::StateType type) { + auto* data = InitAndGet(); + if (!data->enable_tracking) { + assert(data->cf_key.load(std::memory_order_relaxed) == nullptr); + return; + } + data->state_type.store(type, std::memory_order_relaxed); +} + +void ThreadStatusUpdater::ClearThreadState() { + auto* data = InitAndGet(); + if (!data->enable_tracking) { + assert(data->cf_key.load(std::memory_order_relaxed) == nullptr); + return; + } + data->state_type.store( + ThreadStatus::STATE_UNKNOWN, std::memory_order_relaxed); +} + +Status ThreadStatusUpdater::GetThreadList( + std::vector* thread_list) { + thread_list->clear(); + std::vector> valid_list; + + std::lock_guard lck(thread_list_mutex_); + for (auto* thread_data : thread_data_set_) { + assert(thread_data); + auto thread_type = thread_data->thread_type.load( + std::memory_order_relaxed); + // Since any change to cf_info_map requires thread_list_mutex, + // which is currently held by GetThreadList(), here we can safely + // use "memory_order_relaxed" to load the cf_key. + auto cf_key = thread_data->cf_key.load( + std::memory_order_relaxed); + auto iter = cf_info_map_.find(cf_key); + assert(cf_key == 0 || iter != cf_info_map_.end()); + auto* cf_info = iter != cf_info_map_.end() ? + iter->second.get() : nullptr; + const std::string* db_name = nullptr; + const std::string* cf_name = nullptr; + ThreadStatus::OperationType op_type = ThreadStatus::OP_UNKNOWN; + ThreadStatus::StateType state_type = ThreadStatus::STATE_UNKNOWN; + if (cf_info != nullptr) { + db_name = &cf_info->db_name; + cf_name = &cf_info->cf_name; + op_type = thread_data->operation_type.load( + std::memory_order_relaxed); + // display lower-level info only when higher-level info is available. + if (op_type != ThreadStatus::OP_UNKNOWN) { + state_type = thread_data->state_type.load( + std::memory_order_relaxed); + } + } + thread_list->emplace_back( + thread_data->thread_id, thread_type, + db_name ? *db_name : "", + cf_name ? *cf_name : "", + op_type, state_type); + } + + return Status::OK(); +} + +ThreadStatusData* ThreadStatusUpdater::InitAndGet() { + if (UNLIKELY(thread_status_data_ == nullptr)) { + thread_status_data_ = new ThreadStatusData(); + thread_status_data_->thread_id = reinterpret_cast( + thread_status_data_); + std::lock_guard lck(thread_list_mutex_); + thread_data_set_.insert(thread_status_data_); + } + return thread_status_data_; +} + +void ThreadStatusUpdater::NewColumnFamilyInfo( + const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + + cf_info_map_[cf_key].reset( + new ConstantColumnFamilyInfo(db_key, db_name, cf_name)); + db_key_map_[db_key].insert(cf_key); +} + +void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + auto cf_pair = cf_info_map_.find(cf_key); + assert(cf_pair != cf_info_map_.end()); + + auto* cf_info = cf_pair->second.get(); + assert(cf_info); + + // Remove its entry from db_key_map_ by the following steps: + // 1. Obtain the entry in db_key_map_ whose set contains cf_key + // 2. Remove it from the set. + auto db_pair = db_key_map_.find(cf_info->db_key); + assert(db_pair != db_key_map_.end()); + size_t result __attribute__((unused)) = db_pair->second.erase(cf_key); + assert(result); + + cf_pair->second.reset(); + result = cf_info_map_.erase(cf_key); + assert(result); +} + +void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) { + // Acquiring same lock as GetThreadList() to guarantee + // a consistent view of global column family table (cf_info_map). + std::lock_guard lck(thread_list_mutex_); + auto db_pair = db_key_map_.find(db_key); + if (UNLIKELY(db_pair == db_key_map_.end())) { + // In some occasional cases such as DB::Open fails, we won't + // register ColumnFamilyInfo for a db. + return; + } + + size_t result __attribute__((unused)) = 0; + for (auto cf_key : db_pair->second) { + auto cf_pair = cf_info_map_.find(cf_key); + assert(cf_pair != cf_info_map_.end()); + cf_pair->second.reset(); + result = cf_info_map_.erase(cf_key); + assert(result); + } + db_key_map_.erase(db_key); +} + +#else + +void ThreadStatusUpdater::UnregisterThread() { +} + +void ThreadStatusUpdater::ResetThreadStatus() { +} + +void ThreadStatusUpdater::SetThreadType( + ThreadStatus::ThreadType ttype) { +} + +void ThreadStatusUpdater::SetColumnFamilyInfoKey( + const void* cf_key) { +} + +void ThreadStatusUpdater::SetThreadOperation( + const ThreadStatus::OperationType type) { +} + +void ThreadStatusUpdater::ClearThreadOperation() { +} + +void ThreadStatusUpdater::SetThreadState( + const ThreadStatus::StateType type) { +} + +void ThreadStatusUpdater::ClearThreadState() { +} + +Status ThreadStatusUpdater::GetThreadList( + std::vector* thread_list) { + return Status::NotSupported( + "GetThreadList is not supported in the current running environment."); +} + +void ThreadStatusUpdater::NewColumnFamilyInfo( + const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name) { +} + +void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) { +} + +void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) { +} + +#endif // ROCKSDB_USING_THREAD_STATUS +} // namespace rocksdb diff --git a/util/thread_status_updater.h b/util/thread_status_updater.h new file mode 100644 index 000000000..5d4e55bb2 --- /dev/null +++ b/util/thread_status_updater.h @@ -0,0 +1,195 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// The implementation of ThreadStatus. +// +// Note that we make get and set access to ThreadStatusData lockless. +// As a result, ThreadStatusData as a whole is not atomic. However, +// we guarantee consistent ThreadStatusData all the time whenever +// user call GetThreadList(). This consistency guarantee is done +// by having the following constraint in the internal implementation +// of set and get order: +// +// 1. When reset any information in ThreadStatusData, always start from +// clearing up the lower-level information first. +// 2. When setting any information in ThreadStatusData, always start from +// setting the higher-level information. +// 3. When returning ThreadStatusData to the user, fields are fetched from +// higher-level to lower-level. In addition, where there's a nullptr +// in one field, then all fields that has lower-level than that field +// should be ignored. +// +// The high to low level information would be: +// thread_id > thread_type > db > cf > operation > state +// +// This means user might not always get full information, but whenever +// returned by the GetThreadList() is guaranteed to be consistent. +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/status.h" +#include "rocksdb/thread_status.h" +#include "port/port_posix.h" +#include "util/thread_operation.h" + +namespace rocksdb { + +class ColumnFamilyHandle; + +// The structure that keeps constant information about a column family. +struct ConstantColumnFamilyInfo { +#if ROCKSDB_USING_THREAD_STATUS + public: + ConstantColumnFamilyInfo( + const void* _db_key, + const std::string& _db_name, + const std::string& _cf_name) : + db_key(_db_key), db_name(_db_name), cf_name(_cf_name) {} + const void* db_key; + const std::string db_name; + const std::string cf_name; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +// the internal data-structure that is used to reflect the current +// status of a thread using a set of atomic pointers. +struct ThreadStatusData { +#if ROCKSDB_USING_THREAD_STATUS + explicit ThreadStatusData() : thread_id(0), enable_tracking(false) { + thread_type.store(ThreadStatus::USER); + cf_key.store(nullptr); + operation_type.store(ThreadStatus::OP_UNKNOWN); + state_type.store(ThreadStatus::STATE_UNKNOWN); + } + + uint64_t thread_id; + + // A flag to indicate whether the thread tracking is enabled + // in the current thread. This value will be updated based on whether + // the associated Options::enable_thread_tracking is set to true + // in ThreadStatusUtil::SetColumnFamily(). + // + // If set to false, then SetThreadOperation and SetThreadState + // will be no-op. + bool enable_tracking; + + std::atomic thread_type; + std::atomic cf_key; + std::atomic operation_type; + std::atomic state_type; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +// The class that stores and updates the status of the current thread +// using a thread-local ThreadStatusData. +// +// In most of the case, you should use ThreadStatusUtil to update +// the status of the current thread instead of using ThreadSatusUpdater +// directly. +// +// @see ThreadStatusUtil +class ThreadStatusUpdater { + public: + ThreadStatusUpdater() {} + + // Releases all ThreadStatusData of all active threads. + virtual ~ThreadStatusUpdater() {} + + // Unregister the current thread. + void UnregisterThread(); + + // Reset the status of the current thread. This includes resetting + // ColumnFamilyInfoKey, ThreadOperation, and ThreadState. + void ResetThreadStatus(); + + // Set the thread type of the current thread. + void SetThreadType(ThreadStatus::ThreadType ttype); + + // Update the column-family info of the current thread by setting + // its thread-local pointer of ThreadStateInfo to the correct entry. + void SetColumnFamilyInfoKey(const void* cf_key); + + // returns the column family info key. + const void* GetColumnFamilyInfoKey(); + + // Update the thread operation of the current thread. + void SetThreadOperation(const ThreadStatus::OperationType type); + + // Clear thread operation of the current thread. + void ClearThreadOperation(); + + // Update the thread state of the current thread. + void SetThreadState(const ThreadStatus::StateType type); + + // Clear the thread state of the current thread. + void ClearThreadState(); + + // Obtain the status of all active registered threads. + Status GetThreadList( + std::vector* thread_list); + + // Create an entry in the global ColumnFamilyInfo table for the + // specified column family. This function should be called only + // when the current thread does not hold db_mutex. + void NewColumnFamilyInfo( + const void* db_key, const std::string& db_name, + const void* cf_key, const std::string& cf_name); + + // Erase all ConstantColumnFamilyInfo that is associated with the + // specified db instance. This function should be called only when + // the current thread does not hold db_mutex. + void EraseDatabaseInfo(const void* db_key); + + // Erase the ConstantColumnFamilyInfo that is associated with the + // specified ColumnFamilyData. This function should be called only + // when the current thread does not hold db_mutex. + void EraseColumnFamilyInfo(const void* cf_key); + + // Verifies whether the input ColumnFamilyHandles matches + // the information stored in the current cf_info_map. + void TEST_VerifyColumnFamilyInfoMap( + const std::vector& handles, + bool check_exist); + + protected: +#if ROCKSDB_USING_THREAD_STATUS + // The thread-local variable for storing thread status. + static __thread ThreadStatusData* thread_status_data_; + + // Obtain the pointer to the thread status data. It also performs + // initialization when necessary. + ThreadStatusData* InitAndGet(); + + // The mutex that protects cf_info_map and db_key_map. + std::mutex thread_list_mutex_; + + // The current status data of all active threads. + std::unordered_set thread_data_set_; + + // A global map that keeps the column family information. It is stored + // globally instead of inside DB is to avoid the situation where DB is + // closing while GetThreadList function already get the pointer to its + // CopnstantColumnFamilyInfo. + std::unordered_map< + const void*, std::unique_ptr> cf_info_map_; + + // A db_key to cf_key map that allows erasing elements in cf_info_map + // associated to the same db_key faster. + std::unordered_map< + const void*, std::unordered_set> db_key_map_; + +#else + static ThreadStatusData* thread_status_data_; +#endif // ROCKSDB_USING_THREAD_STATUS +}; + +} // namespace rocksdb diff --git a/util/thread_status_updater_debug.cc b/util/thread_status_updater_debug.cc new file mode 100644 index 000000000..274f427d3 --- /dev/null +++ b/util/thread_status_updater_debug.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "util/thread_status_updater.h" +#include "db/column_family.h" + +namespace rocksdb { + +#ifndef NDEBUG +#if ROCKSDB_USING_THREAD_STATUS +void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( + const std::vector& handles, + bool check_exist) { + std::unique_lock lock(thread_list_mutex_); + if (check_exist) { + assert(cf_info_map_.size() == handles.size()); + } + for (auto* handle : handles) { + auto* cfd = reinterpret_cast(handle)->cfd(); + auto iter __attribute__((unused)) = cf_info_map_.find(cfd); + if (check_exist) { + assert(iter != cf_info_map_.end()); + assert(iter->second); + assert(iter->second->cf_name == cfd->GetName()); + } else { + assert(iter == cf_info_map_.end()); + } + } +} + +#else + +void ThreadStatusUpdater::TEST_VerifyColumnFamilyInfoMap( + const std::vector& handles, + bool check_exist) { +} + +#endif // ROCKSDB_USING_THREAD_STATUS +#endif // !NDEBUG + + +} // namespace rocksdb diff --git a/util/thread_status_util.cc b/util/thread_status_util.cc new file mode 100644 index 000000000..970f79ae8 --- /dev/null +++ b/util/thread_status_util.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/env.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" + +namespace rocksdb { + + +#if ROCKSDB_USING_THREAD_STATUS +__thread ThreadStatusUpdater* + ThreadStatusUtil::thread_updater_local_cache_ = nullptr; +__thread bool ThreadStatusUtil::thread_updater_initialized_ = false; + +void ThreadStatusUtil::SetThreadType( + const Env* env, ThreadStatus::ThreadType thread_type) { + if (!MaybeInitThreadLocalUpdater(env)) { + return; + } + assert(thread_updater_local_cache_); + thread_updater_local_cache_->SetThreadType(thread_type); +} + +void ThreadStatusUtil::UnregisterThread() { + thread_updater_initialized_ = false; + if (thread_updater_local_cache_ != nullptr) { + thread_updater_local_cache_->UnregisterThread(); + thread_updater_local_cache_ = nullptr; + } +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { + if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) { + return; + } + assert(thread_updater_local_cache_); + if (cfd != nullptr && cfd->options()->enable_thread_tracking) { + thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); + } else { + // When cfd == nullptr or enable_thread_tracking == false, we set + // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation + // and SetThreadState become no-op. + thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr); + } +} + +void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return; + } + + thread_updater_local_cache_->SetThreadOperation(op); +} + +void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) { + if (thread_updater_local_cache_ == nullptr) { + // thread_updater_local_cache_ must be set in SetColumnFamily + // or other ThreadStatusUtil functions. + return; + } + + thread_updater_local_cache_->SetThreadState(state); +} + +void ThreadStatusUtil::ResetThreadStatus() { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->ResetThreadStatus(); +} + +void ThreadStatusUtil::NewColumnFamilyInfo( + const DB* db, const ColumnFamilyData* cfd) { + if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) { + return; + } + assert(thread_updater_local_cache_); + if (thread_updater_local_cache_) { + thread_updater_local_cache_->NewColumnFamilyInfo( + db, db->GetName(), cfd, cfd->GetName()); + } +} + +void ThreadStatusUtil::EraseColumnFamilyInfo( + const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->EraseColumnFamilyInfo(cfd); +} + +void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) { + if (thread_updater_local_cache_ == nullptr) { + return; + } + thread_updater_local_cache_->EraseDatabaseInfo(db); +} + +bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) { + if (!thread_updater_initialized_ && env != nullptr) { + thread_updater_initialized_ = true; + thread_updater_local_cache_ = env->GetThreadStatusUpdater(); + } + return (thread_updater_local_cache_ != nullptr); +} + +#else + +ThreadStatusUpdater* ThreadStatusUtil::thread_updater_local_cache_ = nullptr; +bool ThreadStatusUtil::thread_updater_initialized_ = false; + +bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) { + return false; +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { +} + +void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { +} + +void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) { +} + +void ThreadStatusUtil::NewColumnFamilyInfo( + const DB* db, const ColumnFamilyData* cfd) { +} + +void ThreadStatusUtil::EraseColumnFamilyInfo( + const ColumnFamilyData* cfd) { +} + +void ThreadStatusUtil::EraseDatabaseInfo(const DB* db) { +} + +void ThreadStatusUtil::ResetThreadStatus() { +} + +#endif // ROCKSDB_USING_THREAD_STATUS + +} // namespace rocksdb diff --git a/util/thread_status_util.h b/util/thread_status_util.h new file mode 100644 index 000000000..8428d492c --- /dev/null +++ b/util/thread_status_util.h @@ -0,0 +1,111 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "db/column_family.h" +#include "rocksdb/env.h" +#include "rocksdb/thread_status.h" +#include "util/thread_status_updater.h" + +namespace rocksdb { +class ColumnFamilyData; + + +// The static utility class for updating thread-local status. +// +// The thread-local status is updated via the thread-local cached +// pointer thread_updater_local_cache_. During each function call, +// when ThreadStatusUtil finds thread_updater_local_cache_ is +// left uninitialized (determined by thread_updater_initialized_), +// it will tries to initialize it using the return value of +// Env::GetThreadStatusUpdater(). When thread_updater_local_cache_ +// is initialized by a non-null pointer, each function call will +// then update the status of the current thread. Otherwise, +// all function calls to ThreadStatusUtil will be no-op. +class ThreadStatusUtil { + public: + // Set the thread type of the current thread. + static void SetThreadType( + const Env* env, ThreadStatus::ThreadType thread_type); + + // Unregister the current thread. + static void UnregisterThread(); + + // Create an entry in the global ColumnFamilyInfo table for the + // specified column family. This function should be called only + // when the current thread does not hold db_mutex. + static void NewColumnFamilyInfo( + const DB* db, const ColumnFamilyData* cfd); + + // Erase the ConstantColumnFamilyInfo that is associated with the + // specified ColumnFamilyData. This function should be called only + // when the current thread does not hold db_mutex. + static void EraseColumnFamilyInfo(const ColumnFamilyData* cfd); + + // Erase all ConstantColumnFamilyInfo that is associated with the + // specified db instance. This function should be called only when + // the current thread does not hold db_mutex. + static void EraseDatabaseInfo(const DB* db); + + // Update the thread status to indicate the current thread is doing + // something related to the specified column family. + static void SetColumnFamily(const ColumnFamilyData* cfd); + + static void SetThreadOperation(ThreadStatus::OperationType type); + + static void SetThreadState(ThreadStatus::StateType type); + + static void ResetThreadStatus(); + +#ifndef NDEBUG + static void TEST_SetOperationDelay( + const ThreadStatus::OperationType operation, int micro); + static void TEST_OperationDelay( + const ThreadStatus::OperationType operation); + static void TEST_SetStateDelay( + const ThreadStatus::StateType state, int micro); + static void TEST_StateDelay(const ThreadStatus::StateType state); +#endif + + protected: + // Initialize the thread-local ThreadStatusUpdater when it finds + // the cached value is nullptr. Returns true if it has cached + // a non-null pointer. + static bool MaybeInitThreadLocalUpdater(const Env* env); + +#if ROCKSDB_USING_THREAD_STATUS + // A boolean flag indicating whether thread_updater_local_cache_ + // is initialized. It is set to true when an Env uses any + // ThreadStatusUtil functions using the current thread other + // than UnregisterThread(). It will be set to false when + // UnregisterThread() is called. + // + // When this variable is set to true, thread_updater_local_cache_ + // will not be updated until this variable is again set to false + // in UnregisterThread(). + static __thread bool thread_updater_initialized_; + + // The thread-local cached ThreadStatusUpdater that caches the + // thread_status_updater_ of the first Env that uses any ThreadStatusUtil + // function other than UnregisterThread(). This variable will + // be cleared when UnregisterThread() is called. + // + // When this variable is set to a non-null pointer, then the status + // of the current thread will be updated when a function of + // ThreadStatusUtil is called. Otherwise, all functions of + // ThreadStatusUtil will be no-op. + // + // When thread_updater_initialized_ is set to true, this variable + // will not be updated until this thread_updater_initialized_ is + // again set to false in UnregisterThread(). + static __thread ThreadStatusUpdater* thread_updater_local_cache_; +#else + static bool thread_updater_initialized_; + static ThreadStatusUpdater* thread_updater_local_cache_; +#endif +}; + +} // namespace rocksdb diff --git a/util/thread_status_util_debug.cc b/util/thread_status_util_debug.cc new file mode 100644 index 000000000..5a86af26a --- /dev/null +++ b/util/thread_status_util_debug.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include + +#include "rocksdb/env.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" + +namespace rocksdb { + +#ifndef NDEBUG +// the delay for debugging purpose. +static std::atomic operations_delay[ThreadStatus::NUM_OP_TYPES]; +static std::atomic states_delay[ThreadStatus::NUM_STATE_TYPES]; + +void ThreadStatusUtil::TEST_SetStateDelay( + const ThreadStatus::StateType state, int micro) { + states_delay[state].store(micro, std::memory_order_relaxed); +} + +void ThreadStatusUtil::TEST_StateDelay( + const ThreadStatus::StateType state) { + Env::Default()->SleepForMicroseconds( + states_delay[state].load(std::memory_order_relaxed)); +} + +void ThreadStatusUtil::TEST_SetOperationDelay( + const ThreadStatus::OperationType operation, int micro) { + operations_delay[operation].store(micro, std::memory_order_relaxed); +} + +void ThreadStatusUtil::TEST_OperationDelay( + const ThreadStatus::OperationType operation) { + Env::Default()->SleepForMicroseconds( + operations_delay[operation].load(std::memory_order_relaxed)); +} +#endif // !NDEBUG + +} // namespace rocksdb diff --git a/util/vectorrep.cc b/util/vectorrep.cc index e61b8ad08..ee38bc304 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -25,7 +25,8 @@ using namespace stl_wrappers; class VectorRep : public MemTableRep { public: - VectorRep(const KeyComparator& compare, Arena* arena, size_t count); + VectorRep(const KeyComparator& compare, MemTableAllocator* allocator, + size_t count); // Insert key into the collection. (The caller will pack key and value into a // single buffer and pass that in as the parameter to Insert) @@ -131,8 +132,9 @@ size_t VectorRep::ApproximateMemoryUsage() { ); } -VectorRep::VectorRep(const KeyComparator& compare, Arena* arena, size_t count) - : MemTableRep(arena), +VectorRep::VectorRep(const KeyComparator& compare, MemTableAllocator* allocator, + size_t count) + : MemTableRep(allocator), bucket_(new Bucket()), immutable_(false), sorted_(false), @@ -282,9 +284,9 @@ MemTableRep::Iterator* VectorRep::GetIterator(Arena* arena) { } // anon namespace MemTableRep* VectorRepFactory::CreateMemTableRep( - const MemTableRep::KeyComparator& compare, Arena* arena, + const MemTableRep::KeyComparator& compare, MemTableAllocator* allocator, const SliceTransform*, Logger* logger) { - return new VectorRep(compare, arena, count_); + return new VectorRep(compare, allocator, count_); } } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/util/xfunc.cc b/util/xfunc.cc new file mode 100644 index 000000000..9a2482272 --- /dev/null +++ b/util/xfunc.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include "rocksdb/options.h" +#include "util/xfunc.h" + +#ifdef XFUNC + +namespace rocksdb { + +std::string XFuncPoint::xfunc_test_; +bool XFuncPoint::initialized_ = false; +bool XFuncPoint::enabled_ = false; + +void GetXFTestOptions(Options* options, int skip_policy) { + if (XFuncPoint::Check("inplace_lock_test") && + (!(skip_policy & kSkipNoSnapshot))) { + options->inplace_update_support = true; + } +} + +} // namespace rocksdb + +#endif // XFUNC diff --git a/util/xfunc.h b/util/xfunc.h new file mode 100644 index 000000000..51122b7aa --- /dev/null +++ b/util/xfunc.h @@ -0,0 +1,99 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +#include + +namespace rocksdb { + +/* + * If ROCKSDB_XFTEST_FORCE has a value of 1, XFUNC is forced to be defined. + * If ROCKSDB_XFTEST_FORCE has a value other than 1, + * XFUNC is forced to be undefined. + * If ROCKSDB_XFTEST_FORCE is undefined, XFUNC is defined based on NDEBUG, + * with XFUNC only being set for debug builds. + */ +#if defined(ROCKSDB_XFTEST_FORCE) +#if (ROCKSDB_XFTEST_FORCE == 1) +#define XFUNC +#endif +#elif NDEBUG +#else +#define XFUNC +#endif + +#ifndef XFUNC +#define XFUNC_TEST(condition, location, lfname, fname, ...) +#else + +struct Options; +void GetXFTestOptions(Options* options, int skip_policy); + +// This class provides the facility to run custom code to test a specific +// feature typically with all existing unit tests. +// A developer could specify cross functional test points in the codebase +// via XFUNC_TEST. +// Each xfunc test represents a position in the execution stream of a thread. +// Whenever that particular piece of code is called, the given cross-functional +// test point is executed. +// eg. on DBOpen, a particular option can be set. +// on Get, a particular option can be set, or a specific check can be invoked. +// XFUNC_TEST(TestName, location, lfname, FunctionName, Args) +// Turn on a specific cross functional test by setting the environment variable +// ROCKSDB_XFUNC_TEST + +class XFuncPoint { + public: + // call once at the beginning of a test to get the test name + static void Init() { + char* s = getenv("ROCKSDB_XFUNC_TEST"); + if (s == nullptr) { + xfunc_test_ = ""; + enabled_ = false; + } else { + xfunc_test_ = s; + enabled_ = true; + } + initialized_ = true; + } + + static bool Initialized() { return initialized_; } + + static bool Check(std::string test) { + return (enabled_ && + ((test.compare("") == 0) || (test.compare(xfunc_test_) == 0))); + } + + private: + static std::string xfunc_test_; + static bool initialized_; + static bool enabled_; +}; + +// Use XFUNC_TEST to specify cross functional test points inside the code base. +// By setting ROCKSDB_XFUNC_TEST, all XFUNC_TEST having that +// value in the condition field will be executed. +// The second argument specifies a string representing the calling location +// The third argument, lfname, is the name of the function which will be created +// and called. +// The fourth argument fname represents the function to be called +// The arguments following that are the arguments to fname +// See Options::Options in options.h for an example use case. +// XFUNC_TEST is no op in release build. +#define XFUNC_TEST(condition, location, lfname, fname, ...) \ + { \ + if (!XFuncPoint::Initialized()) { \ + XFuncPoint::Init(); \ + } \ + if (XFuncPoint::Check(condition)) { \ + std::function lfname = std::bind(fname, __VA_ARGS__); \ + lfname(); \ + } \ + } + +#endif // XFUNC + +enum SkipPolicy { kSkipNone = 0, kSkipNoSnapshot = 1, kSkipNoPrefix = 2 }; +} // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 436f4c2d6..0cac257e3 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -15,9 +15,12 @@ #include "util/crc32c.h" #include "rocksdb/transaction_log.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include +#include #include #include #include @@ -52,7 +55,8 @@ class BackupRateLimiter { (bytes_since_start_ * kMicrosInSecond) / max_bytes_per_second_; if (should_take_micros > interval) { - env_->SleepForMicroseconds(should_take_micros - interval); + env_->SleepForMicroseconds( + static_cast(should_take_micros - interval)); now = env_->NowMicros(); } // reset interval @@ -70,6 +74,27 @@ class BackupRateLimiter { }; } // namespace +void BackupStatistics::IncrementNumberSuccessBackup() { + number_success_backup++; +} +void BackupStatistics::IncrementNumberFailBackup() { + number_fail_backup++; +} + +uint32_t BackupStatistics::GetNumberSuccessBackup() const { + return number_success_backup; +} +uint32_t BackupStatistics::GetNumberFailBackup() const { + return number_fail_backup; +} + +std::string BackupStatistics::ToString() const { + char result[50]; + snprintf(result, sizeof(result), "# success backup: %u, # fail backup: %u", + GetNumberSuccessBackup(), GetNumberFailBackup()); + return result; +} + void BackupableDBOptions::Dump(Logger* logger) const { Log(logger, " Options.backup_dir: %s", backup_dir.c_str()); Log(logger, " Options.backup_env: %p", backup_env); @@ -97,8 +122,10 @@ class BackupEngineImpl : public BackupEngine { void StopBackup() { stop_backup_.store(true, std::memory_order_release); } + Status GarbageCollect(); void GetBackupInfo(std::vector* backup_info); + void GetCorruptedBackups(std::vector* corrupt_backup_ids); Status RestoreDBFromBackup(BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = @@ -118,19 +145,26 @@ class BackupEngineImpl : public BackupEngine { FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum) : refs(0), filename(fname), size(sz), checksum_value(checksum) {} + FileInfo(const FileInfo&) = delete; + FileInfo& operator=(const FileInfo&) = delete; + int refs; const std::string filename; const uint64_t size; - uint32_t checksum_value; + const uint32_t checksum_value; }; class BackupMeta { public: BackupMeta(const std::string& meta_filename, - std::unordered_map* file_infos, Env* env) + std::unordered_map>* file_infos, + Env* env) : timestamp_(0), size_(0), meta_filename_(meta_filename), file_infos_(file_infos), env_(env) {} + BackupMeta(const BackupMeta&) = delete; + BackupMeta& operator=(const BackupMeta&) = delete; + ~BackupMeta() {} void RecordTimestamp() { @@ -142,6 +176,7 @@ class BackupEngineImpl : public BackupEngine { uint64_t GetSize() const { return size_; } + uint32_t GetNumberFiles() { return static_cast(files_.size()); } void SetSequenceNumber(uint64_t sequence_number) { sequence_number_ = sequence_number; } @@ -149,7 +184,7 @@ class BackupEngineImpl : public BackupEngine { return sequence_number_; } - Status AddFile(const FileInfo& file_info); + Status AddFile(std::shared_ptr file_info); void Delete(bool delete_meta = true); @@ -157,7 +192,14 @@ class BackupEngineImpl : public BackupEngine { return files_.empty(); } - const std::vector& GetFiles() { + std::shared_ptr GetFile(const std::string& filename) const { + auto it = file_infos_->find(filename); + if (it == file_infos_->end()) + return nullptr; + return it->second; + } + + const std::vector>& GetFiles() { return files_; } @@ -172,8 +214,8 @@ class BackupEngineImpl : public BackupEngine { uint64_t size_; std::string const meta_filename_; // files with relative paths (without "/" prefix!!) - std::vector files_; - std::unordered_map* file_infos_; + std::vector> files_; + std::unordered_map>* file_infos_; Env* env_; static const size_t max_backup_meta_file_size_ = 10 * 1024 * 1024; // 10MB @@ -260,16 +302,13 @@ class BackupEngineImpl : public BackupEngine { uint64_t size_limit, uint32_t* checksum_value); - // Will delete all the files we don't need anymore - // If full_scan == true, it will do the full scan of files/ directory - // and delete all the files that are not referenced from backuped_file_infos__ - void GarbageCollection(bool full_scan); - // backup state data BackupID latest_backup_id_; - std::map backups_; - std::unordered_map backuped_file_infos_; - std::vector obsolete_backups_; + std::map> backups_; + std::map>> corrupt_backups_; + std::unordered_map> backuped_file_infos_; std::atomic stop_backup_; // options data @@ -286,6 +325,7 @@ class BackupEngineImpl : public BackupEngine { static const size_t kDefaultCopyFileBufferSize = 5 * 1024 * 1024LL; // 5MB size_t copy_file_buffer_size_; bool read_only_; + BackupStatistics backup_statistics_; }; BackupEngine* BackupEngine::NewBackupEngine( @@ -293,6 +333,13 @@ BackupEngine* BackupEngine::NewBackupEngine( return new BackupEngineImpl(db_env, options); } +Status BackupEngine::Open(Env* env, + const BackupableDBOptions& options, + BackupEngine** backup_engine_ptr) { + *backup_engine_ptr = new BackupEngineImpl(env, options); + return Status::OK(); +} + BackupEngineImpl::BackupEngineImpl(Env* db_env, const BackupableDBOptions& options, bool read_only) @@ -344,38 +391,32 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, continue; } assert(backups_.find(backup_id) == backups_.end()); - backups_.insert(std::make_pair( - backup_id, BackupMeta(GetBackupMetaFile(backup_id), - &backuped_file_infos_, backup_env_))); + backups_.insert(std::move( + std::make_pair(backup_id, unique_ptr(new BackupMeta( + GetBackupMetaFile(backup_id), + &backuped_file_infos_, backup_env_))))); } if (options_.destroy_old_data) { // Destory old data assert(!read_only_); - for (auto& backup : backups_) { - backup.second.Delete(); - obsolete_backups_.push_back(backup.first); - } - backups_.clear(); + PurgeOldBackups(0); + (void) GarbageCollect(); // start from beginning latest_backup_id_ = 0; - // GarbageCollection() will do the actual deletion } else { // Load data from storage // load the backups if any for (auto& backup : backups_) { - Status s = backup.second.LoadFromFile(options_.backup_dir); + Status s = backup.second->LoadFromFile(options_.backup_dir); if (!s.ok()) { Log(options_.info_log, "Backup %u corrupted -- %s", backup.first, s.ToString().c_str()); - if (!read_only_) { - Log(options_.info_log, "-> Deleting backup %u", backup.first); - } - backup.second.Delete(!read_only_); - obsolete_backups_.push_back(backup.first); + corrupt_backups_.insert(std::make_pair( + backup.first, std::make_pair(s, std::move(backup.second)))); } } - // delete obsolete backups from the structure - for (auto ob : obsolete_backups_) { - backups_.erase(ob); + + for (const auto& corrupt : corrupt_backups_) { + backups_.erase(backups_.find(corrupt.first)); } Status s = GetLatestBackupFileContents(&latest_backup_id_); @@ -391,16 +432,17 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, } // delete any backups that claim to be later than latest - for (auto itr = backups_.upper_bound(latest_backup_id_); - itr != backups_.end();) { - itr->second.Delete(); - obsolete_backups_.push_back(itr->first); - itr = backups_.erase(itr); + std::vector later_ids; + for (auto itr = backups_.lower_bound(latest_backup_id_ + 1); + itr != backups_.end(); itr++) { + later_ids.push_back(itr->first); + } + for (auto id : later_ids) { + DeleteBackup(id); } if (!read_only_) { PutLatestBackupFileContents(latest_backup_id_); // Ignore errors - GarbageCollection(true); } Log(options_.info_log, "Initialized BackupEngine, the latest backup is %u.", latest_backup_id_); @@ -433,13 +475,16 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { BackupID new_backup_id = latest_backup_id_ + 1; assert(backups_.find(new_backup_id) == backups_.end()); - auto ret = backups_.insert(std::make_pair( - new_backup_id, BackupMeta(GetBackupMetaFile(new_backup_id), - &backuped_file_infos_, backup_env_))); + auto ret = backups_.insert(std::move( + std::make_pair(new_backup_id, unique_ptr(new BackupMeta( + GetBackupMetaFile(new_backup_id), + &backuped_file_infos_, backup_env_))))); assert(ret.second == true); auto& new_backup = ret.first->second; - new_backup.RecordTimestamp(); - new_backup.SetSequenceNumber(sequence_number); + new_backup->RecordTimestamp(); + new_backup->SetSequenceNumber(sequence_number); + + auto start_backup = backup_env_-> NowMicros(); Log(options_.info_log, "Started the backup process -- creating backup %u", new_backup_id); @@ -472,7 +517,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { // * if it's kTableFile, then it's shared // * if it's kDescriptorFile, limit the size to manifest_file_size s = BackupFile(new_backup_id, - &new_backup, + new_backup.get(), options_.share_table_files && type == kTableFile, db->GetName(), /* src_dir */ live_files[i], /* src_fname */ @@ -487,7 +532,7 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { // we only care about live log files // copy the file into backup_dir/files// s = BackupFile(new_backup_id, - &new_backup, + new_backup.get(), false, /* not shared */ db->GetOptions().wal_dir, live_wal_files[i]->PathName(), @@ -505,9 +550,11 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { GetAbsolutePath(GetPrivateFileRel(new_backup_id, false))); } + auto backup_time = backup_env_->NowMicros() - start_backup; + if (s.ok()) { // persist the backup metadata on the disk - s = new_backup.StoreToFile(options_.sync); + s = new_backup->StoreToFile(options_.sync); } if (s.ok()) { // install the newly created backup meta! (atomic) @@ -535,11 +582,17 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { } } + if (s.ok()) { + backup_statistics_.IncrementNumberSuccessBackup(); + } if (!s.ok()) { + backup_statistics_.IncrementNumberFailBackup(); // clean all the files we might have created Log(options_.info_log, "Backup failed -- %s", s.ToString().c_str()); + Log(options_.info_log, "Backup Statistics %s\n", + backup_statistics_.ToString().c_str()); backups_.erase(new_backup_id); - GarbageCollection(true); + (void) GarbageCollect(); return s; } @@ -547,6 +600,17 @@ Status BackupEngineImpl::CreateNewBackup(DB* db, bool flush_before_backup) { // in the LATEST_BACKUP file latest_backup_id_ = new_backup_id; Log(options_.info_log, "Backup DONE. All is good"); + + // backup_speed is in byte/second + double backup_speed = new_backup->GetSize() / (1.048576 * backup_time); + Log(options_.info_log, "Backup number of files: %u", + new_backup->GetNumberFiles()); + Log(options_.info_log, "Backup size: %" PRIu64 " bytes", + new_backup->GetSize()); + Log(options_.info_log, "Backup time: %" PRIu64 " microseconds", backup_time); + Log(options_.info_log, "Backup speed: %.3f MB/s", backup_speed); + Log(options_.info_log, "Backup Statistics %s", + backup_statistics_.ToString().c_str()); return s; } @@ -554,13 +618,15 @@ Status BackupEngineImpl::PurgeOldBackups(uint32_t num_backups_to_keep) { assert(!read_only_); Log(options_.info_log, "Purging old backups, keeping %u", num_backups_to_keep); - while (num_backups_to_keep < backups_.size()) { - Log(options_.info_log, "Deleting backup %u", backups_.begin()->first); - backups_.begin()->second.Delete(); - obsolete_backups_.push_back(backups_.begin()->first); - backups_.erase(backups_.begin()); + std::vector to_delete; + auto itr = backups_.begin(); + while ((backups_.size() - to_delete.size()) > num_backups_to_keep) { + to_delete.push_back(itr->first); + itr++; + } + for (auto backup_id : to_delete) { + DeleteBackup(backup_id); } - GarbageCollection(false); return Status::OK(); } @@ -568,35 +634,74 @@ Status BackupEngineImpl::DeleteBackup(BackupID backup_id) { assert(!read_only_); Log(options_.info_log, "Deleting backup %u", backup_id); auto backup = backups_.find(backup_id); - if (backup == backups_.end()) { - return Status::NotFound("Backup not found"); + if (backup != backups_.end()) { + backup->second->Delete(); + backups_.erase(backup); + } else { + auto corrupt = corrupt_backups_.find(backup_id); + if (corrupt == corrupt_backups_.end()) { + return Status::NotFound("Backup not found"); + } + corrupt->second.second->Delete(); + corrupt_backups_.erase(corrupt); } - backup->second.Delete(); - obsolete_backups_.push_back(backup_id); - backups_.erase(backup); - GarbageCollection(false); + + std::vector to_delete; + for (auto& itr : backuped_file_infos_) { + if (itr.second->refs == 0) { + Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); + Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), + s.ToString().c_str()); + to_delete.push_back(itr.first); + } + } + for (auto& td : to_delete) { + backuped_file_infos_.erase(td); + } + + // take care of private dirs -- GarbageCollect() will take care of them + // if they are not empty + std::string private_dir = GetPrivateFileRel(backup_id); + Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); + Log(options_.info_log, "Deleting private dir %s -- %s", + private_dir.c_str(), s.ToString().c_str()); return Status::OK(); } void BackupEngineImpl::GetBackupInfo(std::vector* backup_info) { backup_info->reserve(backups_.size()); for (auto& backup : backups_) { - if (!backup.second.Empty()) { - backup_info->push_back(BackupInfo( - backup.first, backup.second.GetTimestamp(), backup.second.GetSize())); + if (!backup.second->Empty()) { + backup_info->push_back(BackupInfo( + backup.first, backup.second->GetTimestamp(), + backup.second->GetSize(), + backup.second->GetNumberFiles())); } } } +void +BackupEngineImpl::GetCorruptedBackups( + std::vector* corrupt_backup_ids) { + corrupt_backup_ids->reserve(corrupt_backups_.size()); + for (auto& backup : corrupt_backups_) { + corrupt_backup_ids->push_back(backup.first); + } +} + Status BackupEngineImpl::RestoreDBFromBackup( BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options) { + auto corrupt_itr = corrupt_backups_.find(backup_id); + if (corrupt_itr != corrupt_backups_.end()) { + return corrupt_itr->second.first; + } auto backup_itr = backups_.find(backup_id); if (backup_itr == backups_.end()) { return Status::NotFound("Backup not found"); } auto& backup = backup_itr->second; - if (backup.Empty()) { + if (backup->Empty()) { return Status::NotFound("Backup not found"); } @@ -644,7 +749,8 @@ Status BackupEngineImpl::RestoreDBFromBackup( options_.restore_rate_limit, copy_file_buffer_size_)); } Status s; - for (auto& file : backup.GetFiles()) { + for (const auto& file_info : backup->GetFiles()) { + const std::string &file = file_info->filename; std::string dst; // 1. extract the filename size_t slash = file.find_last_of('/'); @@ -679,9 +785,7 @@ Status BackupEngineImpl::RestoreDBFromBackup( break; } - const auto iter = backuped_file_infos_.find(file); - assert(iter != backuped_file_infos_.end()); - if (iter->second.checksum_value != checksum_value) { + if (file_info->checksum_value != checksum_value) { s = Status::Corruption("Checksum check failed"); break; } @@ -895,7 +999,8 @@ Status BackupEngineImpl::BackupFile(BackupID backup_id, BackupMeta* backup, } } if (s.ok()) { - s = backup->AddFile(FileInfo(dst_relative, size, checksum_value)); + s = backup->AddFile(std::make_shared( + dst_relative, size, checksum_value)); } return s; } @@ -957,115 +1062,91 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir, } } -void BackupEngineImpl::GarbageCollection(bool full_scan) { +Status BackupEngineImpl::GarbageCollect() { assert(!read_only_); Log(options_.info_log, "Starting garbage collection"); - std::vector to_delete; - for (auto& itr : backuped_file_infos_) { - if (itr.second.refs == 0) { - Status s = backup_env_->DeleteFile(GetAbsolutePath(itr.first)); - Log(options_.info_log, "Deleting %s -- %s", itr.first.c_str(), - s.ToString().c_str()); - to_delete.push_back(itr.first); - } - } - for (auto& td : to_delete) { - backuped_file_infos_.erase(td); - } - if (!full_scan) { - // take care of private dirs -- if full_scan == true, then full_scan will - // take care of them - for (auto backup_id : obsolete_backups_) { - std::string private_dir = GetPrivateFileRel(backup_id); - Status s = backup_env_->DeleteDir(GetAbsolutePath(private_dir)); - Log(options_.info_log, "Deleting private dir %s -- %s", - private_dir.c_str(), s.ToString().c_str()); - } - } - obsolete_backups_.clear(); - - if (full_scan) { - Log(options_.info_log, "Starting full scan garbage collection"); - // delete obsolete shared files - std::vector shared_children; - backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), - &shared_children); - for (auto& child : shared_children) { - std::string rel_fname = GetSharedFileRel(child); - // if it's not refcounted, delete it - if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) { - // this might be a directory, but DeleteFile will just fail in that - // case, so we're good - Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); - if (s.ok()) { - Log(options_.info_log, "Deleted %s", rel_fname.c_str()); - } + + // delete obsolete shared files + std::vector shared_children; + backup_env_->GetChildren(GetAbsolutePath(GetSharedFileRel()), + &shared_children); + for (auto& child : shared_children) { + std::string rel_fname = GetSharedFileRel(child); + // if it's not refcounted, delete it + if (backuped_file_infos_.find(rel_fname) == backuped_file_infos_.end()) { + // this might be a directory, but DeleteFile will just fail in that + // case, so we're good + Status s = backup_env_->DeleteFile(GetAbsolutePath(rel_fname)); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", rel_fname.c_str()); } } + } - // delete obsolete private files - std::vector private_children; - backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), - &private_children); - for (auto& child : private_children) { - BackupID backup_id = 0; - bool tmp_dir = child.find(".tmp") != std::string::npos; - sscanf(child.c_str(), "%u", &backup_id); - if (!tmp_dir && // if it's tmp_dir, delete it - (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { - // it's either not a number or it's still alive. continue - continue; - } - // here we have to delete the dir and all its children - std::string full_private_path = - GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); - std::vector subchildren; - backup_env_->GetChildren(full_private_path, &subchildren); - for (auto& subchild : subchildren) { - Status s = backup_env_->DeleteFile(full_private_path + subchild); - if (s.ok()) { - Log(options_.info_log, "Deleted %s", - (full_private_path + subchild).c_str()); - } + // delete obsolete private files + std::vector private_children; + backup_env_->GetChildren(GetAbsolutePath(GetPrivateDirRel()), + &private_children); + for (auto& child : private_children) { + BackupID backup_id = 0; + bool tmp_dir = child.find(".tmp") != std::string::npos; + sscanf(child.c_str(), "%u", &backup_id); + if (!tmp_dir && // if it's tmp_dir, delete it + (backup_id == 0 || backups_.find(backup_id) != backups_.end())) { + // it's either not a number or it's still alive. continue + continue; + } + // here we have to delete the dir and all its children + std::string full_private_path = + GetAbsolutePath(GetPrivateFileRel(backup_id, tmp_dir)); + std::vector subchildren; + backup_env_->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s = backup_env_->DeleteFile(full_private_path + subchild); + if (s.ok()) { + Log(options_.info_log, "Deleted %s", + (full_private_path + subchild).c_str()); } - // finally delete the private dir - Status s = backup_env_->DeleteDir(full_private_path); - Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), - s.ToString().c_str()); } + // finally delete the private dir + Status s = backup_env_->DeleteDir(full_private_path); + Log(options_.info_log, "Deleted dir %s -- %s", full_private_path.c_str(), + s.ToString().c_str()); } + + return Status::OK(); } // ------- BackupMeta class -------- -Status BackupEngineImpl::BackupMeta::AddFile(const FileInfo& file_info) { - size_ += file_info.size; - files_.push_back(file_info.filename); - - auto itr = file_infos_->find(file_info.filename); +Status BackupEngineImpl::BackupMeta::AddFile( + std::shared_ptr file_info) { + auto itr = file_infos_->find(file_info->filename); if (itr == file_infos_->end()) { - auto ret = file_infos_->insert({file_info.filename, file_info}); + auto ret = file_infos_->insert({file_info->filename, file_info}); if (ret.second) { - ret.first->second.refs = 1; + itr = ret.first; + itr->second->refs = 1; } else { // if this happens, something is seriously wrong return Status::Corruption("In memory metadata insertion error"); } } else { - if (itr->second.checksum_value != file_info.checksum_value) { + if (itr->second->checksum_value != file_info->checksum_value) { return Status::Corruption("Checksum mismatch for existing backup file"); } - ++itr->second.refs; // increase refcount if already present + ++itr->second->refs; // increase refcount if already present } + size_ += file_info->size; + files_.push_back(itr->second); + return Status::OK(); } void BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { for (const auto& file : files_) { - auto itr = file_infos_->find(file); - assert(itr != file_infos_->end()); - --(itr->second.refs); // decrease refcount + --file->refs; // decrease refcount } files_.clear(); // delete meta file @@ -1102,24 +1183,31 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile( buf[data.size()] = 0; uint32_t num_files = 0; - int bytes_read = 0; - sscanf(data.data(), "%" PRId64 "%n", ×tamp_, &bytes_read); - data.remove_prefix(bytes_read + 1); // +1 for '\n' - sscanf(data.data(), "%" PRIu64 "%n", &sequence_number_, &bytes_read); - data.remove_prefix(bytes_read + 1); // +1 for '\n' - sscanf(data.data(), "%u%n", &num_files, &bytes_read); - data.remove_prefix(bytes_read + 1); // +1 for '\n' + char *next; + timestamp_ = strtoull(data.data(), &next, 10); + data.remove_prefix(next - data.data() + 1); // +1 for '\n' + sequence_number_ = strtoull(data.data(), &next, 10); + data.remove_prefix(next - data.data() + 1); // +1 for '\n' + num_files = static_cast(strtoul(data.data(), &next, 10)); + data.remove_prefix(next - data.data() + 1); // +1 for '\n' + + std::vector> files; - std::vector files; + Slice checksum_prefix("crc32 "); for (uint32_t i = 0; s.ok() && i < num_files; ++i) { auto line = GetSliceUntil(&data, '\n'); std::string filename = GetSliceUntil(&line, ' ').ToString(); uint64_t size; - s = env_->GetFileSize(backup_dir + "/" + filename, &size); - if (!s.ok()) { - return s; + const std::shared_ptr file_info = GetFile(filename); + if (file_info) { + size = file_info->size; + } else { + s = env_->GetFileSize(backup_dir + "/" + filename, &size); + if (!s.ok()) { + return s; + } } if (line.empty()) { @@ -1127,18 +1215,18 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile( } uint32_t checksum_value = 0; - if (line.starts_with("crc32 ")) { - line.remove_prefix(6); - sscanf(line.data(), "%u", &checksum_value); - if (memcmp(line.data(), std::to_string(checksum_value).c_str(), - line.size() - 1) != 0) { + if (line.starts_with(checksum_prefix)) { + line.remove_prefix(checksum_prefix.size()); + checksum_value = static_cast( + strtoul(line.data(), nullptr, 10)); + if (line != std::to_string(checksum_value)) { return Status::Corruption("Invalid checksum value"); } } else { return Status::Corruption("Unknown checksum type"); } - files.emplace_back(filename, size, checksum_value); + files.emplace_back(new FileInfo(filename, size, checksum_value)); } if (s.ok() && data.size() > 0) { @@ -1147,6 +1235,7 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile( } if (s.ok()) { + files_.reserve(files.size()); for (const auto& file_info : files) { s = AddFile(file_info); if (!s.ok()) { @@ -1176,12 +1265,9 @@ Status BackupEngineImpl::BackupMeta::StoreToFile(bool sync) { sequence_number_); len += snprintf(buf.get() + len, buf_size - len, "%zu\n", files_.size()); for (const auto& file : files_) { - const auto& iter = file_infos_->find(file); - - assert(iter != file_infos_->end()); // use crc32 for now, switch to something else if needed len += snprintf(buf.get() + len, buf_size - len, "%s crc32 %u\n", - file.c_str(), iter->second.checksum_value); + file->filename.c_str(), file->checksum_value); } s = backup_meta_file->Append(Slice(buf.get(), (size_t)len)); @@ -1209,6 +1295,10 @@ class BackupEngineReadOnlyImpl : public BackupEngineReadOnly { backup_engine_->GetBackupInfo(backup_info); } + virtual void GetCorruptedBackups(std::vector* corrupt_backup_ids) { + backup_engine_->GetCorruptedBackups(corrupt_backup_ids); + } + virtual Status RestoreDBFromBackup( BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options = RestoreOptions()) { @@ -1254,6 +1344,11 @@ void BackupableDB::GetBackupInfo(std::vector* backup_info) { backup_engine_->GetBackupInfo(backup_info); } +void +BackupableDB::GetCorruptedBackups(std::vector* corrupt_backup_ids) { + backup_engine_->GetCorruptedBackups(corrupt_backup_ids); +} + Status BackupableDB::PurgeOldBackups(uint32_t num_backups_to_keep) { return backup_engine_->PurgeOldBackups(num_backups_to_keep); } @@ -1266,6 +1361,10 @@ void BackupableDB::StopBackup() { backup_engine_->StopBackup(); } +Status BackupableDB::GarbageCollect() { + return backup_engine_->GarbageCollect(); +} + // --- RestoreBackupableDB methods ------ RestoreBackupableDB::RestoreBackupableDB(Env* db_env, @@ -1281,6 +1380,11 @@ RestoreBackupableDB::GetBackupInfo(std::vector* backup_info) { backup_engine_->GetBackupInfo(backup_info); } +void RestoreBackupableDB::GetCorruptedBackups( + std::vector* corrupt_backup_ids) { + backup_engine_->GetCorruptedBackups(corrupt_backup_ids); +} + Status RestoreBackupableDB::RestoreDBFromBackup( BackupID backup_id, const std::string& db_dir, const std::string& wal_dir, const RestoreOptions& restore_options) { @@ -1303,6 +1407,10 @@ Status RestoreBackupableDB::DeleteBackup(BackupID backup_id) { return backup_engine_->DeleteBackup(backup_id); } +Status RestoreBackupableDB::GarbageCollect() { + return backup_engine_->GarbageCollect(); +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 1d876cd50..46fc7cb6f 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -228,7 +228,7 @@ class FileManager : public EnvWrapper { public: explicit FileManager(Env* t) : EnvWrapper(t), rnd_(5) {} - Status DeleteRandomFileInDir(const std::string dir) { + Status DeleteRandomFileInDir(const std::string& dir) { std::vector children; GetChildren(dir, &children); if (children.size() <= 2) { // . and .. @@ -636,7 +636,34 @@ TEST(BackupableDBTest, CorruptionsTest) { ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); s = restore_db_->RestoreDBFromBackup(2, dbname_, dbname_); ASSERT_TRUE(!s.ok()); + + // make sure that no corrupt backups have actually been deleted! + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/1")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/1")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4")); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5")); + + // delete the corrupt backups and then make sure they're actually deleted + ASSERT_OK(restore_db_->DeleteBackup(5)); + ASSERT_OK(restore_db_->DeleteBackup(4)); + ASSERT_OK(restore_db_->DeleteBackup(3)); ASSERT_OK(restore_db_->DeleteBackup(2)); + (void) restore_db_->GarbageCollect(); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/5") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/5") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/4") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/4") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/3") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/3") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/meta/2") == false); + ASSERT_TRUE(file_manager_->FileExists(backupdir_ + "/private/2") == false); + CloseRestoreDB(); AssertBackupConsistency(0, 0, keys_iteration * 1, keys_iteration * 5); @@ -867,6 +894,8 @@ TEST(BackupableDBTest, DeleteTmpFiles) { file_manager_->WriteToFile(private_tmp_file, "tmp"); ASSERT_EQ(true, file_manager_->FileExists(private_tmp_dir)); OpenBackupableDB(); + // Need to call this explicitly to delete tmp files + (void) db_->GarbageCollect(); CloseBackupableDB(); ASSERT_EQ(false, file_manager_->FileExists(shared_tmp)); ASSERT_EQ(false, file_manager_->FileExists(private_tmp_file)); @@ -916,7 +945,7 @@ TEST(BackupableDBTest, RateLimiting) { auto backup_time = env_->NowMicros() - start_backup; auto rate_limited_backup_time = (bytes_written * kMicrosPerSec) / backupable_options_->backup_rate_limit; - ASSERT_GT(backup_time, 0.9 * rate_limited_backup_time); + ASSERT_GT(backup_time, 0.8 * rate_limited_backup_time); CloseBackupableDB(); @@ -927,7 +956,7 @@ TEST(BackupableDBTest, RateLimiting) { CloseRestoreDB(); auto rate_limited_restore_time = (bytes_written * kMicrosPerSec) / backupable_options_->restore_rate_limit; - ASSERT_GT(restore_time, 0.9 * rate_limited_restore_time); + ASSERT_GT(restore_time, 0.8 * rate_limited_restore_time); AssertBackupConsistency(0, 0, 100000, 100010); } diff --git a/utilities/checkpoint/checkpoint.cc b/utilities/checkpoint/checkpoint.cc new file mode 100644 index 000000000..b180bbd38 --- /dev/null +++ b/utilities/checkpoint/checkpoint.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2012 Facebook. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef ROCKSDB_LITE + +#include "rocksdb/utilities/checkpoint.h" + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#include +#include +#include "db/filename.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "util/file_util.h" + +namespace rocksdb { + +class CheckpointImpl : public Checkpoint { + public: + // Creates a Checkpoint object to be used for creating openable sbapshots + explicit CheckpointImpl(DB* db) : db_(db) {} + + // Builds an openable snapshot of RocksDB on the same disk, which + // accepts an output directory on the same disk, and under the directory + // (1) hard-linked SST files pointing to existing live SST files + // SST files will be copied if output directory is on a different filesystem + // (2) a copied manifest files and other files + // The directory should not already exist and will be created by this API. + // The directory will be an absolute path + using Checkpoint::CreateCheckpoint; + virtual Status CreateCheckpoint(const std::string& checkpoint_dir); + + private: + DB* db_; +}; + +Status Checkpoint::Create(DB* db, Checkpoint** checkpoint_ptr) { + *checkpoint_ptr = new CheckpointImpl(db); + return Status::OK(); +} + +Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir) { + return Status::NotSupported(""); +} + +// Builds an openable snapshot of RocksDB +Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) { + Status s; + std::vector live_files; + uint64_t manifest_file_size = 0; + uint64_t sequence_number = db_->GetLatestSequenceNumber(); + bool same_fs = true; + + if (db_->GetEnv()->FileExists(checkpoint_dir)) { + return Status::InvalidArgument("Directory exists"); + } + + s = db_->DisableFileDeletions(); + if (s.ok()) { + // this will return live_files prefixed with "/" + s = db_->GetLiveFiles(live_files, &manifest_file_size, true); + } + if (!s.ok()) { + db_->EnableFileDeletions(false); + return s; + } + + Log(db_->GetOptions().info_log, + "Started the snapshot process -- creating snapshot in directory %s", + checkpoint_dir.c_str()); + + std::string full_private_path = checkpoint_dir + ".tmp"; + + // create snapshot directory + s = db_->GetEnv()->CreateDir(full_private_path); + + // copy/hard link live_files + for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { + uint64_t number; + FileType type; + bool ok = ParseFileName(live_files[i], &number, &type); + if (!ok) { + s = Status::Corruption("Can't parse file name. This is very bad"); + break; + } + // we should only get sst, manifest and current files here + assert(type == kTableFile || type == kDescriptorFile || + type == kCurrentFile); + assert(live_files[i].size() > 0 && live_files[i][0] == '/'); + std::string src_fname = live_files[i]; + + // rules: + // * if it's kTableFile, then it's shared + // * if it's kDescriptorFile, limit the size to manifest_file_size + // * always copy if cross-device link + if ((type == kTableFile) && same_fs) { + Log(db_->GetOptions().info_log, "Hard Linking %s", src_fname.c_str()); + s = db_->GetEnv()->LinkFile(db_->GetName() + src_fname, + full_private_path + src_fname); + if (s.IsNotSupported()) { + same_fs = false; + s = Status::OK(); + } + } + if ((type != kTableFile) || (!same_fs)) { + Log(db_->GetOptions().info_log, "Copying %s", src_fname.c_str()); + s = CopyFile(db_->GetEnv(), db_->GetName() + src_fname, + full_private_path + src_fname, + (type == kDescriptorFile) ? manifest_file_size : 0); + } + } + + // we copied all the files, enable file deletions + db_->EnableFileDeletions(false); + + if (s.ok()) { + // move tmp private backup to real snapshot directory + s = db_->GetEnv()->RenameFile(full_private_path, checkpoint_dir); + } + if (s.ok()) { + unique_ptr checkpoint_directory; + db_->GetEnv()->NewDirectory(checkpoint_dir, &checkpoint_directory); + if (checkpoint_directory != nullptr) { + s = checkpoint_directory->Fsync(); + } + } + + if (!s.ok()) { + // clean all the files we might have created + Log(db_->GetOptions().info_log, "Snapshot failed -- %s", + s.ToString().c_str()); + // we have to delete the dir and all its children + std::vector subchildren; + db_->GetEnv()->GetChildren(full_private_path, &subchildren); + for (auto& subchild : subchildren) { + Status s1 = db_->GetEnv()->DeleteFile(full_private_path + subchild); + if (s1.ok()) { + Log(db_->GetOptions().info_log, "Deleted %s", + (full_private_path + subchild).c_str()); + } + } + // finally delete the private dir + Status s1 = db_->GetEnv()->DeleteDir(full_private_path); + Log(db_->GetOptions().info_log, "Deleted dir %s -- %s", + full_private_path.c_str(), s1.ToString().c_str()); + return s; + } + + // here we know that we succeeded and installed the new snapshot + Log(db_->GetOptions().info_log, "Snapshot DONE. All is good"); + Log(db_->GetOptions().info_log, "Snapshot sequence number: %" PRIu64, + sequence_number); + + return s; +} +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/utilities/compacted_db/compacted_db_impl.cc b/utilities/compacted_db/compacted_db_impl.cc new file mode 100644 index 000000000..3bd27e46a --- /dev/null +++ b/utilities/compacted_db/compacted_db_impl.cc @@ -0,0 +1,159 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#ifndef ROCKSDB_LITE +#include "utilities/compacted_db/compacted_db_impl.h" +#include "db/db_impl.h" +#include "db/version_set.h" +#include "table/get_context.h" + +namespace rocksdb { + +extern void MarkKeyMayExist(void* arg); +extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, + const Slice& v, bool hit_and_return); + +CompactedDBImpl::CompactedDBImpl( + const DBOptions& options, const std::string& dbname) + : DBImpl(options, dbname) { +} + +CompactedDBImpl::~CompactedDBImpl() { +} + +size_t CompactedDBImpl::FindFile(const Slice& key) { + size_t left = 0; + size_t right = files_.num_files - 1; + while (left < right) { + size_t mid = (left + right) >> 1; + const FdWithKeyRange& f = files_.files[mid]; + if (user_comparator_->Compare(ExtractUserKey(f.largest_key), key) < 0) { + // Key at "mid.largest" is < "target". Therefore all + // files at or before "mid" are uninteresting. + left = mid + 1; + } else { + // Key at "mid.largest" is >= "target". Therefore all files + // after "mid" are uninteresting. + right = mid; + } + } + return right; +} + +Status CompactedDBImpl::Get(const ReadOptions& options, + ColumnFamilyHandle*, const Slice& key, std::string* value) { + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, key, value, nullptr, nullptr); + LookupKey lkey(key, kMaxSequenceNumber); + files_.files[FindFile(key)].fd.table_reader->Get( + options, lkey.internal_key(), &get_context); + if (get_context.State() == GetContext::kFound) { + return Status::OK(); + } + return Status::NotFound(); +} + +std::vector CompactedDBImpl::MultiGet(const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) { + autovector reader_list; + for (const auto& key : keys) { + const FdWithKeyRange& f = files_.files[FindFile(key)]; + if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) { + reader_list.push_back(nullptr); + } else { + LookupKey lkey(key, kMaxSequenceNumber); + f.fd.table_reader->Prepare(lkey.internal_key()); + reader_list.push_back(f.fd.table_reader); + } + } + std::vector statuses(keys.size(), Status::NotFound()); + values->resize(keys.size()); + int idx = 0; + for (auto* r : reader_list) { + if (r != nullptr) { + GetContext get_context(user_comparator_, nullptr, nullptr, nullptr, + GetContext::kNotFound, keys[idx], &(*values)[idx], + nullptr, nullptr); + LookupKey lkey(keys[idx], kMaxSequenceNumber); + r->Get(options, lkey.internal_key(), &get_context); + if (get_context.State() == GetContext::kFound) { + statuses[idx] = Status::OK(); + } + } + ++idx; + } + return statuses; +} + +Status CompactedDBImpl::Init(const Options& options) { + mutex_.Lock(); + ColumnFamilyDescriptor cf(kDefaultColumnFamilyName, + ColumnFamilyOptions(options)); + Status s = Recover({ cf }, true /* read only */, false); + if (s.ok()) { + cfd_ = reinterpret_cast( + DefaultColumnFamily())->cfd(); + delete cfd_->InstallSuperVersion(new SuperVersion(), &mutex_); + } + mutex_.Unlock(); + if (!s.ok()) { + return s; + } + NewThreadStatusCfInfo(cfd_); + version_ = cfd_->GetSuperVersion()->current; + user_comparator_ = cfd_->user_comparator(); + auto* vstorage = version_->storage_info(); + const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0); + // L0 should not have files + if (l0.num_files > 1) { + return Status::NotSupported("L0 contain more than 1 file"); + } + if (l0.num_files == 1) { + if (vstorage->num_non_empty_levels() > 1) { + return Status::NotSupported("Both L0 and other level contain files"); + } + files_ = l0; + return Status::OK(); + } + + for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) { + if (vstorage->LevelFilesBrief(i).num_files > 0) { + return Status::NotSupported("Other levels also contain files"); + } + } + + int level = vstorage->num_non_empty_levels() - 1; + if (vstorage->LevelFilesBrief(level).num_files > 0) { + files_ = vstorage->LevelFilesBrief(level); + return Status::OK(); + } + return Status::NotSupported("no file exists"); +} + +Status CompactedDBImpl::Open(const Options& options, + const std::string& dbname, DB** dbptr) { + *dbptr = nullptr; + + if (options.max_open_files != -1) { + return Status::InvalidArgument("require max_open_files = -1"); + } + if (options.merge_operator.get() != nullptr) { + return Status::InvalidArgument("merge operator is not supported"); + } + DBOptions db_options(options); + std::unique_ptr db(new CompactedDBImpl(db_options, dbname)); + Status s = db->Init(options); + if (s.ok()) { + Log(INFO_LEVEL, db->db_options_.info_log, + "Opened the db as fully compacted mode"); + LogFlush(db->db_options_.info_log); + *dbptr = db.release(); + } + return s; +} + +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/compacted_db/compacted_db_impl.h b/utilities/compacted_db/compacted_db_impl.h new file mode 100644 index 000000000..e1ac92dc4 --- /dev/null +++ b/utilities/compacted_db/compacted_db_impl.h @@ -0,0 +1,96 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once +#ifndef ROCKSDB_LITE +#include "db/db_impl.h" +#include +#include + +namespace rocksdb { + +class CompactedDBImpl : public DBImpl { + public: + CompactedDBImpl(const DBOptions& options, const std::string& dbname); + virtual ~CompactedDBImpl(); + + static Status Open(const Options& options, const std::string& dbname, + DB** dbptr); + + // Implementations of the DB interface + using DB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override; + using DB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector&, + const std::vector& keys, std::vector* values) + override; + + using DBImpl::Put; + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status Write(const WriteOptions& options, + WriteBatch* updates) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::CompactRange; + virtual Status CompactRange(ColumnFamilyHandle* column_family, + const Slice* begin, const Slice* end, + bool reduce_level = false, int target_level = -1, + uint32_t target_path_id = 0) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + virtual Status DisableFileDeletions() override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status EnableFileDeletions(bool force) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + virtual Status GetLiveFiles(std::vector&, + uint64_t* manifest_file_size, + bool flush_memtable = true) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + using DBImpl::Flush; + virtual Status Flush(const FlushOptions& options, + ColumnFamilyHandle* column_family) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + + private: + friend class DB; + inline size_t FindFile(const Slice& key); + Status Init(const Options& options); + + ColumnFamilyData* cfd_; + Version* version_; + const Comparator* user_comparator_; + LevelFilesBrief files_; + + // No copying allowed + CompactedDBImpl(const CompactedDBImpl&); + void operator=(const CompactedDBImpl&); +}; +} +#endif // ROCKSDB_LITE diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index c12a1f253..04d88714b 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -33,7 +33,7 @@ namespace { // > 0 <=> lhs == rhs // TODO(icanadi) move this to JSONDocument? int DocumentCompare(const JSONDocument& lhs, const JSONDocument& rhs) { - assert(rhs.IsObject() == false && rhs.IsObject() == false && + assert(lhs.IsObject() == false && rhs.IsObject() == false && lhs.type() == rhs.type()); switch (lhs.type()) { @@ -312,8 +312,11 @@ bool EncodeJSONPrimitive(const JSONDocument& json, std::string* dst) { break; case JSONDocument::kInt64: dst->push_back(kInt64); - // TODO(icanadi) oops, this will not work correctly for negative numbers - PutFixed64(dst, static_cast(json.GetInt64())); + { + auto val = json.GetInt64(); + dst->push_back((val < 0) ? '0' : '1'); + PutFixed64(dst, static_cast(val)); + } break; case JSONDocument::kString: dst->push_back(kString); @@ -376,7 +379,7 @@ class IndexKey { class SimpleSortedIndex : public Index { public: - SimpleSortedIndex(const std::string field, const std::string& name) + SimpleSortedIndex(const std::string& field, const std::string& name) : field_(field), name_(name) {} virtual const char* Name() const override { return name_.c_str(); } @@ -385,13 +388,13 @@ class SimpleSortedIndex : public Index { override { auto value = document.Get(field_); if (value == nullptr) { - // null if (!EncodeJSONPrimitive(JSONDocument(JSONDocument::kNull), key)) { assert(false); } - } - if (!EncodeJSONPrimitive(*value, key)) { - assert(false); + } else { + if (!EncodeJSONPrimitive(*value, key)) { + assert(false); + } } } virtual const Comparator* GetComparator() const override { @@ -407,7 +410,6 @@ class SimpleSortedIndex : public Index { assert(interval != nullptr); // because index is useful Direction direction; - std::string op; const JSONDocument* limit; if (interval->lower_bound != nullptr) { limit = interval->lower_bound; @@ -736,6 +738,7 @@ class DocumentDBImpl : public DocumentDB { CreateColumnFamily(ColumnFamilyOptions(rocksdb_options_), InternalSecondaryIndexName(index.name), &cf_handle); if (!s.ok()) { + delete index_obj; return s; } diff --git a/utilities/document/document_db_test.cc b/utilities/document/document_db_test.cc index d4c632cce..bacef9a50 100644 --- a/utilities/document/document_db_test.cc +++ b/utilities/document/document_db_test.cc @@ -56,7 +56,7 @@ class DocumentDBTest { } } - JSONDocument* Parse(const std::string doc) { + JSONDocument* Parse(const std::string& doc) { return JSONDocument::ParseJSON(ConvertQuotes(doc).c_str()); } @@ -164,7 +164,9 @@ TEST(DocumentDBTest, ComplexQueryTest) { "{'_id': 8, 'job_name': 'rock', 'priority': 3, 'progress': 93.24}", "{'_id': 9, 'job_name': 'steady', 'priority': 3, 'progress': 9.1}", "{'_id': 10, 'job_name': 'white', 'priority': 1, 'progress': 61.4}", - "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}", }; + "{'_id': 11, 'job_name': 'who', 'priority': 4, 'progress': 39.41}", + "{'_id': 12, 'job_name': 'who', 'priority': -1, 'progress': 39.42}", + "{'_id': 13, 'job_name': 'who', 'priority': -2, 'progress': 39.42}", }; // add index on the fly! CreateIndexes({job_name_index}); @@ -185,6 +187,15 @@ TEST(DocumentDBTest, ComplexQueryTest) { AssertCursorIDs(cursor.get(), {4, 8}); } + // -1 <= priority <= 1, index priority + { + std::unique_ptr query(Parse( + "[{'$filter': {'priority': {'$lte': 1, '$gte': -1}," + " '$index': 'priority'}}]")); + std::unique_ptr cursor(db_->Query(ReadOptions(), *query)); + AssertCursorIDs(cursor.get(), {6, 10, 12}); + } + // 2 < priority < 4 AND progress > 10.0, index progress { std::unique_ptr query(Parse( @@ -209,7 +220,7 @@ TEST(DocumentDBTest, ComplexQueryTest) { "[{'$filter': {'progress': {'$gt': 5.0, '$gte': 35.0, '$lt': 65.5}, " "'$index': 'progress'}}]")); std::unique_ptr cursor(db_->Query(ReadOptions(), *query)); - AssertCursorIDs(cursor.get(), {2, 5, 10, 11}); + AssertCursorIDs(cursor.get(), {2, 5, 10, 11, 12, 13}); } // 2 < priority <= 4, index priority @@ -244,6 +255,35 @@ TEST(DocumentDBTest, ComplexQueryTest) { ASSERT_OK(db_->Update(ReadOptions(), WriteOptions(), *query, *update)); } + // priority < 0 + { + std::unique_ptr query( + Parse("[{'$filter': {'priority': {'$lt': 0}, '$index': 'priority'}}]")); + std::unique_ptr cursor(db_->Query(ReadOptions(), *query)); + ASSERT_OK(cursor->status()); + AssertCursorIDs(cursor.get(), {12, 13}); + } + + // -2 < priority < 0 + { + std::unique_ptr query( + Parse("[{'$filter': {'priority': {'$gt': -2, '$lt': 0}," + " '$index': 'priority'}}]")); + std::unique_ptr cursor(db_->Query(ReadOptions(), *query)); + ASSERT_OK(cursor->status()); + AssertCursorIDs(cursor.get(), {12}); + } + + // -2 <= priority < 0 + { + std::unique_ptr query( + Parse("[{'$filter': {'priority': {'$gte': -2, '$lt': 0}," + " '$index': 'priority'}}]")); + std::unique_ptr cursor(db_->Query(ReadOptions(), *query)); + ASSERT_OK(cursor->status()); + AssertCursorIDs(cursor.get(), {12, 13}); + } + // 4 < priority { std::unique_ptr query( diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index 641f4ee09..254574113 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -6,11 +6,16 @@ #include "rocksdb/utilities/json_document.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + #include + #include -#include #include +#include +#include #include #include "third-party/rapidjson/reader.h" @@ -28,9 +33,9 @@ JSONDocument::JSONDocument(const std::string& s) : type_(kString) { JSONDocument::JSONDocument(const char* s) : type_(kString) { new (&data_.s) std::string(s); } -JSONDocument::JSONDocument(Type type) : type_(type) { +JSONDocument::JSONDocument(Type _type) : type_(_type) { // TODO(icanadi) make all of this better by using templates - switch (type) { + switch (type_) { case kNull: break; case kObject: @@ -542,11 +547,11 @@ bool JSONDocument::DeserializeInternal(Slice* input) { } data_.a.resize(size); for (size_t i = 0; i < size; ++i) { - Type type; - if (!GetNextType(input, &type)) { + Type t; + if (!GetNextType(input, &t)) { return false; } - data_.a[i] = new JSONDocument(type); + data_.a[i] = new JSONDocument(t); if (!data_.a[i]->DeserializeInternal(input)) { return false; } @@ -579,10 +584,10 @@ bool JSONDocument::DeserializeInternal(Slice* input) { for (uint32_t i = 0; ok && i < num_elements; ++i) { Slice key; ok = GetLengthPrefixedSlice(input, &key); - Type type; - ok = ok && GetNextType(input, &type); + Type t; + ok = ok && GetNextType(input, &t); if (ok) { - std::unique_ptr value(new JSONDocument(type)); + std::unique_ptr value(new JSONDocument(t)); ok = value->DeserializeInternal(input); if (ok) { data_.o.insert({key.ToString(), value.get()}); diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc index f63c91c3e..2cb9209e1 100644 --- a/utilities/geodb/geodb_impl.cc +++ b/utilities/geodb/geodb_impl.cc @@ -7,7 +7,9 @@ #include "utilities/geodb/geodb_impl.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif #include #include @@ -15,6 +17,7 @@ #include #include "db/filename.h" #include "util/coding.h" +#include "util/string_util.h" // // There are two types of keys. The first type of key-values @@ -81,7 +84,7 @@ Status GeoDBImpl::GetByPosition(const GeoPosition& pos, Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { Status status; - Slice quadkey; + std::string quadkey; // create an iterator so that we can get a consistent picture // of the database. @@ -94,7 +97,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { iter->Seek(key2); if (iter->Valid() && iter->status().ok()) { if (iter->key().compare(key2) == 0) { - quadkey = iter->value(); + quadkey = iter->value().ToString(); } } if (quadkey.size() == 0) { @@ -105,7 +108,7 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { // // Seek to the quadkey + id prefix // - std::string prefix = MakeKey1Prefix(quadkey.ToString(), id); + std::string prefix = MakeKey1Prefix(quadkey, id); iter->Seek(Slice(prefix)); assert(iter->Valid()); if (!iter->Valid() || !iter->status().ok()) { @@ -114,9 +117,8 @@ Status GeoDBImpl::GetById(const Slice& id, GeoObject* object) { } // split the key into p + quadkey + id + lat + lon - std::vector parts; Slice key = iter->key(); - StringSplit(&parts, key.ToString(), ':'); + std::vector parts = StringSplit(key.ToString(), ':'); assert(parts.size() == 5); assert(parts[0] == "p"); assert(parts[1] == quadkey); @@ -178,9 +180,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos, number_of_values > 0 && iter->Valid() && iter->status().ok(); iter->Next()) { // split the key into p + quadkey + id + lat + lon - std::vector parts; Slice key = iter->key(); - StringSplit(&parts, key.ToString(), ':'); + std::vector parts = StringSplit(key.ToString(), ':'); assert(parts.size() == 5); assert(parts[0] == "p"); std::string* quadkey = &parts[1]; @@ -190,8 +191,8 @@ Status GeoDBImpl::SearchRadial(const GeoPosition& pos, // we are looking for. auto res = std::mismatch(qid.begin(), qid.end(), quadkey->begin()); if (res.first == qid.end()) { - GeoPosition pos(atof(parts[3].c_str()), atof(parts[4].c_str())); - GeoObject obj(pos, parts[4], iter->value().ToString()); + GeoPosition obj_pos(atof(parts[3].c_str()), atof(parts[4].c_str())); + GeoObject obj(obj_pos, parts[4], iter->value().ToString()); values->push_back(obj); number_of_values--; } else { @@ -241,16 +242,6 @@ std::string GeoDBImpl::MakeQuadKeyPrefix(std::string quadkey) { return key; } -void GeoDBImpl::StringSplit(std::vector* tokens, - const std::string &text, char sep) { - std::size_t start = 0, end = 0; - while ((end = text.find(sep, start)) != std::string::npos) { - tokens->push_back(text.substr(start, end - start)); - start = end + 1; - } - tokens->push_back(text.substr(start)); -} - // convert degrees to radians double GeoDBImpl::radians(double x) { return (x * PI) / 180; @@ -395,10 +386,10 @@ std::string GeoDBImpl::TileToQuadKey(const Tile& tile, int levelOfDetail) { // Convert a quadkey to a tile and its level of detail // void GeoDBImpl::QuadKeyToTile(std::string quadkey, Tile* tile, - int *levelOfDetail) { + int* levelOfDetail) { tile->x = tile->y = 0; - *levelOfDetail = quadkey.size(); - const char* key = reinterpret_cast(quadkey.c_str()); + *levelOfDetail = static_cast(quadkey.size()); + const char* key = reinterpret_cast(quadkey.c_str()); for (int i = *levelOfDetail; i > 0; i--) { int mask = 1 << (i - 1); switch (key[*levelOfDetail - i]) { diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h index c7e410458..94b2d6ceb 100644 --- a/utilities/geodb/geodb_impl.h +++ b/utilities/geodb/geodb_impl.h @@ -169,11 +169,6 @@ class GeoDBImpl : public GeoDB { double radius, std::vector* quadKeys); - // splits a string into its components - static void StringSplit(std::vector* tokens, - const std::string &text, - char sep); - // // Create keys for accessing rocksdb table(s) // diff --git a/utilities/leveldb_options/leveldb_options.cc b/utilities/leveldb_options/leveldb_options.cc new file mode 100644 index 000000000..cb7dfb8ea --- /dev/null +++ b/utilities/leveldb_options/leveldb_options.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "rocksdb/utilities/leveldb_options.h" +#include "rocksdb/cache.h" +#include "rocksdb/comparator.h" +#include "rocksdb/env.h" +#include "rocksdb/filter_policy.h" +#include "rocksdb/options.h" +#include "rocksdb/table.h" + +namespace rocksdb { + +LevelDBOptions::LevelDBOptions() + : comparator(BytewiseComparator()), + create_if_missing(false), + error_if_exists(false), + paranoid_checks(false), + env(Env::Default()), + info_log(nullptr), + write_buffer_size(4 << 20), + max_open_files(1000), + block_cache(nullptr), + block_size(4096), + block_restart_interval(16), + compression(kSnappyCompression), + filter_policy(nullptr) {} + +Options ConvertOptions(const LevelDBOptions& leveldb_options) { + Options options = Options(); + options.create_if_missing = leveldb_options.create_if_missing; + options.error_if_exists = leveldb_options.error_if_exists; + options.paranoid_checks = leveldb_options.paranoid_checks; + options.env = leveldb_options.env; + options.info_log.reset(leveldb_options.info_log); + options.write_buffer_size = leveldb_options.write_buffer_size; + options.max_open_files = leveldb_options.max_open_files; + options.compression = leveldb_options.compression; + + BlockBasedTableOptions table_options; + table_options.block_cache.reset(leveldb_options.block_cache); + table_options.block_size = leveldb_options.block_size; + table_options.block_restart_interval = leveldb_options.block_restart_interval; + table_options.filter_policy.reset(leveldb_options.filter_policy); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + return options; +} + +} // namespace rocksdb diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index 9d78651ec..d5083e300 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -1,3 +1,8 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + #include #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -45,7 +50,8 @@ class UInt64AddOperator : public AssociativeMergeOperator { result = DecodeFixed64(value.data()); } else if (logger != nullptr) { // If value is corrupted, treat it as 0 - Log(logger, "uint64 value corruption, size: %zu > %zu", + Log(InfoLogLevel::ERROR_LEVEL, logger, + "uint64 value corruption, size: %zu > %zu", value.size(), sizeof(uint64_t)); } diff --git a/utilities/redis/redis_list_iterator.h b/utilities/redis/redis_list_iterator.h index b776ada24..6d0b1a6af 100644 --- a/utilities/redis/redis_list_iterator.h +++ b/utilities/redis/redis_list_iterator.h @@ -67,7 +67,7 @@ class RedisListIterator { /// attempted, a RedisListException will immediately be thrown. RedisListIterator(const std::string& list_data) : data_(list_data.data()), - num_bytes_(list_data.size()), + num_bytes_(static_cast(list_data.size())), cur_byte_(0), cur_elem_(0), cur_elem_length_(0), @@ -135,11 +135,11 @@ class RedisListIterator { // Ensure we are in a valid state CheckErrors(); - const int kOrigSize = result_.size(); + const int kOrigSize = static_cast(result_.size()); result_.resize(kOrigSize + SizeOf(elem)); - EncodeFixed32(result_.data() + kOrigSize, elem.size()); - memcpy(result_.data() + kOrigSize + sizeof(uint32_t), - elem.data(), + EncodeFixed32(result_.data() + kOrigSize, + static_cast(elem.size())); + memcpy(result_.data() + kOrigSize + sizeof(uint32_t), elem.data(), elem.size()); ++length_; ++cur_elem_; @@ -169,7 +169,7 @@ class RedisListIterator { int Size() const { // result_ holds the currently written data // data_[cur_byte..num_bytes-1] is the remainder of the data - return result_.size() + (num_bytes_ - cur_byte_); + return static_cast(result_.size() + (num_bytes_ - cur_byte_)); } // Reached the end? @@ -209,7 +209,7 @@ class RedisListIterator { /// E.G. This can be used to compute the bytes we want to Reserve(). static uint32_t SizeOf(const Slice& elem) { // [Integer Length . Data] - return sizeof(uint32_t) + elem.size(); + return static_cast(sizeof(uint32_t) + elem.size()); } private: // Private functions diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc index b05c6c798..302f02d7c 100644 --- a/utilities/redis/redis_lists_test.cc +++ b/utilities/redis/redis_lists_test.cc @@ -745,9 +745,9 @@ TEST(RedisListsTest, PersistenceMultiKeyTest) { namespace { void MakeUpper(std::string* const s) { - int len = s->length(); - for(int i=0; i + int len = static_cast(s->length()); + for (int i = 0; i < len; ++i) { + (*s)[i] = toupper((*s)[i]); // C-version defined in } } diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 8b9e49bd4..2a4f7b14e 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -7,11 +7,17 @@ #include "rocksdb/utilities/spatial_db.h" +#ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS +#endif + +#include +#include #include #include #include -#include +#include +#include #include #include @@ -19,6 +25,7 @@ #include "rocksdb/options.h" #include "rocksdb/memtablerep.h" #include "rocksdb/slice_transform.h" +#include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "rocksdb/db.h" #include "rocksdb/utilities/stackable_db.h" @@ -218,6 +225,7 @@ std::string FeatureSet::DebugString() const { switch (iter.second.type()) { case Variant::kNull: out.append("null"); + break; case Variant::kBool: if (iter.second.get_bool()) { out.append("true"); @@ -364,7 +372,7 @@ class SpatialIndexCursor : public Cursor { } delete spatial_iterator; - valid_ = valid_ && primary_key_ids_.size() > 0; + valid_ = valid_ && !primary_key_ids_.empty(); if (valid_) { primary_keys_iterator_ = primary_key_ids_.begin(); @@ -512,6 +520,7 @@ class SpatialDBImpl : public SpatialDB { return Status::InvalidArgument("Spatial indexes can't be empty"); } + const size_t kWriteOutEveryBytes = 1024 * 1024; // 1MB uint64_t id = next_id_.fetch_add(1); for (const auto& si : spatial_indexes) { @@ -533,6 +542,13 @@ class SpatialDBImpl : public SpatialDB { &key, GetQuadKeyFromTile(x, y, spatial_index.tile_bits)); PutFixed64BigEndian(&key, id); batch.Put(itr->second.column_family, key, Slice()); + if (batch.GetDataSize() >= kWriteOutEveryBytes) { + Status s = Write(write_options, &batch); + batch.Clear(); + if (!s.ok()) { + return s; + } + } } } } @@ -548,26 +564,49 @@ class SpatialDBImpl : public SpatialDB { return Write(write_options, &batch); } - virtual Status Compact() override { - Status s, t; + virtual Status Compact(int num_threads) override { + std::vector column_families; + column_families.push_back(data_column_family_); + for (auto& iter : name_to_index_) { - t = Flush(FlushOptions(), iter.second.column_family); - if (!t.ok()) { - s = t; - } - t = CompactRange(iter.second.column_family, nullptr, nullptr); - if (!t.ok()) { - s = t; - } + column_families.push_back(iter.second.column_family); } - t = Flush(FlushOptions(), data_column_family_); - if (!t.ok()) { - s = t; + + std::mutex state_mutex; + std::condition_variable cv; + Status s; + int threads_running = 0; + + std::vector threads; + + for (auto cfh : column_families) { + threads.emplace_back([&, cfh] { + { + std::unique_lock lk(state_mutex); + cv.wait(lk, [&] { return threads_running < num_threads; }); + threads_running++; + } + + Status t = Flush(FlushOptions(), cfh); + if (t.ok()) { + t = CompactRange(cfh, nullptr, nullptr); + } + + { + std::unique_lock lk(state_mutex); + threads_running--; + if (s.ok() && !t.ok()) { + s = t; + } + cv.notify_one(); + } + }); } - t = CompactRange(data_column_family_, nullptr, nullptr); - if (!t.ok()) { - s = t; + + for (auto& t : threads) { + t.join(); } + return s; } @@ -621,6 +660,7 @@ class SpatialDBImpl : public SpatialDB { namespace { DBOptions GetDBOptions(const SpatialDBOptions& options) { DBOptions db_options; + db_options.max_open_files = 50000; db_options.max_background_compactions = 3 * options.num_threads / 4; db_options.max_background_flushes = options.num_threads - db_options.max_background_compactions; @@ -628,8 +668,12 @@ DBOptions GetDBOptions(const SpatialDBOptions& options) { Env::LOW); db_options.env->SetBackgroundThreads(db_options.max_background_flushes, Env::HIGH); + db_options.statistics = CreateDBStatistics(); if (options.bulk_load) { + db_options.stats_dump_period_sec = 600; db_options.disableDataSync = true; + } else { + db_options.stats_dump_period_sec = 1800; // 30min } return db_options; } @@ -639,6 +683,8 @@ ColumnFamilyOptions GetColumnFamilyOptions(const SpatialDBOptions& options, ColumnFamilyOptions column_family_options; column_family_options.write_buffer_size = 128 * 1024 * 1024; // 128MB column_family_options.max_write_buffer_number = 4; + column_family_options.max_bytes_for_level_base = 256 * 1024 * 1024; // 256MB + column_family_options.target_file_size_base = 64 * 1024 * 1024; // 64MB column_family_options.level0_file_num_compaction_trigger = 2; column_family_options.level0_slowdown_writes_trigger = 16; column_family_options.level0_slowdown_writes_trigger = 32; diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc index 166920b57..0484f8c02 100644 --- a/utilities/spatialdb/spatial_db_test.cc +++ b/utilities/spatialdb/spatial_db_test.cc @@ -245,7 +245,10 @@ TEST(SpatialDBTest, RandomizedTest) { elements.push_back(make_pair(blob, bbox)); } - db_->Compact(); + // parallel + db_->Compact(2); + // serial + db_->Compact(1); for (int i = 0; i < 1000; ++i) { BoundingBox int_bbox = RandomBoundingBox(128, &rnd, 10); diff --git a/utilities/spatialdb/utils.h b/utilities/spatialdb/utils.h index eaf3c9b4e..c65ccf561 100644 --- a/utilities/spatialdb/utils.h +++ b/utilities/spatialdb/utils.h @@ -27,7 +27,7 @@ inline uint64_t GetQuadKeyFromTile(uint64_t tile_x, uint64_t tile_y, uint32_t tile_bits) { uint64_t quad_key = 0; for (uint32_t i = 0; i < tile_bits; ++i) { - uint32_t mask = (1LL << i); + uint64_t mask = static_cast(1LL << i); quad_key |= (tile_x & mask) << i; quad_key |= (tile_y & mask) << (i + 1); } diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 4d2d8406e..622e668b1 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -202,8 +202,18 @@ std::vector DBWithTTLImpl::MultiGet( const ReadOptions& options, const std::vector& column_family, const std::vector& keys, std::vector* values) { - return std::vector( - keys.size(), Status::NotSupported("MultiGet not supported with TTL")); + auto statuses = db_->MultiGet(options, column_family, keys, values); + for (size_t i = 0; i < keys.size(); ++i) { + if (!statuses[i].ok()) { + continue; + } + statuses[i] = SanityCheckTimestamp((*values)[i]); + if (!statuses[i].ok()) { + continue; + } + statuses[i] = StripTS(&(*values)[i]); + } + return statuses; } bool DBWithTTLImpl::KeyMayExist(const ReadOptions& options, diff --git a/utilities/ttl/db_ttl_impl.h b/utilities/ttl/db_ttl_impl.h index 84fb55568..6ca1ac157 100644 --- a/utilities/ttl/db_ttl_impl.h +++ b/utilities/ttl/db_ttl_impl.h @@ -206,7 +206,7 @@ class TtlCompactionFilterFactory : public CompactionFilterFactory { class TtlMergeOperator : public MergeOperator { public: - explicit TtlMergeOperator(const std::shared_ptr merge_op, + explicit TtlMergeOperator(const std::shared_ptr& merge_op, Env* env) : user_merge_op_(merge_op), env_(env) { assert(merge_op); @@ -219,7 +219,8 @@ class TtlMergeOperator : public MergeOperator { override { const uint32_t ts_len = DBWithTTLImpl::kTSLength; if (existing_value && existing_value->size() < ts_len) { - Log(logger, "Error: Could not remove timestamp from existing value."); + Log(InfoLogLevel::ERROR_LEVEL, logger, + "Error: Could not remove timestamp from existing value."); return false; } @@ -227,7 +228,8 @@ class TtlMergeOperator : public MergeOperator { std::deque operands_without_ts; for (const auto& operand : operands) { if (operand.size() < ts_len) { - Log(logger, "Error: Could not remove timestamp from operand value."); + Log(InfoLogLevel::ERROR_LEVEL, logger, + "Error: Could not remove timestamp from operand value."); return false; } operands_without_ts.push_back(operand.substr(0, operand.size() - ts_len)); @@ -253,7 +255,7 @@ class TtlMergeOperator : public MergeOperator { // Augment the *new_value with the ttl time-stamp int64_t curtime; if (!env_->GetCurrentTime(&curtime).ok()) { - Log(logger, + Log(InfoLogLevel::ERROR_LEVEL, logger, "Error: Could not get current time to be attached internally " "to the new value."); return false; @@ -274,7 +276,8 @@ class TtlMergeOperator : public MergeOperator { for (const auto& operand : operand_list) { if (operand.size() < ts_len) { - Log(logger, "Error: Could not remove timestamp from value."); + Log(InfoLogLevel::ERROR_LEVEL, logger, + "Error: Could not remove timestamp from value."); return false; } @@ -292,7 +295,7 @@ class TtlMergeOperator : public MergeOperator { // Augment the *new_value with the ttl time-stamp int64_t curtime; if (!env_->GetCurrentTime(&curtime).ok()) { - Log(logger, + Log(InfoLogLevel::ERROR_LEVEL, logger, "Error: Could not get current time to be attached internally " "to the new value."); return false; diff --git a/utilities/ttl/ttl_test.cc b/utilities/ttl/ttl_test.cc index e6d64e54e..73756a704 100644 --- a/utilities/ttl/ttl_test.cc +++ b/utilities/ttl/ttl_test.cc @@ -94,7 +94,8 @@ class TtlTest { void MakeKVMap(int64_t num_entries) { kvmap_.clear(); int digits = 1; - for (int dummy = num_entries; dummy /= 10 ; ++digits); + for (int64_t dummy = num_entries; dummy /= 10; ++digits) { + } int digits_in_i = 1; for (int64_t i = 0; i < num_entries; i++) { std::string key = "key"; @@ -110,17 +111,18 @@ class TtlTest { AppendNumberTo(&value, i); kvmap_[key] = value; } - ASSERT_EQ((int)kvmap_.size(), num_entries);//check all insertions done + ASSERT_EQ(static_cast(kvmap_.size()), + num_entries); // check all insertions done } // Makes a write-batch with key-vals from kvmap_ and 'Write''s it - void MakePutWriteBatch(const BatchOperation* batch_ops, int num_ops) { - ASSERT_LE(num_ops, (int)kvmap_.size()); + void MakePutWriteBatch(const BatchOperation* batch_ops, int64_t num_ops) { + ASSERT_LE(num_ops, static_cast(kvmap_.size())); static WriteOptions wopts; static FlushOptions flush_opts; WriteBatch batch; kv_it_ = kvmap_.begin(); - for (int i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, kv_it_++) { + for (int64_t i = 0; i < num_ops && kv_it_ != kvmap_.end(); i++, ++kv_it_) { switch (batch_ops[i]) { case PUT: batch.Put(kv_it_->first, kv_it_->second); @@ -137,15 +139,16 @@ class TtlTest { } // Puts num_entries starting from start_pos_map from kvmap_ into the database - void PutValues(int start_pos_map, int num_entries, bool flush = true, + void PutValues(int64_t start_pos_map, int64_t num_entries, bool flush = true, ColumnFamilyHandle* cf = nullptr) { ASSERT_TRUE(db_ttl_); - ASSERT_LE(start_pos_map + num_entries, (int)kvmap_.size()); + ASSERT_LE(start_pos_map + num_entries, static_cast(kvmap_.size())); static WriteOptions wopts; static FlushOptions flush_opts; kv_it_ = kvmap_.begin(); advance(kv_it_, start_pos_map); - for (int i = 0; kv_it_ != kvmap_.end() && i < num_entries; i++, kv_it_++) { + for (int64_t i = 0; kv_it_ != kvmap_.end() && i < num_entries; + i++, ++kv_it_) { ASSERT_OK(cf == nullptr ? db_ttl_->Put(wopts, kv_it_->first, kv_it_->second) : db_ttl_->Put(wopts, cf, kv_it_->first, kv_it_->second)); @@ -191,13 +194,32 @@ class TtlTest { } } + // checks the whole kvmap_ to return correct values using MultiGet + void SimpleMultiGetTest() { + static ReadOptions ropts; + std::vector keys; + std::vector values; + + for (auto& kv : kvmap_) { + keys.emplace_back(kv.first); + } + + auto statuses = db_ttl_->MultiGet(ropts, keys, &values); + size_t i = 0; + for (auto& kv : kvmap_) { + ASSERT_OK(statuses[i]); + ASSERT_EQ(values[i], kv.second); + ++i; + } + } + // Sleeps for slp_tim then runs a manual compaction // Checks span starting from st_pos from kvmap_ in the db and // Gets should return true if check is true and false otherwise // Also checks that value that we got is the same as inserted; and =kNewValue // if test_compaction_change is true - void SleepCompactCheck(int slp_tim, int st_pos, int span, bool check = true, - bool test_compaction_change = false, + void SleepCompactCheck(int slp_tim, int64_t st_pos, int64_t span, + bool check = true, bool test_compaction_change = false, ColumnFamilyHandle* cf = nullptr) { ASSERT_TRUE(db_ttl_); @@ -207,7 +229,7 @@ class TtlTest { kv_it_ = kvmap_.begin(); advance(kv_it_, st_pos); std::string v; - for (int i = 0; kv_it_ != kvmap_.end() && i < span; i++, kv_it_++) { + for (int64_t i = 0; kv_it_ != kvmap_.end() && i < span; i++, ++kv_it_) { Status s = (cf == nullptr) ? db_ttl_->Get(ropts, kv_it_->first, &v) : db_ttl_->Get(ropts, cf, kv_it_->first, &v); if (s.ok() != check) { @@ -235,7 +257,8 @@ class TtlTest { } // Similar as SleepCompactCheck but uses TtlIterator to read from db - void SleepCompactCheckIter(int slp, int st_pos, int span, bool check=true) { + void SleepCompactCheckIter(int slp, int st_pos, int64_t span, + bool check = true) { ASSERT_TRUE(db_ttl_); env_->Sleep(slp); ManualCompact(); @@ -250,9 +273,8 @@ class TtlTest { ASSERT_NE(dbiter->value().compare(kv_it_->second), 0); } } else { // dbiter should have found out kvmap_[st_pos] - for (int i = st_pos; - kv_it_ != kvmap_.end() && i < st_pos + span; - i++, kv_it_++) { + for (int64_t i = st_pos; kv_it_ != kvmap_.end() && i < st_pos + span; + i++, ++kv_it_) { ASSERT_TRUE(dbiter->Valid()); ASSERT_EQ(dbiter->value().compare(kv_it_->second), 0); dbiter->Next(); @@ -263,7 +285,7 @@ class TtlTest { class TestFilter : public CompactionFilter { public: - TestFilter(const int64_t kSampleSize, const std::string kNewValue) + TestFilter(const int64_t kSampleSize, const std::string& kNewValue) : kSampleSize_(kSampleSize), kNewValue_(kNewValue) { } @@ -288,7 +310,7 @@ class TtlTest { return false; // Keep keys not matching the format "key" } - int partition = kSampleSize_ / 3; + int64_t partition = kSampleSize_ / 3; if (num_key_end < partition) { return true; } else if (num_key_end < partition * 2) { @@ -311,7 +333,7 @@ class TtlTest { class TestFilterFactory : public CompactionFilterFactory { public: - TestFilterFactory(const int64_t kSampleSize, const std::string kNewValue) + TestFilterFactory(const int64_t kSampleSize, const std::string& kNewValue) : kSampleSize_(kSampleSize), kNewValue_(kNewValue) { } @@ -352,8 +374,8 @@ class TtlTest { // Partitions the sample-size provided into 3 sets over boundary1 and boundary2 TEST(TtlTest, NoEffect) { MakeKVMap(kSampleSize_); - int boundary1 = kSampleSize_ / 3; - int boundary2 = 2 * boundary1; + int64_t boundary1 = kSampleSize_ / 3; + int64_t boundary2 = 2 * boundary1; OpenTtl(); PutValues(0, boundary1); //T=0: Set1 never deleted @@ -510,9 +532,9 @@ TEST(TtlTest, CompactionFilter) { OpenTtlWithTestCompaction(3); PutValues(0, kSampleSize_); // T=0:Insert Set1. - int partition = kSampleSize_ / 3; - SleepCompactCheck(1, 0, partition, false); // Part dropped - SleepCompactCheck(0, partition, partition); // Part kept + int64_t partition = kSampleSize_ / 3; + SleepCompactCheck(1, 0, partition, false); // Part dropped + SleepCompactCheck(0, partition, partition); // Part kept SleepCompactCheck(0, 2 * partition, partition, true, true); // Part changed CloseTtl(); } @@ -530,6 +552,17 @@ TEST(TtlTest, KeyMayExist) { CloseTtl(); } +TEST(TtlTest, MultiGetTest) { + MakeKVMap(kSampleSize_); + + OpenTtl(); + PutValues(0, kSampleSize_, false); + + SimpleMultiGetTest(); + + CloseTtl(); +} + TEST(TtlTest, ColumnFamiliesTest) { DB* db; Options options; diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index 68b3d3970..160e7dac7 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -4,13 +4,289 @@ // of patent rights can be found in the PATENTS file in the same directory. #include "rocksdb/utilities/write_batch_with_index.h" + +#include + #include "rocksdb/comparator.h" +#include "rocksdb/iterator.h" #include "db/column_family.h" #include "db/skiplist.h" #include "util/arena.h" namespace rocksdb { -namespace { + +// when direction == forward +// * current_at_base_ <=> base_iterator > delta_iterator +// when direction == backwards +// * current_at_base_ <=> base_iterator < delta_iterator +// always: +// * equal_keys_ <=> base_iterator == delta_iterator +class BaseDeltaIterator : public Iterator { + public: + BaseDeltaIterator(Iterator* base_iterator, WBWIIterator* delta_iterator, + const Comparator* comparator) + : forward_(true), + current_at_base_(true), + equal_keys_(false), + status_(Status::OK()), + base_iterator_(base_iterator), + delta_iterator_(delta_iterator), + comparator_(comparator) {} + + virtual ~BaseDeltaIterator() {} + + bool Valid() const override { + return current_at_base_ ? BaseValid() : DeltaValid(); + } + + void SeekToFirst() override { + forward_ = true; + base_iterator_->SeekToFirst(); + delta_iterator_->SeekToFirst(); + UpdateCurrent(); + } + + void SeekToLast() override { + forward_ = false; + base_iterator_->SeekToLast(); + delta_iterator_->SeekToLast(); + UpdateCurrent(); + } + + void Seek(const Slice& k) override { + forward_ = true; + base_iterator_->Seek(k); + delta_iterator_->Seek(k); + UpdateCurrent(); + } + + void Next() override { + if (!Valid()) { + status_ = Status::NotSupported("Next() on invalid iterator"); + } + + if (!forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = true; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToFirst(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToFirst(); + } else if (current_at_base_) { + // Change delta from larger than base to smaller + AdvanceDelta(); + } else { + // Change base from larger than delta to smaller + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (Compare() == 0) { + equal_keys_ = true; + } + } + } + Advance(); + } + + void Prev() override { + if (!Valid()) { + status_ = Status::NotSupported("Prev() on invalid iterator"); + } + + if (forward_) { + // Need to change direction + // if our direction was backward and we're not equal, we have two states: + // * both iterators are valid: we're already in a good state (current + // shows to smaller) + // * only one iterator is valid: we need to advance that iterator + forward_ = false; + equal_keys_ = false; + if (!BaseValid()) { + assert(DeltaValid()); + base_iterator_->SeekToLast(); + } else if (!DeltaValid()) { + delta_iterator_->SeekToLast(); + } else if (current_at_base_) { + // Change delta from less advanced than base to more advanced + AdvanceDelta(); + } else { + // Change base from less advanced than delta to more advanced + AdvanceBase(); + } + if (DeltaValid() && BaseValid()) { + if (Compare() == 0) { + equal_keys_ = true; + } + } + } + + Advance(); + } + + Slice key() const override { + return current_at_base_ ? base_iterator_->key() + : delta_iterator_->Entry().key; + } + + Slice value() const override { + return current_at_base_ ? base_iterator_->value() + : delta_iterator_->Entry().value; + } + + Status status() const { + if (!status_.ok()) { + return status_; + } + if (!base_iterator_->status().ok()) { + return base_iterator_->status(); + } + return delta_iterator_->status(); + } + + private: + // -1 -- delta less advanced than base + // 0 -- delta == base + // 1 -- delta more advanced than base + int Compare() const { + assert(delta_iterator_->Valid() && base_iterator_->Valid()); + int cmp = comparator_->Compare(delta_iterator_->Entry().key, + base_iterator_->key()); + if (forward_) { + return cmp; + } else { + return -cmp; + } + } + bool IsDeltaDelete() { + assert(DeltaValid()); + return delta_iterator_->Entry().type == kDeleteRecord; + } + void AssertInvariants() { +#ifndef NDEBUG + if (!Valid()) { + return; + } + if (!BaseValid()) { + assert(!current_at_base_ && delta_iterator_->Valid()); + return; + } + if (!DeltaValid()) { + assert(current_at_base_ && base_iterator_->Valid()); + return; + } + // we don't support those yet + assert(delta_iterator_->Entry().type != kMergeRecord && + delta_iterator_->Entry().type != kLogDataRecord); + int compare = comparator_->Compare(delta_iterator_->Entry().key, + base_iterator_->key()); + if (forward_) { + // current_at_base -> compare < 0 + assert(!current_at_base_ || compare < 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare >= 0); + } else { + // current_at_base -> compare > 0 + assert(!current_at_base_ || compare > 0); + // !current_at_base -> compare <= 0 + assert(current_at_base_ && compare <= 0); + } + // equal_keys_ <=> compare == 0 + assert((equal_keys_ || compare != 0) && (!equal_keys_ || compare == 0)); +#endif + } + + void Advance() { + if (equal_keys_) { + assert(BaseValid() && DeltaValid()); + AdvanceBase(); + AdvanceDelta(); + } else { + if (current_at_base_) { + assert(BaseValid()); + AdvanceBase(); + } else { + assert(DeltaValid()); + AdvanceDelta(); + } + } + UpdateCurrent(); + } + + void AdvanceDelta() { + if (forward_) { + delta_iterator_->Next(); + } else { + delta_iterator_->Prev(); + } + } + void AdvanceBase() { + if (forward_) { + base_iterator_->Next(); + } else { + base_iterator_->Prev(); + } + } + bool BaseValid() const { return base_iterator_->Valid(); } + bool DeltaValid() const { return delta_iterator_->Valid(); } + void UpdateCurrent() { + while (true) { + equal_keys_ = false; + if (!BaseValid()) { + // Base has finished. + if (!DeltaValid()) { + // Finished + return; + } + if (IsDeltaDelete()) { + AdvanceDelta(); + } else { + current_at_base_ = false; + return; + } + } else if (!DeltaValid()) { + // Delta has finished. + current_at_base_ = true; + return; + } else { + int compare = Compare(); + if (compare <= 0) { // delta bigger or equal + if (compare == 0) { + equal_keys_ = true; + } + if (!IsDeltaDelete()) { + current_at_base_ = false; + return; + } + // Delta is less advanced and is delete. + AdvanceDelta(); + if (equal_keys_) { + AdvanceBase(); + } + } else { + current_at_base_ = true; + return; + } + } + } + + AssertInvariants(); + } + + bool forward_; + bool current_at_base_; + bool equal_keys_; + Status status_; + std::unique_ptr base_iterator_; + std::unique_ptr delta_iterator_; + const Comparator* comparator_; // not owned +}; + class ReadableWriteBatch : public WriteBatch { public: explicit ReadableWriteBatch(size_t reserved_bytes = 0) @@ -20,7 +296,6 @@ class ReadableWriteBatch : public WriteBatch { Status GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, Slice* value, Slice* blob) const; }; -} // namespace // Key used by skip list, as the binary searchable index of WriteBatchWithIndex. struct WriteBatchIndexEntry { @@ -29,6 +304,10 @@ struct WriteBatchIndexEntry { WriteBatchIndexEntry(const Slice* sk, uint32_t c) : offset(0), column_family(c), search_key(sk) {} + // If this flag appears in the offset, it indicates a key that is smaller + // than any other entry for the same column family + static const size_t kFlagMin = std::numeric_limits::max(); + size_t offset; // offset of an entry in write batch's string buffer. uint32_t column_family; // column family of the entry const Slice* search_key; // if not null, instead of reading keys from @@ -38,44 +317,33 @@ struct WriteBatchIndexEntry { class WriteBatchEntryComparator { public: - WriteBatchEntryComparator(const Comparator* comparator, + WriteBatchEntryComparator(const Comparator* _default_comparator, const ReadableWriteBatch* write_batch) - : comparator_(comparator), write_batch_(write_batch) {} + : default_comparator_(_default_comparator), write_batch_(write_batch) {} // Compare a and b. Return a negative value if a is less than b, 0 if they // are equal, and a positive value if a is greater than b int operator()(const WriteBatchIndexEntry* entry1, const WriteBatchIndexEntry* entry2) const; + int CompareKey(uint32_t column_family, const Slice& key1, + const Slice& key2) const; + + void SetComparatorForCF(uint32_t column_family_id, + const Comparator* comparator) { + cf_comparator_map_[column_family_id] = comparator; + } + + const Comparator* default_comparator() { return default_comparator_; } + private: - const Comparator* comparator_; + const Comparator* default_comparator_; + std::unordered_map cf_comparator_map_; const ReadableWriteBatch* write_batch_; }; typedef SkipList WriteBatchEntrySkipList; -struct WriteBatchWithIndex::Rep { - Rep(const Comparator* index_comparator, size_t reserved_bytes = 0) - : write_batch(reserved_bytes), - comparator(index_comparator, &write_batch), - skip_list(comparator, &arena) {} - ReadableWriteBatch write_batch; - WriteBatchEntryComparator comparator; - Arena arena; - WriteBatchEntrySkipList skip_list; - - WriteBatchIndexEntry* GetEntry(ColumnFamilyHandle* column_family) { - return GetEntryWithCfId(GetColumnFamilyID(column_family)); - } - - WriteBatchIndexEntry* GetEntryWithCfId(uint32_t column_family_id) { - auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); - auto* index_entry = new (mem) - WriteBatchIndexEntry(write_batch.GetDataSize(), column_family_id); - return index_entry; - } -}; - class WBWIIteratorImpl : public WBWIIterator { public: WBWIIteratorImpl(uint32_t column_family_id, @@ -90,6 +358,27 @@ class WBWIIteratorImpl : public WBWIIterator { virtual bool Valid() const override { return valid_; } + virtual void SeekToFirst() { + valid_ = true; + WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin, + column_family_id_); + skip_list_iter_.Seek(&search_entry); + ReadEntry(); + } + + virtual void SeekToLast() { + valid_ = true; + WriteBatchIndexEntry search_entry(WriteBatchIndexEntry::kFlagMin, + column_family_id_ + 1); + skip_list_iter_.Seek(&search_entry); + if (!skip_list_iter_.Valid()) { + skip_list_iter_.SeekToLast(); + } else { + skip_list_iter_.Prev(); + } + ReadEntry(); + } + virtual void Seek(const Slice& key) override { valid_ = true; WriteBatchIndexEntry search_entry(&key, column_family_id_); @@ -102,10 +391,19 @@ class WBWIIteratorImpl : public WBWIIterator { ReadEntry(); } + virtual void Prev() override { + skip_list_iter_.Prev(); + ReadEntry(); + } + virtual const WriteEntry& Entry() const override { return current_; } virtual Status status() const override { return status_; } + const WriteBatchIndexEntry* GetRawEntry() const { + return skip_list_iter_.key(); + } + private: uint32_t column_family_id_; WriteBatchEntrySkipList::Iterator skip_list_iter_; @@ -139,6 +437,92 @@ class WBWIIteratorImpl : public WBWIIterator { } }; +struct WriteBatchWithIndex::Rep { + Rep(const Comparator* index_comparator, size_t reserved_bytes = 0, + bool _overwrite_key = false) + : write_batch(reserved_bytes), + comparator(index_comparator, &write_batch), + skip_list(comparator, &arena), + overwrite_key(_overwrite_key), + last_entry_offset(0) {} + ReadableWriteBatch write_batch; + WriteBatchEntryComparator comparator; + Arena arena; + WriteBatchEntrySkipList skip_list; + bool overwrite_key; + size_t last_entry_offset; + + // Remember current offset of internal write batch, which is used as + // the starting offset of the next record. + void SetLastEntryOffset() { last_entry_offset = write_batch.GetDataSize(); } + + // In overwrite mode, find the existing entry for the same key and update it + // to point to the current entry. + // Return true if the key is found and updated. + bool UpdateExistingEntry(ColumnFamilyHandle* column_family, const Slice& key); + bool UpdateExistingEntryWithCfId(uint32_t column_family_id, const Slice& key); + + // Add the recent entry to the update. + // In overwrite mode, if key already exists in the index, update it. + void AddOrUpdateIndex(ColumnFamilyHandle* column_family, const Slice& key); + void AddOrUpdateIndex(const Slice& key); + + // Allocate an index entry pointing to the last entry in the write batch and + // put it to skip list. + void AddNewEntry(uint32_t column_family_id); +}; + +bool WriteBatchWithIndex::Rep::UpdateExistingEntry( + ColumnFamilyHandle* column_family, const Slice& key) { + uint32_t cf_id = GetColumnFamilyID(column_family); + return UpdateExistingEntryWithCfId(cf_id, key); +} + +bool WriteBatchWithIndex::Rep::UpdateExistingEntryWithCfId( + uint32_t column_family_id, const Slice& key) { + if (!overwrite_key) { + return false; + } + + WBWIIteratorImpl iter(column_family_id, &skip_list, &write_batch); + iter.Seek(key); + if (!iter.Valid()) { + return false; + } + if (comparator.CompareKey(column_family_id, key, iter.Entry().key) != 0) { + return false; + } + WriteBatchIndexEntry* non_const_entry = + const_cast(iter.GetRawEntry()); + non_const_entry->offset = last_entry_offset; + return true; +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex( + ColumnFamilyHandle* column_family, const Slice& key) { + if (!UpdateExistingEntry(column_family, key)) { + uint32_t cf_id = GetColumnFamilyID(column_family); + const auto* cf_cmp = GetColumnFamilyUserComparator(column_family); + if (cf_cmp != nullptr) { + comparator.SetComparatorForCF(cf_id, cf_cmp); + } + AddNewEntry(cf_id); + } +} + +void WriteBatchWithIndex::Rep::AddOrUpdateIndex(const Slice& key) { + if (!UpdateExistingEntryWithCfId(0, key)) { + AddNewEntry(0); + } +} + +void WriteBatchWithIndex::Rep::AddNewEntry(uint32_t column_family_id) { + auto* mem = arena.Allocate(sizeof(WriteBatchIndexEntry)); + auto* index_entry = + new (mem) WriteBatchIndexEntry(last_entry_offset, column_family_id); + skip_list.Insert(index_entry); + } + Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, WriteType* type, Slice* Key, Slice* value, @@ -179,9 +563,10 @@ Status ReadableWriteBatch::GetEntryFromDataOffset(size_t data_offset, return Status::OK(); } -WriteBatchWithIndex::WriteBatchWithIndex(const Comparator* index_comparator, - size_t reserved_bytes) - : rep(new Rep(index_comparator, reserved_bytes)) {} +WriteBatchWithIndex::WriteBatchWithIndex( + const Comparator* default_index_comparator, size_t reserved_bytes, + bool overwrite_key) + : rep(new Rep(default_index_comparator, reserved_bytes, overwrite_key)) {} WriteBatchWithIndex::~WriteBatchWithIndex() { delete rep; } @@ -197,30 +582,50 @@ WBWIIterator* WriteBatchWithIndex::NewIterator( &(rep->skip_list), &rep->write_batch); } +Iterator* WriteBatchWithIndex::NewIteratorWithBase( + ColumnFamilyHandle* column_family, Iterator* base_iterator) { + if (rep->overwrite_key == false) { + assert(false); + return nullptr; + } + return new BaseDeltaIterator(base_iterator, NewIterator(column_family), + GetColumnFamilyUserComparator(column_family)); +} + +Iterator* WriteBatchWithIndex::NewIteratorWithBase(Iterator* base_iterator) { + if (rep->overwrite_key == false) { + assert(false); + return nullptr; + } + // default column family's comparator + return new BaseDeltaIterator(base_iterator, NewIterator(), + rep->comparator.default_comparator()); +} + void WriteBatchWithIndex::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - auto* index_entry = rep->GetEntry(column_family); + rep->SetLastEntryOffset(); rep->write_batch.Put(column_family, key, value); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(column_family, key); } void WriteBatchWithIndex::Put(const Slice& key, const Slice& value) { - auto* index_entry = rep->GetEntryWithCfId(0); + rep->SetLastEntryOffset(); rep->write_batch.Put(key, value); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(key); } void WriteBatchWithIndex::Merge(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - auto* index_entry = rep->GetEntry(column_family); + rep->SetLastEntryOffset(); rep->write_batch.Merge(column_family, key, value); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(column_family, key); } void WriteBatchWithIndex::Merge(const Slice& key, const Slice& value) { - auto* index_entry = rep->GetEntryWithCfId(0); + rep->SetLastEntryOffset(); rep->write_batch.Merge(key, value); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(key); } void WriteBatchWithIndex::PutLogData(const Slice& blob) { @@ -229,28 +634,15 @@ void WriteBatchWithIndex::PutLogData(const Slice& blob) { void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, const Slice& key) { - auto* index_entry = rep->GetEntry(column_family); + rep->SetLastEntryOffset(); rep->write_batch.Delete(column_family, key); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(column_family, key); } void WriteBatchWithIndex::Delete(const Slice& key) { - auto* index_entry = rep->GetEntryWithCfId(0); - rep->write_batch.Delete(key); - rep->skip_list.Insert(index_entry); -} - -void WriteBatchWithIndex::Delete(ColumnFamilyHandle* column_family, - const SliceParts& key) { - auto* index_entry = rep->GetEntry(column_family); - rep->write_batch.Delete(column_family, key); - rep->skip_list.Insert(index_entry); -} - -void WriteBatchWithIndex::Delete(const SliceParts& key) { - auto* index_entry = rep->GetEntryWithCfId(0); + rep->SetLastEntryOffset(); rep->write_batch.Delete(key); - rep->skip_list.Insert(index_entry); + rep->AddOrUpdateIndex(key); } int WriteBatchEntryComparator::operator()( @@ -262,6 +654,12 @@ int WriteBatchEntryComparator::operator()( return -1; } + if (entry1->offset == WriteBatchIndexEntry::kFlagMin) { + return -1; + } else if (entry2->offset == WriteBatchIndexEntry::kFlagMin) { + return 1; + } + Status s; Slice key1, key2; if (entry1->search_key == nullptr) { @@ -287,7 +685,7 @@ int WriteBatchEntryComparator::operator()( key2 = *(entry2->search_key); } - int cmp = comparator_->Compare(key1, key2); + int cmp = CompareKey(entry1->column_family, key1, key2); if (cmp != 0) { return cmp; } else if (entry1->offset > entry2->offset) { @@ -298,4 +696,15 @@ int WriteBatchEntryComparator::operator()( return 0; } +int WriteBatchEntryComparator::CompareKey(uint32_t column_family, + const Slice& key1, + const Slice& key2) const { + auto comparator_for_cf = cf_comparator_map_.find(column_family); + if (comparator_for_cf != cf_comparator_map_.end()) { + return comparator_for_cf->second->Compare(key1, key2); + } else { + return default_comparator_->Compare(key1, key2); + } +} + } // namespace rocksdb diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index fdceed4c4..f5d6a55a3 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -19,12 +19,16 @@ namespace rocksdb { namespace { class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl { public: - explicit ColumnFamilyHandleImplDummy(int id) - : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} + explicit ColumnFamilyHandleImplDummy(int id, const Comparator* comparator) + : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), + id_(id), + comparator_(comparator) {} uint32_t GetID() const override { return id_; } + const Comparator* user_comparator() const override { return comparator_; } private: uint32_t id_; + const Comparator* comparator_; }; struct Entry { @@ -90,8 +94,9 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { index_map[e.value].push_back(&e); } - WriteBatchWithIndex batch(BytewiseComparator(), 20); - ColumnFamilyHandleImplDummy data(6), index(8); + WriteBatchWithIndex batch(nullptr, 20); + ColumnFamilyHandleImplDummy data(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy index(8, BytewiseComparator()); for (auto& e : entries) { if (e.type == kPutRecord) { batch.Put(&data, e.key, e.value); @@ -115,18 +120,39 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { // Iterator all keys { std::unique_ptr iter(batch.NewIterator(&data)); - iter->Seek(""); - for (auto pair : data_map) { - for (auto v : pair.second) { + for (int seek_to_first : {0, 1}) { + if (seek_to_first) { + iter->SeekToFirst(); + } else { + iter->Seek(""); + } + for (auto pair : data_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + ASSERT_EQ(v->type, write_entry.type); + if (write_entry.type != kDeleteRecord) { + ASSERT_EQ(v->value, write_entry.value.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + iter->SeekToLast(); + for (auto pair = data_map.rbegin(); pair != data_map.rend(); ++pair) { + for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); auto& write_entry = iter->Entry(); - ASSERT_EQ(pair.first, write_entry.key.ToString()); - ASSERT_EQ(v->type, write_entry.type); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + ASSERT_EQ((*v)->type, write_entry.type); if (write_entry.type != kDeleteRecord) { - ASSERT_EQ(v->value, write_entry.value.ToString()); + ASSERT_EQ((*v)->value, write_entry.value.ToString()); } - iter->Next(); + iter->Prev(); } } ASSERT_TRUE(!iter->Valid()); @@ -135,18 +161,40 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { // Iterator all indexes { std::unique_ptr iter(batch.NewIterator(&index)); - iter->Seek(""); - for (auto pair : index_map) { - for (auto v : pair.second) { + for (int seek_to_first : {0, 1}) { + if (seek_to_first) { + iter->SeekToFirst(); + } else { + iter->Seek(""); + } + for (auto pair : index_map) { + for (auto v : pair.second) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + auto& write_entry = iter->Entry(); + ASSERT_EQ(pair.first, write_entry.key.ToString()); + if (v->type != kDeleteRecord) { + ASSERT_EQ(v->key, write_entry.value.ToString()); + ASSERT_EQ(v->value, write_entry.key.ToString()); + } + iter->Next(); + } + } + ASSERT_TRUE(!iter->Valid()); + } + + iter->SeekToLast(); + for (auto pair = index_map.rbegin(); pair != index_map.rend(); ++pair) { + for (auto v = pair->second.rbegin(); v != pair->second.rend(); v++) { ASSERT_OK(iter->status()); ASSERT_TRUE(iter->Valid()); auto& write_entry = iter->Entry(); - ASSERT_EQ(pair.first, write_entry.key.ToString()); - if (v->type != kDeleteRecord) { - ASSERT_EQ(v->key, write_entry.value.ToString()); - ASSERT_EQ(v->value, write_entry.key.ToString()); + ASSERT_EQ(pair->first, write_entry.key.ToString()); + if ((*v)->type != kDeleteRecord) { + ASSERT_EQ((*v)->key, write_entry.value.ToString()); + ASSERT_EQ((*v)->value, write_entry.key.ToString()); } - iter->Next(); + iter->Prev(); } } ASSERT_TRUE(!iter->Valid()); @@ -230,6 +278,605 @@ TEST(WriteBatchWithIndexTest, TestValueAsSecondaryIndex) { } } +TEST(WriteBatchWithIndexTest, TestComparatorForCF) { + ColumnFamilyHandleImplDummy cf1(6, nullptr); + ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20); + + batch.Put(&cf1, "ddd", ""); + batch.Put(&cf2, "aaa", ""); + batch.Put(&cf2, "eee", ""); + batch.Put(&cf1, "ccc", ""); + batch.Put(&reverse_cf, "a11", ""); + batch.Put(&cf1, "bbb", ""); + batch.Put(&reverse_cf, "a33", ""); + batch.Put(&reverse_cf, "a22", ""); + + { + std::unique_ptr iter(batch.NewIterator(&cf1)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("bbb", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ddd", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&cf2)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&reverse_cf)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("z"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("a22"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a22", iter->Entry().key.ToString()); + + iter->Seek("a13"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + } +} + +TEST(WriteBatchWithIndexTest, TestOverwriteKey) { + ColumnFamilyHandleImplDummy cf1(6, nullptr); + ColumnFamilyHandleImplDummy reverse_cf(66, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(88, BytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20, true); + + batch.Put(&cf1, "ddd", ""); + batch.Merge(&cf1, "ddd", ""); + batch.Delete(&cf1, "ddd"); + batch.Put(&cf2, "aaa", ""); + batch.Delete(&cf2, "aaa"); + batch.Put(&cf2, "aaa", "aaa"); + batch.Put(&cf2, "eee", "eee"); + batch.Put(&cf1, "ccc", ""); + batch.Put(&reverse_cf, "a11", ""); + batch.Delete(&cf1, "ccc"); + batch.Put(&reverse_cf, "a33", "a33"); + batch.Put(&reverse_cf, "a11", "a11"); + batch.Delete(&reverse_cf, "a33"); + + { + std::unique_ptr iter(batch.NewIterator(&cf1)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ccc", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("ddd", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&cf2)); + iter->SeekToLast(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + ASSERT_EQ("eee", iter->Entry().value.ToString()); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + ASSERT_EQ("aaa", iter->Entry().value.ToString()); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToFirst(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("aaa", iter->Entry().key.ToString()); + ASSERT_EQ("aaa", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("eee", iter->Entry().key.ToString()); + ASSERT_EQ("eee", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + { + std::unique_ptr iter(batch.NewIterator(&reverse_cf)); + iter->Seek(""); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("z"); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + ASSERT_EQ("a11", iter->Entry().value.ToString()); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a11", iter->Entry().key.ToString()); + ASSERT_EQ("a11", iter->Entry().value.ToString()); + iter->Prev(); + + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ("a33", iter->Entry().key.ToString()); + ASSERT_TRUE(iter->Entry().type == WriteType::kDeleteRecord); + iter->Prev(); + ASSERT_TRUE(!iter->Valid()); + } +} + +namespace { +typedef std::map KVMap; + +class KVIter : public Iterator { + public: + explicit KVIter(const KVMap* map) : map_(map), iter_(map_->end()) {} + virtual bool Valid() const { return iter_ != map_->end(); } + virtual void SeekToFirst() { iter_ = map_->begin(); } + virtual void SeekToLast() { + if (map_->empty()) { + iter_ = map_->end(); + } else { + iter_ = map_->find(map_->rbegin()->first); + } + } + virtual void Seek(const Slice& k) { iter_ = map_->lower_bound(k.ToString()); } + virtual void Next() { ++iter_; } + virtual void Prev() { + if (iter_ == map_->begin()) { + iter_ = map_->end(); + return; + } + --iter_; + } + + virtual Slice key() const { return iter_->first; } + virtual Slice value() const { return iter_->second; } + virtual Status status() const { return Status::OK(); } + + private: + const KVMap* const map_; + KVMap::const_iterator iter_; +}; + +void AssertIter(Iterator* iter, const std::string& key, + const std::string& value) { + ASSERT_OK(iter->status()); + ASSERT_TRUE(iter->Valid()); + ASSERT_EQ(key, iter->key().ToString()); + ASSERT_EQ(value, iter->value().ToString()); +} + +void AssertItersEqual(Iterator* iter1, Iterator* iter2) { + ASSERT_EQ(iter1->Valid(), iter2->Valid()); + if (iter1->Valid()) { + ASSERT_EQ(iter1->key().ToString(), iter2->key().ToString()); + ASSERT_EQ(iter1->value().ToString(), iter2->value().ToString()); + } +} +} // namespace + +TEST(WriteBatchWithIndexTest, TestRandomIteraratorWithBase) { + std::vector source_strings = {"a", "b", "c", "d", "e", + "f", "g", "h", "i", "j"}; + for (int rand_seed = 301; rand_seed < 366; rand_seed++) { + Random rnd(rand_seed); + + ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf3(8, BytewiseComparator()); + + WriteBatchWithIndex batch(BytewiseComparator(), 20, true); + + if (rand_seed % 2 == 0) { + batch.Put(&cf2, "zoo", "bar"); + } + if (rand_seed % 4 == 1) { + batch.Put(&cf3, "zoo", "bar"); + } + + KVMap map; + KVMap merged_map; + for (auto key : source_strings) { + std::string value = key + key; + int type = rnd.Uniform(6); + switch (type) { + case 0: + // only base has it + map[key] = value; + merged_map[key] = value; + break; + case 1: + // only delta has it + batch.Put(&cf1, key, value); + map[key] = value; + merged_map[key] = value; + break; + case 2: + // both has it. Delta should win + batch.Put(&cf1, key, value); + map[key] = "wrong_value"; + merged_map[key] = value; + break; + case 3: + // both has it. Delta is delete + batch.Delete(&cf1, key); + map[key] = "wrong_value"; + break; + case 4: + // only delta has it. Delta is delete + batch.Delete(&cf1, key); + map[key] = "wrong_value"; + break; + default: + // Neither iterator has it. + break; + } + } + + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&map))); + std::unique_ptr result_iter(new KVIter(&merged_map)); + + bool is_valid = false; + for (int i = 0; i < 128; i++) { + // Random walk and make sure iter and result_iter returns the + // same key and value + int type = rnd.Uniform(5); + ASSERT_OK(iter->status()); + switch (type) { + case 0: + // Seek to First + iter->SeekToFirst(); + result_iter->SeekToFirst(); + break; + case 1: + // Seek to last + iter->SeekToLast(); + result_iter->SeekToLast(); + break; + case 2: { + // Seek to random key + auto key_idx = rnd.Uniform(static_cast(source_strings.size())); + auto key = source_strings[key_idx]; + iter->Seek(key); + result_iter->Seek(key); + break; + } + case 3: + // Next + if (is_valid) { + iter->Next(); + result_iter->Next(); + } else { + continue; + } + break; + default: + assert(type == 4); + // Prev + if (is_valid) { + iter->Prev(); + result_iter->Prev(); + } else { + continue; + } + break; + } + AssertItersEqual(iter.get(), result_iter.get()); + is_valid = iter->Valid(); + } + } +} + +TEST(WriteBatchWithIndexTest, TestIteraratorWithBase) { + ColumnFamilyHandleImplDummy cf1(6, BytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, BytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20, true); + + { + KVMap map; + map["a"] = "aa"; + map["c"] = "cc"; + map["e"] = "ee"; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "e", "ee"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "e", "ee"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "c", "cc"); + + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + + iter->Seek("a"); + AssertIter(iter.get(), "a", "aa"); + } + + // Test the case that there is one element in the write batch + batch.Put(&cf2, "zoo", "bar"); + batch.Put(&cf1, "a", "aa"); + { + KVMap empty_map; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + batch.Delete(&cf1, "b"); + batch.Put(&cf1, "c", "cc"); + batch.Put(&cf1, "d", "dd"); + batch.Delete(&cf1, "e"); + + { + KVMap map; + map["b"] = ""; + map["cc"] = "cccc"; + map["f"] = "ff"; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + iter->Next(); + AssertIter(iter.get(), "f", "ff"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "f", "ff"); + iter->Prev(); + AssertIter(iter.get(), "d", "dd"); + iter->Prev(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "cc", "cccc"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("c"); + AssertIter(iter.get(), "c", "cc"); + + iter->Seek("cb"); + AssertIter(iter.get(), "cc", "cccc"); + + iter->Seek("cc"); + AssertIter(iter.get(), "cc", "cccc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + + iter->Seek("e"); + AssertIter(iter.get(), "f", "ff"); + + iter->Prev(); + AssertIter(iter.get(), "d", "dd"); + + iter->Next(); + AssertIter(iter.get(), "f", "ff"); + } + + { + KVMap empty_map; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "d", "dd"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + AssertIter(iter.get(), "a", "aa"); + + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("aa"); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "d", "dd"); + + iter->Seek("ca"); + AssertIter(iter.get(), "d", "dd"); + + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + } +} + +TEST(WriteBatchWithIndexTest, TestIteraratorWithBaseReverseCmp) { + ColumnFamilyHandleImplDummy cf1(6, ReverseBytewiseComparator()); + ColumnFamilyHandleImplDummy cf2(2, ReverseBytewiseComparator()); + WriteBatchWithIndex batch(BytewiseComparator(), 20, true); + + // Test the case that there is one element in the write batch + batch.Put(&cf2, "zoo", "bar"); + batch.Put(&cf1, "a", "aa"); + { + KVMap empty_map; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&empty_map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + } + + batch.Put(&cf1, "c", "cc"); + { + KVMap map; + std::unique_ptr iter( + batch.NewIteratorWithBase(&cf1, new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "c", "cc"); + iter->Next(); + AssertIter(iter.get(), "a", "aa"); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "a", "aa"); + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "a", "aa"); + + iter->Prev(); + AssertIter(iter.get(), "c", "cc"); + + iter->Seek("a"); + AssertIter(iter.get(), "a", "aa"); + } + + // default column family + batch.Put("a", "b"); + { + KVMap map; + map["b"] = ""; + std::unique_ptr iter(batch.NewIteratorWithBase(new KVIter(&map))); + + iter->SeekToFirst(); + AssertIter(iter.get(), "a", "b"); + iter->Next(); + AssertIter(iter.get(), "b", ""); + iter->Next(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->SeekToLast(); + AssertIter(iter.get(), "b", ""); + iter->Prev(); + AssertIter(iter.get(), "a", "b"); + iter->Prev(); + ASSERT_OK(iter->status()); + ASSERT_TRUE(!iter->Valid()); + + iter->Seek("b"); + AssertIter(iter.get(), "b", ""); + + iter->Prev(); + AssertIter(iter.get(), "a", "b"); + + iter->Seek("0"); + AssertIter(iter.get(), "a", "b"); + } +} + } // namespace int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }