diff --git a/.gitignore b/.gitignore index 6a92b5d53..62a8bf1bc 100644 --- a/.gitignore +++ b/.gitignore @@ -42,6 +42,7 @@ unity.a tags rocksdb_dump rocksdb_undump +db_test2 java/out java/target diff --git a/.travis.yml b/.travis.yml index b6fa63c5d..b045d259e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,7 +34,7 @@ before_script: # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. script: - if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format || true; fi - - OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest && make clean && OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 check + - OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest && make clean && OPT="-DTRAVIS -DROCKSDB_LITE" V=1 make -j4 static_lib notifications: email: diff --git a/CMakeLists.txt b/CMakeLists.txt index a005d26d1..5e36b53ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,8 +13,8 @@ # cd build # 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries. # See thirdparty.inc for more information. -# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 .. -# OR for VS Studio 15 cmake -G "Visual Studio 14 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 .. +# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 -DJNI=1 .. +# OR for VS Studio 15 cmake -G "Visual Studio 14 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 -DJNI=1 .. # 4. Then build the project in debug mode (you may want to add /m[:] flag to run msbuild in parallel threads # or simply /m ot use all avail cores) # msbuild rocksdb.sln @@ -30,10 +30,9 @@ cmake_minimum_required(VERSION 2.6) project(rocksdb) include(${CMAKE_CURRENT_SOURCE_DIR}/thirdparty.inc) - -execute_process(COMMAND $ENV{COMSPEC} " /C date /T" OUTPUT_VARIABLE DATE) -execute_process(COMMAND $ENV{COMSPEC} " /C time /T" OUTPUT_VARIABLE TIME) -string(REGEX REPLACE "(..)/(..)/..(..).*" "\\1/\\2/\\3" DATE ${DATE}) +execute_process(COMMAND powershell -Command "Get-Date -format MM_dd_yyyy" OUTPUT_VARIABLE DATE) +execute_process(COMMAND powershell -Command "Get-Date -format HH:mm:ss" OUTPUT_VARIABLE TIME) +string(REGEX REPLACE "(..)_(..)_..(..).*" "\\1/\\2/\\3" DATE ${DATE}) string(REGEX REPLACE "(..):(.....).*" " \\1:\\2" TIME ${TIME}) string(CONCAT GIT_DATE_TIME ${DATE} ${TIME}) @@ -99,6 +98,7 @@ add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) # Main library source code set(SOURCES + db/auto_roll_logger.cc db/builder.cc db/c.cc db/column_family.cc @@ -114,6 +114,7 @@ set(SOURCES db/db_impl_debug.cc db/db_impl_experimental.cc db/db_impl_readonly.cc + db/db_info_dumper.cc db/db_iter.cc db/event_helpers.cc db/experimental.cc @@ -145,9 +146,12 @@ set(SOURCES db/write_batch_base.cc db/write_controller.cc db/write_thread.cc + db/xfunc_test_points.cc memtable/hash_cuckoo_rep.cc memtable/hash_linklist_rep.cc memtable/hash_skiplist_rep.cc + memtable/skiplistrep.cc + memtable/vectorrep.cc port/stack_trace.cc port/win/env_win.cc port/win/port_win.cc @@ -173,7 +177,6 @@ set(SOURCES table/merger.cc table/sst_file_writer.cc table/meta_blocks.cc - table/mock_table.cc table/plain_table_builder.cc table/plain_table_factory.cc table/plain_table_index.cc @@ -182,9 +185,9 @@ set(SOURCES table/table_properties.cc table/two_level_iterator.cc tools/sst_dump_tool.cc + tools/db_bench_tool.cc tools/dump/db_dump_tool.cc util/arena.cc - util/auto_roll_logger.cc util/bloom.cc util/build_version.cc util/cache.cc @@ -193,17 +196,18 @@ set(SOURCES util/comparator.cc util/concurrent_arena.cc util/crc32c.cc - util/db_info_dumper.cc - util/delete_scheduler_impl.cc + util/delete_scheduler.cc util/dynamic_bloom.cc util/env.cc util/env_hdfs.cc util/event_logger.cc util/file_util.cc util/file_reader_writer.cc + util/sst_file_manager_impl.cc util/filter_policy.cc util/hash.cc util/histogram.cc + util/histogram_windowing.cc util/instrumented_mutex.cc util/iostats_context.cc tools/ldb_cmd.cc @@ -211,7 +215,6 @@ set(SOURCES util/logging.cc util/log_buffer.cc util/memenv.cc - util/mock_env.cc util/murmurhash.cc util/mutable_cf_options.cc util/options.cc @@ -223,7 +226,6 @@ set(SOURCES util/perf_level.cc util/random.cc util/rate_limiter.cc - util/skiplistrep.cc util/slice.cc util/statistics.cc util/status.cc @@ -237,11 +239,12 @@ set(SOURCES util/thread_status_updater.cc util/thread_status_util.cc util/thread_status_util_debug.cc - util/vectorrep.cc + util/transaction_test_util.cc util/xfunc.cc util/xxhash.cc utilities/backupable/backupable_db.cc utilities/checkpoint/checkpoint.cc + utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc utilities/document/document_db.cc utilities/document/json_document.cc utilities/document/json_document_builder.cc @@ -275,6 +278,8 @@ set(SOURCES # and linked to tests. Add test only code that is not #ifdefed for Release here. set(TESTUTIL_SOURCE db/db_test_util.cc + table/mock_table.cc + util/mock_env.cc util/thread_status_updater_debug.cc ) @@ -287,8 +292,19 @@ set_target_properties(rocksdb${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "-DROCK add_dependencies(rocksdb${ARTIFACT_SUFFIX} GenerateBuildVersion) target_link_libraries(rocksdb${ARTIFACT_SUFFIX} ${LIBS}) +if (DEFINED JNI) + if (${JNI} EQUAL 1) + message(STATUS "JNI library is enabled") + add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/java) + else() + message(STATUS "JNI library is disabled") + endif() +else() + message(STATUS "JNI library is disabled") +endif() + set(APPS - db/db_bench.cc + tools/db_bench.cc db/memtablerep_bench.cc table/table_reader_bench.cc tools/db_stress.cc @@ -303,6 +319,7 @@ set(APPS set(C_TESTS db/c_test.c) set(TESTS + db/auto_roll_logger_test.cc db/column_family_test.cc db/compact_files_test.cc db/compaction_iterator_test.cc @@ -312,17 +329,20 @@ set(TESTS db/comparator_db_test.cc db/corruption_test.cc db/cuckoo_table_db_test.cc - db/db_iter_test.cc - db/db_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc db/db_dynamic_level_test.cc db/db_inplace_update_test.cc + db/db_iter_test.cc db/db_log_iter_test.cc + db/db_properties_test.cc + db/db_table_properties_test.cc + db/db_tailing_iter_test.cc + db/db_test.cc + db/db_test2.cc + db/db_block_cache_test.cc db/db_universal_compaction_test.cc db/db_wal_test.cc - db/db_tailing_iter_test.cc - db/db_table_properties_test.cc db/dbformat_test.cc db/deletefile_test.cc db/fault_injection_test.cc @@ -363,7 +383,6 @@ set(TESTS tools/sst_dump_test.cc util/arena_test.cc util/autovector_test.cc - util/auto_roll_logger_test.cc util/bloom_test.cc util/cache_test.cc util/coding_test.cc @@ -376,6 +395,7 @@ set(TESTS util/file_reader_writer_test.cc util/heap_test.cc util/histogram_test.cc + util/iostats_context_test.cc util/memenv_test.cc util/mock_env_test.cc util/options_test.cc diff --git a/HISTORY.md b/HISTORY.md index 2400411d4..757f14f9a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,26 @@ # Rocksdb Change Log +## 4.6.0 (3/10/2016) +### Public API Changes +* Change default of BlockBasedTableOptions.format_version to 2. It means default DB created by 4.6 or up cannot be opened by RocksDB version 3.9 or earlier. +* Added strict_capacity_limit option to NewLRUCache. If the flag is set to true, insert to cache will fail if no enough capacity can be free. Signiture of Cache::Insert() is updated accordingly. +* Tickers [NUMBER_DB_NEXT, NUMBER_DB_PREV, NUMBER_DB_NEXT_FOUND, NUMBER_DB_PREV_FOUND, ITER_BYTES_READ] are not updated immediately. The are updated when the Iterator is deleted. +* Add monotonically increasing counter (DB property "rocksdb.current-super-version-number") that increments upon any change to the LSM tree. +### New Features +* Add CompactionPri::kMinOverlappingRatio, a compaction picking mode friendly to write amplification. +* Deprecate Iterator::IsKeyPinned() and replace it with Iterator::GetProperty() with prop_name="rocksdb.iterator.is.key.pinned" + +## 4.5.0 (2/5/2016) +### Public API Changes +* Add a new perf context level between kEnableCount and kEnableTime. Level 2 now does not include timers for mutexes. +* Statistics of mutex operation durations will not be measured by default. If you want to have them enabled, you need to set Statistics::stats_level_ to kAll. +* DBOptions::delete_scheduler and NewDeleteScheduler() are removed, please use DBOptions::sst_file_manager and NewSstFileManager() instead + +### New Features +* ldb tool now supports operations to non-default column families. +* Add kPersistedTier to ReadTier. This option allows Get and MultiGet to read only the persited data and skip mem-tables if writes were done with disableWAL = true. +* Add DBOptions::sst_file_manager. Use NewSstFileManager() in include/rocksdb/sst_file_manager.h to create a SstFileManager that can be used to track the total size of SST files and control the SST files deletion rate. -## Unreleased +## 4.4.0 (1/14/2016) ### Public API Changes * Change names in CompactionPri and add a new one. * Deprecate options.soft_rate_limit and add options.soft_pending_compaction_bytes_limit. diff --git a/INSTALL.md b/INSTALL.md index bff75155f..3669bf1cf 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -21,7 +21,7 @@ depend on gflags. You will need to have gflags installed to run `make all`. This use binaries compiled by `make all` in production. * By default the binary we produce is optimized for the platform you're compiling on -(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before +(-march=native or the equivalent). If you want to build a portable binary, add 'PORTABLE=1' before your make commands, like this: `PORTABLE=1 make static_lib` ## Dependencies diff --git a/LICENSE b/LICENSE index b13290186..46f685e96 100644 --- a/LICENSE +++ b/LICENSE @@ -2,7 +2,7 @@ BSD License For rocksdb software -Copyright (c) 2014, Facebook, Inc. +Copyright (c) 2011-present, Facebook, Inc. All rights reserved. --------------------------------------------------------------------- diff --git a/Makefile b/Makefile index 396b8e201..a1e321f83 100644 --- a/Makefile +++ b/Makefile @@ -84,7 +84,8 @@ endif # compile with -O2 if debug level is not 2 ifneq ($(DEBUG_LEVEL), 2) OPT += -O2 -fno-omit-frame-pointer -ifneq ($(MACHINE),ppc64) # ppc64 doesn't support -momit-leaf-frame-pointer +# Skip for archs that don't support -momit-leaf-frame-pointer +ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1)) OPT += -momit-leaf-frame-pointer endif endif @@ -143,6 +144,9 @@ else OPT += -DNDEBUG endif +ifeq ($(PLATFORM), OS_SOLARIS) + PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99 +endif ifneq ($(filter -DROCKSDB_LITE,$(OPT)),) # found CFLAGS += -fno-exceptions @@ -237,8 +241,12 @@ VALGRIND_VER := $(join $(VALGRIND_VER),valgrind) VALGRIND_OPTS = --error-exitcode=$(VALGRIND_ERROR) --leak-check=full +BENCHTOOLOBJECTS = $(BENCH_SOURCES:.cc=.o) $(LIBOBJECTS) $(TESTUTIL) + TESTS = \ db_test \ + db_test2 \ + db_block_cache_test \ db_iter_test \ db_log_iter_test \ db_compaction_filter_test \ @@ -248,6 +256,7 @@ TESTS = \ db_tailing_iter_test \ db_universal_compaction_test \ db_wal_test \ + db_properties_test \ db_table_properties_test \ block_hash_index_test \ autovector_test \ @@ -332,7 +341,8 @@ TESTS = \ compact_on_deletion_collector_test \ compaction_job_stats_test \ transaction_test \ - ldb_cmd_test + ldb_cmd_test \ + iostats_context_test SUBSET := $(shell echo $(TESTS) |sed s/^.*$(ROCKSDBTESTS_START)/$(ROCKSDBTESTS_START)/) @@ -460,7 +470,7 @@ test_names = \ -e '/^(\s*)(\S+)/; !$$1 and do {$$p=$$2; break};' \ -e 'print qq! $$p$$2!' -ifeq ($(MAKECMDGOALS),check) +ifneq (,$(filter check parallel_check,$(MAKECMDGOALS)),) # Use /dev/shm if it has the sticky bit set (otherwise, /tmp), # and create a randomly-named rocksdb.XXXX directory therein. # We'll use that directory in the "make check" rules. @@ -616,6 +626,46 @@ valgrind_check: $(TESTS) fi; \ done + +ifneq ($(PAR_TEST),) +parloop: + ret_bad=0; \ + for t in $(PAR_TEST); do \ + echo "===== Running $$t in parallel $(NUM_PAR)";\ + if [ $(db_test) -eq 1 ]; then \ + seq $(J) | v="$$t" parallel --gnu 's=$(TMPD)/rdb-{}; export TEST_TMPDIR=$$s;' \ + 'timeout 2m ./db_test --gtest_filter=$$v >> $$s/log-{} 2>1'; \ + else\ + seq $(J) | v="./$$t" parallel --gnu 's=$(TMPD)/rdb-{};' \ + 'export TEST_TMPDIR=$$s; timeout 10m $$v >> $$s/log-{} 2>1'; \ + fi; \ + ret_code=$$?; \ + if [ $$ret_code -ne 0 ]; then \ + ret_bad=$$ret_code; \ + echo $$t exited with $$ret_code; \ + fi; \ + done; \ + exit $$ret_bad; +endif + +parallel_check: $(TESTS) + $(AM_V_GEN)if test "$(J)" > 1 \ + && (parallel --gnu --help 2>/dev/null) | \ + grep -q 'GNU Parallel'; \ + then \ + echo Running in parallel $(J); \ + else \ + echo "Need to have GNU Parallel and J > 1"; exit 1; \ + fi; \ + ret_bad=0; \ + echo $(J);\ + echo Test Dir: $(TMPD); \ + seq $(J) | parallel --gnu 's=$(TMPD)/rdb-{}; rm -rf $$s; mkdir $$s'; \ + $(MAKE) PAR_TEST="$(shell $(test_names))" TMPD=$(TMPD) \ + J=$(J) db_test=1 parloop; \ + $(MAKE) PAR_TEST="$(filter-out db_test, $(TESTS))" \ + TMPD=$(TMPD) J=$(J) db_test=0 parloop; + analyze: clean $(CLANG_SCAN_BUILD) --use-analyzer=$(CLANG_ANALYZER) \ --use-c++=$(CXX) --use-cc=$(CC) --status-bugs \ @@ -652,7 +702,7 @@ clean: tags: ctags * -R - cscope -b `find . -name '*.cc'` `find . -name '*.h'` + cscope -b `find . -name '*.cc'` `find . -name '*.h'` `find . -name '*.c'` format: build_tools/format-diff.sh @@ -667,7 +717,7 @@ $(LIBRARY): $(LIBOBJECTS) $(AM_V_AR)rm -f $@ $(AM_V_at)$(AR) $(ARFLAGS) $@ $(LIBOBJECTS) -db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) +db_bench: tools/db_bench.o $(BENCHTOOLOBJECTS) $(AM_LINK) cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) @@ -742,6 +792,12 @@ slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_test2: db/db_test2.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + +db_block_cache_test: db/db_block_cache_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -769,6 +825,9 @@ db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util. db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +db_properties_test: db/db_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + db_table_properties_test: db/db_table_properties_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) @@ -970,7 +1029,7 @@ manual_compaction_test: db/manual_compaction_test.o $(LIBOBJECTS) $(TESTHARNESS) filelock_test: util/filelock_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -auto_roll_logger_test: util/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) +auto_roll_logger_test: db/auto_roll_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) memtable_list_test: db/memtable_list_test.o $(LIBOBJECTS) $(TESTHARNESS) @@ -994,6 +1053,9 @@ ldb_cmd_test: tools/ldb_cmd_test.o $(LIBOBJECTS) $(TESTHARNESS) ldb: tools/ldb.o $(LIBOBJECTS) $(AM_LINK) +iostats_context_test: util/iostats_context_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_V_CCLD)$(CXX) $^ $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) + #------------------------------------------------- # make install related stuff INSTALL_PATH ?= /usr/local @@ -1036,7 +1098,11 @@ install: install-static # --------------------------------------------------------------------------- JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux -ARCH := $(shell getconf LONG_BIT) +ifeq ($(PLATFORM), OS_SOLARIS) + ARCH := $(shell isainfo -b) +else + ARCH := $(shell getconf LONG_BIT) +endif ROCKSDBJNILIB = librocksdbjni-linux$(ARCH).so ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-linux$(ARCH).jar ROCKSDB_JAR_ALL = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH).jar @@ -1044,14 +1110,19 @@ ROCKSDB_JAVADOCS_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PA ROCKSDB_SOURCES_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-sources.jar ifeq ($(PLATFORM), OS_MACOSX) -ROCKSDBJNILIB = librocksdbjni-osx.jnilib -ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar + ROCKSDBJNILIB = librocksdbjni-osx.jnilib + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-osx.jar ifneq ("$(wildcard $(JAVA_HOME)/include/darwin)","") JAVA_INCLUDE = -I$(JAVA_HOME)/include -I $(JAVA_HOME)/include/darwin else JAVA_INCLUDE = -I/System/Library/Frameworks/JavaVM.framework/Headers/ endif endif +ifeq ($(PLATFORM), OS_SOLARIS) + ROCKSDBJNILIB = librocksdbjni-solaris$(ARCH).so + ROCKSDB_JAR = rocksdbjni-$(ROCKSDB_MAJOR).$(ROCKSDB_MINOR).$(ROCKSDB_PATCH)-solaris$(ARCH).jar + JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/solaris +endif libz.a: -rm -rf zlib-1.2.8 @@ -1145,11 +1216,10 @@ jtest: rocksdbjava jdb_bench: cd java;$(MAKE) db_bench; -commit-prereq: - $(MAKE) clean && $(MAKE) all check; +commit_prereq: build_tools/rocksdb-lego-determinator \ + build_tools/precommit_checker.py + J=$(J) build_tools/precommit_checker.py unit unit_481 clang_unit tsan asan lite $(MAKE) clean && $(MAKE) jclean && $(MAKE) rocksdbjava; - $(MAKE) clean && USE_CLANG=1 $(MAKE) all; - $(MAKE) clean && OPT=-DROCKSDB_LITE $(MAKE) static_lib; xfunc: for xftest in $(XFUNC_TESTS); do \ diff --git a/USERS.md b/USERS.md index c76b0221f..c873ca386 100644 --- a/USERS.md +++ b/USERS.md @@ -45,4 +45,14 @@ Airbnb is using RocksDB as a storage engine for their personalized search servic Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo ## Smyte -[Smyte](https://www.smyte.com/) uses RocksDB as the storage layer for their core key-value storage, high-performance counters and time-windowed HyperLogLog services. \ No newline at end of file +[Smyte](https://www.smyte.com/) uses RocksDB as the storage layer for their core key-value storage, high-performance counters and time-windowed HyperLogLog services. + +## Rakuten Marketing +[Rakuten Marketing](https://marketing.rakuten.com/) uses RocksDB as the disk cache layer for the real-time bidding service in their Performance DSP. + +## VWO, Wingify +[VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed. + +## quasardb +[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark. +quasardb uses a heavily tuned RocksDB as its persistence layer. \ No newline at end of file diff --git a/arcanist_util/config/FacebookArcanistConfiguration.php b/arcanist_util/config/FacebookArcanistConfiguration.php index c3454903b..156a6dbfb 100644 --- a/arcanist_util/config/FacebookArcanistConfiguration.php +++ b/arcanist_util/config/FacebookArcanistConfiguration.php @@ -10,13 +10,184 @@ class FacebookArcanistConfiguration extends ArcanistConfiguration { ArcanistBaseWorkflow $workflow, $error_code) { if ($command == 'diff' && !$workflow->isRawDiffSource()) { - $this->maybePushToJenkins($workflow); + $this->startTestsInJenkins($workflow); + $this->startTestsInSandcastle($workflow); } } + ////////////////////////////////////////////////////////////////////// + /* Run tests in sandcastle */ + function postURL($diffID, $url) { + $cmd = 'echo \'{"diff_id": "' . $diffID . '", ' + . '"name":"click here for sandcastle tests for D' . $diffID . '", ' + . '"link":"' . $url . '"}\' | ' + . 'http_proxy=fwdproxy.any.facebook.com:8080 ' + . 'https_proxy=fwdproxy.any.facebook.com:8080 arc call-conduit ' + . 'differential.updateunitresults'; + shell_exec($cmd); + } + + function updateTestCommand($diffID, $test, $status) { + $cmd = 'echo \'{"diff_id": "' . $diffID . '", ' + . '"name":"' . $test . '", ' + . '"result":"' . $status . '"}\' | ' + . 'http_proxy=fwdproxy.any.facebook.com:8080 ' + . 'https_proxy=fwdproxy.any.facebook.com:8080 arc call-conduit ' + . 'differential.updateunitresults'; + return $cmd; + } + + function updateTest($diffID, $test) { + shell_exec($this->updateTestCommand($diffID, $test, "waiting")); + } + + function getSteps($diffID, $username, $test) { + $arcrc_content = exec("cat ~/.arcrc | gzip -f | base64 -w0"); + + // Sandcastle machines don't have arc setup. We copy the user certificate + // and authenticate using that in sandcastle + $setup = array( + "name" => "Setup arcrc", + "shell" => "echo " . $arcrc_content . " | base64 --decode" + . " | gzip -d > ~/.arcrc", + "user" => "root" + ); + + // arc demands certain permission on its config + $fix_permission = array( + "name" => "Fix environment", + "shell" => "chmod 600 ~/.arcrc", + "user" => "root" + ); + + // fbcode is a sub-repo. We cannot patch until we add it to ignore otherwise + // git thinks it is uncommited change + $fix_git_ignore = array( + "name" => "Fix git ignore", + "shell" => "echo fbcode >> .git/info/exclude", + "user" => "root" + ); + + // Patch the code (keep your fingures crossed) + $patch = array( + "name" => "Patch " . $diffID, + "shell" => "HTTPS_PROXY=fwdproxy:8080 arc --arcrc-file ~/.arcrc " + . "patch --diff " . $diffID, + "user" => "root" + ); + + // Clean up the user arc config we are using + $cleanup = array( + "name" => "Arc cleanup", + "shell" => "rm -f ~/.arcrc", + "user" => "root" + ); + + // Construct the steps in the order of execution + $steps[] = $setup; + $steps[] = $fix_permission; + $steps[] = $fix_git_ignore; + $steps[] = $patch; + + // Run the actual command + $this->updateTest($diffID, $test); + $cmd = $this->updateTestCommand($diffID, $test, "running") . ";" + . "(./build_tools/precommit_checker.py " . $test + . "&& " + . $this->updateTestCommand($diffID, $test, "pass") . ")" + . "|| " . $this->updateTestCommand($diffID, $test, "fail") + . "; cat /tmp/precommit-check.log" + . "; for f in `ls t/log-*`; do echo \$f; cat \$f; done"; + + $run_test = array( + "name" => "Run " . $test, + "shell" => $cmd, + "user" => "root", + ); + + $steps[] = $run_test; + $steps[] = $cleanup; + + return $steps; + } + + function startTestsInSandcastle($workflow) { + // extract information we need from workflow or CLI + $diffID = $workflow->getDiffId(); + $username = exec("whoami"); + + if ($diffID == null || $username == null) { + // there is no diff and we can't extract username + // we cannot schedule sandcasstle job + return; + } + + if (strcmp(getenv("ROCKSDB_CHECK_ALL"), 1) == 0) { + // extract all tests from the CI definition + $output = file_get_contents("build_tools/rocksdb-lego-determinator"); + preg_match_all('/[ ]{2}([a-zA-Z0-9_]+)[\)]{1}/', $output, $matches); + $tests = $matches[1]; + } else { + // manually list of tests we want to run in sandcastle + $tests = array( + "unit", "unit_481", "clang_unit", "tsan", "asan", "lite", "valgrind" + ); + } + + // construct a job definition for each test and add it to the master plan + foreach ($tests as $test) { + $arg[] = array( + "name" => "RocksDB diff " . $diffID . " test " . $test, + "steps" => $this->getSteps($diffID, $username, $test) + ); + } + + // we cannot submit the parallel execution master plan to sandcastle + // we need supply the job plan as a determinator + // so we construct a small job that will spit out the master job plan + // which sandcastle will parse and execute + // Why compress ? Otherwise we run over the max string size. + $cmd = "echo " . base64_encode(json_encode($arg)) + . " | gzip -f | base64 -w0"; + $arg_encoded = shell_exec($cmd); + + $command = array( + "name" => "Run diff " . $diffID . "for user " . $username, + "steps" => array() + ); + + $command["steps"][] = array( + "name" => "Generate determinator", + "shell" => "echo " . $arg_encoded . " | base64 --decode | gzip -d" + . " | base64 --decode", + "determinator" => true, + "user" => "root" + ); + + // submit to sandcastle + $url = 'https://interngraph.intern.facebook.com/sandcastle/generate?' + .'command=SandcastleUniversalCommand' + .'&vcs=rocksdb-git&revision=origin%2Fmaster&type=lego' + .'&user=krad&alias=rocksdb-precommit' + .'&command-args=' . urlencode(json_encode($command)); + + $cmd = 'https_proxy= HTTPS_PROXY= curl -s -k -F app=659387027470559 ' + . '-F token=AeO_3f2Ya3TujjnxGD4 "' . $url . '"'; + + $output = shell_exec($cmd); + + // extract sandcastle URL from the response + preg_match('/url": "(.+)"/', $output, $sandcastle_url); + + echo "\nSandcastle URL: " . $sandcastle_url[1] . "\n"; + + // Ask phabricator to display it on the diff UI + $this->postURL($diffID, $sandcastle_url[1]); + } + ////////////////////////////////////////////////////////////////////// /* Send off builds to jenkins */ - function maybePushToJenkins($workflow) { + function startTestsInJenkins($workflow) { $diffID = $workflow->getDiffID(); if ($diffID === null) { return; @@ -31,5 +202,4 @@ class FacebookArcanistConfiguration extends ArcanistConfiguration { ."buildWithParameters?token=AUTH&DIFF_ID=$diffID"; system("curl --noproxy '*' \"$url\" > /dev/null 2>&1"); } - } diff --git a/arcanist_util/cpp_linter/cpplint.py b/arcanist_util/cpp_linter/cpplint.py index d6201945a..3bb33e17b 100755 --- a/arcanist_util/cpp_linter/cpplint.py +++ b/arcanist_util/cpp_linter/cpplint.py @@ -1,5 +1,5 @@ #!/usr/bin/python -# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index fc099a540..10a4d497f 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -189,17 +189,19 @@ if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then # Also don't need any compilation tests if compiling on fbcode true else - # Test whether fallocate is available - $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < - #include - int main() { - int fd = open("/dev/null", 0); - fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024); - } + if ! test $ROCKSDB_DISABLE_FALLOCATE; then + # Test whether fallocate is available + $CXX $CFLAGS -x c++ - -o /dev/null 2>/dev/null < + #include + int main() { + int fd = open("/dev/null", 0); + fallocate(fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, 1024); + } EOF - if [ "$?" = 0 ]; then - COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT" + fi fi # Test whether Snappy library is installed @@ -359,7 +361,15 @@ if test "$USE_SSE"; then # if Intel SSE instruction set is supported, set USE_SSE=1 COMMON_FLAGS="$COMMON_FLAGS -msse -msse4.2 " elif test -z "$PORTABLE"; then - COMMON_FLAGS="$COMMON_FLAGS -march=native " + if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then + # Tune for this POWER processor, treating '+' models as base models + POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+` + COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER " + elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then + COMMON_FLAGS="$COMMON_FLAGS -march=z10 " + else + COMMON_FLAGS="$COMMON_FLAGS -march=native " + fi fi PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" diff --git a/build_tools/dependencies.sh b/build_tools/dependencies.sh new file mode 100644 index 000000000..f5ef15c3a --- /dev/null +++ b/build_tools/dependencies.sh @@ -0,0 +1,16 @@ +GCC_BASE=/mnt/vol/engshare/fbcode/third-party2/gcc/4.9.x/centos6-native/1317bc4/ +CLANG_BASE=/mnt/gvfs/third-party2/clang/fc904e50a9266b9d7b98cae1993afa0c5aae1440/3.7.1/centos6-native/9d9ecb9/ +LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/ea2fd1278810d3af2ea52218d2767e09d786dbd0/4.9.x/gcc-4.9-glibc-2.20/024dbc3 +GLIBC_BASE=/mnt/gvfs/third-party2/glibc/f5484f168c0e4d19823d41df052c5870c6e575a4/2.20/gcc-4.9-glibc-2.20/500e281 +SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/cbf6f1f209e5bd160bdc5d971744e039f36b1566/1.1.3/gcc-4.9-glibc-2.20/e9936bf +ZLIB_BASE=/mnt/gvfs/third-party2/zlib/6d39cb54708049f527e713ad19f2aadb9d3667e8/1.2.8/gcc-4.9-glibc-2.20/e9936bf +BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/2ddd45f0853bfc8bb1c27f0f447236a1a26c338a/1.0.6/gcc-4.9-glibc-2.20/e9936bf +LZ4_BASE=/mnt/gvfs/third-party2/lz4/6858fac689e0f92e584224d91bdb0e39f6c8320d/r131/gcc-4.9-glibc-2.20/e9936bf +ZSTD_BASE=/mnt/gvfs/third-party2/zstd/cb6c4880fcb4fee471574ba6af63a3882155a16a/0.5.1/gcc-4.9-glibc-2.20/e9936bf +GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/c7275a4ceae0aca0929e56964a31dafc53c1ee96/2.1.1/gcc-4.8.1-glibc-2.17/c3f970a +JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/40791a3fef9206a77f2c4bc51f8169e5bf10d68e/master/gcc-4.9-glibc-2.20/a6c5e1e +NUMA_BASE=/mnt/gvfs/third-party2/numa/ae54a5ed22cdabb1c6446dce4e8ffae5b4446d73/2.0.8/gcc-4.9-glibc-2.20/e9936bf +LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/303048f72efc92ae079e62dfc84823401aecfd94/trunk/gcc-4.9-glibc-2.20/12266b1 +KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/1a48835975c66d30e47770ec419758ed3b9ba010/3.10.62-62_fbk17_03959_ge29cc63/gcc-4.9-glibc-2.20/da39a3e +BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/a5b8152b2a15ce8a98808cf954fbccec825a97bc/2.25/centos6-native/da39a3e +VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/af85c56f424cd5edfc2c97588299b44ecdec96bb/3.10.0/gcc-4.9-glibc-2.20/e9936bf diff --git a/build_tools/dependencies_4.8.1.sh b/build_tools/dependencies_4.8.1.sh new file mode 100644 index 000000000..845f765d0 --- /dev/null +++ b/build_tools/dependencies_4.8.1.sh @@ -0,0 +1,16 @@ +GCC_BASE=/mnt/vol/engshare/fbcode/third-party2/gcc/4.8.1/centos6-native/cc6c9dc/ +CLANG_BASE=/mnt/gvfs/third-party2/clang/fc904e50a9266b9d7b98cae1993afa0c5aae1440/3.7.1/centos6-native/9d9ecb9/ +LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/ea2fd1278810d3af2ea52218d2767e09d786dbd0/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc +GLIBC_BASE=/mnt/gvfs/third-party2/glibc/f5484f168c0e4d19823d41df052c5870c6e575a4/2.17/gcc-4.8.1-glibc-2.17/99df8fc +SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/cbf6f1f209e5bd160bdc5d971744e039f36b1566/1.1.3/gcc-4.8.1-glibc-2.17/c3f970a +ZLIB_BASE=/mnt/gvfs/third-party2/zlib/6d39cb54708049f527e713ad19f2aadb9d3667e8/1.2.8/gcc-4.8.1-glibc-2.17/c3f970a +BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/2ddd45f0853bfc8bb1c27f0f447236a1a26c338a/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a +LZ4_BASE=/mnt/gvfs/third-party2/lz4/6858fac689e0f92e584224d91bdb0e39f6c8320d/r131/gcc-4.8.1-glibc-2.17/c3f970a +ZSTD_BASE=/mnt/gvfs/third-party2/zstd/cb6c4880fcb4fee471574ba6af63a3882155a16a/0.5.1/gcc-4.8.1-glibc-2.17/c3f970a +GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/c7275a4ceae0aca0929e56964a31dafc53c1ee96/2.1.1/gcc-4.8.1-glibc-2.17/c3f970a +JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/40791a3fef9206a77f2c4bc51f8169e5bf10d68e/master/gcc-4.8.1-glibc-2.17/8d31e51 +NUMA_BASE=/mnt/gvfs/third-party2/numa/ae54a5ed22cdabb1c6446dce4e8ffae5b4446d73/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a +LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/303048f72efc92ae079e62dfc84823401aecfd94/trunk/gcc-4.8.1-glibc-2.17/675d945 +KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/1a48835975c66d30e47770ec419758ed3b9ba010/3.10.62-62_fbk17_03959_ge29cc63/gcc-4.8.1-glibc-2.17/da39a3e +BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/a5b8152b2a15ce8a98808cf954fbccec825a97bc/2.25/centos6-native/da39a3e +VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/af85c56f424cd5edfc2c97588299b44ecdec96bb/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 1394e4e64..9e1c613ec 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -6,103 +6,104 @@ # Environment variables that change the behavior of this script: # PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included + +BASEDIR=`dirname $BASH_SOURCE` +source "$BASEDIR/dependencies.sh" + CFLAGS="" -# location of libgcc -LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/0473c80518a10d6efcbe24c5eeca3fb4ec9b519c/4.9.x/gcc-4.9-glibc-2.20/e1a7e4e" +# libgcc LIBGCC_INCLUDE="$LIBGCC_BASE/include" LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" -# location of glibc -GLIBC_REV=7397bed99280af5d9543439cdb7d018af7542720 -GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/include" -GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.20/gcc-4.9-glibc-2.20/99df8fc/lib" - -SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/include/" +# glibc +GLIBC_INCLUDE="$GLIBC_BASE/include" +GLIBC_LIBS=" -L $GLIB_BASE/lib" +# snappy +SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/" if test -z $PIC_BUILD; then - SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy.a" + SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a" else - SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/b0f269b3ca47770121aa159b99e1d8d2ab260e1f/1.0.3/gcc-4.9-glibc-2.20/c32916f/lib/libsnappy_pic.a" + SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a" fi - CFLAGS+=" -DSNAPPY" if test -z $PIC_BUILD; then # location of zlib headers and libraries - ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/include/" - ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/feb983d9667f4cf5e9da07ce75abc824764b67a1/1.2.8/gcc-4.9-glibc-2.20/4230243/lib/libz.a" + ZLIB_INCLUDE=" -I $ZLIB_BASE/include/" + ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a" CFLAGS+=" -DZLIB" # location of bzip headers and libraries - BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/include/" - BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/af004cceebb2dfd173ca29933ea5915e727aad2f/1.0.6/gcc-4.9-glibc-2.20/4230243/lib/libbz2.a" + BZIP_INCLUDE=" -I $BZIP2_BASE/include/" + BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a" CFLAGS+=" -DBZIP2" - LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/6858fac689e0f92e584224d91bdb0e39f6c8320d/r131/gcc-4.9-glibc-2.20/e9936bf/include/" - LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/6858fac689e0f92e584224d91bdb0e39f6c8320d/r131/gcc-4.9-glibc-2.20/e9936bf/lib/liblz4.a" + LZ4_INCLUDE=" -I $LZ4_BASE/include/" + LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a" CFLAGS+=" -DLZ4" - ZSTD_REV=810b81b4705def5243e998b54701f3c504e4009e - ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.4.2/gcc-4.8.1-glibc-2.17/c3f970a/include" - ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.4.2/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a" + ZSTD_INCLUDE=" -I $ZSTD_BASE/include/" + ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a" CFLAGS+=" -DZSTD" fi # location of gflags headers and libraries -GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/include/" +GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" if test -z $PIC_BUILD; then - GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags.a" + GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a" else - GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/0fa60e2b88de3e469db6c482d6e6dac72f5d65f9/1.6/gcc-4.9-glibc-2.20/4230243/lib/libgflags_pic.a" + GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a" fi CFLAGS+=" -DGFLAGS=google" # location of jemalloc -JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/include/" -JEMALLOC_LIB=" /mnt/gvfs/third-party2/jemalloc/bcd68e5e419efa4e61b9486d6854564d6d75a0b5/3.6.0/gcc-4.9-glibc-2.20/2aafc78/lib/libjemalloc.a" +JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/" +JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a" if test -z $PIC_BUILD; then # location of numa - NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/include/" - NUMA_LIB=" /mnt/gvfs/third-party2/numa/bbefc39ecbf31d0ca184168eb613ef8d397790ee/2.0.8/gcc-4.9-glibc-2.20/4230243/lib/libnuma.a" + NUMA_INCLUDE=" -I $NUMA_BASE/include/" + NUMA_LIB=" $NUMA_BASE/lib/libnuma.a" CFLAGS+=" -DNUMA" # location of libunwind - LIBUNWIND="/mnt/gvfs/third-party2/libunwind/1de3b75e0afedfe5585b231bbb340ec7a1542335/1.1/gcc-4.9-glibc-2.20/34235e8/lib/libunwind.a" + LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a" fi # use Intel SSE support for checksum calculations export USE_SSE=1 -BINUTILS="/mnt/gvfs/third-party2/binutils/0b6ad0c88ddd903333a48ae8bff134efac468e4a/2.25/centos6-native/da39a3e/bin" +BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" -GCC_BASE="/mnt/gvfs/third-party2/gcc/1c67a0b88f64d4d9ced0382d141c76aaa7d62fba/4.9.x/centos6-native/1317bc4" STDLIBS="-L $GCC_BASE/lib64" -CLANG_BASE="/mnt/gvfs/third-party2/clang/d81444dd214df3d2466734de45bb264a0486acc3/dev" -CLANG_BIN="$CLANG_BASE/centos6-native/af4b1a0/bin" +CLANG_BIN="$CLANG_BASE/bin" +CLANG_LIB="$CLANG_BASE/lib" +CLANG_SRC="$CLANG_BASE/../../src" + CLANG_ANALYZER="$CLANG_BIN/clang++" -CLANG_SCAN_BUILD="$CLANG_BASE/src/clang/tools/scan-build/scan-build" +CLANG_SCAN_BUILD="$CLANG_SRC/clang/tools/scan-build/scan-build" if [ -z "$USE_CLANG" ]; then # gcc CC="$GCC_BASE/bin/gcc" CXX="$GCC_BASE/bin/g++" - + CFLAGS+=" -B$BINUTILS/gold" CFLAGS+=" -isystem $GLIBC_INCLUDE" CFLAGS+=" -isystem $LIBGCC_INCLUDE" else - # clang - CLANG_INCLUDE="$CLANG_BASE/gcc-4.9-glibc-2.20/74c386f/lib/clang/dev/include/" + # clang + CLANG_INCLUDE="$CLANG_LIB/clang/*/include" CC="$CLANG_BIN/clang" CXX="$CLANG_BIN/clang++" - KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/ffd14f660a43c4b92717986b1bba66722ef089d0/3.2.18_70_fbk11_00129_gc8882d0/gcc-4.9-glibc-2.20/da39a3e/include" + KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include" CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib" CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.9.x " @@ -128,6 +129,6 @@ PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" -VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/6c45ef049cbf11c2df593addb712cd891049e737/3.10.0/gcc-4.9-glibc-2.20/4230243/bin/" +VALGRIND_VER="$VALGRIND_BASE/bin/" export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh index 0a4adc96f..71428d03b 100644 --- a/build_tools/fbcode_config4.8.1.sh +++ b/build_tools/fbcode_config4.8.1.sh @@ -4,85 +4,77 @@ # fbcode settings. It uses the latest g++ compiler and also # uses jemalloc +BASEDIR=`dirname $BASH_SOURCE` +source "$BASEDIR/dependencies_4.8.1.sh" + # location of libgcc -LIBGCC_BASE="/mnt/gvfs/third-party2/libgcc/d00277f4559e261ed0a81f30f23c0ce5564e359e/4.8.1/gcc-4.8.1-glibc-2.17/8aac7fc" LIBGCC_INCLUDE="$LIBGCC_BASE/include" LIBGCC_LIBS=" -L $LIBGCC_BASE/libs" # location of glibc -GLIBC_REV=0600c95b31226b5e535614c590677d87c62d8016 -GLIBC_INCLUDE="/mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/include" -GLIBC_LIBS=" -L /mnt/gvfs/third-party2/glibc/$GLIBC_REV/2.17/gcc-4.8.1-glibc-2.17/99df8fc/lib" +GLIBC_INCLUDE="$GLIBC_BASE/include" +GLIBC_LIBS=" -L $GLIBC_BASE/lib" # location of snappy headers and libraries -SNAPPY_REV=cbf6f1f209e5bd160bdc5d971744e039f36b1566 -SNAPPY_INCLUDE=" -I /mnt/gvfs/third-party2/snappy/$SNAPPY_REV/1.1.3/gcc-4.8.1-glibc-2.17/c3f970a/include" -SNAPPY_LIBS=" /mnt/gvfs/third-party2/snappy/$SNAPPY_REV/1.1.3/gcc-4.8.1-glibc-2.17/c3f970a/lib/libsnappy.a" +SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include" +SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a" # location of zlib headers and libraries -ZLIB_REV=6d39cb54708049f527e713ad19f2aadb9d3667e8 -ZLIB_INCLUDE=" -I /mnt/gvfs/third-party2/zlib/$ZLIB_REV/1.2.8/gcc-4.8.1-glibc-2.17/c3f970a/include" -ZLIB_LIBS=" /mnt/gvfs/third-party2/zlib/$ZLIB_REV/1.2.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libz.a" +ZLIB_INCLUDE=" -I $ZLIB_BASE/include" +ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a" # location of bzip headers and libraries -BZIP_REV=d6c789bfc2ec4c51a63d66df2878926b8158cde8 -BZIP_INCLUDE=" -I /mnt/gvfs/third-party2/bzip2/$BZIP_REV/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/include/" -BZIP_LIBS=" /mnt/gvfs/third-party2/bzip2/$BZIP_REV/1.0.6/gcc-4.8.1-glibc-2.17/c3f970a/lib/libbz2.a" +BZIP2_INCLUDE=" -I $BZIP2_BASE/include/" +BZIP2_LIBS=" $BZIP2_BASE/lib/libbz2.a" -LZ4_REV=6858fac689e0f92e584224d91bdb0e39f6c8320d -LZ4_INCLUDE=" -I /mnt/gvfs/third-party2/lz4/$LZ4_REV/r131/gcc-4.8.1-glibc-2.17/c3f970a/include" -LZ4_LIBS=" /mnt/gvfs/third-party2/lz4/$LZ4_REV/r131/gcc-4.8.1-glibc-2.17/c3f970a/lib/liblz4.a" +LZ4_INCLUDE=" -I $LZ4_BASE/include" +LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a" -ZSTD_REV=810b81b4705def5243e998b54701f3c504e4009e -ZSTD_INCLUDE=" -I /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.4.2/gcc-4.8.1-glibc-2.17/c3f970a/include" -ZSTD_LIBS=" /mnt/gvfs/third-party2/zstd/$ZSTD_REV/0.4.2/gcc-4.8.1-glibc-2.17/c3f970a/lib/libzstd.a" +ZSTD_INCLUDE=" -I $ZSTD_BASE/include" +ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a" # location of gflags headers and libraries -GFLAGS_REV=c7275a4ceae0aca0929e56964a31dafc53c1ee96 -GFLAGS_INCLUDE=" -I /mnt/gvfs/third-party2/gflags/$GFLAGS_REV/2.1.1/gcc-4.8.1-glibc-2.17/c3f970a/include/" -GFLAGS_LIBS=" /mnt/gvfs/third-party2/gflags/$GFLAGS_REV/2.1.1/gcc-4.8.1-glibc-2.17/c3f970a/lib/libgflags.a" +GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/" +GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a" # location of jemalloc -JEMALLOC_REV=c370265e58c4b6602e798df23335a1e9913dae52 -JEMALLOC_INCLUDE=" -I /mnt/gvfs/third-party2/jemalloc/$JEMALLOC_REV/4.0.3/gcc-4.8.1-glibc-2.17/8d31e51/include" -JEMALLOC_LIB="/mnt/gvfs/third-party2/jemalloc/$JEMALLOC_REV/4.0.3/gcc-4.8.1-glibc-2.17/8d31e51/lib/libjemalloc.a" +JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include" +JEMALLOC_LIB="$JEMALLOC_BASE/lib/libjemalloc.a" # location of numa -NUMA_REV=ae54a5ed22cdabb1c6446dce4e8ffae5b4446d73 -NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/" -NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a" +NUMA_INCLUDE=" -I $NUMA_BASE/include/" +NUMA_LIB=" $NUMA_BASE/lib/libnuma.a" # location of libunwind -LIBUNWIND_REV=121f1a75c4414683aea8c70b761bfaf187f7c1a3 -LIBUNWIND="/mnt/gvfs/third-party2/libunwind/$LIBUNWIND_REV/trunk/gcc-4.8.1-glibc-2.17/675d945/lib/libunwind.a" +LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a" # use Intel SSE support for checksum calculations export USE_SSE=1 -BINUTILS="/mnt/gvfs/third-party2/binutils/75670d0d8ef4891fd1ec2a7513ef01cd002c823b/2.25/centos6-native/da39a3e/bin" +BINUTILS="$BINUTILS_BASE/bin" AR="$BINUTILS/ar" -DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" +DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP2_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE" -GCC_BASE="/mnt/gvfs/third-party2/gcc/c0064002d2609ab649603f769f0bd110bbe48029/4.8.1/centos6-native/cc6c9dc" STDLIBS="-L $GCC_BASE/lib64" if [ -z "$USE_CLANG" ]; then # gcc CC="$GCC_BASE/bin/gcc" CXX="$GCC_BASE/bin/g++" - + CFLAGS="-B$BINUTILS/gold -m64 -mtune=generic" CFLAGS+=" -isystem $GLIBC_INCLUDE" CFLAGS+=" -isystem $LIBGCC_INCLUDE" else - # clang - CLANG_BASE="/mnt/gvfs/third-party2/clang/ab054e9a490a8fd4537c0b6ec56e5c91c0f81c91/3.7" - CLANG_INCLUDE="$CLANG_BASE/gcc-4.8.1-glibc-2.17/ee9b060/lib/clang/3.7/include" - CC="$CLANG_BASE/centos6-native/b2feaee/bin/clang" - CXX="$CLANG_BASE/centos6-native/b2feaee/bin/clang++" + # clang + CLANG_BIN="$CLANG_BASE/bin" + CLANG_LIB="$CLANG_BASE/lib" + CLANG_INCLUDE="$CLANG_LIB/clang/*/include" + CC="$CLANG_BIN/clang" + CXX="$CLANG_BIN/clang++" - KERNEL_HEADERS_INCLUDE="/mnt/gvfs/third-party2/kernel-headers/1a48835975c66d30e47770ec419758ed3b9ba010/3.10.62-62_fbk17_03959_ge29cc63/gcc-4.8.1-glibc-2.17/da39a3e/include/" + KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include/" CFLAGS="-B$BINUTILS/gold -nostdinc -nostdlib" CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/4.8.1 " @@ -100,16 +92,15 @@ CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PR CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA" CXXFLAGS+=" $CFLAGS" -EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" +EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP2_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so" EXEC_LDFLAGS+=" $LIBUNWIND" EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib" PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++" -EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" +EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP2_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS" -VALGRIND_REV=af85c56f424cd5edfc2c97588299b44ecdec96bb -VALGRIND_VER="/mnt/gvfs/third-party2/valgrind/$VALGRIND_REV/3.8.1/gcc-4.8.1-glibc-2.17/c3f970a/bin/" +VALGRIND_VER="$VALGRIND_BASE/bin/" export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE diff --git a/build_tools/make_new_version.sh b/build_tools/make_new_version.sh index 409944f83..76a847355 100755 --- a/build_tools/make_new_version.sh +++ b/build_tools/make_new_version.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2013, Facebook, Inc. All rights reserved. +# Copyright (c) 2011-present, Facebook, Inc. All rights reserved. # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. An additional grant # of patent rights can be found in the PATENTS file in the same directory. @@ -10,7 +10,7 @@ then GIT="git" fi -# Print out the colored progress info so that it can be brainlessly +# Print out the colored progress info so that it can be brainlessly # distinguished by users. function title() { echo -e "\033[1;32m$*\033[0m" diff --git a/build_tools/precommit_checker.py b/build_tools/precommit_checker.py new file mode 100755 index 000000000..ceb5cb4ab --- /dev/null +++ b/build_tools/precommit_checker.py @@ -0,0 +1,198 @@ +#!/usr/local/fbcode/gcc-4.8.1-glibc-2.17-fb/bin/python2.7 + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import argparse +import commands +import subprocess +import sys +import re +import os +import time + +# +# Simple logger +# + + +class Log: + + LOG_FILE = "/tmp/precommit-check.log" + + def __init__(self): + self.filename = Log.LOG_FILE + self.f = open(self.filename, 'w+', 0) + + def caption(self, str): + line = "\n##### %s #####\n" % str + if self.f: + self.f.write("%s \n" % line) + else: + print(line) + + def error(self, str): + data = "\n\n##### ERROR ##### %s" % str + if self.f: + self.f.write("%s \n" % data) + else: + print(data) + + def log(self, str): + if self.f: + self.f.write("%s \n" % str) + else: + print(str) + +# +# Shell Environment +# + + +class Env(object): + + def __init__(self, tests): + self.tests = tests + self.log = Log() + + def shell(self, cmd, path=os.getcwd()): + if path: + os.chdir(path) + + self.log.log("==== shell session ===========================") + self.log.log("%s> %s" % (path, cmd)) + status = subprocess.call("cd %s; %s" % (path, cmd), shell=True, + stdout=self.log.f, stderr=self.log.f) + self.log.log("status = %s" % status) + self.log.log("============================================== \n\n") + return status + + def GetOutput(self, cmd, path=os.getcwd()): + if path: + os.chdir(path) + + self.log.log("==== shell session ===========================") + self.log.log("%s> %s" % (path, cmd)) + status, out = commands.getstatusoutput(cmd) + self.log.log("status = %s" % status) + self.log.log("out = %s" % out) + self.log.log("============================================== \n\n") + return status, out + +# +# Pre-commit checker +# + + +class PreCommitChecker(Env): + + def __init__(self, tests): + Env.__init__(self, tests) + + # + # Get commands for a given job from the determinator file + # + def get_commands(self, test): + status, out = self.GetOutput( + "build_tools/rocksdb-lego-determinator %s" % test, ".") + return status, out + + # + # Run a specific CI job + # + def run_test(self, test): + self.log.caption("Running test %s locally" % test) + + # get commands for the CI job determinator + status, cmds = self.get_commands(test) + if status != 0: + self.log.error("Error getting commands for test %s" % test) + return False + + # Parse the JSON to extract the commands to run + cmds = re.findall("'shell':'([^\']*)'", cmds) + + if len(cmds) == 0: + self.log.log("No commands found") + return False + + # Run commands + for cmd in cmds: + # Replace J=<..> with the local environment variable + if "J" in os.environ: + cmd = cmd.replace("J=1", "J=%s" % os.environ["J"]) + cmd = cmd.replace("make ", "make -j%s " % os.environ["J"]) + # Run the command + status = self.shell(cmd, ".") + if status != 0: + self.log.error("Error running command %s for test %s" + % (cmd, test)) + return False + + return True + + # + # Run specified CI jobs + # + def run_tests(self): + if not self.tests: + self.log.error("Invalid args. Please provide tests") + return False + + self.print_separator() + self.print_row("TEST", "RESULT") + self.print_separator() + + for test in self.tests: + start_time = time.time() + self.print_test(test) + result = self.run_test(test) + elapsed_min = (time.time() - start_time) / 60 + if not result: + self.log.error("Error running test %s" % test) + self.print_result("FAIL (%dm)" % elapsed_min) + return False + self.print_result("PASS (%dm)" % elapsed_min) + + self.print_separator() + return True + + # + # Print a line + # + def print_separator(self): + print("".ljust(60, "-")) + + # + # Print two colums + # + def print_row(self, c0, c1): + print("%s%s" % (c0.ljust(40), c1.ljust(20))) + + def print_test(self, test): + print(test.ljust(40), end="") + sys.stdout.flush() + + def print_result(self, result): + print(result.ljust(20)) + +# +# Main +# +parser = argparse.ArgumentParser(description='RocksDB pre-commit checker.') + +# +parser.add_argument('test', nargs='+', + help='CI test(s) to run. e.g: unit punit asan tsan') + +print("Please follow log %s" % Log.LOG_FILE) + +args = parser.parse_args() +checker = PreCommitChecker(args.test) + +if not checker.run_tests(): + print("Error running tests. Please check log file %s" % Log.LOG_FILE) + sys.exit(1) + +sys.exit(0) diff --git a/build_tools/update_dependencies.sh b/build_tools/update_dependencies.sh new file mode 100755 index 000000000..9959700cf --- /dev/null +++ b/build_tools/update_dependencies.sh @@ -0,0 +1,127 @@ +#!/bin/sh +# +# Update dependencies.sh file with the latest avaliable versions + +BASEDIR=$(dirname $0) +OUTPUT="" + +function log_variable() +{ + echo "$1=${!1}" >> "$OUTPUT" +} + + +TP2_LATEST="/mnt/vol/engshare/fbcode/third-party2" +## $1 => lib name +## $2 => lib version (if not provided, will try to pick latest) +## $3 => platform (if not provided, will try to pick latest gcc) +## +## get_lib_base will set a variable named ${LIB_NAME}_BASE to the lib location +function get_lib_base() +{ + local lib_name=$1 + local lib_version=$2 + local lib_platform=$3 + + local result="$TP2_LATEST/$lib_name/" + + # Lib Version + if [ -z "$lib_version" ] || [ "$lib_version" = "LATEST" ]; then + # version is not provided, use latest + result=`ls -dr1v $result/*/ | head -n1` + else + result="$result/$lib_version/" + fi + + # Lib Platform + if [ -z "$lib_platform" ]; then + # platform is not provided, use latest gcc + result=`ls -dr1v $result/gcc-*[^fb]/ | head -n1` + else + result="$result/$lib_platform/" + fi + + result=`ls -1d $result/*/ | head -n1` + + # lib_name => LIB_NAME_BASE + local __res_var=${lib_name^^}"_BASE" + __res_var=`echo $__res_var | tr - _` + # LIB_NAME_BASE=$result + eval $__res_var=`readlink -f $result` + + log_variable $__res_var +} + +########################################################### +# 4.9.x dependencies # +########################################################### + +OUTPUT="$BASEDIR/dependencies.sh" + +rm -f "$OUTPUT" +touch "$OUTPUT" + +echo "Writing dependencies to $OUTPUT" + +# Compilers locations +GCC_BASE=`ls -d1 $TP2_LATEST/gcc/4.9.x/centos6-native/*/ | head -n1` +CLANG_BASE=`ls -d1 /mnt/gvfs/third-party2/clang/fc904e50a9266b9d7b98cae1993afa0c5aae1440/3.7.1/centos6-native/*/ | head -n1` + +log_variable GCC_BASE +log_variable CLANG_BASE + +# Libraries locations +get_lib_base libgcc 4.9.x +get_lib_base glibc 2.20 +get_lib_base snappy LATEST +get_lib_base zlib LATEST +get_lib_base bzip2 LATEST +get_lib_base lz4 LATEST +get_lib_base zstd LATEST +get_lib_base gflags LATEST +get_lib_base jemalloc LATEST +get_lib_base numa LATEST +get_lib_base libunwind LATEST + +get_lib_base kernel-headers LATEST +get_lib_base binutils LATEST centos6-native +get_lib_base valgrind LATEST + +git diff $OUTPUT + +########################################################### +# 4.8.1 dependencies # +########################################################### + +OUTPUT="$BASEDIR/dependencies_4.8.1.sh" + +rm -f "$OUTPUT" +touch "$OUTPUT" + +echo "Writing 4.8.1 dependencies to $OUTPUT" + +# Compilers locations +GCC_BASE=`ls -d1 $TP2_LATEST/gcc/4.8.1/centos6-native/*/ | head -n1` +CLANG_BASE=`ls -d1 /mnt/gvfs/third-party2/clang/fc904e50a9266b9d7b98cae1993afa0c5aae1440/3.7.1/centos6-native/*/ | head -n1` + +log_variable GCC_BASE +log_variable CLANG_BASE + +# Libraries locations +get_lib_base libgcc 4.8.1 gcc-4.8.1-glibc-2.17 +get_lib_base glibc 2.17 gcc-4.8.1-glibc-2.17 +get_lib_base snappy LATEST gcc-4.8.1-glibc-2.17 +get_lib_base zlib LATEST gcc-4.8.1-glibc-2.17 +get_lib_base bzip2 LATEST gcc-4.8.1-glibc-2.17 +get_lib_base lz4 LATEST gcc-4.8.1-glibc-2.17 +get_lib_base zstd LATEST gcc-4.8.1-glibc-2.17 +get_lib_base gflags LATEST gcc-4.8.1-glibc-2.17 +get_lib_base jemalloc LATEST gcc-4.8.1-glibc-2.17 +get_lib_base numa LATEST gcc-4.8.1-glibc-2.17 +get_lib_base libunwind LATEST gcc-4.8.1-glibc-2.17 + +get_lib_base kernel-headers LATEST gcc-4.8.1-glibc-2.17 +get_lib_base binutils LATEST centos6-native +get_lib_base valgrind 3.8.1 gcc-4.8.1-glibc-2.17 + +git diff $OUTPUT diff --git a/util/auto_roll_logger.cc b/db/auto_roll_logger.cc similarity index 88% rename from util/auto_roll_logger.cc rename to db/auto_roll_logger.cc index e9b13d109..8118b2377 100644 --- a/util/auto_roll_logger.cc +++ b/db/auto_roll_logger.cc @@ -1,9 +1,9 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "util/auto_roll_logger.h" +#include "db/auto_roll_logger.h" #include "util/mutexlock.h" using namespace std; @@ -12,7 +12,9 @@ namespace rocksdb { // -- AutoRollLogger Status AutoRollLogger::ResetLogger() { + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:BeforeNewLogger"); status_ = env_->NewLogger(log_fname_, &logger_); + TEST_SYNC_POINT("AutoRollLogger::ResetLogger:AfterNewLogger"); if (!status_.ok()) { return status_; @@ -32,8 +34,16 @@ Status AutoRollLogger::ResetLogger() { } void AutoRollLogger::RollLogFile() { - std::string old_fname = OldInfoLogFileName( - dbname_, env_->NowMicros(), db_absolute_path_, db_log_dir_); + // This function is called when log is rotating. Two rotations + // can happen quickly (NowMicro returns same value). To not overwrite + // previous log file we increment by one micro second and try again. + uint64_t now = env_->NowMicros(); + std::string old_fname; + do { + old_fname = OldInfoLogFileName( + dbname_, now, db_absolute_path_, db_log_dir_); + now++; + } while (env_->FileExists(old_fname).ok()); env_->RenameFile(log_fname_, old_fname); } diff --git a/util/auto_roll_logger.h b/db/auto_roll_logger.h similarity index 82% rename from util/auto_roll_logger.h rename to db/auto_roll_logger.h index 1288cdf3a..a43a98a8f 100644 --- a/util/auto_roll_logger.h +++ b/db/auto_roll_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,6 +13,8 @@ #include "db/filename.h" #include "port/port.h" #include "port/util_logger.h" +#include "util/sync_point.h" +#include "util/mutexlock.h" namespace rocksdb { @@ -53,11 +55,26 @@ class AutoRollLogger : public Logger { return status_; } - size_t GetLogFileSize() const override { return logger_->GetLogFileSize(); } + size_t GetLogFileSize() const override { + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + return logger->GetLogFileSize(); + } void Flush() override { - if (logger_) { - logger_->Flush(); + std::shared_ptr logger; + { + MutexLock l(&mutex_); + // pin down the current logger_ instance before releasing the mutex. + logger = logger_; + } + TEST_SYNC_POINT("AutoRollLogger::Flush:PinnedLogger"); + if (logger) { + logger->Flush(); } } @@ -101,7 +118,7 @@ class AutoRollLogger : public Logger { uint64_t ctime_; uint64_t cached_now_access_count; uint64_t call_NowMicros_every_N_records_; - port::Mutex mutex_; + mutable port::Mutex mutex_; }; // Facade to craete logger automatically diff --git a/util/auto_roll_logger_test.cc b/db/auto_roll_logger_test.cc similarity index 84% rename from util/auto_roll_logger_test.cc rename to db/auto_roll_logger_test.cc index c26be2bd2..60c89a186 100644 --- a/util/auto_roll_logger_test.cc +++ b/db/auto_roll_logger_test.cc @@ -1,17 +1,20 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // #include +#include #include #include #include #include #include #include +#include "db/auto_roll_logger.h" +#include "port/port.h" +#include "util/sync_point.h" #include "util/testharness.h" -#include "util/auto_roll_logger.h" #include "rocksdb/db.h" #include #include @@ -260,7 +263,60 @@ TEST_F(AutoRollLoggerTest, CreateLoggerFromOptions) { auto_roll_logger, options.log_file_time_to_roll, kSampleMessage + ":CreateLoggerFromOptions - both"); } -#endif + +TEST_F(AutoRollLoggerTest, LogFlushWhileRolling) { + DBOptions options; + shared_ptr logger; + + InitTestDb(); + options.max_log_file_size = 1024 * 5; + ASSERT_OK(CreateLoggerFromOptions(kTestDir, options, &logger)); + AutoRollLogger* auto_roll_logger = + dynamic_cast(logger.get()); + ASSERT_TRUE(auto_roll_logger); + std::thread flush_thread; + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + // Need to pin the old logger before beginning the roll, as rolling grabs + // the mutex, which would prevent us from accessing the old logger. + {"AutoRollLogger::Flush:PinnedLogger", + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"}, + // Need to finish the flush thread init before this callback because the + // callback accesses flush_thread.get_id() in order to apply certain sync + // points only to the flush thread. + {"AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit", + "AutoRollLoggerTest::LogFlushWhileRolling:FlushCallbackBegin"}, + // Need to reset logger at this point in Flush() to exercise a race + // condition case, which is executing the flush with the pinned (old) + // logger after the roll has cut over to a new logger. + {"AutoRollLoggerTest::LogFlushWhileRolling:FlushCallback1", + "AutoRollLogger::ResetLogger:BeforeNewLogger"}, + {"AutoRollLogger::ResetLogger:AfterNewLogger", + "AutoRollLoggerTest::LogFlushWhileRolling:FlushCallback2"}, + }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "PosixLogger::Flush:BeginCallback", [&](void* arg) { + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:FlushCallbackBegin"); + if (std::this_thread::get_id() == flush_thread.get_id()) { + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:FlushCallback1"); + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:FlushCallback2"); + } + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + flush_thread = std::thread([&]() { auto_roll_logger->Flush(); }); + TEST_SYNC_POINT( + "AutoRollLoggerTest::LogFlushWhileRolling:PreRollAndPostThreadInit"); + RollLogFileBySizeTest(auto_roll_logger, options.max_log_file_size, + kSampleMessage + ":LogFlushWhileRolling"); + flush_thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +#endif // OS_WIN TEST_F(AutoRollLoggerTest, InfoLogLevel) { InitTestDb(); diff --git a/db/builder.cc b/db/builder.cc index 52605b27d..317c9b054 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -63,7 +63,7 @@ Status BuildTable( const CompressionType compression, const CompressionOptions& compression_opts, bool paranoid_file_checks, InternalStats* internal_stats, const Env::IOPriority io_priority, - TableProperties* table_properties) { + TableProperties* table_properties, int level) { // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; Status s; @@ -149,7 +149,8 @@ Status BuildTable( ReadOptions(), env_options, internal_comparator, meta->fd, nullptr, (internal_stats == nullptr) ? nullptr : internal_stats->GetFileReadHist(0), - false)); + false /* for_compaction */, nullptr /* arena */, + false /* skip_filter */, level)); s = it->status(); if (s.ok() && paranoid_file_checks) { for (it->SeekToFirst(); it->Valid(); it->Next()) { diff --git a/db/builder.h b/db/builder.h index 9a4d3b60b..1eba6da9c 100644 --- a/db/builder.h +++ b/db/builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -61,6 +61,6 @@ extern Status BuildTable( const CompressionOptions& compression_opts, bool paranoid_file_checks, InternalStats* internal_stats, const Env::IOPriority io_priority = Env::IO_HIGH, - TableProperties* table_properties = nullptr); + TableProperties* table_properties = nullptr, int level = -1); } // namespace rocksdb diff --git a/db/c.cc b/db/c.cc index 731a76331..9f49aba23 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -1288,6 +1288,11 @@ void rocksdb_block_based_options_set_cache_index_and_filter_blocks( options->rep.cache_index_and_filter_blocks = v; } +void rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t* options, unsigned char v) { + options->rep.pin_l0_filter_and_index_blocks_in_cache = v; +} + void rocksdb_block_based_options_set_skip_table_builder_flush( rocksdb_block_based_table_options_t* options, unsigned char v) { options->rep.skip_table_builder_flush = v; diff --git a/db/column_family.cc b/db/column_family.cc index 408f53831..cde308d8c 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -135,6 +135,10 @@ Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) { "Delete filtering (filter_deletes) is not compatible with concurrent " "memtable writes (allow_concurrent_memtable_writes)"); } + if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) { + return Status::InvalidArgument( + "Memtable doesn't concurrent writes (allow_concurrent_memtable_write)"); + } return Status::OK(); } @@ -143,13 +147,10 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, const ColumnFamilyOptions& src) { ColumnFamilyOptions result = src; result.comparator = icmp; -#ifdef OS_MACOSX - // TODO(icanadi) make write_buffer_size uint64_t instead of size_t - ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30); -#else - ClipToRange(&result.write_buffer_size, - ((size_t)64) << 10, ((size_t)64) << 30); -#endif + size_t clamp_max = std::conditional< + sizeof(size_t) == 4, std::integral_constant, + std::integral_constant>::type::value; + ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, clamp_max); // if user sets arena_block_size, we trust user to use this value. Otherwise, // calculate a proper value from writer_buffer_size; if (result.arena_block_size <= 0) { @@ -239,6 +240,17 @@ ColumnFamilyOptions SanitizeOptions(const DBOptions& db_options, result.level0_slowdown_writes_trigger, result.level0_file_num_compaction_trigger); } + + if (result.soft_pending_compaction_bytes_limit == 0) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } else if (result.hard_pending_compaction_bytes_limit > 0 && + result.soft_pending_compaction_bytes_limit > + result.hard_pending_compaction_bytes_limit) { + result.soft_pending_compaction_bytes_limit = + result.hard_pending_compaction_bytes_limit; + } + if (result.level_compaction_dynamic_level_bytes) { if (result.compaction_style != kCompactionStyleLevel || db_options.db_paths.size() > 1U) { @@ -513,6 +525,21 @@ std::unique_ptr SetupDelay( } return write_controller->GetDelayToken(write_rate); } + +int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger, + int level0_slowdown_writes_trigger) { + // SanitizeOptions() ensures it. + assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger); + + // 1/4 of the way between L0 compaction trigger threshold and slowdown + // condition. + // Or twice as compaction trigger, if it is smaller. + return std::min(level0_file_num_compaction_trigger * 2, + level0_file_num_compaction_trigger + + (level0_slowdown_writes_trigger - + level0_file_num_compaction_trigger) / + 4); +} } // namespace void ColumnFamilyData::RecalculateWriteStallConditions( @@ -531,21 +558,6 @@ void ColumnFamilyData::RecalculateWriteStallConditions( "(waiting for flush), max_write_buffer_number is set to %d", name_.c_str(), imm()->NumNotFlushed(), mutable_cf_options.max_write_buffer_number); - } else if (mutable_cf_options.max_write_buffer_number > 3 && - imm()->NumNotFlushed() >= - mutable_cf_options.max_write_buffer_number - 1) { - write_controller_token_ = - SetupDelay(ioptions_.delayed_write_rate, write_controller, - compaction_needed_bytes, prev_compaction_needed_bytes_, - mutable_cf_options.disable_auto_compactions); - internal_stats_->AddCFStats(InternalStats::MEMTABLE_SLOWDOWN, 1); - Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, - "[%s] Stalling writes because we have %d immutable memtables " - "(waiting for flush), max_write_buffer_number is set to %d " - "rate %" PRIu64, - name_.c_str(), imm()->NumNotFlushed(), - mutable_cf_options.max_write_buffer_number, - write_controller->delayed_write_rate()); } else if (vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_stop_writes_trigger) { write_controller_token_ = write_controller->GetStopToken(); @@ -567,6 +579,21 @@ void ColumnFamilyData::RecalculateWriteStallConditions( "[%s] Stopping writes because of estimated pending compaction " "bytes %" PRIu64, name_.c_str(), compaction_needed_bytes); + } else if (mutable_cf_options.max_write_buffer_number > 3 && + imm()->NumNotFlushed() >= + mutable_cf_options.max_write_buffer_number - 1) { + write_controller_token_ = + SetupDelay(ioptions_.delayed_write_rate, write_controller, + compaction_needed_bytes, prev_compaction_needed_bytes_, + mutable_cf_options.disable_auto_compactions); + internal_stats_->AddCFStats(InternalStats::MEMTABLE_SLOWDOWN, 1); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Stalling writes because we have %d immutable memtables " + "(waiting for flush), max_write_buffer_number is set to %d " + "rate %" PRIu64, + name_.c_str(), imm()->NumNotFlushed(), + mutable_cf_options.max_write_buffer_number, + write_controller->delayed_write_rate()); } else if (mutable_cf_options.level0_slowdown_writes_trigger >= 0 && vstorage->l0_delay_trigger_count() >= mutable_cf_options.level0_slowdown_writes_trigger) { @@ -598,6 +625,29 @@ void ColumnFamilyData::RecalculateWriteStallConditions( "bytes %" PRIu64 " rate %" PRIu64, name_.c_str(), vstorage->estimated_compaction_needed_bytes(), write_controller->delayed_write_rate()); + } else if (vstorage->l0_delay_trigger_count() >= + GetL0ThresholdSpeedupCompaction( + mutable_cf_options.level0_file_num_compaction_trigger, + mutable_cf_options.level0_slowdown_writes_trigger)) { + write_controller_token_ = write_controller->GetCompactionPressureToken(); + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Increasing compaction threads because we have %d level-0 " + "files ", + name_.c_str(), vstorage->l0_delay_trigger_count()); + } else if (vstorage->estimated_compaction_needed_bytes() >= + mutable_cf_options.soft_pending_compaction_bytes_limit / 4) { + // Increase compaction threads if bytes needed for compaction exceeds + // 1/4 of threshold for slowing down. + // If soft pending compaction byte limit is not set, always speed up + // compaction. + write_controller_token_ = write_controller->GetCompactionPressureToken(); + if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) { + Log(InfoLogLevel::WARN_LEVEL, ioptions_.info_log, + "[%s] Increasing compaction threads because of estimated pending " + "compaction " + "bytes %" PRIu64, + name_.c_str(), vstorage->estimated_compaction_needed_bytes()); + } } else { write_controller_token_.reset(); } diff --git a/db/column_family.h b/db/column_family.h index 6266d40a2..1a4036e60 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -465,6 +465,8 @@ class ColumnFamilySet { // Don't call while iterating over ColumnFamilySet void FreeDeadColumnFamilies(); + Cache* get_table_cache() { return table_cache_; } + private: friend class ColumnFamilyData; // helper function that gets called from cfd destructor diff --git a/db/column_family_test.cc b/db/column_family_test.cc index d86735c2e..6e05a7da9 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -65,6 +65,7 @@ class ColumnFamilyTest : public testing::Test { ~ColumnFamilyTest() { Close(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); Destroy(); delete env_; } @@ -1951,13 +1952,27 @@ TEST_F(ColumnFamilyTest, ReadDroppedColumnFamily) { PutRandomData(1, kKeysNum, 100); PutRandomData(2, kKeysNum, 100); - if (iter == 0) { - // Drop CF two - ASSERT_OK(db_->DropColumnFamily(handles_[2])); - } else { - // delete CF two - delete handles_[2]; - handles_[2] = nullptr; + { + std::unique_ptr iterator( + db_->NewIterator(ReadOptions(), handles_[2])); + iterator->SeekToFirst(); + + if (iter == 0) { + // Drop CF two + ASSERT_OK(db_->DropColumnFamily(handles_[2])); + } else { + // delete CF two + delete handles_[2]; + handles_[2] = nullptr; + } + // Make sure iterator created can still be used. + int count = 0; + for (; iterator->Valid(); iterator->Next()) { + ASSERT_OK(iterator->status()); + ++count; + } + ASSERT_OK(iterator->status()); + ASSERT_EQ(count, kKeysNum); } // Add bunch more data to other CFs @@ -1999,7 +2014,9 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) { Reopen({options, options}); rocksdb::SyncPoint::GetInstance()->LoadDependency( - {{"VersionSet::LogAndApply::ColumnFamilyDrop:1", + {{"VersionSet::LogAndApply::ColumnFamilyDrop:0", + "FlushJob::WriteLevel0Table"}, + {"VersionSet::LogAndApply::ColumnFamilyDrop:1", "FlushJob::InstallResults"}, {"FlushJob::InstallResults", "VersionSet::LogAndApply::ColumnFamilyDrop:2"}}); @@ -2045,7 +2062,6 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) { Close(); Destroy(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } #ifndef ROCKSDB_LITE @@ -2123,7 +2139,6 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) { drop_cf_thread.join(); Close(); Destroy(); - rocksdb::SyncPoint::GetInstance()->DisableProcessing(); for (auto* comparator : comparators) { if (comparator) { delete comparator; @@ -2135,6 +2150,9 @@ TEST_F(ColumnFamilyTest, CreateAndDropRace) { TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { const uint64_t kBaseRate = 810000u; db_options_.delayed_write_rate = kBaseRate; + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + Open({"default"}); ColumnFamilyData* cfd = static_cast(db_->DefaultColumnFamily())->cfd(); @@ -2160,6 +2178,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_TRUE(!dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate, dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); vstorage->TEST_set_estimated_compaction_needed_bytes(400); cfd->RecalculateWriteStallConditions(mutable_cf_options); @@ -2167,6 +2186,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.2, dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); vstorage->TEST_set_estimated_compaction_needed_bytes(500); cfd->RecalculateWriteStallConditions(mutable_cf_options); @@ -2222,6 +2242,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { cfd->RecalculateWriteStallConditions(mutable_cf_options); ASSERT_TRUE(dbfull()->TEST_write_controler().IsStopped()); ASSERT_TRUE(!dbfull()->TEST_write_controler().NeedsDelay()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); vstorage->TEST_set_estimated_compaction_needed_bytes(3001); cfd->RecalculateWriteStallConditions(mutable_cf_options); @@ -2246,6 +2267,7 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { ASSERT_TRUE(dbfull()->TEST_write_controler().NeedsDelay()); ASSERT_EQ(kBaseRate / 1.2, dbfull()->TEST_write_controler().delayed_write_rate()); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); vstorage->set_l0_delay_trigger_count(101); cfd->RecalculateWriteStallConditions(mutable_cf_options); @@ -2318,6 +2340,73 @@ TEST_F(ColumnFamilyTest, WriteStallSingleColumnFamily) { dbfull()->TEST_write_controler().delayed_write_rate()); } +TEST_F(ColumnFamilyTest, CompactionSpeedupSingleColumnFamily) { + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + Open({"default"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 50; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(50); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(300); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(45); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(6); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + // Speed up threshold = min(4 * 2, 4 + (12 - 4)/4) = 6 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 16; + mutable_cf_options.level0_stop_writes_trigger = 30; + + vstorage->set_l0_delay_trigger_count(5); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(7); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(3); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); +} + TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) { const uint64_t kBaseRate = 810000u; db_options_.delayed_write_rate = kBaseRate; @@ -2399,6 +2488,104 @@ TEST_F(ColumnFamilyTest, WriteStallTwoColumnFamilies) { ASSERT_EQ(kBaseRate / 1.2, dbfull()->TEST_write_controler().delayed_write_rate()); } + +TEST_F(ColumnFamilyTest, CompactionSpeedupTwoColumnFamilies) { + db_options_.base_background_compactions = 2; + db_options_.max_background_compactions = 6; + column_family_options_.soft_pending_compaction_bytes_limit = 200; + column_family_options_.hard_pending_compaction_bytes_limit = 2000; + Open(); + CreateColumnFamilies({"one"}); + ColumnFamilyData* cfd = + static_cast(db_->DefaultColumnFamily())->cfd(); + VersionStorageInfo* vstorage = cfd->current()->storage_info(); + + ColumnFamilyData* cfd1 = + static_cast(handles_[1])->cfd(); + VersionStorageInfo* vstorage1 = cfd1->current()->storage_info(); + + MutableCFOptions mutable_cf_options( + Options(db_options_, column_family_options_), + ImmutableCFOptions(Options(db_options_, column_family_options_))); + // Speed up threshold = min(4 * 2, 4 + (36 - 4)/4) = 8 + mutable_cf_options.level0_file_num_compaction_trigger = 4; + mutable_cf_options.level0_slowdown_writes_trigger = 36; + mutable_cf_options.level0_stop_writes_trigger = 30; + // Speedup threshold = 200 / 4 = 50 + mutable_cf_options.soft_pending_compaction_bytes_limit = 200; + mutable_cf_options.hard_pending_compaction_bytes_limit = 2000; + + MutableCFOptions mutable_cf_options1 = mutable_cf_options; + mutable_cf_options1.level0_slowdown_writes_trigger = 16; + + vstorage->TEST_set_estimated_compaction_needed_bytes(40); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(60); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(30); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(70); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->TEST_set_estimated_compaction_needed_bytes(20); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->TEST_set_estimated_compaction_needed_bytes(3); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(9); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage1->set_l0_delay_trigger_count(2); + cfd1->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(6, dbfull()->BGCompactionsAllowed()); + + vstorage->set_l0_delay_trigger_count(0); + cfd->RecalculateWriteStallConditions(mutable_cf_options); + ASSERT_EQ(2, dbfull()->BGCompactionsAllowed()); +} + +TEST_F(ColumnFamilyTest, LogSyncConflictFlush) { + Open(); + CreateColumnFamiliesAndReopen({"one", "two"}); + + Put(0, "", ""); + Put(1, "foo", "bar"); + + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"DBImpl::SyncWAL:BeforeMarkLogsSynced:1", + "ColumnFamilyTest::LogSyncConflictFlush:1"}, + {"ColumnFamilyTest::LogSyncConflictFlush:2", + "DBImpl::SyncWAL:BeforeMarkLogsSynced:2"}}); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::thread thread([&] { db_->SyncWAL(); }); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:1"); + Flush(1); + Put(1, "foo", "bar"); + Flush(1); + + TEST_SYNC_POINT("ColumnFamilyTest::LogSyncConflictFlush:2"); + + thread.join(); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + Close(); +} } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/compact_files_test.cc b/db/compact_files_test.cc index b2a131ecf..794defb11 100644 --- a/db/compact_files_test.cc +++ b/db/compact_files_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,6 +7,7 @@ #include #include +#include #include #include "rocksdb/db.h" @@ -107,6 +108,7 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) { break; } } + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); delete db; } @@ -141,9 +143,6 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { } auto l0_files = collector->GetFlushedFiles(); - CompactionOptions compact_opt; - compact_opt.compression = kNoCompression; - compact_opt.output_file_size_limit = kWriteBufferSize * 5; ASSERT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); // verify all compaction input files are deleted @@ -153,6 +152,62 @@ TEST_F(CompactFilesTest, ObsoleteFiles) { delete db; } +TEST_F(CompactFilesTest, CapturingPendingFiles) { + Options options; + options.create_if_missing = true; + // Disable RocksDB background compaction. + options.compaction_style = kCompactionStyleNone; + // Always do full scans for obsolete files (needed to reproduce the issue). + options.delete_obsolete_files_period_micros = 0; + + // Add listener. + FlushedFileCollector* collector = new FlushedFileCollector(); + options.listeners.emplace_back(collector); + + DB* db = nullptr; + DestroyDB(db_name_, options); + Status s = DB::Open(options, db_name_, &db); + assert(s.ok()); + assert(db); + + // Create 5 files. + for (int i = 0; i < 5; ++i) { + db->Put(WriteOptions(), "key" + ToString(i), "value"); + db->Flush(FlushOptions()); + } + + auto l0_files = collector->GetFlushedFiles(); + EXPECT_EQ(5, l0_files.size()); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"CompactFilesImpl:2", "CompactFilesTest.CapturingPendingFiles:0"}, + {"CompactFilesTest.CapturingPendingFiles:1", "CompactFilesImpl:3"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // Start compacting files. + std::thread compaction_thread( + [&] { EXPECT_OK(db->CompactFiles(CompactionOptions(), l0_files, 1)); }); + + // In the meantime flush another file. + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:0"); + db->Put(WriteOptions(), "key5", "value"); + db->Flush(FlushOptions()); + TEST_SYNC_POINT("CompactFilesTest.CapturingPendingFiles:1"); + + compaction_thread.join(); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + delete db; + + // Make sure we can reopen the DB. + s = DB::Open(options, db_name_, &db); + ASSERT_TRUE(s.ok()); + assert(db); + delete db; +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/compacted_db_impl.cc b/db/compacted_db_impl.cc index 980b34e12..db8daa549 100644 --- a/db/compacted_db_impl.cc +++ b/db/compacted_db_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compacted_db_impl.h b/db/compacted_db_impl.h index ec2d53762..9c42010a6 100644 --- a/db/compacted_db_impl.h +++ b/db/compacted_db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction.cc b/db/compaction.cc index e28cf68ee..21bdcf2a0 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction.h b/db/compaction.h index 19e1e7581..729c4edaf 100644 --- a/db/compaction.h +++ b/db/compaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -138,6 +138,8 @@ class Compaction { // Clear all files to indicate that they are not being compacted // Delete this compaction from the list of running compactions. + // + // Requirement: DB mutex held void ReleaseCompactionFiles(Status status); // Returns the summary of the compaction in "output" with maximum "len" diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index 1e3e140f4..20eed4f3d 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction_iterator.h b/db/compaction_iterator.h index cc8e4a531..b13aef3ff 100644 --- a/db/compaction_iterator.h +++ b/db/compaction_iterator.h @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction_iterator_test.cc b/db/compaction_iterator_test.cc index a59f56771..4cbccca55 100644 --- a/db/compaction_iterator_test.cc +++ b/db/compaction_iterator_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 2d0711ff0..b04541adb 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -51,6 +51,7 @@ #include "util/iostats_context_imp.h" #include "util/log_buffer.h" #include "util/logging.h" +#include "util/sst_file_manager_impl.h" #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" @@ -211,6 +212,7 @@ CompactionJob::CompactionJob( const EnvOptions& env_options, VersionSet* versions, std::atomic* shutting_down, LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, Statistics* stats, + InstrumentedMutex* db_mutex, Status* db_bg_error, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, @@ -230,6 +232,8 @@ CompactionJob::CompactionJob( db_directory_(db_directory), output_directory_(output_directory), stats_(stats), + db_mutex_(db_mutex), + db_bg_error_(db_bg_error), existing_snapshots_(std::move(existing_snapshots)), earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot), table_cache_(std::move(table_cache)), @@ -237,7 +241,9 @@ CompactionJob::CompactionJob( paranoid_file_checks_(paranoid_file_checks), measure_io_stats_(measure_io_stats) { assert(log_buffer_ != nullptr); - ThreadStatusUtil::SetColumnFamily(compact_->compaction->column_family_data()); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + cfd->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ReportStartedCompaction(compaction); } @@ -249,8 +255,9 @@ CompactionJob::~CompactionJob() { void CompactionJob::ReportStartedCompaction( Compaction* compaction) { - ThreadStatusUtil::SetColumnFamily( - compact_->compaction->column_family_data()); + const auto* cfd = compact_->compaction->column_family_data(); + ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, + cfd->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::COMPACTION_JOB_ID, @@ -356,7 +363,7 @@ void CompactionJob::GenSubcompactionBoundaries() { size_t num_files = flevel->num_files; if (num_files == 0) { - break; + continue; } if (lvl == 0) { @@ -415,12 +422,9 @@ void CompactionJob::GenSubcompactionBoundaries() { // Group the ranges into subcompactions const double min_file_fill_percent = 4.0 / 5; - uint64_t max_output_files = - static_cast( - std::ceil( - sum / min_file_fill_percent / - cfd->GetCurrentMutableCFOptions()->MaxFileSizeForLevel(out_lvl)) - ); + uint64_t max_output_files = static_cast(std::ceil( + sum / min_file_fill_percent / + cfd->GetCurrentMutableCFOptions()->MaxFileSizeForLevel(out_lvl))); uint64_t subcompactions = std::min({static_cast(ranges.size()), static_cast(db_options_.max_subcompactions), @@ -518,18 +522,17 @@ Status CompactionJob::Run() { return status; } -Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex) { +Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) { AutoThreadOperationStageUpdater stage_updater( ThreadStatus::STAGE_COMPACTION_INSTALL); - db_mutex->AssertHeld(); + db_mutex_->AssertHeld(); Status status = compact_->status; ColumnFamilyData* cfd = compact_->compaction->column_family_data(); cfd->internal_stats()->AddCompactionStats( compact_->compaction->output_level(), compaction_stats_); if (status.ok()) { - status = InstallCompactionResults(mutable_cf_options, db_mutex); + status = InstallCompactionResults(mutable_cf_options); } VersionStorageInfo::LevelSummaryStorage tmp; auto vstorage = cfd->current()->storage_info(); @@ -855,13 +858,33 @@ Status CompactionJob::FinishCompactionOutputFile( event_logger_, cfd->ioptions()->listeners, meta->fd, info); } } + + // Report new file to SstFileManagerImpl + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm && meta->fd.GetPathId() == 0) { + ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); + auto fn = TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(), + meta->fd.GetPathId()); + sfm->OnAddFile(fn); + if (sfm->IsMaxAllowedSpaceReached()) { + InstrumentedMutexLock l(db_mutex_); + if (db_bg_error_->ok()) { + s = Status::IOError("Max allowed space was reached"); + *db_bg_error_ = s; + TEST_SYNC_POINT( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached"); + } + } + } + sub_compact->builder.reset(); return s; } Status CompactionJob::InstallCompactionResults( - const MutableCFOptions& mutable_cf_options, InstrumentedMutex* db_mutex) { - db_mutex->AssertHeld(); + const MutableCFOptions& mutable_cf_options) { + db_mutex_->AssertHeld(); auto* compaction = compact_->compaction; // paranoia: verify that the files that we started with @@ -896,7 +919,7 @@ Status CompactionJob::InstallCompactionResults( } return versions_->LogAndApply(compaction->column_family_data(), mutable_cf_options, compaction->edit(), - db_mutex, db_directory_); + db_mutex_, db_directory_); } void CompactionJob::RecordCompactionIOStats() { diff --git a/db/compaction_job.h b/db/compaction_job.h index e4d5244e3..c6edefbe0 100644 --- a/db/compaction_job.h +++ b/db/compaction_job.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -56,7 +56,8 @@ class CompactionJob { const EnvOptions& env_options, VersionSet* versions, std::atomic* shutting_down, LogBuffer* log_buffer, Directory* db_directory, Directory* output_directory, - Statistics* stats, + Statistics* stats, InstrumentedMutex* db_mutex, + Status* db_bg_error, std::vector existing_snapshots, SequenceNumber earliest_write_conflict_snapshot, std::shared_ptr table_cache, EventLogger* event_logger, @@ -77,8 +78,7 @@ class CompactionJob { Status Run(); // REQUIRED: mutex held - Status Install(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex); + Status Install(const MutableCFOptions& mutable_cf_options); private: struct SubcompactionState; @@ -95,8 +95,7 @@ class CompactionJob { Status FinishCompactionOutputFile(const Status& input_status, SubcompactionState* sub_compact); - Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options, - InstrumentedMutex* db_mutex); + Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options); void RecordCompactionIOStats(); Status OpenCompactionOutputFile(SubcompactionState* sub_compact); void CleanupCompaction(); @@ -130,6 +129,8 @@ class CompactionJob { Directory* db_directory_; Directory* output_directory_; Statistics* stats_; + InstrumentedMutex* db_mutex_; + Status* db_bg_error_; // If there were two snapshots with seq numbers s1 and // s2 and s1 < s2, and if we find two instances of a key k1 then lies // entirely within s1 and s2, then the earlier version of k1 can be safely diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index df38bd3b7..4ead93510 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 0c438296f..f3bc4cca9 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -250,9 +250,9 @@ class CompactionJobTest : public testing::Test { EventLogger event_logger(db_options_.info_log.get()); CompactionJob compaction_job( 0, &compaction, db_options_, env_options_, versions_.get(), - &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, snapshots, - earliest_write_conflict_snapshot, table_cache_, &event_logger, false, - false, dbname_, &compaction_job_stats_); + &shutting_down_, &log_buffer, nullptr, nullptr, nullptr, &mutex_, + &bg_error_, snapshots, earliest_write_conflict_snapshot, table_cache_, + &event_logger, false, false, dbname_, &compaction_job_stats_); VerifyInitializationOfCompactionJobStats(compaction_job_stats_); @@ -262,8 +262,7 @@ class CompactionJobTest : public testing::Test { s = compaction_job.Run(); ASSERT_OK(s); mutex_.Lock(); - ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions(), - &mutex_)); + ASSERT_OK(compaction_job.Install(*cfd->GetLatestMutableCFOptions())); mutex_.Unlock(); if (expected_results.size() == 0) { @@ -295,6 +294,7 @@ class CompactionJobTest : public testing::Test { ColumnFamilyData* cfd_; std::unique_ptr compaction_filter_; std::shared_ptr merge_op_; + Status bg_error_; }; TEST_F(CompactionJobTest, Simple) { diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 5bb27907e..91b9a2b5a 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -141,7 +141,8 @@ CompactionPicker::~CompactionPicker() {} // Delete this compaction from the list of running compactions. void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { - if (c->start_level() == 0) { + if (c->start_level() == 0 || + ioptions_.compaction_style == kCompactionStyleUniversal) { level0_compactions_in_progress_.erase(c); } if (!status.ok()) { @@ -612,6 +613,17 @@ Compaction* CompactionPicker::CompactRange( if (input_level == 0) { level0_compactions_in_progress_.insert(compaction); } + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here + { // this piece of code recomputes compaction score + CompactionOptionsFIFO dummy_compaction_options_fifo; + vstorage->ComputeCompactionScore(mutable_cf_options, + dummy_compaction_options_fifo); + } + return compaction; } diff --git a/db/compaction_picker.h b/db/compaction_picker.h index c082a9fce..0503c8692 100644 --- a/db/compaction_picker.h +++ b/db/compaction_picker.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -83,6 +83,8 @@ class CompactionPicker { #endif // ROCKSDB_LITE // Free up the files that participated in a compaction + // + // Requirement: DB mutex held void ReleaseCompactionFiles(Compaction* c, Status status); // Returns true if any one of the specified files are being compacted diff --git a/db/compaction_picker_test.cc b/db/compaction_picker_test.cc index 50f97f247..98d80eea8 100644 --- a/db/compaction_picker_test.cc +++ b/db/compaction_picker_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -487,6 +487,87 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) { } #endif // ROCKSDB_LITE +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "179", 50000000U); + Add(2, 7U, "180", "220", 50000000U); + Add(2, 8U, "321", "400", 50000000U); // File not overlapping + Add(2, 9U, "721", "800", 50000000U); + + Add(3, 26U, "150", "170", 260000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 260000000U); + Add(3, 30U, "750", "900", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Pick file 8 because it overlaps with 0 files on level 3. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping2) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + Add(2, 6U, "150", "175", + 60000000U); // Overlaps with file 26, 27, total size 521M + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27, 28, total size + // 520M, the smalelst overlapping + Add(2, 8U, "201", "300", + 60000000U); // Overlaps with file 28, 29, total size 521M + + Add(3, 26U, "100", "110", 261000000U); + Add(3, 26U, "150", "170", 261000000U); + Add(3, 27U, "171", "179", 260000000U); + Add(3, 28U, "191", "220", 260000000U); + Add(3, 29U, "221", "300", 261000000U); + Add(3, 30U, "321", "400", 261000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 7 because overlapping ratio is the biggest. + ASSERT_EQ(7U, compaction->input(0, 0)->fd.GetNumber()); +} + +TEST_F(CompactionPickerTest, CompactionPriMinOverlapping3) { + NewVersionStorage(6, kCompactionStyleLevel); + mutable_cf_options_.target_file_size_base = 10000000; + mutable_cf_options_.target_file_size_multiplier = 10; + mutable_cf_options_.compaction_pri = kMinOverlappingRatio; + + // file 7 and 8 over lap with the same file, but file 8 is smaller so + // it will be picked. + Add(2, 6U, "150", "175", 60000000U); // Overlaps with file 26, 27 + Add(2, 7U, "176", "200", 60000000U); // Overlaps with file 27 + Add(2, 8U, "201", "300", 61000000U); // Overlaps with file 27 + + Add(3, 26U, "160", "165", 260000000U); + Add(3, 26U, "166", "170", 260000000U); + Add(3, 27U, "180", "400", 260000000U); + Add(3, 28U, "401", "500", 260000000U); + UpdateVersionStorageInfo(); + + std::unique_ptr compaction(level_compaction_picker.PickCompaction( + cf_name_, mutable_cf_options_, vstorage_.get(), &log_buffer_)); + ASSERT_TRUE(compaction.get() != nullptr); + ASSERT_EQ(1U, compaction->num_input_files(0)); + // Picking file 8 because overlapping ratio is the biggest. + ASSERT_EQ(8U, compaction->input(0, 0)->fd.GetNumber()); +} + // This test exhibits the bug where we don't properly reset parent_index in // PickCompaction() TEST_F(CompactionPickerTest, ParentIndexResetBug) { diff --git a/db/comparator_db_test.cc b/db/comparator_db_test.cc index 530c91060..e4e84107e 100644 --- a/db/comparator_db_test.cc +++ b/db/comparator_db_test.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/convenience.cc b/db/convenience.cc index a9d113ff1..b1042c74d 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/corruption_test.cc b/db/corruption_test.cc index f6e06bec8..85bfe57cb 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/cuckoo_table_db_test.cc b/db/cuckoo_table_db_test.cc index 09a68de92..f48b5b436 100644 --- a/db/cuckoo_table_db_test.cc +++ b/db/cuckoo_table_db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc new file mode 100644 index 000000000..939cf44e0 --- /dev/null +++ b/db/db_block_cache_test.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace rocksdb { + +static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + +class DBBlockCacheTest : public DBTestBase { + private: + size_t miss_count_ = 0; + size_t hit_count_ = 0; + size_t insert_count_ = 0; + size_t failure_count_ = 0; + size_t compressed_miss_count_ = 0; + size_t compressed_hit_count_ = 0; + size_t compressed_insert_count_ = 0; + size_t compressed_failure_count_ = 0; + + public: + const size_t kNumBlocks = 10; + const size_t kValueSize = 100; + + DBBlockCacheTest() : DBTestBase("/db_block_cache_test") {} + + BlockBasedTableOptions GetTableOptions() { + BlockBasedTableOptions table_options; + // Set a small enough block size so that each key-value get its own block. + table_options.block_size = 1; + return table_options; + } + + Options GetOptions(const BlockBasedTableOptions& table_options) { + Options options = CurrentOptions(); + options.create_if_missing = true; + // options.compression = kNoCompression; + options.statistics = rocksdb::CreateDBStatistics(); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + return options; + } + + void InitTable(const Options& options) { + std::string value(kValueSize, 'a'); + for (size_t i = 0; i < kNumBlocks; i++) { + ASSERT_OK(Put(ToString(i), value.c_str())); + } + } + + void RecordCacheCounters(const Options& options) { + miss_count_ = TestGetTickerCount(options, BLOCK_CACHE_MISS); + hit_count_ = TestGetTickerCount(options, BLOCK_CACHE_HIT); + insert_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD); + failure_count_ = TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + compressed_miss_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + compressed_hit_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + compressed_insert_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + compressed_failure_count_ = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } + + void CheckCacheCounters(const Options& options, size_t expected_misses, + size_t expected_hits, size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = TestGetTickerCount(options, BLOCK_CACHE_MISS); + size_t new_hit_count = TestGetTickerCount(options, BLOCK_CACHE_HIT); + size_t new_insert_count = TestGetTickerCount(options, BLOCK_CACHE_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_ADD_FAILURES); + ASSERT_EQ(miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(failure_count_ + expected_failures, new_failure_count); + miss_count_ = new_miss_count; + hit_count_ = new_hit_count; + insert_count_ = new_insert_count; + failure_count_ = new_failure_count; + } + + void CheckCompressedCacheCounters(const Options& options, + size_t expected_misses, + size_t expected_hits, + size_t expected_inserts, + size_t expected_failures) { + size_t new_miss_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS); + size_t new_hit_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT); + size_t new_insert_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD); + size_t new_failure_count = + TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + ASSERT_EQ(compressed_miss_count_ + expected_misses, new_miss_count); + ASSERT_EQ(compressed_hit_count_ + expected_hits, new_hit_count); + ASSERT_EQ(compressed_insert_count_ + expected_inserts, new_insert_count); + ASSERT_EQ(compressed_failure_count_ + expected_failures, new_failure_count); + compressed_miss_count_ = new_miss_count; + compressed_hit_count_ = new_hit_count; + compressed_insert_count_ = new_insert_count; + compressed_failure_count_ = new_failure_count; + } +}; + +TEST_F(DBBlockCacheTest, TestWithoutCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + cache->SetCapacity(usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + + // Test with strict capacity limit. + cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + delete iter; + iter = nullptr; + + // Release interators and access cache again. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iterators[i].reset(); + CheckCacheCounters(options, 0, 0, 0, 0); + } + ASSERT_EQ(0, cache->GetPinnedUsage()); + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 0, 1, 0, 0); + iterators[i].reset(iter); + } +} + +#ifdef SNAPPY +TEST_F(DBBlockCacheTest, TestWithCompressedBlockCache) { + ReadOptions read_options; + auto table_options = GetTableOptions(); + auto options = GetOptions(table_options); + options.compression = CompressionType::kSnappyCompression; + InitTable(options); + + std::shared_ptr cache = NewLRUCache(0, 0, false); + std::shared_ptr compressed_cache = NewLRUCache(0, 0, false); + table_options.block_cache = cache; + table_options.block_cache_compressed = compressed_cache; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + Reopen(options); + RecordCacheCounters(options); + + std::vector> iterators(kNumBlocks - 1); + Iterator* iter = nullptr; + + // Load blocks into cache. + for (size_t i = 0; i < kNumBlocks - 1; i++) { + iter = db_->NewIterator(read_options); + iter->Seek(ToString(i)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + iterators[i].reset(iter); + } + size_t usage = cache->GetUsage(); + ASSERT_LT(0, usage); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + size_t compressed_usage = compressed_cache->GetUsage(); + ASSERT_LT(0, compressed_usage); + // Compressed block cache cannot be pinned. + ASSERT_EQ(0, compressed_cache->GetPinnedUsage()); + + // Set strict capacity limit flag. Now block will only load into compressed + // block cache. + cache->SetCapacity(usage); + cache->SetStrictCapacityLimit(true); + ASSERT_EQ(usage, cache->GetPinnedUsage()); + // compressed_cache->SetCapacity(compressed_usage); + compressed_cache->SetCapacity(0); + // compressed_cache->SetStrictCapacityLimit(true); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_TRUE(iter->status().IsIncomplete()); + CheckCacheCounters(options, 1, 0, 0, 1); + CheckCompressedCacheCounters(options, 1, 0, 1, 0); + delete iter; + iter = nullptr; + + // Clear strict capacity limit flag. This time we shall hit compressed block + // cache. + cache->SetStrictCapacityLimit(false); + iter = db_->NewIterator(read_options); + iter->Seek(ToString(kNumBlocks - 1)); + ASSERT_OK(iter->status()); + CheckCacheCounters(options, 1, 0, 1, 0); + CheckCompressedCacheCounters(options, 0, 1, 0, 0); + delete iter; + iter = nullptr; +} +#endif + +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 6fde1fe06..88738ec36 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index e2925ba07..6dedccd82 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -533,6 +533,104 @@ TEST_P(DBCompactionTestWithParam, CompactionTrigger) { ASSERT_EQ(NumTableFilesAtLevel(1, 1), 1); } +TEST_F(DBCompactionTest, BGCompactionsAllowed) { + // Create several column families. Make compaction triggers in all of them + // and see number of compactions scheduled to be less than allowed. + const int kNumKeysPerFile = 100; + + Options options; + options.write_buffer_size = 110 << 10; // 110KB + options.arena_block_size = 4 << 10; + options.num_levels = 3; + // Should speed up compaction when there are 4 files. + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 20; + options.soft_pending_compaction_bytes_limit = 1 << 30; // Infinitely large + options.base_background_compactions = 1; + options.max_background_compactions = 3; + options.memtable_factory.reset(new SpecialSkipListFactory(kNumKeysPerFile)); + options = CurrentOptions(options); + + // Block all threads in thread pool. + const size_t kTotalTasks = 4; + env_->SetBackgroundThreads(4, Env::LOW); + test::SleepingBackgroundTask sleeping_tasks[kTotalTasks]; + for (size_t i = 0; i < kTotalTasks; i++) { + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + + CreateAndReopenWithCF({"one", "two", "three"}, options); + + Random rnd(301); + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Create two more files for one column family, which triggers speed up + // condition, three compactions will be scheduled. + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(2, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(2, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[2]); + ASSERT_EQ(options.level0_file_num_compaction_trigger + num + 1, + NumTableFilesAtLevel(0, 2)); + } + ASSERT_EQ(3, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + // Unblock all threads to unblock all compactions. + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } + dbfull()->TEST_WaitForCompact(); + + // Verify number of compactions allowed will come back to 1. + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].Reset(); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_tasks[i], Env::Priority::LOW); + sleeping_tasks[i].WaitUntilSleeping(); + } + for (int cf = 0; cf < 4; cf++) { + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + for (int i = 0; i < kNumKeysPerFile; i++) { + ASSERT_OK(Put(cf, Key(i), "")); + } + // put extra key to trigger flush + ASSERT_OK(Put(cf, "", "")); + dbfull()->TEST_WaitForFlushMemTable(handles_[cf]); + ASSERT_EQ(NumTableFilesAtLevel(0, cf), num + 1); + } + } + + // Now all column families qualify compaction but only one should be + // scheduled, because no column family hits speed up condition. + ASSERT_EQ(1, env_->GetThreadPoolQueueLen(Env::Priority::LOW)); + + for (size_t i = 0; i < kTotalTasks; i++) { + sleeping_tasks[i].WakeUp(); + sleeping_tasks[i].WaitUntilDone(); + } +} + TEST_P(DBCompactionTestWithParam, CompactionsGenerateMultipleFiles) { Options options; options.write_buffer_size = 100000000; // Large write buffer @@ -1898,7 +1996,7 @@ TEST_P(DBCompactionTestWithParam, DISABLED_CompactFilesOnLevelCompaction) { std::set overlapping_file_names; std::vector compaction_input_file_names; for (int f = 0; f < file_picked; ++f) { - int level; + int level = 0; auto file_meta = PickFileRandomly(cf_meta, &rnd, &level); compaction_input_file_names.push_back(file_meta->name); GetOverlappingFileNumbersForLevelCompaction( @@ -2198,6 +2296,25 @@ TEST_P(DBCompactionTestWithParam, CompressLevelCompaction) { Destroy(options); } +TEST_F(DBCompactionTest, SanitizeCompactionOptionsTest) { + Options options = CurrentOptions(); + options.max_background_compactions = 5; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + DestroyAndReopen(options); + ASSERT_EQ(5, db_->GetOptions().base_background_compactions); + ASSERT_EQ(100, db_->GetOptions().soft_pending_compaction_bytes_limit); + + options.base_background_compactions = 4; + options.max_background_compactions = 3; + options.soft_pending_compaction_bytes_limit = 200; + options.hard_pending_compaction_bytes_limit = 150; + DestroyAndReopen(options); + ASSERT_EQ(3, db_->GetOptions().base_background_compactions); + ASSERT_EQ(150, db_->GetOptions().soft_pending_compaction_bytes_limit); +} + // This tests for a bug that could cause two level0 compactions running // concurrently // TODO(aekmekji): Make sure that the reason this fails when run with @@ -2390,8 +2507,12 @@ TEST_P(CompactionPriTest, Test) { } } -INSTANTIATE_TEST_CASE_P(CompactionPriTest, CompactionPriTest, - ::testing::Values(0, 1, 2)); +INSTANTIATE_TEST_CASE_P( + CompactionPriTest, CompactionPriTest, + ::testing::Values(CompactionPri::kByCompensatedSize, + CompactionPri::kOldestLargestSeqFirst, + CompactionPri::kOldestSmallestSeqFirst, + CompactionPri::kMinOverlappingRatio)); #endif // !defined(ROCKSDB_LITE) } // namespace rocksdb diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index f29985e05..7b3a15e56 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index e39ccf496..89ebb6650 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_impl.cc b/db/db_impl.cc index fa3713f44..338b31ba0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -15,6 +15,9 @@ #include #include +#ifdef OS_SOLARIS +#include +#endif #include #include @@ -28,8 +31,10 @@ #include #include +#include "db/auto_roll_logger.h" #include "db/builder.h" #include "db/compaction_job.h" +#include "db/db_info_dumper.h" #include "db/db_iter.h" #include "db/dbformat.h" #include "db/event_helpers.h" @@ -51,6 +56,7 @@ #include "db/write_batch_internal.h" #include "db/write_callback.h" #include "db/writebuffer.h" +#include "db/xfunc_test_points.h" #include "memtable/hash_linklist_rep.h" #include "memtable/hash_skiplist_rep.h" #include "port/likely.h" @@ -58,7 +64,6 @@ #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/db.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/sst_file_writer.h" @@ -72,19 +77,18 @@ #include "table/merger.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" -#include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" -#include "util/db_info_dumper.h" #include "util/file_reader_writer.h" #include "util/file_util.h" #include "util/iostats_context_imp.h" #include "util/log_buffer.h" #include "util/logging.h" #include "util/mutexlock.h" +#include "util/sst_file_manager_impl.h" #include "util/options_helper.h" #include "util/options_parser.h" #include "util/perf_context_imp.h" @@ -142,6 +146,12 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { result.info_log = nullptr; } } + if (result.base_background_compactions == -1) { + result.base_background_compactions = result.max_background_compactions; + } + if (result.base_background_compactions > result.max_background_compactions) { + result.base_background_compactions = result.max_background_compactions; + } result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions, Env::Priority::LOW); result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes, @@ -265,13 +275,14 @@ DBImpl::DBImpl(const DBOptions& options, const std::string& dbname) db_options_.delete_obsolete_files_period_micros), last_stats_dump_time_microsec_(0), next_job_id_(1), - flush_on_destroy_(false), + has_unpersisted_data_(false), env_options_(db_options_), #ifndef ROCKSDB_LITE wal_manager_(db_options_, env_options_), #endif // ROCKSDB_LITE event_logger_(db_options_.info_log.get()), bg_work_paused_(0), + bg_compaction_paused_(0), refitting_level_(false), opened_successfully_(false) { env_->GetAbsolutePath(dbname, &db_absolute_path_); @@ -312,7 +323,8 @@ void DBImpl::CancelAllBackgroundWork(bool wait) { DBImpl::~DBImpl() { mutex_.Lock(); - if (!shutting_down_.load(std::memory_order_acquire) && flush_on_destroy_) { + if (!shutting_down_.load(std::memory_order_acquire) && + has_unpersisted_data_) { for (auto cfd : *versions_->GetColumnFamilySet()) { if (!cfd->IsDropped() && !cfd->mem()->IsEmpty()) { cfd->Ref(); @@ -484,23 +496,22 @@ void DBImpl::MaybeDumpStats() { last_stats_dump_time_microsec_ = now_micros; #ifndef ROCKSDB_LITE - bool tmp1 = false; - bool tmp2 = false; - DBPropertyType cf_property_type = - GetPropertyType(DB::Properties::kCFStats, &tmp1, &tmp2); - DBPropertyType db_property_type = - GetPropertyType(DB::Properties::kDBStats, &tmp1, &tmp2); + const DBPropertyInfo* cf_property_info = + GetPropertyInfo(DB::Properties::kCFStats); + assert(cf_property_info != nullptr); + const DBPropertyInfo* db_property_info = + GetPropertyInfo(DB::Properties::kDBStats); + assert(db_property_info != nullptr); + std::string stats; { InstrumentedMutexLock l(&mutex_); for (auto cfd : *versions_->GetColumnFamilySet()) { - cfd->internal_stats()->GetStringProperty(cf_property_type, - DB::Properties::kCFStats, - &stats); + cfd->internal_stats()->GetStringProperty( + *cf_property_info, DB::Properties::kCFStats, &stats); } - default_cf_internal_stats_->GetStringProperty(db_property_type, - DB::Properties::kDBStats, - &stats); + default_cf_internal_stats_->GetStringProperty( + *db_property_info, DB::Properties::kDBStats, &stats); } Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "------- DUMPING STATS -------"); @@ -561,6 +572,7 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // Get obsolete files. This function will also update the list of // pending files in VersionSet(). versions_->GetObsoleteFiles(&job_context->sst_delete_files, + &job_context->manifest_delete_files, job_context->min_pending_output); // store the current filenum, lognum, etc @@ -678,9 +690,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { } auto candidate_files = state.full_scan_candidate_files; - candidate_files.reserve(candidate_files.size() + - state.sst_delete_files.size() + - state.log_delete_files.size()); + candidate_files.reserve( + candidate_files.size() + state.sst_delete_files.size() + + state.log_delete_files.size() + state.manifest_delete_files.size()); // We may ignore the dbname when generating the file names. const char* kDumbDbName = ""; for (auto file : state.sst_delete_files) { @@ -696,6 +708,9 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { 0); } } + for (const auto& filename : state.manifest_delete_files) { + candidate_files.emplace_back(filename, 0); + } // dedup state.candidate_files so we don't try to delete the same // file twice @@ -782,8 +797,8 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { } #endif // !ROCKSDB_LITE Status file_deletion_status; - if (type == kTableFile && path_id == 0) { - file_deletion_status = DeleteOrMoveToTrash(&db_options_, fname); + if (type == kTableFile) { + file_deletion_status = DeleteSSTFile(&db_options_, fname, path_id); } else { file_deletion_status = env_->DeleteFile(fname); } @@ -814,7 +829,8 @@ void DBImpl::PurgeObsoleteFiles(const JobContext& state) { // Delete old info log files. size_t old_info_log_file_count = old_info_log_files.size(); - if (old_info_log_file_count >= db_options_.keep_log_file_num) { + if (old_info_log_file_count != 0 && + old_info_log_file_count >= db_options_.keep_log_file_num) { std::sort(old_info_log_files.begin(), old_info_log_files.end()); size_t end = old_info_log_file_count - db_options_.keep_log_file_num; for (unsigned int i = 0; i <= end; i++) { @@ -1393,9 +1409,9 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); auto pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); + meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -1517,13 +1533,26 @@ Status DBImpl::FlushMemTableToOutputFile( bg_error_ = s; } RecordFlushIOStats(); -#ifndef ROCKSDB_LITE if (s.ok()) { +#ifndef ROCKSDB_LITE // may temporarily unlock and lock the mutex. NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, job_context->job_id, flush_job.GetTableProperties()); - } #endif // ROCKSDB_LITE + auto sfm = + static_cast(db_options_.sst_file_manager.get()); + if (sfm) { + // Notify sst_file_manager that a new file was added + std::string file_path = MakeTableFileName(db_options_.db_paths[0].path, + file_meta.fd.GetNumber()); + sfm->OnAddFile(file_path); + if (sfm->IsMaxAllowedSpaceReached() && bg_error_.ok()) { + bg_error_ = Status::IOError("Max allowed space was reached"); + TEST_SYNC_POINT( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached"); + } + } + } return s; } @@ -1813,13 +1842,16 @@ Status DBImpl::CompactFilesImpl( std::vector snapshot_seqs = snapshots_.GetAll(&earliest_write_conflict_snapshot); + auto pending_outputs_inserted_elem = + CaptureCurrentFileNumberInPendingOutputs(); + assert(is_snapshot_supported_ || snapshots_.empty()); CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs, - earliest_write_conflict_snapshot, table_cache_, &event_logger_, - c->mutable_cf_options()->paranoid_file_checks, + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, &bg_error_, + snapshot_seqs, earliest_write_conflict_snapshot, table_cache_, + &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->compaction_measure_io_stats, dbname_, nullptr); // Here we pass a nullptr for CompactionJobStats because // CompactFiles does not trigger OnCompactionCompleted(), @@ -1834,21 +1866,35 @@ Status DBImpl::CompactFilesImpl( // support for CompactFiles, we should have CompactFiles API // pass a pointer of CompactionJobStats as the out-value // instead of using EventListener. + + // Creating a compaction influences the compaction score because the score + // takes running compactions into account (by skipping files that are already + // being compacted). Since we just changed compaction score, we recalculate it + // here. + { + CompactionOptionsFIFO dummy_compaction_options_fifo; + version->storage_info()->ComputeCompactionScore( + *c->mutable_cf_options(), dummy_compaction_options_fifo); + } + compaction_job.Prepare(); mutex_.Unlock(); TEST_SYNC_POINT("CompactFilesImpl:0"); TEST_SYNC_POINT("CompactFilesImpl:1"); compaction_job.Run(); + TEST_SYNC_POINT("CompactFilesImpl:2"); + TEST_SYNC_POINT("CompactFilesImpl:3"); mutex_.Lock(); - Status status = compaction_job.Install(*c->mutable_cf_options(), &mutex_); + Status status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWorkWrapper( c->column_family_data(), job_context, *c->mutable_cf_options()); } c->ReleaseCompactionFiles(s); - c.reset(); + + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); if (status.ok()) { // Done @@ -1864,6 +1910,8 @@ Status DBImpl::CompactFilesImpl( } } + c.reset(); + bg_compaction_scheduled_--; if (bg_compaction_scheduled_ == 0) { bg_cv_.SignalAll(); @@ -1875,10 +1923,11 @@ Status DBImpl::CompactFilesImpl( Status DBImpl::PauseBackgroundWork() { InstrumentedMutexLock guard_lock(&mutex_); - bg_work_paused_++; + bg_compaction_paused_++; while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_ > 0) { bg_cv_.Wait(); } + bg_work_paused_++; return Status::OK(); } @@ -1888,7 +1937,11 @@ Status DBImpl::ContinueBackgroundWork() { return Status::InvalidArgument(); } assert(bg_work_paused_ > 0); + assert(bg_compaction_paused_ > 0); + bg_compaction_paused_--; bg_work_paused_--; + // It's sufficient to check just bg_work_paused_ here since + // bg_work_paused_ is always no greater than bg_compaction_paused_ if (bg_work_paused_ == 0) { MaybeScheduleFlushOrCompaction(); } @@ -2188,6 +2241,9 @@ Status DBImpl::SyncWAL() { status = directories_.GetWalDir()->Fsync(); } + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:1"); + TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); + { InstrumentedMutexLock l(&mutex_); MarkLogsSynced(current_log_number, need_log_dir_sync, status); @@ -2215,7 +2271,8 @@ void DBImpl::MarkLogsSynced( ++it; } } - assert(logs_.empty() || (logs_.size() == 1 && !logs_[0].getting_synced)); + assert(logs_.empty() || logs_[0].number > up_to || + (logs_.size() == 1 && !logs_[0].getting_synced)); log_sync_cv_.SignalAll(); } @@ -2453,25 +2510,32 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this); } + auto bg_compactions_allowed = BGCompactionsAllowed(); + // special case -- if max_background_flushes == 0, then schedule flush on a // compaction thread if (db_options_.max_background_flushes == 0) { while (unscheduled_flushes_ > 0 && bg_flush_scheduled_ + bg_compaction_scheduled_ < - db_options_.max_background_compactions) { + bg_compactions_allowed) { unscheduled_flushes_--; bg_flush_scheduled_++; env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::LOW, this); } } + if (bg_compaction_paused_ > 0) { + // we paused the background compaction + return; + } + if (HasExclusiveManualCompaction()) { // only manual compactions are allowed to run. don't schedule automatic // compactions return; } - while (bg_compaction_scheduled_ < db_options_.max_background_compactions && + while (bg_compaction_scheduled_ < bg_compactions_allowed && unscheduled_compactions_ > 0) { CompactionArg* ca = new CompactionArg; ca->db = this; @@ -2483,6 +2547,14 @@ void DBImpl::MaybeScheduleFlushOrCompaction() { } } +int DBImpl::BGCompactionsAllowed() const { + if (write_controller_.NeedSpeedupCompaction()) { + return db_options_.max_background_compactions; + } else { + return db_options_.base_background_compactions; + } +} + void DBImpl::AddToCompactionQueue(ColumnFamilyData* cfd) { assert(!cfd->pending_compaction()); cfd->Ref(); @@ -2595,10 +2667,10 @@ Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context, LogToBuffer( log_buffer, "Calling FlushMemTableToOutputFile with column " - "family [%s], flush slots available %d, compaction slots available %d", - cfd->GetName().c_str(), - db_options_.max_background_flushes - bg_flush_scheduled_, - db_options_.max_background_compactions - bg_compaction_scheduled_); + "family [%s], flush slots available %d, compaction slots allowed %d, " + "compaction slots scheduled %d", + cfd->GetName().c_str(), db_options_.max_background_flushes, + bg_flush_scheduled_, BGCompactionsAllowed() - bg_compaction_scheduled_); status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress, job_context, log_buffer); if (cfd->Unref()) { @@ -2911,7 +2983,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:TrivialMove"); // Instrument for event update // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily(c->column_family_data()); + ThreadStatusUtil::SetColumnFamily( + c->column_family_data(), c->column_family_data()->ioptions()->env, + c->column_family_data()->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); @@ -2980,8 +3054,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, CompactionJob compaction_job( job_context->job_id, c.get(), db_options_, env_options_, versions_.get(), &shutting_down_, log_buffer, directories_.GetDbDir(), - directories_.GetDataDir(c->output_path_id()), stats_, snapshot_seqs, - earliest_write_conflict_snapshot, table_cache_, &event_logger_, + directories_.GetDataDir(c->output_path_id()), stats_, &mutex_, + &bg_error_, snapshot_seqs, earliest_write_conflict_snapshot, + table_cache_, &event_logger_, c->mutable_cf_options()->paranoid_file_checks, c->mutable_cf_options()->compaction_measure_io_stats, dbname_, &compaction_job_stats); @@ -2992,7 +3067,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, TEST_SYNC_POINT("DBImpl::BackgroundCompaction:NonTrivial:AfterRun"); mutex_.Lock(); - status = compaction_job.Install(*c->mutable_cf_options(), &mutex_); + status = compaction_job.Install(*c->mutable_cf_options()); if (status.ok()) { InstallSuperVersionAndScheduleWorkWrapper( c->column_family_data(), job_context, *c->mutable_cf_options()); @@ -3294,13 +3369,19 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, LookupKey lkey(key, snapshot); PERF_TIMER_STOP(get_snapshot_time); - if (sv->mem->Get(lkey, value, &s, &merge_context)) { - // Done - RecordTick(stats_, MEMTABLE_HIT); - } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { - // Done - RecordTick(stats_, MEMTABLE_HIT); - } else { + bool skip_memtable = + (read_options.read_tier == kPersistedTier && has_unpersisted_data_); + bool done = false; + if (!skip_memtable) { + if (sv->mem->Get(lkey, value, &s, &merge_context)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } else if (sv->imm->Get(lkey, value, &s, &merge_context)) { + done = true; + RecordTick(stats_, MEMTABLE_HIT); + } + } + if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); sv->current->Get(read_options, lkey, value, &s, &merge_context, value_found); @@ -3314,6 +3395,7 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, RecordTick(stats_, NUMBER_KEYS_READ); RecordTick(stats_, BYTES_READ, value->size()); + MeasureTime(stats_, BYTES_PER_READ, value->size()); } return s; } @@ -3384,14 +3466,23 @@ std::vector DBImpl::MultiGet( assert(mgd_iter != multiget_cf_data.end()); auto mgd = mgd_iter->second; auto super_version = mgd->super_version; - if (super_version->mem->Get(lkey, value, &s, &merge_context)) { - // Done - } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { - // Done - } else { + bool skip_memtable = + (read_options.read_tier == kPersistedTier && has_unpersisted_data_); + bool done = false; + if (!skip_memtable) { + if (super_version->mem->Get(lkey, value, &s, &merge_context)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } else if (super_version->imm->Get(lkey, value, &s, &merge_context)) { + done = true; + // TODO(?): RecordTick(stats_, MEMTABLE_HIT)? + } + } + if (!done) { PERF_TIMER_GUARD(get_from_output_files_time); super_version->current->Get(read_options, lkey, value, &s, &merge_context); + // TODO(?): RecordTick(stats_, MEMTABLE_MISS)? } if (s.ok()) { @@ -3424,6 +3515,7 @@ std::vector DBImpl::MultiGet( RecordTick(stats_, NUMBER_MULTIGET_CALLS); RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys); RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read); + MeasureTime(stats_, BYTES_PER_MULTIGET, bytes_read); PERF_TIMER_STOP(get_post_process_time); return stat_list; @@ -3516,6 +3608,9 @@ Status DBImpl::AddFile(ColumnFamilyHandle* column_family, auto cfh = reinterpret_cast(column_family); ColumnFamilyData* cfd = cfh->cfd(); + if (file_info->num_entries == 0) { + return Status::InvalidArgument("File contain no entries"); + } if (file_info->version != 1) { return Status::InvalidArgument("Generated table version is not supported"); } @@ -3536,8 +3631,16 @@ Status DBImpl::AddFile(ColumnFamilyHandle* column_family, return Status::InvalidArgument( "Non zero sequence numbers are not supported"); } + // Generate a location for the new table - meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, file_info->file_size); + std::list::iterator pending_outputs_inserted_elem; + { + InstrumentedMutexLock l(&mutex_); + pending_outputs_inserted_elem = CaptureCurrentFileNumberInPendingOutputs(); + meta.fd = + FileDescriptor(versions_->NewFileNumber(), 0, file_info->file_size); + } + std::string db_fname = TableFileName( db_options_.db_paths, meta.fd.GetNumber(), meta.fd.GetPathId()); @@ -3550,6 +3653,7 @@ Status DBImpl::AddFile(ColumnFamilyHandle* column_family, } else { status = CopyFile(env_, file_info->file_path, db_fname, 0); } + TEST_SYNC_POINT("DBImpl::AddFile:FileCopied"); if (!status.ok()) { return status; } @@ -3613,6 +3717,7 @@ Status DBImpl::AddFile(ColumnFamilyHandle* column_family, delete InstallSuperVersionAndScheduleWork(cfd, nullptr, mutable_cf_options); } + ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem); } if (!status.ok()) { @@ -3826,6 +3931,10 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + if (read_options.read_tier == kPersistedTier) { + return NewErrorIterator(Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators.")); + } auto cfh = reinterpret_cast(column_family); auto cfd = cfh->cfd(); @@ -3857,8 +3966,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, env_, *cfd->ioptions(), cfd->user_comparator(), iter, kMaxSequenceNumber, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_options.iterate_upper_bound, read_options.prefix_same_as_start, - read_options.pin_data); + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data); #endif } else { SequenceNumber latest_snapshot = versions_->LastSequence(); @@ -3915,8 +4024,8 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations, - read_options.iterate_upper_bound, read_options.prefix_same_as_start, - read_options.pin_data); + sv->version_number, read_options.iterate_upper_bound, + read_options.prefix_same_as_start, read_options.pin_data); InternalIterator* internal_iter = NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); @@ -3932,6 +4041,10 @@ Status DBImpl::NewIterators( const ReadOptions& read_options, const std::vector& column_families, std::vector* iterators) { + if (read_options.read_tier == kPersistedTier) { + return Status::NotSupported( + "ReadTier::kPersistedData is not yet supported in iterators."); + } iterators->clear(); iterators->reserve(column_families.size()); XFUNC_TEST("", "managed_new", managed_new1, xf_manage_new, @@ -3965,8 +4078,8 @@ Status DBImpl::NewIterators( iterators->push_back(NewDBIterator( env_, *cfd->ioptions(), cfd->user_comparator(), iter, kMaxSequenceNumber, - sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr, - false, read_options.pin_data)); + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data)); } #endif } else { @@ -3985,8 +4098,8 @@ Status DBImpl::NewIterators( ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), snapshot, - sv->mutable_cf_options.max_sequential_skip_in_iterations, nullptr, - false, read_options.pin_data); + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number, nullptr, false, read_options.pin_data); InternalIterator* internal_iter = NewInternalIterator(read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -4078,7 +4191,6 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } Status status; - bool callback_failed = false; bool xfunc_attempted_write = false; XFUNC_TEST("transaction", "transaction_xftest_write_impl", @@ -4096,7 +4208,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, w.sync = write_options.sync; w.disableWAL = write_options.disableWAL; w.in_batch_group = false; - w.has_callback = (callback != nullptr) ? true : false; + w.callback = callback; if (!write_options.disableWAL) { RecordTick(stats_, WRITE_WITH_WAL); @@ -4109,28 +4221,32 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // we are a non-leader in a parallel group PERF_TIMER_GUARD(write_memtable_time); - ColumnFamilyMemTablesImpl column_family_memtables( - versions_->GetColumnFamilySet()); - WriteBatchInternal::SetSequence(w.batch, w.sequence); - w.status = WriteBatchInternal::InsertInto( - w.batch, &column_family_memtables, &flush_scheduler_, - write_options.ignore_missing_column_families, 0 /*log_number*/, this, - true /*dont_filter_deletes*/, true /*concurrent_memtable_writes*/); + if (!w.CallbackFailed()) { + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + w.batch, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, this, + true /*dont_filter_deletes*/, true /*concurrent_memtable_writes*/); + } if (write_thread_.CompleteParallelWorker(&w)) { // we're responsible for early exit - auto last_sequence = w.parallel_group->last_writer->sequence; + auto last_sequence = w.parallel_group->last_sequence; SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); versions_->SetLastSequence(last_sequence); write_thread_.EarlyExitParallelGroup(&w); } assert(w.state == WriteThread::STATE_COMPLETED); // STATE_COMPLETED conditional below handles exit + + status = w.FinalStatus(); } if (w.state == WriteThread::STATE_COMPLETED) { // write is complete and leader has updated sequence RecordTick(stats_, WRITE_DONE_BY_OTHER); - return w.status; + return w.FinalStatus(); } // else we are the leader of the write batch group assert(w.state == WriteThread::STATE_GROUP_LEADER); @@ -4236,7 +4352,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, uint64_t last_sequence = versions_->LastSequence(); WriteThread::Writer* last_writer = &w; - autovector write_batch_group; + autovector write_group; bool need_log_sync = !write_options.disableWAL && write_options.sync; bool need_log_dir_sync = need_log_sync && !log_dir_synced_; @@ -4255,24 +4371,15 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // during this phase since &w is currently responsible for logging // and protects against concurrent loggers and concurrent writes // into memtables - - mutex_.Unlock(); - - if (callback != nullptr) { - // If this write has a validation callback, check to see if this write - // is able to be written. Must be called on the write thread. - status = callback->Callback(this); - callback_failed = true; - } - } else { - mutex_.Unlock(); } + mutex_.Unlock(); + // At this point the mutex is unlocked bool exit_completed_early = false; - last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader( - &w, &last_writer, &write_batch_group); + last_batch_group_size_ = + write_thread_.EnterAsBatchGroupLeader(&w, &last_writer, &write_group); if (status.ok()) { // Rules for when we can update the memtable concurrently @@ -4288,15 +4395,17 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // assumed to be true. Rule 4 is checked for each batch. We could // relax rules 2 and 3 if we could prevent write batches from referring // more than once to a particular key. - bool parallel = db_options_.allow_concurrent_memtable_write && - write_batch_group.size() > 1; + bool parallel = + db_options_.allow_concurrent_memtable_write && write_group.size() > 1; int total_count = 0; uint64_t total_byte_size = 0; - for (auto b : write_batch_group) { - total_count += WriteBatchInternal::Count(b); - total_byte_size = WriteBatchInternal::AppendedByteSize( - total_byte_size, WriteBatchInternal::ByteSize(b)); - parallel = parallel && !b->HasMerge(); + for (auto writer : write_group) { + if (writer->CheckCallback(this)) { + total_count += WriteBatchInternal::Count(writer->batch); + total_byte_size = WriteBatchInternal::AppendedByteSize( + total_byte_size, WriteBatchInternal::ByteSize(writer->batch)); + parallel = parallel && !writer->batch->HasMerge(); + } } const SequenceNumber current_sequence = last_sequence + 1; @@ -4305,10 +4414,11 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // Record statistics RecordTick(stats_, NUMBER_KEYS_WRITTEN, total_count); RecordTick(stats_, BYTES_WRITTEN, total_byte_size); + MeasureTime(stats_, BYTES_PER_WRITE, total_byte_size); PERF_TIMER_STOP(write_pre_and_post_process_time); if (write_options.disableWAL) { - flush_on_destroy_ = true; + has_unpersisted_data_ = true; } uint64_t log_size = 0; @@ -4316,21 +4426,22 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, PERF_TIMER_GUARD(write_wal_time); WriteBatch* merged_batch = nullptr; - if (write_batch_group.size() == 1) { - merged_batch = write_batch_group[0]; + if (write_group.size() == 1 && !write_group[0]->CallbackFailed()) { + merged_batch = write_group[0]->batch; } else { // WAL needs all of the batches flattened into a single batch. // We could avoid copying here with an iov-like AddRecord // interface merged_batch = &tmp_batch_; - for (auto b : write_batch_group) { - WriteBatchInternal::Append(merged_batch, b); + for (auto writer : write_group) { + if (!writer->CallbackFailed()) { + WriteBatchInternal::Append(merged_batch, writer->batch); + } } } WriteBatchInternal::SetSequence(merged_batch, current_sequence); assert(WriteBatchInternal::Count(merged_batch) == total_count); - assert(WriteBatchInternal::ByteSize(merged_batch) == total_byte_size); Slice log_entry = WriteBatchInternal::Contents(merged_batch); status = logs_.back().writer->AddRecord(log_entry); @@ -4385,7 +4496,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } stats->AddDBStats(InternalStats::WAL_FILE_BYTES, log_size); } - uint64_t for_other = write_batch_group.size() - 1; + uint64_t for_other = write_group.size() - 1; if (for_other > 0) { stats->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER, for_other); if (!write_options.disableWAL) { @@ -4396,41 +4507,50 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (!parallel) { status = WriteBatchInternal::InsertInto( - write_batch_group, current_sequence, column_family_memtables_.get(), + write_group, current_sequence, column_family_memtables_.get(), &flush_scheduler_, write_options.ignore_missing_column_families, 0 /*log_number*/, this, false /*dont_filter_deletes*/); + + if (status.ok()) { + // There were no write failures. Set leader's status + // in case the write callback returned a non-ok status. + status = w.FinalStatus(); + } + } else { WriteThread::ParallelGroup pg; pg.leader = &w; pg.last_writer = last_writer; + pg.last_sequence = last_sequence; pg.early_exit_allowed = !need_log_sync; - pg.running.store(static_cast(write_batch_group.size()), + pg.running.store(static_cast(write_group.size()), std::memory_order_relaxed); write_thread_.LaunchParallelFollowers(&pg, current_sequence); - ColumnFamilyMemTablesImpl column_family_memtables( - versions_->GetColumnFamilySet()); - assert(w.sequence == current_sequence); - WriteBatchInternal::SetSequence(w.batch, w.sequence); - w.status = WriteBatchInternal::InsertInto( - w.batch, &column_family_memtables, &flush_scheduler_, - write_options.ignore_missing_column_families, 0 /*log_number*/, - this, true /*dont_filter_deletes*/, - true /*concurrent_memtable_writes*/); - - assert(last_writer->sequence == last_sequence); + if (!w.CallbackFailed()) { + // do leader write + ColumnFamilyMemTablesImpl column_family_memtables( + versions_->GetColumnFamilySet()); + assert(w.sequence == current_sequence); + WriteBatchInternal::SetSequence(w.batch, w.sequence); + w.status = WriteBatchInternal::InsertInto( + w.batch, &column_family_memtables, &flush_scheduler_, + write_options.ignore_missing_column_families, 0 /*log_number*/, + this, true /*dont_filter_deletes*/, + true /*concurrent_memtable_writes*/); + } + // CompleteParallelWorker returns true if this thread should // handle exit, false means somebody else did exit_completed_early = !write_thread_.CompleteParallelWorker(&w); - status = w.status; - assert(status.ok() || !exit_completed_early); + status = w.FinalStatus(); } - if (status.ok() && !exit_completed_early) { + if (!exit_completed_early && w.status.ok()) { SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence); versions_->SetLastSequence(last_sequence); if (!need_log_sync) { - write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status); + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); exit_completed_early = true; } } @@ -4443,14 +4563,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, // // Is setting bg_error_ enough here? This will at least stop // compaction and fail any further writes. - if (!status.ok() && bg_error_.ok()) { + if (!status.ok() && bg_error_.ok() && !w.CallbackFailed()) { bg_error_ = status; } } } PERF_TIMER_START(write_pre_and_post_process_time); - if (db_options_.paranoid_checks && !status.ok() && !callback_failed && + if (db_options_.paranoid_checks && !status.ok() && !w.CallbackFailed() && !status.IsBusy()) { mutex_.Lock(); if (bg_error_.ok()) { @@ -4466,7 +4586,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, } if (!exit_completed_early) { - write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status); + write_thread_.ExitAsBatchGroupLeader(&w, last_writer, w.status); } return status; @@ -4678,53 +4798,51 @@ const DBOptions& DBImpl::GetDBOptions() const { return db_options_; } bool DBImpl::GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) { - bool is_int_property = false; - bool need_out_of_mutex = false; - DBPropertyType property_type = - GetPropertyType(property, &is_int_property, &need_out_of_mutex); - + const DBPropertyInfo* property_info = GetPropertyInfo(property); value->clear(); auto cfd = reinterpret_cast(column_family)->cfd(); - if (is_int_property) { + if (property_info == nullptr) { + return false; + } else if (property_info->handle_int) { uint64_t int_value; - bool ret_value = GetIntPropertyInternal( - cfd, property_type, need_out_of_mutex, false, &int_value); + bool ret_value = + GetIntPropertyInternal(cfd, *property_info, false, &int_value); if (ret_value) { *value = ToString(int_value); } return ret_value; - } else { + } else if (property_info->handle_string) { InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetStringProperty(property_type, property, + return cfd->internal_stats()->GetStringProperty(*property_info, property, value); } + // Shouldn't reach here since exactly one of handle_string and handle_int + // should be non-nullptr. + assert(false); + return false; } bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family, const Slice& property, uint64_t* value) { - bool is_int_property = false; - bool need_out_of_mutex = false; - DBPropertyType property_type = - GetPropertyType(property, &is_int_property, &need_out_of_mutex); - if (!is_int_property) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { return false; } auto cfd = reinterpret_cast(column_family)->cfd(); - return GetIntPropertyInternal(cfd, property_type, need_out_of_mutex, false, - value); + return GetIntPropertyInternal(cfd, *property_info, false, value); } bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, - DBPropertyType property_type, - bool need_out_of_mutex, bool is_locked, - uint64_t* value) { - if (!need_out_of_mutex) { + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value) { + assert(property_info.handle_int != nullptr); + if (!property_info.need_out_of_mutex) { if (is_locked) { mutex_.AssertHeld(); - return cfd->internal_stats()->GetIntProperty(property_type, value, this); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); } else { InstrumentedMutexLock l(&mutex_); - return cfd->internal_stats()->GetIntProperty(property_type, value, this); + return cfd->internal_stats()->GetIntProperty(property_info, value, this); } } else { SuperVersion* sv = nullptr; @@ -4735,7 +4853,7 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, } bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex( - property_type, sv->current, value); + property_info, sv->current, value); if (!is_locked) { ReturnAndCleanupSuperVersion(cfd, sv); @@ -4747,11 +4865,8 @@ bool DBImpl::GetIntPropertyInternal(ColumnFamilyData* cfd, bool DBImpl::GetAggregatedIntProperty(const Slice& property, uint64_t* aggregated_value) { - bool need_out_of_mutex; - bool is_int_property; - DBPropertyType property_type = - GetPropertyType(property, &is_int_property, &need_out_of_mutex); - if (!is_int_property) { + const DBPropertyInfo* property_info = GetPropertyInfo(property); + if (property_info == nullptr || property_info->handle_int == nullptr) { return false; } @@ -4761,8 +4876,7 @@ bool DBImpl::GetAggregatedIntProperty(const Slice& property, InstrumentedMutexLock l(&mutex_); uint64_t value; for (auto* cfd : *versions_->GetColumnFamilySet()) { - if (GetIntPropertyInternal(cfd, property_type, need_out_of_mutex, true, - &value)) { + if (GetIntPropertyInternal(cfd, *property_info, true, &value)) { sum += value; } else { return false; @@ -5414,6 +5528,25 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, } impl->mutex_.Unlock(); + auto sfm = static_cast( + impl->db_options_.sst_file_manager.get()); + if (s.ok() && sfm) { + // Notify SstFileManager about all sst files that already exist in + // db_paths[0] when the DB is opened. + auto& db_path = impl->db_options_.db_paths[0]; + std::vector existing_files; + impl->db_options_.env->GetChildren(db_path.path, &existing_files); + for (auto& file_name : existing_files) { + uint64_t file_number; + FileType file_type; + std::string file_path = db_path.path + "/" + file_name; + if (ParseFileName(file_name, &file_number, &file_type) && + file_type == kTableFile) { + sfm->OnAddFile(file_path); + } + } + } + if (s.ok()) { Log(InfoLogLevel::INFO_LEVEL, impl->db_options_.info_log, "DB pointer %p", impl); @@ -5473,7 +5606,7 @@ Status DestroyDB(const std::string& dbname, const Options& options) { if (type == kMetaDatabase) { del = DestroyDB(path_to_delete, options); } else if (type == kTableFile) { - del = DeleteOrMoveToTrash(&options, path_to_delete); + del = DeleteSSTFile(&options, path_to_delete, 0); } else { del = env->DeleteFile(path_to_delete); } @@ -5489,13 +5622,9 @@ Status DestroyDB(const std::string& dbname, const Options& options) { for (size_t i = 0; i < filenames.size(); i++) { if (ParseFileName(filenames[i], &number, &type) && type == kTableFile) { // Lock file will be deleted at end - Status del; std::string table_path = db_path.path + "/" + filenames[i]; - if (path_id == 0) { - del = DeleteOrMoveToTrash(&options, table_path); - } else { - del = env->DeleteFile(table_path); - } + Status del = DeleteSSTFile(&options, table_path, + static_cast(path_id)); if (result.ok() && !del.ok()) { result = del; } @@ -5650,7 +5779,8 @@ Status DBImpl::RenameTempFileToOptionsFile(const std::string& file_name) { void DBImpl::NewThreadStatusCfInfo( ColumnFamilyData* cfd) const { if (db_options_.enable_thread_tracking) { - ThreadStatusUtil::NewColumnFamilyInfo(this, cfd); + ThreadStatusUtil::NewColumnFamilyInfo(this, cfd, cfd->GetName(), + cfd->ioptions()->env); } } diff --git a/db/db_impl.h b/db/db_impl.h index 683fd49dc..b9efb775b 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -347,6 +347,10 @@ class DBImpl : public DB { #endif // NDEBUG + // Return maximum background compaction alowed to be scheduled based on + // compaction status. + int BGCompactionsAllowed() const; + // Returns the list of live files in 'live' and the list // of all files in the filesystem in 'candidate_files'. // If force == false and the last call was less than @@ -490,10 +494,11 @@ class DBImpl : public DB { // Background process needs to call // auto x = CaptureCurrentFileNumberInPendingOutputs() + // auto file_num = versions_->NewFileNumber(); // // ReleaseFileNumberFromPendingOutputs(x) - // This will protect any temporary files created while is - // executing from being deleted. + // This will protect any file with number `file_num` or greater from being + // deleted while is running. // ----------- // This function will capture current file number and append it to // pending_outputs_. This will prevent any background process to delete any @@ -818,7 +823,10 @@ class DBImpl : public DB { // they're unique std::atomic next_job_id_; - bool flush_on_destroy_; // Used when disableWAL is true. + // A flag indicating whether the current rocksdb database has any + // data that is not yet persisted into either WAL or SST file. + // Used when disableWAL is true. + bool has_unpersisted_data_; static const int KEEP_LOG_FILE_NUM = 1000; // MSVC version 1800 still does not have constexpr for ::max() @@ -836,9 +844,12 @@ class DBImpl : public DB { // Unified interface for logging events EventLogger event_logger_; - // A value of >0 temporarily disables scheduling of background work + // A value of > 0 temporarily disables scheduling of background work int bg_work_paused_; + // A value of > 0 temporarily disables scheduling of background compaction + int bg_compaction_paused_; + // Guard against multiple concurrent refitting bool refitting_level_; @@ -889,9 +900,8 @@ class DBImpl : public DB { bool* value_found = nullptr); bool GetIntPropertyInternal(ColumnFamilyData* cfd, - DBPropertyType property_type, - bool need_out_of_mutex, bool is_locked, - uint64_t* value); + const DBPropertyInfo& property_info, + bool is_locked, uint64_t* value); bool HasPendingManualCompaction(); bool HasExclusiveManualCompaction(); diff --git a/db/db_impl_debug.cc b/db/db_impl_debug.cc index e494c4ee5..af4553f89 100644 --- a/db/db_impl_debug.cc +++ b/db/db_impl_debug.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_impl_experimental.cc b/db/db_impl_experimental.cc index 6bf0ba6a1..af3663e60 100644 --- a/db/db_impl_experimental.cc +++ b/db/db_impl_experimental.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index 618ade8c9..57c14df14 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -54,10 +54,11 @@ Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, auto db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ + ? reinterpret_cast(read_options.snapshot) + ->number_ : latest_snapshot), - super_version->mutable_cf_options.max_sequential_skip_in_iterations); + super_version->mutable_cf_options.max_sequential_skip_in_iterations, + super_version->version_number); auto internal_iter = NewInternalIterator( read_options, cfd, super_version, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); @@ -81,10 +82,11 @@ Status DBImplReadOnly::NewIterators( auto* db_iter = NewArenaWrappedDbIterator( env_, *cfd->ioptions(), cfd->user_comparator(), (read_options.snapshot != nullptr - ? reinterpret_cast( - read_options.snapshot)->number_ - : latest_snapshot), - sv->mutable_cf_options.max_sequential_skip_in_iterations); + ? reinterpret_cast(read_options.snapshot) + ->number_ + : latest_snapshot), + sv->mutable_cf_options.max_sequential_skip_in_iterations, + sv->version_number); auto* internal_iter = NewInternalIterator( read_options, cfd, sv, db_iter->GetArena()); db_iter->SetIterUnderDBIter(internal_iter); diff --git a/db/db_impl_readonly.h b/db/db_impl_readonly.h index 8f3103aac..a410a4e32 100644 --- a/db/db_impl_readonly.h +++ b/db/db_impl_readonly.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/db_info_dumper.cc b/db/db_info_dumper.cc similarity index 97% rename from util/db_info_dumper.cc rename to db/db_info_dumper.cc index 6cb978fbb..56cf3e288 100644 --- a/util/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,6 +7,8 @@ #define __STDC_FORMAT_MACROS #endif +#include "db/db_info_dumper.h" + #include #include #include @@ -16,7 +18,6 @@ #include "db/filename.h" #include "rocksdb/options.h" #include "rocksdb/env.h" -#include "util/db_info_dumper.h" namespace rocksdb { diff --git a/util/db_info_dumper.h b/db/db_info_dumper.h similarity index 85% rename from util/db_info_dumper.h rename to db/db_info_dumper.h index ed0a63ded..470b6224f 100644 --- a/util/db_info_dumper.h +++ b/db/db_info_dumper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_inplace_update_test.cc b/db/db_inplace_update_test.cc index 046ddead4..2c15a5f14 100644 --- a/db/db_inplace_update_test.cc +++ b/db/db_inplace_update_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_iter.cc b/db/db_iter.cc index 10e9658cc..256b65447 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,18 +13,19 @@ #include #include -#include "db/filename.h" #include "db/dbformat.h" +#include "db/filename.h" #include "port/port.h" #include "rocksdb/env.h" -#include "rocksdb/options.h" #include "rocksdb/iterator.h" #include "rocksdb/merge_operator.h" +#include "rocksdb/options.h" #include "table/internal_iterator.h" #include "util/arena.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/perf_context_imp.h" +#include "util/string_util.h" namespace rocksdb { @@ -59,9 +60,47 @@ class DBIter: public Iterator { kReverse }; + // LocalStatistics contain Statistics counters that will be aggregated per + // each iterator instance and then will be sent to the global statistics when + // the iterator is destroyed. + // + // The purpose of this approach is to avoid perf regression happening + // when multiple threads bump the atomic counters from a DBIter::Next(). + struct LocalStatistics { + explicit LocalStatistics() { ResetCounters(); } + + void ResetCounters() { + next_count_ = 0; + next_found_count_ = 0; + prev_count_ = 0; + prev_found_count_ = 0; + bytes_read_ = 0; + } + + void BumpGlobalStatistics(Statistics* global_statistics) { + RecordTick(global_statistics, NUMBER_DB_NEXT, next_count_); + RecordTick(global_statistics, NUMBER_DB_NEXT_FOUND, next_found_count_); + RecordTick(global_statistics, NUMBER_DB_PREV, prev_count_); + RecordTick(global_statistics, NUMBER_DB_PREV_FOUND, prev_found_count_); + RecordTick(global_statistics, ITER_BYTES_READ, bytes_read_); + ResetCounters(); + } + + // Map to Tickers::NUMBER_DB_NEXT + uint64_t next_count_; + // Map to Tickers::NUMBER_DB_NEXT_FOUND + uint64_t next_found_count_; + // Map to Tickers::NUMBER_DB_PREV + uint64_t prev_count_; + // Map to Tickers::NUMBER_DB_PREV_FOUND + uint64_t prev_found_count_; + // Map to Tickers::ITER_BYTES_READ + uint64_t bytes_read_; + }; + DBIter(Env* env, const ImmutableCFOptions& ioptions, const Comparator* cmp, InternalIterator* iter, SequenceNumber s, bool arena_mode, - uint64_t max_sequential_skip_in_iterations, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound = nullptr, bool prefix_same_as_start = false) : arena_mode_(arena_mode), @@ -75,6 +114,7 @@ class DBIter: public Iterator { valid_(false), current_entry_is_merged_(false), statistics_(ioptions.statistics), + version_number_(version_number), iterate_upper_bound_(iterate_upper_bound), prefix_same_as_start_(prefix_same_as_start), iter_pinned_(false) { @@ -84,6 +124,7 @@ class DBIter: public Iterator { } virtual ~DBIter() { RecordTick(statistics_, NO_ITERATORS, -1); + local_stats_.BumpGlobalStatistics(statistics_); if (!arena_mode_) { delete iter_; } else { @@ -136,9 +177,27 @@ class DBIter: public Iterator { } return s; } - virtual bool IsKeyPinned() const override { - assert(valid_); - return iter_pinned_ && saved_key_.IsKeyPinned(); + + virtual Status GetProperty(std::string prop_name, + std::string* prop) override { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.super-version-number") { + // First try to pass the value returned from inner iterator. + if (!iter_->GetProperty(prop_name, prop).ok()) { + *prop = ToString(version_number_); + } + return Status::OK(); + } else if (prop_name == "rocksdb.iterator.is-key-pinned") { + if (valid_) { + *prop = (iter_pinned_ && saved_key_.IsKeyPinned()) ? "1" : "0"; + } else { + *prop = "Iterator is not valid."; + } + return Status::OK(); + } + return Status::InvalidArgument("Undentified property."); } virtual void Next() override; @@ -186,12 +245,14 @@ class DBIter: public Iterator { bool current_entry_is_merged_; Statistics* statistics_; uint64_t max_skip_; + uint64_t version_number_; const Slice* iterate_upper_bound_; IterKey prefix_start_; bool prefix_same_as_start_; bool iter_pinned_; // List of operands for merge operator. std::deque merge_operands_; + LocalStatistics local_stats_; // No copying allowed DBIter(const DBIter&); @@ -229,6 +290,9 @@ void DBIter::Next() { PERF_COUNTER_ADD(internal_key_skipped_count, 1); } + if (statistics_ != nullptr) { + local_stats_.next_count_++; + } // Now we point to the next internal position, for both of merge and // not merge cases. if (!iter_->Valid()) { @@ -236,18 +300,15 @@ void DBIter::Next() { return; } FindNextUserEntry(true /* skipping the current user key */); - if (statistics_ != nullptr) { - RecordTick(statistics_, NUMBER_DB_NEXT); - if (valid_) { - RecordTick(statistics_, NUMBER_DB_NEXT_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); - } - } if (valid_ && prefix_extractor_ && prefix_same_as_start_ && prefix_extractor_->Transform(saved_key_.GetKey()) .compare(prefix_start_.GetKey()) != 0) { valid_ = false; } + if (statistics_ != nullptr && valid_) { + local_stats_.next_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); + } } // PRE: saved_key_ has the current user key if skipping @@ -275,7 +336,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) { if (ParseKey(&ikey)) { if (iterate_upper_bound_ != nullptr && - ikey.user_key.compare(*iterate_upper_bound_) >= 0) { + user_comparator_->Compare(ikey.user_key, *iterate_upper_bound_) >= 0) { break; } @@ -415,10 +476,10 @@ void DBIter::Prev() { } PrevInternal(); if (statistics_ != nullptr) { - RecordTick(statistics_, NUMBER_DB_PREV); + local_stats_.prev_count_++; if (valid_) { - RecordTick(statistics_, NUMBER_DB_PREV_FOUND); - RecordTick(statistics_, ITER_BYTES_READ, key().size() + value().size()); + local_stats_.prev_found_count_++; + local_stats_.bytes_read_ += (key().size() + value().size()); } } if (valid_ && prefix_extractor_ && prefix_same_as_start_ && @@ -818,12 +879,13 @@ Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions, InternalIterator* internal_iter, const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, const Slice* iterate_upper_bound, bool prefix_same_as_start, bool pin_data) { DBIter* db_iter = new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence, - false, max_sequential_skip_in_iterations, iterate_upper_bound, - prefix_same_as_start); + false, max_sequential_skip_in_iterations, version_number, + iterate_upper_bound, prefix_same_as_start); if (pin_data) { db_iter->PinData(); } @@ -850,12 +912,13 @@ inline Slice ArenaWrappedDBIter::key() const { return db_iter_->key(); } inline Slice ArenaWrappedDBIter::value() const { return db_iter_->value(); } inline Status ArenaWrappedDBIter::status() const { return db_iter_->status(); } inline Status ArenaWrappedDBIter::PinData() { return db_iter_->PinData(); } +inline Status ArenaWrappedDBIter::GetProperty(std::string prop_name, + std::string* prop) { + return db_iter_->GetProperty(prop_name, prop); +} inline Status ArenaWrappedDBIter::ReleasePinnedData() { return db_iter_->ReleasePinnedData(); } -inline bool ArenaWrappedDBIter::IsKeyPinned() const { - return db_iter_->IsKeyPinned(); -} void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, void* arg2) { db_iter_->RegisterCleanup(function, arg1, arg2); @@ -864,7 +927,7 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1, ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ImmutableCFOptions& ioptions, const Comparator* user_key_comparator, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound, bool prefix_same_as_start, bool pin_data) { ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); @@ -872,7 +935,7 @@ ArenaWrappedDBIter* NewArenaWrappedDbIterator( auto mem = arena->AllocateAligned(sizeof(DBIter)); DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator, nullptr, sequence, - true, max_sequential_skip_in_iterations, + true, max_sequential_skip_in_iterations, version_number, iterate_upper_bound, prefix_same_as_start); iter->SetDBIter(db_iter); diff --git a/db/db_iter.h b/db/db_iter.h index ba52c8cf9..f239d2984 100644 --- a/db/db_iter.h +++ b/db/db_iter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,6 +9,7 @@ #pragma once #include +#include #include "rocksdb/db.h" #include "rocksdb/iterator.h" #include "db/dbformat.h" @@ -24,14 +25,12 @@ class InternalIterator; // Return a new iterator that converts internal keys (yielded by // "*internal_iter") that were live at the specified "sequence" number // into appropriate user keys. -extern Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& options, - const Comparator* user_key_comparator, - InternalIterator* internal_iter, - const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, - const Slice* iterate_upper_bound = nullptr, - bool prefix_same_as_start = false, - bool pin_data = false); +extern Iterator* NewDBIterator( + Env* env, const ImmutableCFOptions& options, + const Comparator* user_key_comparator, InternalIterator* internal_iter, + const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations, + uint64_t version_number, const Slice* iterate_upper_bound = nullptr, + bool prefix_same_as_start = false, bool pin_data = false); // A wrapper iterator which wraps DB Iterator and the arena, with which the DB // iterator is supposed be allocated. This class is used as an entry point of @@ -66,7 +65,7 @@ class ArenaWrappedDBIter : public Iterator { void RegisterCleanup(CleanupFunction function, void* arg1, void* arg2); virtual Status PinData(); virtual Status ReleasePinnedData(); - virtual bool IsKeyPinned() const override; + virtual Status GetProperty(std::string prop_name, std::string* prop) override; private: DBIter* db_iter_; @@ -77,7 +76,7 @@ class ArenaWrappedDBIter : public Iterator { extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( Env* env, const ImmutableCFOptions& options, const Comparator* user_key_comparator, const SequenceNumber& sequence, - uint64_t max_sequential_skip_in_iterations, + uint64_t max_sequential_skip_in_iterations, uint64_t version_number, const Slice* iterate_upper_bound = nullptr, bool prefix_same_as_start = false, bool pin_data = false); diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index f1e3324d8..3b3030110 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -181,10 +181,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -215,7 +214,7 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -241,7 +240,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -273,7 +273,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -308,7 +309,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -337,7 +339,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 7, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -374,7 +377,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 4, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -399,7 +403,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -421,7 +426,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 10, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -456,7 +462,8 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations, ro.iterate_upper_bound)); + 7, options.max_sequential_skip_in_iterations, 0, + ro.iterate_upper_bound)); SetPerfLevel(kEnableCount); ASSERT_TRUE(GetPerfLevel() == kEnableCount); @@ -482,10 +489,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); @@ -524,10 +530,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("b", "val_b"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -555,10 +560,9 @@ TEST_F(DBIteratorTest, DBIteratorPrevNext) { internal_iter->AddPut("c", "val_c"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 10, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -583,10 +587,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -595,10 +598,9 @@ TEST_F(DBIteratorTest, DBIteratorEmpty) { TestIterator* internal_iter = new TestIterator(BytewiseComparator()); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(!db_iter->Valid()); } @@ -617,10 +619,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkipCountSkips) { } internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2, + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -659,9 +660,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { options.statistics = rocksdb::CreateDBStatistics(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -695,9 +695,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -724,9 +723,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 202, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, 202, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -756,10 +754,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); @@ -773,10 +770,9 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { } internal_iter->AddPut("c", "200"); internal_iter->Finish(); - std::unique_ptr db_iter( - NewDBIterator(env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, 200, - options.max_sequential_skip_in_iterations)); + std::unique_ptr db_iter(NewDBIterator( + env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, + 200, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -809,9 +805,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -844,9 +839,8 @@ TEST_F(DBIteratorTest, DBIteratorUseSkip) { internal_iter->Finish(); std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), - BytewiseComparator(), internal_iter, i + 2, - options.max_sequential_skip_in_iterations)); + env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, i + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -887,7 +881,7 @@ TEST_F(DBIteratorTest, DBIterator1) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -913,7 +907,7 @@ TEST_F(DBIteratorTest, DBIterator2) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 0, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -936,7 +930,7 @@ TEST_F(DBIteratorTest, DBIterator3) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 2, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -958,7 +952,7 @@ TEST_F(DBIteratorTest, DBIterator4) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 4, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -987,7 +981,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1009,7 +1003,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 1, options.max_sequential_skip_in_iterations)); + 1, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1031,7 +1025,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1053,7 +1047,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 3, options.max_sequential_skip_in_iterations)); + 3, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1075,7 +1069,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1097,7 +1091,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1119,7 +1113,7 @@ TEST_F(DBIteratorTest, DBIterator5) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1145,7 +1139,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1167,7 +1161,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 1, options.max_sequential_skip_in_iterations)); + 1, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1189,7 +1183,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1211,7 +1205,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 3, options.max_sequential_skip_in_iterations)); + 3, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(!db_iter->Valid()); } @@ -1229,7 +1223,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1251,7 +1245,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1273,7 +1267,7 @@ TEST_F(DBIteratorTest, DBIterator6) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1311,7 +1305,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 0, options.max_sequential_skip_in_iterations)); + 0, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1345,7 +1339,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 2, options.max_sequential_skip_in_iterations)); + 2, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1385,7 +1379,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 4, options.max_sequential_skip_in_iterations)); + 4, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1425,7 +1419,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 5, options.max_sequential_skip_in_iterations)); + 5, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1470,7 +1464,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 6, options.max_sequential_skip_in_iterations)); + 6, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1516,7 +1510,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 7, options.max_sequential_skip_in_iterations)); + 7, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1556,7 +1550,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 9, options.max_sequential_skip_in_iterations)); + 9, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1602,7 +1596,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 13, options.max_sequential_skip_in_iterations)); + 13, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1649,7 +1643,7 @@ TEST_F(DBIteratorTest, DBIterator7) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 14, options.max_sequential_skip_in_iterations)); + 14, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1678,7 +1672,7 @@ TEST_F(DBIteratorTest, DBIterator8) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "b"); @@ -1707,7 +1701,7 @@ TEST_F(DBIteratorTest, DBIterator9) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); @@ -1754,7 +1748,7 @@ TEST_F(DBIteratorTest, DBIterator10) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, options.max_sequential_skip_in_iterations)); + 10, options.max_sequential_skip_in_iterations, 0)); db_iter->Seek("c"); ASSERT_TRUE(db_iter->Valid()); @@ -1778,9 +1772,9 @@ TEST_F(DBIteratorTest, SeekToLastOccurrenceSeq0) { internal_iter->AddPut("b", "2"); internal_iter->Finish(); - std::unique_ptr db_iter(NewDBIterator( - env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, - 10, 0 /* force seek */)); + std::unique_ptr db_iter( + NewDBIterator(env_, ImmutableCFOptions(options), BytewiseComparator(), + internal_iter, 10, 0 /* force seek */, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1807,7 +1801,7 @@ TEST_F(DBIteratorTest, DBIterator11) { std::unique_ptr db_iter(NewDBIterator( env_, ImmutableCFOptions(options), BytewiseComparator(), internal_iter, 1, - options.max_sequential_skip_in_iterations)); + options.max_sequential_skip_in_iterations, 0)); db_iter->SeekToFirst(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "a"); @@ -1832,7 +1826,7 @@ TEST_F(DBIteratorTest, DBIterator12) { std::unique_ptr db_iter( NewDBIterator(env_, ImmutableCFOptions(options), BytewiseComparator(), - internal_iter, 10, 0)); + internal_iter, 10, 0, 0)); db_iter->SeekToLast(); ASSERT_TRUE(db_iter->Valid()); ASSERT_EQ(db_iter->key().ToString(), "c"); @@ -1874,7 +1868,7 @@ class DBIterWithMergeIterTest : public testing::Test { db_iter_.reset(NewDBIterator(env_, ImmutableCFOptions(options_), BytewiseComparator(), merge_iter, 8 /* read data earlier than seqId 8 */, - 3 /* max iterators before reseek */)); + 3 /* max iterators before reseek */, 0)); } Env* env_; diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index 33b5e4ef9..956f601a7 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc new file mode 100644 index 000000000..60e04cfad --- /dev/null +++ b/db/db_properties_test.cc @@ -0,0 +1,1206 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include + +#include +#include + +#include "db/db_test_util.h" +#include "port/stack_trace.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" +#include "rocksdb/perf_level.h" +#include "rocksdb/table.h" +#include "util/random.h" + +namespace rocksdb { + +class DBPropertiesTest : public DBTestBase { + public: + DBPropertiesTest() : DBTestBase("/db_properties_test") {} +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, Empty) { + do { + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options = CurrentOptions(options); + CreateAndReopenWithCF({"pikachu"}, options); + + std::string num; + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("0", num); + + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + // Block sync calls + env_->delay_sstable_sync_.store(true, std::memory_order_release); + Put(1, "k1", std::string(100000, 'x')); // Fill memtable + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("2", num); + + Put(1, "k2", std::string(100000, 'y')); // Trigger compaction + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ("1", num); + + ASSERT_EQ("v1", Get(1, "foo")); + // Release sync calls + env_->delay_sstable_sync_.store(false, std::memory_order_release); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("1", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->DisableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("3", num); + + ASSERT_OK(db_->EnableFileDeletions(false)); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("2", num); + + ASSERT_OK(db_->EnableFileDeletions()); + ASSERT_TRUE( + dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); + ASSERT_EQ("0", num); + } while (ChangeOptions()); +} + +TEST_F(DBPropertiesTest, CurrentVersionNumber) { + uint64_t v1, v2, v3; + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v1)); + Put("12345678", ""); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v2)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.current-super-version-number", &v3)); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); +} + +TEST_F(DBPropertiesTest, GetAggregatedIntPropertyTest) { + const int kKeySize = 100; + const int kValueSize = 500; + const int kKeyNum = 100; + + Options options; + options.env = env_; + options.create_if_missing = true; + options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10; + // Make them never flush + options.min_write_buffer_number_to_merge = 1000; + options.max_write_buffer_number = 1000; + options = CurrentOptions(options); + CreateAndReopenWithCF({"one", "two", "three", "four"}, options); + + Random rnd(301); + for (auto* handle : handles_) { + for (int i = 0; i < kKeyNum; ++i) { + db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + } + + uint64_t manual_sum = 0; + uint64_t api_sum = 0; + uint64_t value = 0; + for (auto* handle : handles_) { + ASSERT_TRUE( + db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value)); + manual_sum += value; + } + ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, + &api_sum)); + ASSERT_GT(manual_sum, 0); + ASSERT_EQ(manual_sum, api_sum); + + ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value)); + + uint64_t before_flush_trm; + uint64_t after_flush_trm; + for (auto* handle : handles_) { + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); + + // Issue flush and expect larger memory usage of table readers. + db_->Flush(FlushOptions(), handle); + + ASSERT_TRUE(db_->GetAggregatedIntProperty( + DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); + ASSERT_GT(after_flush_trm, before_flush_trm); + } +} + +namespace { +void ResetTableProperties(TableProperties* tp) { + tp->data_size = 0; + tp->index_size = 0; + tp->filter_size = 0; + tp->raw_key_size = 0; + tp->raw_value_size = 0; + tp->num_data_blocks = 0; + tp->num_entries = 0; +} + +void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { + double dummy_double; + std::replace(tp_string.begin(), tp_string.end(), ';', ' '); + std::replace(tp_string.begin(), tp_string.end(), '=', ' '); + ResetTableProperties(tp); + + sscanf(tp_string.c_str(), + "# data blocks %" SCNu64 " # entries %" SCNu64 " raw key size %" SCNu64 + " raw average key size %lf " + " raw value size %" SCNu64 + " raw average value size %lf " + " data block size %" SCNu64 " index block size %" SCNu64 + " filter block size %" SCNu64, + &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, + &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, + &tp->index_size, &tp->filter_size); +} + +void VerifySimilar(uint64_t a, uint64_t b, double bias) { + ASSERT_EQ(a == 0U, b == 0U); + if (a == 0) { + return; + } + double dbl_a = static_cast(a); + double dbl_b = static_cast(b); + if (dbl_a > dbl_b) { + ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); + } else { + ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); + } +} + +void VerifyTableProperties(const TableProperties& base_tp, + const TableProperties& new_tp, + double filter_size_bias = 0.1, + double index_size_bias = 0.1, + double data_size_bias = 0.1, + double num_data_blocks_bias = 0.05) { + VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); + VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); + VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); + VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, + num_data_blocks_bias); + ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); + ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); + ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); +} + +void GetExpectedTableProperties(TableProperties* expected_tp, + const int kKeySize, const int kValueSize, + const int kKeysPerTable, const int kTableCount, + const int kBloomBitsPerKey, + const size_t kBlockSize) { + const int kKeyCount = kTableCount * kKeysPerTable; + const int kAvgSuccessorSize = kKeySize / 2; + const int kEncodingSavePerKey = kKeySize / 4; + expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); + expected_tp->raw_value_size = kKeyCount * kValueSize; + expected_tp->num_entries = kKeyCount; + expected_tp->num_data_blocks = + kTableCount * + (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / + kBlockSize; + expected_tp->data_size = + kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); + expected_tp->index_size = + expected_tp->num_data_blocks * (kAvgSuccessorSize + 12); + expected_tp->filter_size = + kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); +} +} // anonymous namespace + +TEST_F(DBPropertiesTest, ValidatePropertyInfo) { + for (const auto& ppt_name_and_info : InternalStats::ppt_name_to_info) { + // If C++ gets a std::string_literal, this would be better to check at + // compile-time using static_assert. + ASSERT_TRUE(ppt_name_and_info.first.empty() || + !isdigit(ppt_name_and_info.first.back())); + + ASSERT_TRUE((ppt_name_and_info.second.handle_string == nullptr) != + (ppt_name_and_info.second.handle_int == nullptr)); + } +} + +TEST_F(DBPropertiesTest, AggregatedTableProperties) { + for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { + const int kKeysPerTable = 100; + const int kKeySize = 80; + const int kValueSize = 200; + const int kBloomBitsPerKey = 20; + + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + Random rnd(5632); + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kKeysPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + db_->Flush(FlushOptions()); + } + std::string property; + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); + + TableProperties expected_tp; + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, kTableCount, kBloomBitsPerKey, + table_options.block_size); + + TableProperties output_tp; + ParseTablePropertiesString(property, &output_tp); + + VerifyTableProperties(expected_tp, output_tp); + } +} + +TEST_F(DBPropertiesTest, ReadLatencyHistogramByLevel) { + Options options = CurrentOptions(); + options.write_buffer_size = 110 << 10; + options.level0_file_num_compaction_trigger = 6; + options.num_levels = 4; + options.compression = kNoCompression; + options.max_bytes_for_level_base = 4500 << 10; + options.target_file_size_base = 98 << 10; + options.max_write_buffer_number = 2; + options.statistics = rocksdb::CreateDBStatistics(); + options.max_open_files = 100; + + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + + DestroyAndReopen(options); + int key_index = 0; + Random rnd(301); + for (int num = 0; num < 8; num++) { + Put("foo", "bar"); + GenerateNewFile(&rnd, &key_index); + dbfull()->TEST_WaitForCompact(); + } + dbfull()->TEST_WaitForCompact(); + + std::string prop; + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + + // Get() after flushes, See latency histogram tracked. + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue Get(). See thee latency tracked + Reopen(options); + dbfull()->TEST_WaitForCompact(); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // Reopen and issue iterating. See thee latency tracked + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + { + unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + } + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + + // options.max_open_files preloads table readers. + options.max_open_files = -1; + Reopen(options); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); + for (int key = 0; key < key_index; key++) { + Get(Key(key)); + } + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); + ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); + ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); + ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); +} + +TEST_F(DBPropertiesTest, AggregatedTablePropertiesAtLevel) { + const int kTableCount = 100; + const int kKeysPerTable = 10; + const int kKeySize = 50; + const int kValueSize = 400; + const int kMaxLevel = 7; + const int kBloomBitsPerKey = 20; + Random rnd(301); + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 8; + options.compression = kNoCompression; + options.create_if_missing = true; + options.level0_file_num_compaction_trigger = 2; + options.target_file_size_base = 8192; + options.max_bytes_for_level_base = 10000; + options.max_bytes_for_level_multiplier = 2; + // This ensures there no compaction happening when we call GetProperty(). + options.disable_auto_compactions = true; + + BlockBasedTableOptions table_options; + table_options.filter_policy.reset( + NewBloomFilterPolicy(kBloomBitsPerKey, false)); + table_options.block_size = 1024; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + DestroyAndReopen(options); + + std::string level_tp_strings[kMaxLevel]; + std::string tp_string; + TableProperties level_tps[kMaxLevel]; + TableProperties tp, sum_tp, expected_tp; + for (int table = 1; table <= kTableCount; ++table) { + for (int i = 0; i < kKeysPerTable; ++i) { + db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), + RandomString(&rnd, kValueSize)); + } + db_->Flush(FlushOptions()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + ResetTableProperties(&sum_tp); + for (int level = 0; level < kMaxLevel; ++level) { + db_->GetProperty( + DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level), + &level_tp_strings[level]); + ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); + sum_tp.data_size += level_tps[level].data_size; + sum_tp.index_size += level_tps[level].index_size; + sum_tp.filter_size += level_tps[level].filter_size; + sum_tp.raw_key_size += level_tps[level].raw_key_size; + sum_tp.raw_value_size += level_tps[level].raw_value_size; + sum_tp.num_data_blocks += level_tps[level].num_data_blocks; + sum_tp.num_entries += level_tps[level].num_entries; + } + db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); + ParseTablePropertiesString(tp_string, &tp); + ASSERT_EQ(sum_tp.data_size, tp.data_size); + ASSERT_EQ(sum_tp.index_size, tp.index_size); + ASSERT_EQ(sum_tp.filter_size, tp.filter_size); + ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); + ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); + ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); + ASSERT_EQ(sum_tp.num_entries, tp.num_entries); + if (table > 3) { + GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, + kKeysPerTable, table, kBloomBitsPerKey, + table_options.block_size); + // Gives larger bias here as index block size, filter block size, + // and data block size become much harder to estimate in this test. + VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); + } + } +} + +TEST_F(DBPropertiesTest, NumImmutableMemTable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number_to_maintain = 4; + options.write_buffer_size = 1000000; + CreateAndReopenWithCF({"pikachu"}, options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + SetPerfLevel(kEnableTime); + ASSERT_TRUE(GetPerfLevel() == kEnableTime); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "1"); + + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(2, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); + ASSERT_EQ(num, "2"); + perf_context.Reset(); + Get(1, "k2"); + ASSERT_EQ(2, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k3"); + ASSERT_EQ(1, static_cast(perf_context.get_from_memtable_count)); + perf_context.Reset(); + Get(1, "k1"); + ASSERT_EQ(3, static_cast(perf_context.get_from_memtable_count)); + + ASSERT_OK(Flush(1)); + ASSERT_TRUE(dbfull()->GetProperty(handles_[1], + "rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], DB::Properties::kNumImmutableMemTableFlushed, &num)); + ASSERT_EQ(num, "3"); + ASSERT_TRUE(dbfull()->GetProperty( + handles_[1], "rocksdb.cur-size-active-mem-table", &num)); + // "192" is the size of the metadata of an empty skiplist, this would + // break if we change the default skiplist implementation + ASSERT_EQ(num, "192"); + + uint64_t int_num; + uint64_t base_total_size; + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); + + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); + ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); + ASSERT_EQ(int_num, 3U); + + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 4U); + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE(dbfull()->GetIntProperty( + handles_[1], "rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, base_total_size + 1); + + SetPerfLevel(kDisable); + ASSERT_TRUE(GetPerfLevel() == kDisable); + } while (ChangeCompactOptions()); +} + +TEST_F(DBPropertiesTest, GetProperty) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleUniversal; + options.level0_file_num_compaction_trigger = 1; + options.compaction_options_universal.size_ratio = 50; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + SetPerfLevel(kEnableTime); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "1"); + perf_context.Reset(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "1"); + ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); + ASSERT_EQ(num, "2"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "2"); + // Verify the same set of properties through GetIntProperty + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); + ASSERT_EQ(int_num, 2U); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); + ASSERT_EQ(int_num, 1U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); + ASSERT_EQ(int_num, 0U); + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_EQ(int_num, 2U); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); + + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); + ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); + ASSERT_EQ(num, "0"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); + ASSERT_EQ(num, "1"); + ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); + ASSERT_EQ(num, "4"); + + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + // Wait for compaction to be done. This is important because otherwise RocksDB + // might schedule a compaction when reopening the database, failing assertion + // (A) as a result. + dbfull()->TEST_WaitForCompact(); + options.max_open_files = 10; + Reopen(options); + // After reopening, no table reader is loaded, so no memory for table readers + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_EQ(int_num, 0U); // (A) + ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); + ASSERT_GT(int_num, 0U); + + // After reading a key, at least one table reader is loaded. + Get("k5"); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); + ASSERT_GT(int_num, 0U); + + // Test rocksdb.num-live-versions + { + options.level0_file_num_compaction_trigger = 20; + Reopen(options); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + + // Use an iterator to hold current version + std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + // Use an iterator to hold current version + std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); + + ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); + Flush(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 3U); + + iter2.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 2U); + + iter1.reset(); + ASSERT_TRUE( + dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); + ASSERT_EQ(int_num, 1U); + } +} + +TEST_F(DBPropertiesTest, ApproximateMemoryUsage) { + const int kNumRounds = 10; + // TODO(noetzli) kFlushesPerRound does not really correlate with how many + // flushes happen. + const int kFlushesPerRound = 10; + const int kWritesPerFlush = 10; + const int kKeySize = 100; + const int kValueSize = 1000; + Options options; + options.write_buffer_size = 1000; // small write buffer + options.min_write_buffer_number_to_merge = 4; + options.compression = kNoCompression; + options.create_if_missing = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + Random rnd(301); + + std::vector iters; + + uint64_t active_mem; + uint64_t unflushed_mem; + uint64_t all_mem; + uint64_t prev_all_mem; + + // Phase 0. The verify the initial value of all these properties are the same + // as we have no mem-tables. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(all_mem, active_mem); + ASSERT_EQ(all_mem, unflushed_mem); + + // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to + // "size-all-mem-tables" + for (int r = 0; r < kNumRounds; ++r) { + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Make sure that there is no flush between getting the two properties. + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // in no iterator case, these two number should be the same. + ASSERT_EQ(unflushed_mem, all_mem); + } + prev_all_mem = all_mem; + + // Phase 2. Keep issuing Put() but also create new iterators. This time we + // expect "size-all-mem-tables" > "cur-size-all-mem-tables". + for (int r = 0; r < kNumRounds; ++r) { + iters.push_back(db_->NewIterator(ReadOptions())); + for (int f = 0; f < kFlushesPerRound; ++f) { + for (int w = 0; w < kWritesPerFlush; ++w) { + Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); + } + } + // Force flush to prevent flush from happening between getting the + // properties or after getting the properties and before the new round. + Flush(); + + // In the second round, add iterators. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_GT(all_mem, active_mem); + ASSERT_GT(all_mem, unflushed_mem); + ASSERT_GT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks + // whenever we release an iterator. + for (auto* iter : iters) { + delete iter; + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + // Expect the size shrinking + ASSERT_LT(all_mem, prev_all_mem); + prev_all_mem = all_mem; + } + + // Expect all these three counters to be the same. + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); + + // Phase 5. Reopen, and expect all these three counters to be the same again. + Reopen(options); + dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); + dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); + dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); + ASSERT_EQ(active_mem, unflushed_mem); + ASSERT_EQ(unflushed_mem, all_mem); +} + +TEST_F(DBPropertiesTest, EstimatePendingCompBytes) { + // Set sizes to both background thread pool to be 1 and block them. + env_->SetBackgroundThreads(1, Env::HIGH); + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.compaction_style = kCompactionStyleLevel; + options.level0_file_num_compaction_trigger = 2; + options.max_background_compactions = 1; + options.max_background_flushes = 1; + options.max_write_buffer_number = 10; + options.min_write_buffer_number_to_merge = 1; + options.max_write_buffer_number_to_maintain = 0; + options.write_buffer_size = 1000000; + Reopen(options); + + std::string big_value(1000000 * 2, 'x'); + std::string num; + uint64_t int_num; + + ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); + + ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); + Flush(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_GT(int_num, 0U); + + sleeping_task_low.WakeUp(); + sleeping_task_low.WaitUntilDone(); + + dbfull()->TEST_WaitForCompact(); + ASSERT_TRUE(dbfull()->GetIntProperty( + "rocksdb.estimate-pending-compaction-bytes", &int_num)); + ASSERT_EQ(int_num, 0U); +} +#endif // ROCKSDB_LITE + +class CountingUserTblPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingUserTblPropCollector"; } + + Status Finish(UserCollectedProperties* properties) override { + std::string encoded; + PutVarint32(&encoded, count_); + *properties = UserCollectedProperties{ + {"CountingUserTblPropCollector", message_}, {"Count", encoded}, + }; + return Status::OK(); + } + + Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override { + ++count_; + return Status::OK(); + } + + virtual UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + private: + std::string message_ = "Rocksdb"; + uint32_t count_ = 0; +}; + +class CountingUserTblPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + explicit CountingUserTblPropCollectorFactory( + uint32_t expected_column_family_id) + : expected_column_family_id_(expected_column_family_id), + num_created_(0) {} + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + EXPECT_EQ(expected_column_family_id_, context.column_family_id); + num_created_++; + return new CountingUserTblPropCollector(); + } + const char* Name() const override { + return "CountingUserTblPropCollectorFactory"; + } + void set_expected_column_family_id(uint32_t v) { + expected_column_family_id_ = v; + } + uint32_t expected_column_family_id_; + uint32_t num_created_; +}; + +class CountingDeleteTabPropCollector : public TablePropertiesCollector { + public: + const char* Name() const override { return "CountingDeleteTabPropCollector"; } + + Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, + SequenceNumber seq, uint64_t file_size) override { + if (type == kEntryDelete) { + num_deletes_++; + } + return Status::OK(); + } + + bool NeedCompact() const override { return num_deletes_ > 10; } + + UserCollectedProperties GetReadableProperties() const override { + return UserCollectedProperties{}; + } + + Status Finish(UserCollectedProperties* properties) override { + *properties = + UserCollectedProperties{{"num_delete", ToString(num_deletes_)}}; + return Status::OK(); + } + + private: + uint32_t num_deletes_ = 0; +}; + +class CountingDeleteTabPropCollectorFactory + : public TablePropertiesCollectorFactory { + public: + virtual TablePropertiesCollector* CreateTablePropertiesCollector( + TablePropertiesCollectorFactory::Context context) override { + return new CountingDeleteTabPropCollector(); + } + const char* Name() const override { + return "CountingDeleteTabPropCollectorFactory"; + } +}; + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, GetUserDefinedTableProperties) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = (1 << 30); + options.max_background_flushes = 0; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(0); + options.table_properties_collector_factories[0] = collector_factory; + Reopen(options); + // Create 4 tables + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); + } + db_->Flush(FlushOptions()); + } + + TablePropertiesCollection props; + ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); + ASSERT_EQ(4U, props.size()); + uint32_t sum = 0; + for (const auto& item : props) { + auto& user_collected = item.second->user_collected_properties; + ASSERT_TRUE(user_collected.find("CountingUserTblPropCollector") != + user_collected.end()); + ASSERT_EQ(user_collected.at("CountingUserTblPropCollector"), "Rocksdb"); + ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); + Slice key(user_collected.at("Count")); + uint32_t count; + ASSERT_TRUE(GetVarint32(&key, &count)); + sum += count; + } + ASSERT_EQ(10u + 11u + 12u + 13u, sum); + + ASSERT_GT(collector_factory->num_created_, 0U); + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} +#endif // ROCKSDB_LITE + +TEST_F(DBPropertiesTest, UserDefinedTablePropertiesContext) { + Options options = CurrentOptions(); + options.level0_file_num_compaction_trigger = 3; + options.max_background_flushes = 0; + options.table_properties_collector_factories.resize(1); + std::shared_ptr collector_factory = + std::make_shared(1); + options.table_properties_collector_factories[0] = collector_factory, + CreateAndReopenWithCF({"pikachu"}, options); + // Create 2 files + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(1, ToString(table * 100 + i), "val"); + } + Flush(1); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + ASSERT_GT(collector_factory->num_created_, 0U); + + // Come back to write to default column family + collector_factory->num_created_ = 0; + collector_factory->set_expected_column_family_id(0); // default CF + // Create 4 tables in default column family + for (int table = 0; table < 2; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + // Trigger automatic compactions. + for (int table = 0; table < 3; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_GT(collector_factory->num_created_, 0U); + + collector_factory->num_created_ = 0; + dbfull()->TEST_CompactRange(0, nullptr, nullptr); + ASSERT_GT(collector_factory->num_created_, 0U); +} + +#ifndef ROCKSDB_LITE +TEST_F(DBPropertiesTest, TablePropertiesNeedCompactTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.write_buffer_size = 4096; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 2; + options.level0_slowdown_writes_trigger = 2; + options.level0_stop_writes_trigger = 4; + options.target_file_size_base = 2048; + options.max_bytes_for_level_base = 10240; + options.max_bytes_for_level_multiplier = 4; + options.soft_pending_compaction_bytes_limit = 1024 * 1024; + options.num_levels = 8; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 1000; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); + ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); + } + Flush(); + dbfull()->TEST_WaitForCompact(); + if (NumTableFilesAtLevel(0) == 1) { + // Clear Level 0 so that when later flush a file with deletions, + // we don't trigger an organic compaction. + ASSERT_OK(Put(Key(0), "")); + ASSERT_OK(Put(Key(kMaxKey * 2), "")); + Flush(); + dbfull()->TEST_WaitForCompact(); + } + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + + { + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + ++c; + } + ASSERT_EQ(c, 200); + } + + Delete(Key(0)); + for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { + Delete(Key(i)); + } + Delete(Key(kMaxKey * 2)); + + Flush(); + dbfull()->TEST_WaitForCompact(); + + { + SetPerfLevel(kEnableCount); + perf_context.Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + iter->Seek(Key(kMaxKey - 100)); + while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { + iter->Next(); + } + ASSERT_EQ(c, 0); + ASSERT_LT(perf_context.internal_delete_skipped_count, 30u); + ASSERT_LT(perf_context.internal_key_skipped_count, 30u); + SetPerfLevel(kDisable); + } +} + +TEST_F(DBPropertiesTest, NeedCompactHintPersistentTest) { + Random rnd(301); + + Options options; + options.create_if_missing = true; + options.max_write_buffer_number = 8; + options.level0_file_num_compaction_trigger = 10; + options.level0_slowdown_writes_trigger = 10; + options.level0_stop_writes_trigger = 10; + options.disable_auto_compactions = true; + + std::shared_ptr collector_factory = + std::make_shared(); + options.table_properties_collector_factories.resize(1); + options.table_properties_collector_factories[0] = collector_factory; + + DestroyAndReopen(options); + + const int kMaxKey = 100; + for (int i = 0; i < kMaxKey; i++) { + ASSERT_OK(Put(Key(i), "")); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + + for (int i = 1; i < kMaxKey - 1; i++) { + Delete(Key(i)); + } + Flush(); + dbfull()->TEST_WaitForFlushMemTable(); + ASSERT_EQ(NumTableFilesAtLevel(0), 2); + + // Restart the DB. Although number of files didn't reach + // options.level0_file_num_compaction_trigger, compaction should + // still be triggered because of the need-compaction hint. + options.disable_auto_compactions = false; + Reopen(options); + dbfull()->TEST_WaitForCompact(); + ASSERT_EQ(NumTableFilesAtLevel(0), 0); + { + SetPerfLevel(kEnableCount); + perf_context.Reset(); + int c = 0; + std::unique_ptr iter(db_->NewIterator(ReadOptions())); + for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { + c++; + } + ASSERT_EQ(c, 2); + ASSERT_EQ(perf_context.internal_delete_skipped_count, 0); + // We iterate every key twice. Is it a bug? + ASSERT_LE(perf_context.internal_key_skipped_count, 2); + SetPerfLevel(kDisable); + } +} +#endif // ROCKSDB_LITE +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index becf76e6f..87ba13d86 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index 75f69e622..bfb62926e 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -654,6 +654,51 @@ TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) { ASSERT_EQ(found, iter->key().ToString()); } +TEST_F(DBTestTailingIterator, ForwardIteratorVersionProperty) { + Options options = CurrentOptions(); + options.write_buffer_size = 1000; + + ReadOptions read_options; + read_options.tailing = true; + + Put("foo", "bar"); + + uint64_t v1, v2, v3, v4; + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("foo"); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v1 = static_cast(std::atoi(prop_value.c_str())); + + Put("foo1", "bar1"); + Flush(); + + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v2 = static_cast(std::atoi(prop_value.c_str())); + + iter->Seek("f"); + + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v3 = static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(v1, v2); + ASSERT_GT(v3, v2); + } + + { + std::unique_ptr iter(db_->NewIterator(read_options)); + iter->Seek("foo"); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.super-version-number", + &prop_value)); + v4 = static_cast(std::atoi(prop_value.c_str())); + } + ASSERT_EQ(v3, v4); +} } // namespace rocksdb #endif // !defined(ROCKSDB_LITE) diff --git a/db/db_test.cc b/db/db_test.cc index 35bea7345..bdb081f93 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,6 +20,9 @@ #ifndef OS_WIN #include #endif +#ifdef OS_SOLARIS +#include +#endif #include "db/filename.h" #include "db/dbformat.h" @@ -34,9 +37,9 @@ #include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" #include "rocksdb/db.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/experimental.h" +#include "rocksdb/sst_file_manager.h" #include "rocksdb/filter_policy.h" #include "rocksdb/options.h" #include "rocksdb/perf_context.h" @@ -62,9 +65,10 @@ #include "util/compression.h" #include "util/mutexlock.h" #include "util/rate_limiter.h" +#include "util/sst_file_manager_impl.h" #include "util/statistics.h" -#include "util/testharness.h" #include "util/sync_point.h" +#include "util/testharness.h" #include "util/testutil.h" #include "util/mock_env.h" #include "util/string_util.h" @@ -118,67 +122,108 @@ class DBTestWithParam uint32_t max_subcompactions_; bool exclusive_manual_compaction_; }; -#ifndef ROCKSDB_LITE -TEST_F(DBTest, Empty) { - do { - Options options; - options.env = env_; - options.write_buffer_size = 100000; // Small write buffer - options = CurrentOptions(options); - CreateAndReopenWithCF({"pikachu"}, options); - std::string num; - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("0", num); +TEST_F(DBTest, MockEnvTest) { + unique_ptr env{new MockEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("1", num); + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; - // Block sync calls - env_->delay_sstable_sync_.store(true, std::memory_order_release); - Put(1, "k1", std::string(100000, 'x')); // Fill memtable - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("2", num); + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } - Put(1, "k2", std::string(100000, 'y')); // Trigger compaction - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ("1", num); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } - ASSERT_EQ("v1", Get(1, "foo")); - // Release sync calls - env_->delay_sstable_sync_.store(false, std::memory_order_release); + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("1", num); + // TEST_FlushMemTable() is not supported in ROCKSDB_LITE + #ifndef ROCKSDB_LITE + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + #endif // ROCKSDB_LITE - ASSERT_OK(db_->DisableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("3", num); + delete db; +} - ASSERT_OK(db_->EnableFileDeletions(false)); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("2", num); +// NewMemEnv returns nullptr in ROCKSDB_LITE since class InMemoryEnv isn't +// defined. +#ifndef ROCKSDB_LITE +TEST_F(DBTest, MemEnvTest) { + unique_ptr env{NewMemEnv(Env::Default())}; + Options options; + options.create_if_missing = true; + options.env = env.get(); + DB* db; - ASSERT_OK(db_->EnableFileDeletions()); - ASSERT_TRUE( - dbfull()->GetProperty("rocksdb.is-file-deletions-enabled", &num)); - ASSERT_EQ("0", num); - } while (ChangeOptions()); + const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; + const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; + + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); + } + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + Iterator* iterator = db->NewIterator(ReadOptions()); + iterator->SeekToFirst(); + for (size_t i = 0; i < 3; ++i) { + ASSERT_TRUE(iterator->Valid()); + ASSERT_TRUE(keys[i] == iterator->key()); + ASSERT_TRUE(vals[i] == iterator->value()); + iterator->Next(); + } + ASSERT_TRUE(!iterator->Valid()); + delete iterator; + + DBImpl* dbi = reinterpret_cast(db); + ASSERT_OK(dbi->TEST_FlushMemTable()); + + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + + delete db; + + options.create_if_missing = false; + ASSERT_OK(DB::Open(options, "/dir/db", &db)); + for (size_t i = 0; i < 3; ++i) { + std::string res; + ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); + ASSERT_TRUE(res == vals[i]); + } + delete db; } #endif // ROCKSDB_LITE @@ -190,13 +235,11 @@ TEST_F(DBTest, WriteEmptyBatch) { CreateAndReopenWithCF({"pikachu"}, options); ASSERT_OK(Put(1, "foo", "bar")); - env_->sync_counter_.store(0); WriteOptions wo; wo.sync = true; wo.disableWAL = false; WriteBatch empty_batch; ASSERT_OK(dbfull()->Write(wo, &empty_batch)); - ASSERT_GE(env_->sync_counter_.load(), 1); // make sure we can re-open it. ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); @@ -204,59 +247,6 @@ TEST_F(DBTest, WriteEmptyBatch) { } #ifndef ROCKSDB_LITE -TEST_F(DBTest, GetAggregatedIntPropertyTest) { - const int kKeySize = 100; - const int kValueSize = 500; - const int kKeyNum = 100; - - Options options; - options.env = env_; - options.create_if_missing = true; - options.write_buffer_size = (kKeySize + kValueSize) * kKeyNum / 10; - // Make them never flush - options.min_write_buffer_number_to_merge = 1000; - options.max_write_buffer_number = 1000; - options = CurrentOptions(options); - CreateAndReopenWithCF({"one", "two", "three", "four"}, options); - - Random rnd(301); - for (auto* handle : handles_) { - for (int i = 0; i < kKeyNum; ++i) { - db_->Put(WriteOptions(), handle, RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); - } - } - - uint64_t manual_sum = 0; - uint64_t api_sum = 0; - uint64_t value = 0; - for (auto* handle : handles_) { - ASSERT_TRUE( - db_->GetIntProperty(handle, DB::Properties::kSizeAllMemTables, &value)); - manual_sum += value; - } - ASSERT_TRUE(db_->GetAggregatedIntProperty(DB::Properties::kSizeAllMemTables, - &api_sum)); - ASSERT_GT(manual_sum, 0); - ASSERT_EQ(manual_sum, api_sum); - - ASSERT_FALSE(db_->GetAggregatedIntProperty(DB::Properties::kDBStats, &value)); - - uint64_t before_flush_trm; - uint64_t after_flush_trm; - for (auto* handle : handles_) { - ASSERT_TRUE(db_->GetAggregatedIntProperty( - DB::Properties::kEstimateTableReadersMem, &before_flush_trm)); - - // Issue flush and expect larger memory usage of table readers. - db_->Flush(FlushOptions(), handle); - - ASSERT_TRUE(db_->GetAggregatedIntProperty( - DB::Properties::kEstimateTableReadersMem, &after_flush_trm)); - ASSERT_GT(after_flush_trm, before_flush_trm); - } -} - TEST_F(DBTest, ReadOnlyDB) { ASSERT_OK(Put("foo", "v1")); ASSERT_OK(Put("bar", "v2")); @@ -434,6 +424,97 @@ TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); } +TEST_F(DBTest, IndexAndFilterBlocksOfNewTableAddedToCacheWithPinning) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + ASSERT_OK(Put(1, "key", "val")); + // Create a new table. + ASSERT_OK(Flush(1)); + + // index/filter blocks added to block cache right after table creation. + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // only index/filter were added + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); + + std::string value; + // Miss and hit count should remain the same, they're all pinned. + db_->KeyMayExist(ReadOptions(), handles_[1], "key", &value); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // Miss and hit count should remain the same, they're all pinned. + value = Get(1, "key"); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); +} + +TEST_F(DBTest, MultiLevelIndexAndFilterBlocksCachedWithPinning) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.pin_l0_filter_and_index_blocks_in_cache = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + // move this table to L1 + dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); + + // reset block cache + table_options.block_cache = NewLRUCache(64 * 1024); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + // create new table at L0 + Put(1, "a2", "begin2"); + Put(1, "z2", "end2"); + ASSERT_OK(Flush(1)); + + // get base cache values + uint64_t fm = TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS); + uint64_t fh = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); + uint64_t im = TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS); + uint64_t ih = TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT); + + std::string value; + // this should be read from L0 + // so cache values don't change + value = Get(1, "a2"); + ASSERT_EQ(fm, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(im, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(ih, TestGetTickerCount(options, BLOCK_CACHE_INDEX_HIT)); + + // this should be read from L1 + // the file is opened, prefetching results in a cache filter miss + // the block is loaded and added to the cache, + // then the get results in a cache hit for L1 + value = Get(1, "a"); + ASSERT_EQ(fm + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(fh + 1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); +} + TEST_F(DBTest, ParanoidFileChecks) { Options options = CurrentOptions(); options.create_if_missing = true; @@ -478,442 +559,6 @@ TEST_F(DBTest, ParanoidFileChecks) { TestGetTickerCount(options, BLOCK_CACHE_ADD)); } -namespace { -void ResetTableProperties(TableProperties* tp) { - tp->data_size = 0; - tp->index_size = 0; - tp->filter_size = 0; - tp->raw_key_size = 0; - tp->raw_value_size = 0; - tp->num_data_blocks = 0; - tp->num_entries = 0; -} - -void ParseTablePropertiesString(std::string tp_string, TableProperties* tp) { - double dummy_double; - std::replace(tp_string.begin(), tp_string.end(), ';', ' '); - std::replace(tp_string.begin(), tp_string.end(), '=', ' '); - ResetTableProperties(tp); - - sscanf(tp_string.c_str(), "# data blocks %" SCNu64 - " # entries %" SCNu64 - " raw key size %" SCNu64 - " raw average key size %lf " - " raw value size %" SCNu64 - " raw average value size %lf " - " data block size %" SCNu64 - " index block size %" SCNu64 - " filter block size %" SCNu64, - &tp->num_data_blocks, &tp->num_entries, &tp->raw_key_size, - &dummy_double, &tp->raw_value_size, &dummy_double, &tp->data_size, - &tp->index_size, &tp->filter_size); -} - -void VerifySimilar(uint64_t a, uint64_t b, double bias) { - ASSERT_EQ(a == 0U, b == 0U); - if (a == 0) { - return; - } - double dbl_a = static_cast(a); - double dbl_b = static_cast(b); - if (dbl_a > dbl_b) { - ASSERT_LT(static_cast(dbl_a - dbl_b) / (dbl_a + dbl_b), bias); - } else { - ASSERT_LT(static_cast(dbl_b - dbl_a) / (dbl_a + dbl_b), bias); - } -} - -void VerifyTableProperties(const TableProperties& base_tp, - const TableProperties& new_tp, - double filter_size_bias = 0.1, - double index_size_bias = 0.1, - double data_size_bias = 0.1, - double num_data_blocks_bias = 0.05) { - VerifySimilar(base_tp.data_size, new_tp.data_size, data_size_bias); - VerifySimilar(base_tp.index_size, new_tp.index_size, index_size_bias); - VerifySimilar(base_tp.filter_size, new_tp.filter_size, filter_size_bias); - VerifySimilar(base_tp.num_data_blocks, new_tp.num_data_blocks, - num_data_blocks_bias); - ASSERT_EQ(base_tp.raw_key_size, new_tp.raw_key_size); - ASSERT_EQ(base_tp.raw_value_size, new_tp.raw_value_size); - ASSERT_EQ(base_tp.num_entries, new_tp.num_entries); -} - -void GetExpectedTableProperties(TableProperties* expected_tp, - const int kKeySize, const int kValueSize, - const int kKeysPerTable, const int kTableCount, - const int kBloomBitsPerKey, - const size_t kBlockSize) { - const int kKeyCount = kTableCount * kKeysPerTable; - const int kAvgSuccessorSize = kKeySize / 2; - const int kEncodingSavePerKey = kKeySize / 4; - expected_tp->raw_key_size = kKeyCount * (kKeySize + 8); - expected_tp->raw_value_size = kKeyCount * kValueSize; - expected_tp->num_entries = kKeyCount; - expected_tp->num_data_blocks = - kTableCount * - (kKeysPerTable * (kKeySize - kEncodingSavePerKey + kValueSize)) / - kBlockSize; - expected_tp->data_size = - kTableCount * (kKeysPerTable * (kKeySize + 8 + kValueSize)); - expected_tp->index_size = - expected_tp->num_data_blocks * (kAvgSuccessorSize + 12); - expected_tp->filter_size = - kTableCount * (kKeysPerTable * kBloomBitsPerKey / 8); -} -} // namespace - -TEST_F(DBTest, AggregatedTableProperties) { - for (int kTableCount = 40; kTableCount <= 100; kTableCount += 30) { - const int kKeysPerTable = 100; - const int kKeySize = 80; - const int kValueSize = 200; - const int kBloomBitsPerKey = 20; - - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - options.compression = kNoCompression; - options.create_if_missing = true; - - BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(kBloomBitsPerKey, false)); - table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - - DestroyAndReopen(options); - - Random rnd(5632); - for (int table = 1; table <= kTableCount; ++table) { - for (int i = 0; i < kKeysPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); - } - db_->Flush(FlushOptions()); - } - std::string property; - db_->GetProperty(DB::Properties::kAggregatedTableProperties, &property); - - TableProperties expected_tp; - GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, kTableCount, kBloomBitsPerKey, - table_options.block_size); - - TableProperties output_tp; - ParseTablePropertiesString(property, &output_tp); - - VerifyTableProperties(expected_tp, output_tp); - } -} - -TEST_F(DBTest, ReadLatencyHistogramByLevel) { - Options options = CurrentOptions(); - options.write_buffer_size = 110 << 10; - options.level0_file_num_compaction_trigger = 6; - options.num_levels = 4; - options.compression = kNoCompression; - options.max_bytes_for_level_base = 4500 << 10; - options.target_file_size_base = 98 << 10; - options.max_write_buffer_number = 2; - options.statistics = rocksdb::CreateDBStatistics(); - options.max_open_files = 100; - - BlockBasedTableOptions table_options; - table_options.no_block_cache = true; - - DestroyAndReopen(options); - int key_index = 0; - Random rnd(301); - for (int num = 0; num < 8; num++) { - Put("foo", "bar"); - GenerateNewFile(&rnd, &key_index); - dbfull()->TEST_WaitForCompact(); - } - dbfull()->TEST_WaitForCompact(); - - std::string prop; - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - - // Get() after flushes, See latency histogram tracked. - for (int key = 0; key < key_index; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // Reopen and issue Get(). See thee latency tracked - Reopen(options); - dbfull()->TEST_WaitForCompact(); - for (int key = 0; key < key_index; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // Reopen and issue iterating. See thee latency tracked - Reopen(options); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_EQ(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - { - unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { - } - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - - // options.max_open_files preloads table readers. - options.max_open_files = -1; - Reopen(options); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); - for (int key = 0; key < key_index; key++) { - Get(Key(key)); - } - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); - ASSERT_NE(std::string::npos, prop.find("** Level 0 read latency histogram")); - ASSERT_NE(std::string::npos, prop.find("** Level 1 read latency histogram")); - ASSERT_EQ(std::string::npos, prop.find("** Level 2 read latency histogram")); -} - -TEST_F(DBTest, AggregatedTablePropertiesAtLevel) { - const int kTableCount = 100; - const int kKeysPerTable = 10; - const int kKeySize = 50; - const int kValueSize = 400; - const int kMaxLevel = 7; - const int kBloomBitsPerKey = 20; - Random rnd(301); - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 8; - options.compression = kNoCompression; - options.create_if_missing = true; - options.level0_file_num_compaction_trigger = 2; - options.target_file_size_base = 8192; - options.max_bytes_for_level_base = 10000; - options.max_bytes_for_level_multiplier = 2; - // This ensures there no compaction happening when we call GetProperty(). - options.disable_auto_compactions = true; - - BlockBasedTableOptions table_options; - table_options.filter_policy.reset( - NewBloomFilterPolicy(kBloomBitsPerKey, false)); - table_options.block_size = 1024; - options.table_factory.reset(new BlockBasedTableFactory(table_options)); - - DestroyAndReopen(options); - - std::string level_tp_strings[kMaxLevel]; - std::string tp_string; - TableProperties level_tps[kMaxLevel]; - TableProperties tp, sum_tp, expected_tp; - for (int table = 1; table <= kTableCount; ++table) { - for (int i = 0; i < kKeysPerTable; ++i) { - db_->Put(WriteOptions(), RandomString(&rnd, kKeySize), - RandomString(&rnd, kValueSize)); - } - db_->Flush(FlushOptions()); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); - ResetTableProperties(&sum_tp); - for (int level = 0; level < kMaxLevel; ++level) { - db_->GetProperty( - DB::Properties::kAggregatedTablePropertiesAtLevel + ToString(level), - &level_tp_strings[level]); - ParseTablePropertiesString(level_tp_strings[level], &level_tps[level]); - sum_tp.data_size += level_tps[level].data_size; - sum_tp.index_size += level_tps[level].index_size; - sum_tp.filter_size += level_tps[level].filter_size; - sum_tp.raw_key_size += level_tps[level].raw_key_size; - sum_tp.raw_value_size += level_tps[level].raw_value_size; - sum_tp.num_data_blocks += level_tps[level].num_data_blocks; - sum_tp.num_entries += level_tps[level].num_entries; - } - db_->GetProperty(DB::Properties::kAggregatedTableProperties, &tp_string); - ParseTablePropertiesString(tp_string, &tp); - ASSERT_EQ(sum_tp.data_size, tp.data_size); - ASSERT_EQ(sum_tp.index_size, tp.index_size); - ASSERT_EQ(sum_tp.filter_size, tp.filter_size); - ASSERT_EQ(sum_tp.raw_key_size, tp.raw_key_size); - ASSERT_EQ(sum_tp.raw_value_size, tp.raw_value_size); - ASSERT_EQ(sum_tp.num_data_blocks, tp.num_data_blocks); - ASSERT_EQ(sum_tp.num_entries, tp.num_entries); - if (table > 3) { - GetExpectedTableProperties(&expected_tp, kKeySize, kValueSize, - kKeysPerTable, table, kBloomBitsPerKey, - table_options.block_size); - // Gives larger bias here as index block size, filter block size, - // and data block size become much harder to estimate in this test. - VerifyTableProperties(tp, expected_tp, 0.5, 0.4, 0.4, 0.25); - } - } -} -#endif // ROCKSDB_LITE - -class CoutingUserTblPropCollector : public TablePropertiesCollector { - public: - const char* Name() const override { return "CoutingUserTblPropCollector"; } - - Status Finish(UserCollectedProperties* properties) override { - std::string encoded; - PutVarint32(&encoded, count_); - *properties = UserCollectedProperties{ - {"CoutingUserTblPropCollector", message_}, {"Count", encoded}, - }; - return Status::OK(); - } - - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { - ++count_; - return Status::OK(); - } - - virtual UserCollectedProperties GetReadableProperties() const override { - return UserCollectedProperties{}; - } - - private: - std::string message_ = "Rocksdb"; - uint32_t count_ = 0; -}; - -class CoutingUserTblPropCollectorFactory - : public TablePropertiesCollectorFactory { - public: - explicit CoutingUserTblPropCollectorFactory( - uint32_t expected_column_family_id) - : expected_column_family_id_(expected_column_family_id), - num_created_(0) {} - virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) override { - EXPECT_EQ(expected_column_family_id_, context.column_family_id); - num_created_++; - return new CoutingUserTblPropCollector(); - } - const char* Name() const override { - return "CoutingUserTblPropCollectorFactory"; - } - void set_expected_column_family_id(uint32_t v) { - expected_column_family_id_ = v; - } - uint32_t expected_column_family_id_; - uint32_t num_created_; -}; - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, GetUserDefinedTableProperties) { - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = (1<<30); - options.max_background_flushes = 0; - options.table_properties_collector_factories.resize(1); - std::shared_ptr collector_factory = - std::make_shared(0); - options.table_properties_collector_factories[0] = collector_factory; - Reopen(options); - // Create 4 tables - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - db_->Put(WriteOptions(), ToString(table * 100 + i), "val"); - } - db_->Flush(FlushOptions()); - } - - TablePropertiesCollection props; - ASSERT_OK(db_->GetPropertiesOfAllTables(&props)); - ASSERT_EQ(4U, props.size()); - uint32_t sum = 0; - for (const auto& item : props) { - auto& user_collected = item.second->user_collected_properties; - ASSERT_TRUE(user_collected.find("CoutingUserTblPropCollector") != - user_collected.end()); - ASSERT_EQ(user_collected.at("CoutingUserTblPropCollector"), "Rocksdb"); - ASSERT_TRUE(user_collected.find("Count") != user_collected.end()); - Slice key(user_collected.at("Count")); - uint32_t count; - ASSERT_TRUE(GetVarint32(&key, &count)); - sum += count; - } - ASSERT_EQ(10u + 11u + 12u + 13u, sum); - - ASSERT_GT(collector_factory->num_created_, 0U); - collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_GT(collector_factory->num_created_, 0U); -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, UserDefinedTablePropertiesContext) { - Options options = CurrentOptions(); - options.level0_file_num_compaction_trigger = 3; - options.max_background_flushes = 0; - options.table_properties_collector_factories.resize(1); - std::shared_ptr collector_factory = - std::make_shared(1); - options.table_properties_collector_factories[0] = collector_factory, - CreateAndReopenWithCF({"pikachu"}, options); - // Create 2 files - for (int table = 0; table < 2; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(1, ToString(table * 100 + i), "val"); - } - Flush(1); - } - ASSERT_GT(collector_factory->num_created_, 0U); - - collector_factory->num_created_ = 0; - // Trigger automatic compactions. - for (int table = 0; table < 3; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(1, ToString(table * 100 + i), "val"); - } - Flush(1); - dbfull()->TEST_WaitForCompact(); - } - ASSERT_GT(collector_factory->num_created_, 0U); - - collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]); - ASSERT_GT(collector_factory->num_created_, 0U); - - // Come back to write to default column family - collector_factory->num_created_ = 0; - collector_factory->set_expected_column_family_id(0); // default CF - // Create 4 tables in default column family - for (int table = 0; table < 2; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); - } - Flush(); - } - ASSERT_GT(collector_factory->num_created_, 0U); - - collector_factory->num_created_ = 0; - // Trigger automatic compactions. - for (int table = 0; table < 3; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); - } - Flush(); - dbfull()->TEST_WaitForCompact(); - } - ASSERT_GT(collector_factory->num_created_, 0U); - - collector_factory->num_created_ = 0; - dbfull()->TEST_CompactRange(0, nullptr, nullptr); - ASSERT_GT(collector_factory->num_created_, 0U); -} - -#ifndef ROCKSDB_LITE TEST_F(DBTest, LevelLimitReopen) { Options options = CurrentOptions(); CreateAndReopenWithCF({"pikachu"}, options); @@ -935,34 +580,184 @@ TEST_F(DBTest, LevelLimitReopen) { options.max_bytes_for_level_multiplier_additional.resize(10, 1); ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options)); } -#endif // ROCKSDB_LITE +#endif // ROCKSDB_LITE + +TEST_F(DBTest, PutDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo", "v2")); + ASSERT_EQ("v2", Get(1, "foo")); + ASSERT_OK(Delete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + } while (ChangeOptions()); +} + +TEST_F(DBTest, PutSingleDeleteGet) { + do { + CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); + ASSERT_OK(Put(1, "foo", "v1")); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_OK(Put(1, "foo2", "v2")); + ASSERT_EQ("v2", Get(1, "foo2")); + ASSERT_OK(SingleDelete(1, "foo")); + ASSERT_EQ("NOT_FOUND", Get(1, "foo")); + // Skip HashCuckooRep as it does not support single delete. FIFO and + // universal compaction do not apply to the test case. Skip MergePut + // because single delete does not get removed when it encounters a merge. + } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | + kSkipUniversalCompaction | kSkipMergePut)); +} + +TEST_F(DBTest, ReadFromPersistedTier) { + do { + Random rnd(301); + Options options = CurrentOptions(); + for (int disableWAL = 0; disableWAL <= 1; ++disableWAL) { + CreateAndReopenWithCF({"pikachu"}, options); + WriteOptions wopt; + wopt.disableWAL = (disableWAL == 1); + // 1st round: put but not flush + ASSERT_OK(db_->Put(wopt, handles_[1], "foo", "first")); + ASSERT_OK(db_->Put(wopt, handles_[1], "bar", "one")); + ASSERT_EQ("first", Get(1, "foo")); + ASSERT_EQ("one", Get(1, "bar")); + + // Read directly from persited data. + ReadOptions ropt; + ropt.read_tier = kPersistedTier; + std::string value; + if (wopt.disableWAL) { + // as data has not yet being flushed, we expect not found. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "foo", &value)); + ASSERT_OK(db_->Get(ropt, handles_[1], "bar", &value)); + } + + // Multiget + std::vector multiget_cfs; + multiget_cfs.push_back(handles_[1]); + multiget_cfs.push_back(handles_[1]); + std::vector multiget_keys; + multiget_keys.push_back("foo"); + multiget_keys.push_back("bar"); + std::vector multiget_values; + auto statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[0].IsNotFound()); + ASSERT_TRUE(statuses[1].IsNotFound()); + } else { + ASSERT_OK(statuses[0]); + ASSERT_OK(statuses[1]); + } + + // 2nd round: flush and put a new value in memtable. + ASSERT_OK(Flush(1)); + ASSERT_OK(db_->Put(wopt, handles_[1], "rocksdb", "hello")); + + // once the data has been flushed, we are able to get the + // data when kPersistedTier is used. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).ok()); + ASSERT_EQ(value, "first"); + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + if (wopt.disableWAL) { + ASSERT_TRUE( + db_->Get(ropt, handles_[1], "rocksdb", &value).IsNotFound()); + } else { + ASSERT_OK(db_->Get(ropt, handles_[1], "rocksdb", &value)); + ASSERT_EQ(value, "hello"); + } + + // Expect same result in multiget + multiget_cfs.push_back(handles_[1]); + multiget_keys.push_back("rocksdb"); + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].ok()); + ASSERT_EQ("first", multiget_values[0]); + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[2].IsNotFound()); + } else { + ASSERT_OK(statuses[2]); + } + + // 3rd round: delete and flush + ASSERT_OK(db_->Delete(wopt, handles_[1], "foo")); + Flush(1); + ASSERT_OK(db_->Delete(wopt, handles_[1], "bar")); + + ASSERT_TRUE(db_->Get(ropt, handles_[1], "foo", &value).IsNotFound()); + if (wopt.disableWAL) { + // Still expect finding the value as its delete has not yet being + // flushed. + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).ok()); + ASSERT_EQ(value, "one"); + } else { + ASSERT_TRUE(db_->Get(ropt, handles_[1], "bar", &value).IsNotFound()); + } + ASSERT_TRUE(db_->Get(ropt, handles_[1], "rocksdb", &value).ok()); + ASSERT_EQ(value, "hello"); + + statuses = + db_->MultiGet(ropt, multiget_cfs, multiget_keys, &multiget_values); + ASSERT_TRUE(statuses[0].IsNotFound()); + if (wopt.disableWAL) { + ASSERT_TRUE(statuses[1].ok()); + ASSERT_EQ("one", multiget_values[1]); + } else { + ASSERT_TRUE(statuses[1].IsNotFound()); + } + ASSERT_TRUE(statuses[2].ok()); + ASSERT_EQ("hello", multiget_values[2]); + if (wopt.disableWAL == 0) { + DestroyAndReopen(options); + } + } + } while (ChangeOptions(kSkipHashCuckoo)); +} + +TEST_F(DBTest, IteratorProperty) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + Put(1, "1", "2"); + ReadOptions ropt; + ropt.pin_data = false; + { + unique_ptr iter(db_->NewIterator(ropt, handles_[1])); + iter->SeekToFirst(); + std::string prop_value; + ASSERT_NOK(iter->GetProperty("non_existing.value", &prop_value)); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("0", prop_value); + iter->Next(); + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("Iterator is not valid.", prop_value); + } + Close(); +} -TEST_F(DBTest, PutDeleteGet) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_OK(Put(1, "foo", "v2")); - ASSERT_EQ("v2", Get(1, "foo")); - ASSERT_OK(Delete(1, "foo")); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - } while (ChangeOptions()); -} +TEST_F(DBTest, PersistedTierOnIterator) { + // The test needs to be changed if kPersistedTier is supported in iterator. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"pikachu"}, options); + ReadOptions ropt; + ropt.read_tier = kPersistedTier; -TEST_F(DBTest, PutSingleDeleteGet) { - do { - CreateAndReopenWithCF({"pikachu"}, CurrentOptions()); - ASSERT_OK(Put(1, "foo", "v1")); - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_OK(Put(1, "foo2", "v2")); - ASSERT_EQ("v2", Get(1, "foo2")); - ASSERT_OK(SingleDelete(1, "foo")); - ASSERT_EQ("NOT_FOUND", Get(1, "foo")); - // Skip HashCuckooRep as it does not support single delete. FIFO and - // universal compaction do not apply to the test case. Skip MergePut - // because single delete does not get removed when it encounters a merge. - } while (ChangeOptions(kSkipHashCuckoo | kSkipFIFOCompaction | - kSkipUniversalCompaction | kSkipMergePut)); + auto* iter = db_->NewIterator(ropt, handles_[1]); + ASSERT_TRUE(iter->status().IsNotSupported()); + delete iter; + + std::vector iters; + ASSERT_TRUE(db_->NewIterators(ropt, {handles_[1]}, &iters).IsNotSupported()); + Close(); } TEST_F(DBTest, SingleDeleteFlush) { @@ -1050,6 +845,61 @@ TEST_F(DBTest, EmptyFlush) { kSkipUniversalCompaction | kSkipMergePut)); } +// Disable because not all platform can run it. +// It requires more than 9GB memory to run it, With single allocation +// of more than 3GB. +TEST_F(DBTest, DISABLED_VeryLargeValue) { + const size_t kValueSize = 3221225472u; // 3GB value + const size_t kKeySize = 8388608u; // 8MB key + std::string raw(kValueSize, 'v'); + std::string key1(kKeySize, 'c'); + std::string key2(kKeySize, 'd'); + + Options options; + options.env = env_; + options.write_buffer_size = 100000; // Small write buffer + options.paranoid_checks = true; + options = CurrentOptions(options); + DestroyAndReopen(options); + + ASSERT_OK(Put("boo", "v1")); + ASSERT_OK(Put("foo", "v1")); + ASSERT_OK(Put(key1, raw)); + raw[0] = 'w'; + ASSERT_OK(Put(key2, raw)); + dbfull()->TEST_WaitForFlushMemTable(); + + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + std::string value; + Status s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); + + // Compact all files. + Flush(); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + + // Check DB is not in read-only state. + ASSERT_OK(Put("boo", "v1")); + + s = db_->Get(ReadOptions(), key1, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('v', value[0]); + + s = db_->Get(ReadOptions(), key2, &value); + ASSERT_OK(s); + ASSERT_EQ(kValueSize, value.size()); + ASSERT_EQ('w', value[0]); +} + TEST_F(DBTest, GetFromImmutableLayer) { do { Options options; @@ -2282,499 +2132,79 @@ TEST_F(DBTest, IgnoreRecoveredLog) { } while (ChangeOptions(kSkipHashCuckoo)); } -TEST_F(DBTest, CheckLock) { - do { - DB* localdb; - Options options = CurrentOptions(); - ASSERT_OK(TryReopen(options)); - - // second open should fail - ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); - } while (ChangeCompactOptions()); -} - -TEST_F(DBTest, FlushMultipleMemtable) { - do { - Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = -1; - CreateAndReopenWithCF({"pikachu"}, options); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); - ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - - ASSERT_EQ("v1", Get(1, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); - ASSERT_OK(Flush(1)); - } while (ChangeCompactOptions()); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, NumImmutableMemTable) { - do { - Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 4; - options.min_write_buffer_number_to_merge = 3; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; - CreateAndReopenWithCF({"pikachu"}, options); - - std::string big_value(1000000 * 2, 'x'); - std::string num; - SetPerfLevel(kEnableTime);; - ASSERT_TRUE(GetPerfLevel() == kEnableTime); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k1", big_value)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); - ASSERT_EQ(num, "1"); - - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k2"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", big_value)); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "2"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &num)); - ASSERT_EQ(num, "2"); - perf_context.Reset(); - Get(1, "k2"); - ASSERT_EQ(2, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k3"); - ASSERT_EQ(1, (int) perf_context.get_from_memtable_count); - perf_context.Reset(); - Get(1, "k1"); - ASSERT_EQ(3, (int) perf_context.get_from_memtable_count); - - ASSERT_OK(Flush(1)); - ASSERT_TRUE(dbfull()->GetProperty(handles_[1], - "rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty( - handles_[1], "rocksdb.cur-size-active-mem-table", &num)); - // "192" is the size of the metadata of an empty skiplist, this would - // break if we change the default skiplist implementation - ASSERT_EQ(num, "192"); - - uint64_t int_num; - uint64_t base_total_size; - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.estimate-num-keys", &base_total_size)); - - ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k2")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k3", "")); - ASSERT_OK(dbfull()->Delete(writeOpt, handles_[1], "k3")); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-deletes-active-mem-table", &int_num)); - ASSERT_EQ(int_num, 2U); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-entries-active-mem-table", &int_num)); - ASSERT_EQ(int_num, 3U); - - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "k2", big_value)); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-entries-imm-mem-tables", &int_num)); - ASSERT_EQ(int_num, 4U); - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.num-deletes-imm-mem-tables", &int_num)); - ASSERT_EQ(int_num, 2U); - - ASSERT_TRUE(dbfull()->GetIntProperty( - handles_[1], "rocksdb.estimate-num-keys", &int_num)); - ASSERT_EQ(int_num, base_total_size + 1); - - SetPerfLevel(kDisable); - ASSERT_TRUE(GetPerfLevel() == kDisable); - } while (ChangeCompactOptions()); -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, FlushEmptyColumnFamily) { - // Block flush thread and disable compaction thread - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); - - Options options = CurrentOptions(); - // disable compaction - options.disable_auto_compactions = true; - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.max_write_buffer_number = 2; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 1; - CreateAndReopenWithCF({"pikachu"}, options); - - // Compaction can still go through even if no thread can flush the - // mem table. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); - - // Insert can go through - ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - - ASSERT_EQ("v1", Get(0, "foo")); - ASSERT_EQ("v1", Get(1, "bar")); - - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - - // Flush can still go through. - ASSERT_OK(Flush(0)); - ASSERT_OK(Flush(1)); - - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, GetProperty) { - // Set sizes to both background thread pool to be 1 and block them. - env_->SetBackgroundThreads(1, Env::HIGH); - env_->SetBackgroundThreads(1, Env::LOW); - test::SleepingBackgroundTask sleeping_task_low; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, - Env::Priority::LOW); - test::SleepingBackgroundTask sleeping_task_high; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, - &sleeping_task_high, Env::Priority::HIGH); - - Options options = CurrentOptions(); - WriteOptions writeOpt = WriteOptions(); - writeOpt.disableWAL = true; - options.compaction_style = kCompactionStyleUniversal; - options.level0_file_num_compaction_trigger = 1; - options.compaction_options_universal.size_ratio = 50; - options.max_background_compactions = 1; - options.max_background_flushes = 1; - options.max_write_buffer_number = 10; - options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; - Reopen(options); - - std::string big_value(1000000 * 2, 'x'); - std::string num; - uint64_t int_num; - SetPerfLevel(kEnableTime); - - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-live-data-size", &int_num)); - ASSERT_EQ(int_num, 0U); - - ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "1"); - perf_context.Reset(); - - ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "1"); - ASSERT_OK(dbfull()->Delete(writeOpt, "k-non-existing")); - ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.num-immutable-mem-table", &num)); - ASSERT_EQ(num, "2"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "2"); - // Verify the same set of properties through GetIntProperty - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-immutable-mem-table", &int_num)); - ASSERT_EQ(int_num, 2U); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.mem-table-flush-pending", &int_num)); - ASSERT_EQ(int_num, 1U); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.compaction-pending", &int_num)); - ASSERT_EQ(int_num, 0U); - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); - ASSERT_EQ(int_num, 2U); - - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); - - sleeping_task_high.WakeUp(); - sleeping_task_high.WaitUntilDone(); - dbfull()->TEST_WaitForFlushMemTable(); - - ASSERT_OK(dbfull()->Put(writeOpt, "k4", big_value)); - ASSERT_OK(dbfull()->Put(writeOpt, "k5", big_value)); - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.mem-table-flush-pending", &num)); - ASSERT_EQ(num, "0"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.compaction-pending", &num)); - ASSERT_EQ(num, "1"); - ASSERT_TRUE(dbfull()->GetProperty("rocksdb.estimate-num-keys", &num)); - ASSERT_EQ(num, "4"); - - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_GT(int_num, 0U); - - sleeping_task_low.WakeUp(); - sleeping_task_low.WaitUntilDone(); - - // Wait for compaction to be done. This is important because otherwise RocksDB - // might schedule a compaction when reopening the database, failing assertion - // (A) as a result. - dbfull()->TEST_WaitForCompact(); - options.max_open_files = 10; - Reopen(options); - // After reopening, no table reader is loaded, so no memory for table readers - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_EQ(int_num, 0U); // (A) - ASSERT_TRUE(dbfull()->GetIntProperty("rocksdb.estimate-num-keys", &int_num)); - ASSERT_GT(int_num, 0U); - - // After reading a key, at least one table reader is loaded. - Get("k5"); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.estimate-table-readers-mem", &int_num)); - ASSERT_GT(int_num, 0U); - - // Test rocksdb.num-live-versions - { - options.level0_file_num_compaction_trigger = 20; - Reopen(options); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 1U); - - // Use an iterator to hold current version - std::unique_ptr iter1(dbfull()->NewIterator(ReadOptions())); - - ASSERT_OK(dbfull()->Put(writeOpt, "k6", big_value)); - Flush(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 2U); - - // Use an iterator to hold current version - std::unique_ptr iter2(dbfull()->NewIterator(ReadOptions())); - - ASSERT_OK(dbfull()->Put(writeOpt, "k7", big_value)); - Flush(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 3U); - - iter2.reset(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 2U); - - iter1.reset(); - ASSERT_TRUE( - dbfull()->GetIntProperty("rocksdb.num-live-versions", &int_num)); - ASSERT_EQ(int_num, 1U); - } -} - -TEST_F(DBTest, ApproximateMemoryUsage) { - const int kNumRounds = 10; - // TODO(noetzli) kFlushesPerRound does not really correlate with how many - // flushes happen. - const int kFlushesPerRound = 10; - const int kWritesPerFlush = 10; - const int kKeySize = 100; - const int kValueSize = 1000; - Options options; - options.write_buffer_size = 1000; // small write buffer - options.min_write_buffer_number_to_merge = 4; - options.compression = kNoCompression; - options.create_if_missing = true; - options = CurrentOptions(options); - DestroyAndReopen(options); - - Random rnd(301); - - std::vector iters; - - uint64_t active_mem; - uint64_t unflushed_mem; - uint64_t all_mem; - uint64_t prev_all_mem; - - // Phase 0. The verify the initial value of all these properties are the same - // as we have no mem-tables. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(all_mem, active_mem); - ASSERT_EQ(all_mem, unflushed_mem); - - // Phase 1. Simply issue Put() and expect "cur-size-all-mem-tables" equals to - // "size-all-mem-tables" - for (int r = 0; r < kNumRounds; ++r) { - for (int f = 0; f < kFlushesPerRound; ++f) { - for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); - } - } - // Make sure that there is no flush between getting the two properties. - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - // in no iterator case, these two number should be the same. - ASSERT_EQ(unflushed_mem, all_mem); - } - prev_all_mem = all_mem; - - // Phase 2. Keep issuing Put() but also create new iterators. This time we - // expect "size-all-mem-tables" > "cur-size-all-mem-tables". - for (int r = 0; r < kNumRounds; ++r) { - iters.push_back(db_->NewIterator(ReadOptions())); - for (int f = 0; f < kFlushesPerRound; ++f) { - for (int w = 0; w < kWritesPerFlush; ++w) { - Put(RandomString(&rnd, kKeySize), RandomString(&rnd, kValueSize)); - } - } - // Force flush to prevent flush from happening between getting the - // properties or after getting the properties and before the new round. - Flush(); - - // In the second round, add iterators. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_GT(all_mem, active_mem); - ASSERT_GT(all_mem, unflushed_mem); - ASSERT_GT(all_mem, prev_all_mem); - prev_all_mem = all_mem; - } +TEST_F(DBTest, CheckLock) { + do { + DB* localdb; + Options options = CurrentOptions(); + ASSERT_OK(TryReopen(options)); - // Phase 3. Delete iterators and expect "size-all-mem-tables" shrinks - // whenever we release an iterator. - for (auto* iter : iters) { - delete iter; - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - // Expect the size shrinking - ASSERT_LT(all_mem, prev_all_mem); - prev_all_mem = all_mem; - } + // second open should fail + ASSERT_TRUE(!(DB::Open(options, dbname_, &localdb)).ok()); + } while (ChangeCompactOptions()); +} - // Expect all these three counters to be the same. - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(active_mem, unflushed_mem); - ASSERT_EQ(unflushed_mem, all_mem); +TEST_F(DBTest, FlushMultipleMemtable) { + do { + Options options = CurrentOptions(); + WriteOptions writeOpt = WriteOptions(); + writeOpt.disableWAL = true; + options.max_write_buffer_number = 4; + options.min_write_buffer_number_to_merge = 3; + options.max_write_buffer_number_to_maintain = -1; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "foo", "v1")); + ASSERT_OK(Flush(1)); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - // Phase 5. Reopen, and expect all these three counters to be the same again. - Reopen(options); - dbfull()->GetIntProperty("rocksdb.cur-size-active-mem-table", &active_mem); - dbfull()->GetIntProperty("rocksdb.cur-size-all-mem-tables", &unflushed_mem); - dbfull()->GetIntProperty("rocksdb.size-all-mem-tables", &all_mem); - ASSERT_EQ(active_mem, unflushed_mem); - ASSERT_EQ(unflushed_mem, all_mem); + ASSERT_EQ("v1", Get(1, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); + ASSERT_OK(Flush(1)); + } while (ChangeCompactOptions()); } -TEST_F(DBTest, EstimatePendingCompBytes) { - // Set sizes to both background thread pool to be 1 and block them. +TEST_F(DBTest, FlushEmptyColumnFamily) { + // Block flush thread and disable compaction thread env_->SetBackgroundThreads(1, Env::HIGH); env_->SetBackgroundThreads(1, Env::LOW); test::SleepingBackgroundTask sleeping_task_low; env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, Env::Priority::LOW); + test::SleepingBackgroundTask sleeping_task_high; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_high, Env::Priority::HIGH); Options options = CurrentOptions(); + // disable compaction + options.disable_auto_compactions = true; WriteOptions writeOpt = WriteOptions(); writeOpt.disableWAL = true; - options.compaction_style = kCompactionStyleLevel; - options.level0_file_num_compaction_trigger = 2; - options.max_background_compactions = 1; - options.max_background_flushes = 1; - options.max_write_buffer_number = 10; + options.max_write_buffer_number = 2; options.min_write_buffer_number_to_merge = 1; - options.max_write_buffer_number_to_maintain = 0; - options.write_buffer_size = 1000000; - Reopen(options); + options.max_write_buffer_number_to_maintain = 1; + CreateAndReopenWithCF({"pikachu"}, options); - std::string big_value(1000000 * 2, 'x'); - std::string num; - uint64_t int_num; + // Compaction can still go through even if no thread can flush the + // mem table. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); - ASSERT_OK(dbfull()->Put(writeOpt, "k1", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); + // Insert can go through + ASSERT_OK(dbfull()->Put(writeOpt, handles_[0], "foo", "v1")); + ASSERT_OK(dbfull()->Put(writeOpt, handles_[1], "bar", "v1")); - ASSERT_OK(dbfull()->Put(writeOpt, "k2", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); + ASSERT_EQ("v1", Get(0, "foo")); + ASSERT_EQ("v1", Get(1, "bar")); - ASSERT_OK(dbfull()->Put(writeOpt, "k3", big_value)); - Flush(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_GT(int_num, 0U); + sleeping_task_high.WakeUp(); + sleeping_task_high.WaitUntilDone(); + + // Flush can still go through. + ASSERT_OK(Flush(0)); + ASSERT_OK(Flush(1)); sleeping_task_low.WakeUp(); sleeping_task_low.WaitUntilDone(); - - dbfull()->TEST_WaitForCompact(); - ASSERT_TRUE(dbfull()->GetIntProperty( - "rocksdb.estimate-pending-compaction-bytes", &int_num)); - ASSERT_EQ(int_num, 0U); } -#endif // ROCKSDB_LITE TEST_F(DBTest, FLUSH) { do { @@ -4258,6 +3688,7 @@ TEST_F(DBTest, ManifestWriteError) { options.env = env_; options.create_if_missing = true; options.error_if_exists = false; + options.paranoid_checks = true; DestroyAndReopen(options); ASSERT_OK(Put("foo", "bar")); ASSERT_EQ("bar", Get("foo")); @@ -4274,10 +3705,33 @@ TEST_F(DBTest, ManifestWriteError) { dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail ASSERT_EQ("bar", Get("foo")); + error_type->store(false, std::memory_order_release); + + // Since paranoid_checks=true, writes should fail + ASSERT_NOK(Put("foo2", "bar2")); + + // Recovery: should not lose data + ASSERT_EQ("bar", Get("foo")); + + // Try again with paranoid_checks=false + Close(); + options.paranoid_checks = false; + Reopen(options); + + // Merging compaction (will fail) + error_type->store(true, std::memory_order_release); + dbfull()->TEST_CompactRange(last, nullptr, nullptr); // Should fail + ASSERT_EQ("bar", Get("foo")); + // Recovery: should not lose data error_type->store(false, std::memory_order_release); Reopen(options); ASSERT_EQ("bar", Get("foo")); + + // Since paranoid_checks=false, writes should succeed + ASSERT_OK(Put("foo2", "bar2")); + ASSERT_EQ("bar", Get("foo")); + ASSERT_EQ("bar2", Get("foo2")); } } #endif // ROCKSDB_LITE @@ -6042,12 +5496,38 @@ static bool CompareIterators(int step, return ok; } -TEST_F(DBTest, Randomized) { +class DBTestRandomized : public DBTest, + public ::testing::WithParamInterface { + public: + virtual void SetUp() override { option_config_ = GetParam(); } + + static std::vector GenerateOptionConfigs() { + std::vector option_configs; + // skip cuckoo hash as it does not support snapshot. + for (int option_config = kDefault; option_config < kEnd; ++option_config) { + if (!ShouldSkipOptions(option_config, kSkipDeletesFilterFirst | + kSkipNoSeekToLast | + kSkipHashCuckoo)) { + option_configs.push_back(option_config); + } + } + option_configs.push_back(kBlockBasedTableWithIndexRestartInterval); + return option_configs; + } +}; + +INSTANTIATE_TEST_CASE_P( + DBTestRandomized, DBTestRandomized, + ::testing::ValuesIn(DBTestRandomized::GenerateOptionConfigs())); + +TEST_P(DBTestRandomized, Randomized) { anon::OptionsOverride options_override; options_override.skip_policy = kSkipNoSnapshot; - Random rnd(test::RandomSeed()); - do { - ModelDB model(CurrentOptions(options_override)); + Options options = CurrentOptions(options_override); + DestroyAndReopen(options); + + Random rnd(test::RandomSeed() + GetParam()); + ModelDB model(options); const int N = 10000; const Snapshot* model_snap = nullptr; const Snapshot* db_snap = nullptr; @@ -6072,13 +5552,10 @@ TEST_F(DBTest, Randomized) { : rnd.Uniform(8)); ASSERT_OK(model.Put(WriteOptions(), k, v)); ASSERT_OK(db_->Put(WriteOptions(), k, v)); - } else if (p < 90) { // Delete k = RandomKey(&rnd, minimum); ASSERT_OK(model.Delete(WriteOptions(), k)); ASSERT_OK(db_->Delete(WriteOptions(), k)); - - } else { // Multi-element batch WriteBatch b; const int num = rnd.Uniform(8); @@ -6116,26 +5593,15 @@ TEST_F(DBTest, Randomized) { if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - - auto options = CurrentOptions(options_override); Reopen(options); ASSERT_TRUE(CompareIterators(step, &model, db_, nullptr, nullptr)); model_snap = model.GetSnapshot(); db_snap = db_->GetSnapshot(); } - - if ((step % 2000) == 0) { - fprintf(stderr, - "DBTest.Randomized, option ID: %d, step: %d out of %d\n", - option_config_, step, N); - } } if (model_snap != nullptr) model.ReleaseSnapshot(model_snap); if (db_snap != nullptr) db_->ReleaseSnapshot(db_snap); - // skip cuckoo hash as it does not support snapshot. - } while (ChangeOptions(kSkipDeletesFilterFirst | kSkipNoSeekToLast | - kSkipHashCuckoo)); } TEST_F(DBTest, MultiGetSimple) { @@ -6511,6 +5977,28 @@ TEST_F(DBTest, TableOptionsSanitizeTest) { options.prefix_extractor.reset(NewFixedPrefixTransform(1)); ASSERT_OK(TryReopen(options)); } + +TEST_F(DBTest, ConcurrentMemtableNotSupported) { + Options options = CurrentOptions(); + options.allow_concurrent_memtable_write = true; + options.soft_pending_compaction_bytes_limit = 0; + options.hard_pending_compaction_bytes_limit = 100; + options.create_if_missing = true; + + DestroyDB(dbname_, options); + options.memtable_factory.reset(NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ASSERT_NOK(TryReopen(options)); + + options.memtable_factory.reset(new SkipListFactory); + ASSERT_OK(TryReopen(options)); + + ColumnFamilyOptions cf_options(options); + cf_options.memtable_factory.reset( + NewHashLinkListRepFactory(4, 0, 3, true, 4)); + ColumnFamilyHandle* handle; + ASSERT_NOK(db_->CreateColumnFamily(cf_options, "name", &handle)); +} + #endif // ROCKSDB_LITE TEST_F(DBTest, SanitizeNumThreads) { @@ -8235,10 +7723,24 @@ TEST_F(DBTest, EncodeDecompressedBlockSizeTest) { } } +TEST_F(DBTest, MutexWaitStatsDisabledByDefault) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + CreateAndReopenWithCF({"pikachu"}, options); + const uint64_t kMutexWaitDelay = 100; + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, + kMutexWaitDelay); + ASSERT_OK(Put("hello", "rocksdb")); + ASSERT_EQ(TestGetTickerCount(options, DB_MUTEX_WAIT_MICROS), 0); + ThreadStatusUtil::TEST_SetStateDelay(ThreadStatus::STATE_MUTEX_WAIT, 0); +} + TEST_F(DBTest, MutexWaitStats) { Options options = CurrentOptions(); options.create_if_missing = true; options.statistics = rocksdb::CreateDBStatistics(); + options.statistics->stats_level_ = StatsLevel::kAll; CreateAndReopenWithCF({"pikachu"}, options); const uint64_t kMutexWaitDelay = 100; ThreadStatusUtil::TEST_SetStateDelay( @@ -8486,245 +7988,74 @@ TEST_P(DBTestWithParam, MergeCompactionTimeTest) { ASSERT_OK(db_->Merge(WriteOptions(), "foo", "TEST")); ASSERT_OK(Flush()); } - dbfull()->TEST_WaitForFlushMemTable(); - dbfull()->TEST_WaitForCompact(); - - ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); -} - -TEST_P(DBTestWithParam, FilterCompactionTimeTest) { - Options options; - options.compaction_filter_factory = - std::make_shared(this); - options.disable_auto_compactions = true; - options.create_if_missing = true; - options.statistics = rocksdb::CreateDBStatistics(); - options.max_subcompactions = max_subcompactions_; - options = CurrentOptions(options); - DestroyAndReopen(options); - - // put some data - for (int table = 0; table < 4; ++table) { - for (int i = 0; i < 10 + table; ++i) { - Put(ToString(table * 100 + i), "val"); - } - Flush(); - } - - CompactRangeOptions cro; - cro.exclusive_manual_compaction = exclusive_manual_compaction_; - ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); - ASSERT_EQ(0U, CountLiveFiles()); - - Reopen(options); - - Iterator* itr = db_->NewIterator(ReadOptions()); - itr->SeekToFirst(); - ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); - delete itr; -} -#endif // ROCKSDB_LITE - -TEST_F(DBTest, TestLogCleanup) { - Options options = CurrentOptions(); - options.write_buffer_size = 64 * 1024; // very small - // only two memtables allowed ==> only two log files - options.max_write_buffer_number = 2; - Reopen(options); - - for (int i = 0; i < 100000; ++i) { - Put(Key(i), "val"); - // only 2 memtables will be alive, so logs_to_free needs to always be below - // 2 - ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); - } -} - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, EmptyCompactedDB) { - Options options; - options.max_open_files = -1; - options = CurrentOptions(options); - Close(); - ASSERT_OK(ReadOnlyReopen(options)); - Status s = Put("new", "value"); - ASSERT_TRUE(s.IsNotSupported()); - Close(); -} -#endif // ROCKSDB_LITE - -class CountingDeleteTabPropCollector : public TablePropertiesCollector { - public: - const char* Name() const override { return "CountingDeleteTabPropCollector"; } - - Status AddUserKey(const Slice& user_key, const Slice& value, EntryType type, - SequenceNumber seq, uint64_t file_size) override { - if (type == kEntryDelete) { - num_deletes_++; - } - return Status::OK(); - } - - bool NeedCompact() const override { return num_deletes_ > 10; } - - UserCollectedProperties GetReadableProperties() const override { - return UserCollectedProperties{}; - } - - Status Finish(UserCollectedProperties* properties) override { - *properties = - UserCollectedProperties{{"num_delete", ToString(num_deletes_)}}; - return Status::OK(); - } - - private: - uint32_t num_deletes_ = 0; -}; - -class CountingDeleteTabPropCollectorFactory - : public TablePropertiesCollectorFactory { - public: - virtual TablePropertiesCollector* CreateTablePropertiesCollector( - TablePropertiesCollectorFactory::Context context) override { - return new CountingDeleteTabPropCollector(); - } - const char* Name() const override { - return "CountingDeleteTabPropCollectorFactory"; - } -}; - -#ifndef ROCKSDB_LITE -TEST_F(DBTest, TablePropertiesNeedCompactTest) { - Random rnd(301); - - Options options; - options.create_if_missing = true; - options.write_buffer_size = 4096; - options.max_write_buffer_number = 8; - options.level0_file_num_compaction_trigger = 2; - options.level0_slowdown_writes_trigger = 2; - options.level0_stop_writes_trigger = 4; - options.target_file_size_base = 2048; - options.max_bytes_for_level_base = 10240; - options.max_bytes_for_level_multiplier = 4; - options.soft_pending_compaction_bytes_limit = 1024 * 1024; - options.num_levels = 8; - - std::shared_ptr collector_factory = - std::make_shared(); - options.table_properties_collector_factories.resize(1); - options.table_properties_collector_factories[0] = collector_factory; - - DestroyAndReopen(options); - - const int kMaxKey = 1000; - for (int i = 0; i < kMaxKey; i++) { - ASSERT_OK(Put(Key(i), RandomString(&rnd, 102))); - ASSERT_OK(Put(Key(kMaxKey + i), RandomString(&rnd, 102))); - } - Flush(); - dbfull()->TEST_WaitForCompact(); - if (NumTableFilesAtLevel(0) == 1) { - // Clear Level 0 so that when later flush a file with deletions, - // we don't trigger an organic compaction. - ASSERT_OK(Put(Key(0), "")); - ASSERT_OK(Put(Key(kMaxKey * 2), "")); - Flush(); - dbfull()->TEST_WaitForCompact(); - } - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - - { - int c = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(kMaxKey - 100)); - while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { - iter->Next(); - ++c; - } - ASSERT_EQ(c, 200); - } - - Delete(Key(0)); - for (int i = kMaxKey - 100; i < kMaxKey + 100; i++) { - Delete(Key(i)); - } - Delete(Key(kMaxKey * 2)); - - Flush(); - dbfull()->TEST_WaitForCompact(); - - { - SetPerfLevel(kEnableCount); - perf_context.Reset(); - int c = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - iter->Seek(Key(kMaxKey - 100)); - while (iter->Valid() && iter->key().compare(Key(kMaxKey + 100)) < 0) { - iter->Next(); - } - ASSERT_EQ(c, 0); - ASSERT_LT(perf_context.internal_delete_skipped_count, 30u); - ASSERT_LT(perf_context.internal_key_skipped_count, 30u); - SetPerfLevel(kDisable); - } -} + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); -TEST_F(DBTest, NeedCompactHintPersistentTest) { - Random rnd(301); + ASSERT_NE(TestGetTickerCount(options, MERGE_OPERATION_TOTAL_TIME), 0); +} +TEST_P(DBTestWithParam, FilterCompactionTimeTest) { Options options; - options.create_if_missing = true; - options.max_write_buffer_number = 8; - options.level0_file_num_compaction_trigger = 10; - options.level0_slowdown_writes_trigger = 10; - options.level0_stop_writes_trigger = 10; + options.compaction_filter_factory = + std::make_shared(this); options.disable_auto_compactions = true; - - std::shared_ptr collector_factory = - std::make_shared(); - options.table_properties_collector_factories.resize(1); - options.table_properties_collector_factories[0] = collector_factory; - + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.max_subcompactions = max_subcompactions_; + options = CurrentOptions(options); DestroyAndReopen(options); - const int kMaxKey = 100; - for (int i = 0; i < kMaxKey; i++) { - ASSERT_OK(Put(Key(i), "")); + // put some data + for (int table = 0; table < 4; ++table) { + for (int i = 0; i < 10 + table; ++i) { + Put(ToString(table * 100 + i), "val"); + } + Flush(); } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - for (int i = 1; i < kMaxKey - 1; i++) { - Delete(Key(i)); - } - Flush(); - dbfull()->TEST_WaitForFlushMemTable(); - ASSERT_EQ(NumTableFilesAtLevel(0), 2); + CompactRangeOptions cro; + cro.exclusive_manual_compaction = exclusive_manual_compaction_; + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + ASSERT_EQ(0U, CountLiveFiles()); - // Restart the DB. Although number of files didn't reach - // options.level0_file_num_compaction_trigger, compaction should - // still be triggered because of the need-compaction hint. - options.disable_auto_compactions = false; Reopen(options); - dbfull()->TEST_WaitForCompact(); - ASSERT_EQ(NumTableFilesAtLevel(0), 0); - { - SetPerfLevel(kEnableCount); - perf_context.Reset(); - int c = 0; - std::unique_ptr iter(db_->NewIterator(ReadOptions())); - for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) { - c++; - } - ASSERT_EQ(c, 2); - ASSERT_EQ(perf_context.internal_delete_skipped_count, 0); - // We iterate every key twice. Is it a bug? - ASSERT_LE(perf_context.internal_key_skipped_count, 2); - SetPerfLevel(kDisable); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_NE(TestGetTickerCount(options, FILTER_OPERATION_TOTAL_TIME), 0); + delete itr; +} +#endif // ROCKSDB_LITE + +TEST_F(DBTest, TestLogCleanup) { + Options options = CurrentOptions(); + options.write_buffer_size = 64 * 1024; // very small + // only two memtables allowed ==> only two log files + options.max_write_buffer_number = 2; + Reopen(options); + + for (int i = 0; i < 100000; ++i) { + Put(Key(i), "val"); + // only 2 memtables will be alive, so logs_to_free needs to always be below + // 2 + ASSERT_LT(dbfull()->TEST_LogsToFreeSize(), static_cast(3)); } } +#ifndef ROCKSDB_LITE +TEST_F(DBTest, EmptyCompactedDB) { + Options options; + options.max_open_files = -1; + options = CurrentOptions(options); + Close(); + ASSERT_OK(ReadOnlyReopen(options)); + Status s = Put("new", "value"); + ASSERT_TRUE(s.IsNotSupported()); + Close(); +} +#endif // ROCKSDB_LITE + +#ifndef ROCKSDB_LITE TEST_F(DBTest, SuggestCompactRangeTest) { class CompactionFilterFactoryGetContext : public CompactionFilterFactory { public: @@ -9417,16 +8748,79 @@ TEST_F(DBTest, DeletingOldWalAfterDrop) { EXPECT_GT(lognum2, lognum1); } +TEST_F(DBTest, DBWithSstFileManager) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + int files_added = 0; + int files_deleted = 0; + int files_moved = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnAddFile", [&](void* arg) { files_added++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnDeleteFile", [&](void* arg) { files_deleted++; }); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "SstFileManagerImpl::OnMoveFile", [&](void* arg) { files_moved++; }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 25; i++) { + GenerateNewRandomFile(&rnd); + ASSERT_OK(Flush()); + dbfull()->TEST_WaitForFlushMemTable(); + dbfull()->TEST_WaitForCompact(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), GetAllSSTFiles()); + } + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + auto files_in_db = GetAllSSTFiles(); + // Verify that we are tracking all sst files in dbname_ + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + // Verify the total files size + uint64_t total_files_size = 0; + for (auto& file_to_size : files_in_db) { + total_files_size += file_to_size.second; + } + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + // We flushed at least 25 files + ASSERT_GE(files_added, 25); + // Compaction must have deleted some files + ASSERT_GT(files_deleted, 0); + // No files were moved + ASSERT_EQ(files_moved, 0); + + Close(); + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + // Verify that we track all the files again after the DB is closed and opened + Close(); + sst_file_manager.reset(NewSstFileManager(env_)); + options.sst_file_manager = sst_file_manager; + sfm = static_cast(sst_file_manager.get()); + + Reopen(options); + ASSERT_EQ(sfm->GetTrackedFiles(), files_in_db); + ASSERT_EQ(sfm->GetTotalSize(), total_files_size); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + #ifndef ROCKSDB_LITE TEST_F(DBTest, RateLimitedDelete) { rocksdb::SyncPoint::GetInstance()->LoadDependency({ - {"DBTest::RateLimitedDelete:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, + {"DBTest::RateLimitedDelete:1", "DeleteScheduler::BackgroundEmptyTrash"}, }); std::vector penalties; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", + "DeleteScheduler::BackgroundEmptyTrash:Wait", [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); @@ -9437,9 +8831,10 @@ TEST_F(DBTest, RateLimitedDelete) { std::string trash_dir = test::TmpDir(env_) + "/trash"; int64_t rate_bytes_per_sec = 1024 * 10; // 10 Kbs / Sec Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); Destroy(last_options_); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -9466,7 +8861,7 @@ TEST_F(DBTest, RateLimitedDelete) { uint64_t delete_start_time = env_->NowMicros(); // Hold BackgroundEmptyTrash TEST_SYNC_POINT("DBTest::RateLimitedDelete:1"); - options.delete_scheduler->WaitForEmptyTrash(); + sfm->WaitForEmptyTrash(); uint64_t time_spent_deleting = env_->NowMicros() - delete_start_time; uint64_t total_files_size = 0; @@ -9489,7 +8884,7 @@ TEST_F(DBTest, RateLimitedDelete) { TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -9502,9 +8897,10 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { std::string trash_dir = test::TmpDir(env_) + "/trash"; int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); ASSERT_OK(s); + auto sfm = static_cast(options.sst_file_manager.get()); DestroyAndReopen(options); @@ -9538,7 +8934,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { ASSERT_OK(db_->CompactRange(compact_options, &begin, &end)); ASSERT_EQ("0,2", FilesPerLevel(0)); - options.delete_scheduler->WaitForEmptyTrash(); + sfm->WaitForEmptyTrash(); ASSERT_EQ(bg_delete_file, 8); compact_options.bottommost_level_compaction = @@ -9546,7 +8942,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); ASSERT_EQ("0,1", FilesPerLevel(0)); - options.delete_scheduler->WaitForEmptyTrash(); + sfm->WaitForEmptyTrash(); ASSERT_EQ(bg_delete_file, 8); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); @@ -9555,7 +8951,7 @@ TEST_F(DBTest, DeleteSchedulerMultipleDBPaths) { TEST_F(DBTest, DestroyDBWithRateLimitedDelete) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); @@ -9577,17 +8973,114 @@ TEST_F(DBTest, DestroyDBWithRateLimitedDelete) { std::string trash_dir = test::TmpDir(env_) + "/trash"; int64_t rate_bytes_per_sec = 1024 * 1024; // 1 Mb / Sec Status s; - options.delete_scheduler.reset(NewDeleteScheduler( - env_, trash_dir, rate_bytes_per_sec, nullptr, false, &s)); + options.sst_file_manager.reset(NewSstFileManager( + env_, nullptr, trash_dir, rate_bytes_per_sec, false, &s)); ASSERT_OK(s); ASSERT_OK(DestroyDB(dbname_, options)); - options.delete_scheduler->WaitForEmptyTrash(); + auto sfm = static_cast(options.sst_file_manager.get()); + sfm->WaitForEmptyTrash(); // We have deleted the 4 sst files in the delete_scheduler ASSERT_EQ(bg_delete_file, 4); } #endif // ROCKSDB_LITE +TEST_F(DBTest, DBWithMaxSpaceAllowed) { + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + Random rnd(301); + + // Generate a file containing 100 keys. + for (int i = 0; i < 100; i++) { + ASSERT_OK(Put(Key(i), RandomString(&rnd, 50))); + } + ASSERT_OK(Flush()); + + uint64_t first_file_size = 0; + auto files_in_db = GetAllSSTFiles(&first_file_size); + ASSERT_EQ(sfm->GetTotalSize(), first_file_size); + + // Set the maximum allowed space usage to the current total size + sfm->SetMaxAllowedSpaceUsage(first_file_size + 1); + + ASSERT_OK(Put("key1", "val1")); + // This flush will cause bg_error_ and will fail + ASSERT_NOK(Flush()); +} + +TEST_F(DBTest, DBWithMaxSpaceAllowedRandomized) { + // This test will set a maximum allowed space for the DB, then it will + // keep filling the DB until the limit is reached and bg_error_ is set. + // When bg_error_ is set we will verify that the DB size is greater + // than the limit. + + std::vector max_space_limits_mbs = {1, 2, 4, 8, 10}; + + bool bg_error_set = false; + uint64_t total_sst_files_size = 0; + + int reached_max_space_on_flush = 0; + int reached_max_space_on_compaction = 0; + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::FlushMemTableToOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_flush++; + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "CompactionJob::FinishCompactionOutputFile:MaxAllowedSpaceReached", + [&](void* arg) { + bg_error_set = true; + GetAllSSTFiles(&total_sst_files_size); + reached_max_space_on_compaction++; + }); + + for (auto limit_mb : max_space_limits_mbs) { + bg_error_set = false; + total_sst_files_size = 0; + rocksdb::SyncPoint::GetInstance()->ClearTrace(); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + std::shared_ptr sst_file_manager(NewSstFileManager(env_)); + auto sfm = static_cast(sst_file_manager.get()); + + Options options = CurrentOptions(); + options.sst_file_manager = sst_file_manager; + options.write_buffer_size = 1024 * 512; // 512 Kb + DestroyAndReopen(options); + Random rnd(301); + + sfm->SetMaxAllowedSpaceUsage(limit_mb * 1024 * 1024); + + int keys_written = 0; + uint64_t estimated_db_size = 0; + while (true) { + auto s = Put(RandomString(&rnd, 10), RandomString(&rnd, 50)); + if (!s.ok()) { + break; + } + keys_written++; + // Check the estimated db size vs the db limit just to make sure we + // dont run into an infinite loop + estimated_db_size = keys_written * 60; // ~60 bytes per key + ASSERT_LT(estimated_db_size, limit_mb * 1024 * 1024 * 2); + } + ASSERT_TRUE(bg_error_set); + ASSERT_GE(total_sst_files_size, limit_mb * 1024 * 1024); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + } + + ASSERT_GT(reached_max_space_on_flush, 0); + ASSERT_GT(reached_max_space_on_compaction, 0); +} + TEST_F(DBTest, UnsupportedManualSync) { DestroyAndReopen(CurrentOptions()); env_->is_wal_sync_thread_safe_.store(false); @@ -9917,6 +9410,12 @@ TEST_F(DBTest, AddExternalSstFile) { ASSERT_EQ(file5_info.smallest_key, Key(400)); ASSERT_EQ(file5_info.largest_key, Key(499)); + // Cannot create an empty sst file + std::string file_empty = sst_files_folder + "file_empty.sst"; + ExternalSstFileInfo file_empty_info; + s = sst_file_writer.Finish(&file_empty_info); + ASSERT_NOK(s); + DestroyAndReopen(options); // Add file using file path s = db_->AddFile(file1); @@ -10001,6 +9500,58 @@ TEST_F(DBTest, AddExternalSstFile) { kSkipFIFOCompaction)); } +// This test reporduce a bug that can happen in some cases if the DB started +// purging obsolete files when we are adding an external sst file. +// This situation may result in deleting the file while it's being added. +TEST_F(DBTest, AddExternalSstFilePurgeObsoleteFilesBug) { + std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; + env_->CreateDir(sst_files_folder); + Options options = CurrentOptions(); + options.env = env_; + const ImmutableCFOptions ioptions(options); + + SstFileWriter sst_file_writer(EnvOptions(), ioptions, options.comparator); + + // file1.sst (0 => 500) + std::string sst_file_path = sst_files_folder + "file1.sst"; + Status s = sst_file_writer.Open(sst_file_path); + ASSERT_OK(s); + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + s = sst_file_writer.Add(k, k + "_val"); + ASSERT_OK(s); + } + + ExternalSstFileInfo sst_file_info; + s = sst_file_writer.Finish(&sst_file_info); + ASSERT_OK(s); + + options.delete_obsolete_files_period_micros = 0; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "DBImpl::AddFile:FileCopied", [&](void* arg) { + ASSERT_OK(Put("aaa", "bbb")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("aaa", "xxx")); + ASSERT_OK(Flush()); + db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + s = db_->AddFile(sst_file_path); + ASSERT_OK(s); + + for (int i = 0; i < 500; i++) { + std::string k = Key(i); + std::string v = k + "_val"; + ASSERT_EQ(Get(k), v); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + TEST_F(DBTest, AddExternalSstFileNoCopy) { std::string sst_files_folder = test::TmpDir(env_) + "/sst_files/"; env_->CreateDir(sst_files_folder); @@ -10402,7 +9953,10 @@ TEST_F(DBTest, PinnedDataIteratorRandomized) { ASSERT_EQ(true_data.lower_bound(k), true_data.end()); continue; } - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); keys_slices.push_back(iter->key()); true_keys.push_back(true_data.lower_bound(k)->first); } @@ -10417,7 +9971,10 @@ TEST_F(DBTest, PinnedDataIteratorRandomized) { printf("Testing iterating forward on all keys\n"); std::vector all_keys; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); all_keys.push_back(iter->key()); } ASSERT_EQ(all_keys.size(), true_data.size()); @@ -10435,7 +9992,10 @@ TEST_F(DBTest, PinnedDataIteratorRandomized) { printf("Testing iterating backward on all keys\n"); std::vector all_keys; for (iter->SeekToLast(); iter->Valid(); iter->Prev()) { - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK( + iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); all_keys.push_back(iter->key()); } ASSERT_EQ(all_keys.size(), true_data.size()); @@ -10506,7 +10066,9 @@ TEST_F(DBTest, PinnedDataIteratorMultipleFiles) { std::vector> results; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); results.emplace_back(iter->key(), iter->value().ToString()); } @@ -10559,7 +10121,9 @@ TEST_F(DBTest, PinnedDataIteratorMergeOperator) { std::vector> results; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); results.emplace_back(iter->key(), iter->value().ToString()); } @@ -10614,7 +10178,9 @@ TEST_F(DBTest, PinnedDataIteratorReadAfterUpdate) { std::vector> results; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { - ASSERT_TRUE(iter->IsKeyPinned()); + std::string prop_value; + ASSERT_OK(iter->GetProperty("rocksdb.iterator.is-key-pinned", &prop_value)); + ASSERT_EQ("1", prop_value); results.emplace_back(iter->key(), iter->value().ToString()); } @@ -11223,6 +10789,168 @@ TEST_F(DBTest, WalFilterTestWithColumnFamilies) { } #endif // ROCKSDB_LITE +class SliceTransformLimitedDomain : public SliceTransform { + const char* Name() const override { return "SliceTransformLimitedDomain"; } + + Slice Transform(const Slice& src) const override { + return Slice(src.data(), 5); + } + + bool InDomain(const Slice& src) const override { + // prefix will be x???? + return src.size() >= 5 && src[0] == 'x'; + } + + bool InRange(const Slice& dst) const override { + // prefix will be x???? + return dst.size() == 5 && dst[0] == 'x'; + } +}; + +TEST_F(DBTest, PrefixExtractorFullFilter) { + BlockBasedTableOptions bbto; + // Full Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = false; + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1111_AAAA", "val1")); + ASSERT_OK(Put("x1112_AAAA", "val2")); + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val5")); + + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("x1111_AAAA"), "val1"); + ASSERT_EQ(Get("x1112_AAAA"), "val2"); + ASSERT_EQ(Get("x1113_AAAA"), "val3"); + ASSERT_EQ(Get("x1114_AAAA"), "val4"); + // Was not added to filter but rocksdb will try to read it from the filter + ASSERT_EQ(Get("zzzzz_AAAA"), "val5"); +} + +TEST_F(DBTest, PrefixExtractorBlockFilter) { + BlockBasedTableOptions bbto; + // Block Filter Block + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10, true)); + + Options options = CurrentOptions(); + options.prefix_extractor = std::make_shared(); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + DestroyAndReopen(options); + + ASSERT_OK(Put("x1113_AAAA", "val3")); + ASSERT_OK(Put("x1114_AAAA", "val4")); + // Not in domain, wont be added to filter + ASSERT_OK(Put("zzzzz_AAAA", "val1")); + ASSERT_OK(Put("zzzzz_AAAB", "val2")); + ASSERT_OK(Put("zzzzz_AAAC", "val3")); + ASSERT_OK(Put("zzzzz_AAAD", "val4")); + + ASSERT_OK(Flush()); + + std::vector iter_res; + auto iter = db_->NewIterator(ReadOptions()); + // Seek to a key that was not in Domain + for (iter->Seek("zzzzz_AAAA"); iter->Valid(); iter->Next()) { + iter_res.emplace_back(iter->value().ToString()); + } + + std::vector expected_res = {"val1", "val2", "val3", "val4"}; + ASSERT_EQ(iter_res, expected_res); + delete iter; +} + +TEST_F(DBTest, IteratorWithLocalStatistics) { + Options options = CurrentOptions(); + options.statistics = rocksdb::CreateDBStatistics(); + DestroyAndReopen(options); + + Random rnd(301); + for (int i = 0; i < 1000; i++) { + // Key 10 bytes / Value 10 bytes + ASSERT_OK(Put(RandomString(&rnd, 10), RandomString(&rnd, 10))); + } + + std::atomic total_next(0); + std::atomic total_next_found(0); + std::atomic total_prev(0); + std::atomic total_prev_found(0); + std::atomic total_bytes(0); + + std::vector threads; + std::function reader_func_next = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToFirst(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Next(); + total_next++; + + if (!iter->Valid()) { + break; + } + total_next_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + std::function reader_func_prev = [&]() { + Iterator* iter = db_->NewIterator(ReadOptions()); + + iter->SeekToLast(); + // Seek will bump ITER_BYTES_READ + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + while (true) { + iter->Prev(); + total_prev++; + + if (!iter->Valid()) { + break; + } + total_prev_found++; + total_bytes += iter->key().size(); + total_bytes += iter->value().size(); + } + + delete iter; + }; + + for (int i = 0; i < 10; i++) { + threads.emplace_back(reader_func_next); + } + for (int i = 0; i < 15; i++) { + threads.emplace_back(reader_func_prev); + } + + for (auto& t : threads) { + t.join(); + } + + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT), total_next); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_NEXT_FOUND), + total_next_found); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV), total_prev); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_DB_PREV_FOUND), + total_prev_found); + ASSERT_EQ(TestGetTickerCount(options, ITER_BYTES_READ), total_bytes); +} + #ifndef ROCKSDB_LITE class BloomStatsTestWithParam : public DBTest, diff --git a/db/db_test2.cc b/db/db_test2.cc new file mode 100644 index 000000000..3d9820b65 --- /dev/null +++ b/db/db_test2.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include "db/db_test_util.h" +#include "port/stack_trace.h" + +namespace rocksdb { + +class DBTest2 : public DBTestBase { + public: + DBTest2() : DBTestBase("/db_test2") {} +}; + +TEST_F(DBTest2, IteratorPropertyVersionNumber) { + Put("", ""); + Iterator* iter1 = db_->NewIterator(ReadOptions()); + std::string prop_value; + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1 = + static_cast(std::atoi(prop_value.c_str())); + + Put("", ""); + Flush(); + + Iterator* iter2 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter2->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number2 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_GT(version_number2, version_number1); + + Put("", ""); + + Iterator* iter3 = db_->NewIterator(ReadOptions()); + ASSERT_OK( + iter3->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number3 = + static_cast(std::atoi(prop_value.c_str())); + + ASSERT_EQ(version_number2, version_number3); + + iter1->SeekToFirst(); + ASSERT_OK( + iter1->GetProperty("rocksdb.iterator.super-version-number", &prop_value)); + uint64_t version_number1_new = + static_cast(std::atoi(prop_value.c_str())); + ASSERT_EQ(version_number1, version_number1_new); + + delete iter1; + delete iter2; + delete iter3; +} + +TEST_F(DBTest2, CacheIndexAndFilterWithDBRestart) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); + BlockBasedTableOptions table_options; + table_options.cache_index_and_filter_blocks = true; + table_options.filter_policy.reset(NewBloomFilterPolicy(20)); + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + CreateAndReopenWithCF({"pikachu"}, options); + + Put(1, "a", "begin"); + Put(1, "z", "end"); + ASSERT_OK(Flush(1)); + TryReopenWithColumnFamilies({"default", "pikachu"}, options); + + std::string value; + value = Get(1, "a"); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + rocksdb::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/db/db_test_util.cc b/db/db_test_util.cc index 0d342cc52..950941817 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -73,63 +73,69 @@ DBTestBase::~DBTestBase() { delete env_; } -// Switch to a fresh database with the next option configuration to -// test. Return false if there are no more configurations to test. -bool DBTestBase::ChangeOptions(int skip_mask) { - for (option_config_++; option_config_ < kEnd; option_config_++) { +bool DBTestBase::ShouldSkipOptions(int option_config, int skip_mask) { #ifdef ROCKSDB_LITE // These options are not supported in ROCKSDB_LITE - if (option_config_ == kHashSkipList || - option_config_ == kPlainTableFirstBytePrefix || - option_config_ == kPlainTableCappedPrefix || - option_config_ == kPlainTableCappedPrefixNonMmap || - option_config_ == kPlainTableAllBytesPrefix || - option_config_ == kVectorRep || option_config_ == kHashLinkList || - option_config_ == kHashCuckoo || - option_config_ == kUniversalCompaction || - option_config_ == kUniversalCompactionMultiLevel || - option_config_ == kUniversalSubcompactions || - option_config_ == kFIFOCompaction) { - continue; + if (option_config == kHashSkipList || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap || + option_config == kPlainTableAllBytesPrefix || + option_config == kVectorRep || option_config == kHashLinkList || + option_config == kHashCuckoo || option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel || + option_config == kUniversalSubcompactions || + option_config == kFIFOCompaction || + option_config == kConcurrentSkipList) { + return true; } #endif if ((skip_mask & kSkipDeletesFilterFirst) && - option_config_ == kDeletesFilterFirst) { - continue; + option_config == kDeletesFilterFirst) { + return true; } if ((skip_mask & kSkipUniversalCompaction) && - (option_config_ == kUniversalCompaction || - option_config_ == kUniversalCompactionMultiLevel)) { - continue; + (option_config == kUniversalCompaction || + option_config == kUniversalCompactionMultiLevel)) { + return true; } - if ((skip_mask & kSkipMergePut) && option_config_ == kMergePut) { - continue; + if ((skip_mask & kSkipMergePut) && option_config == kMergePut) { + return true; } if ((skip_mask & kSkipNoSeekToLast) && - (option_config_ == kHashLinkList || option_config_ == kHashSkipList)) { - continue; + (option_config == kHashLinkList || option_config == kHashSkipList)) { + return true; } if ((skip_mask & kSkipPlainTable) && - (option_config_ == kPlainTableAllBytesPrefix || - option_config_ == kPlainTableFirstBytePrefix || - option_config_ == kPlainTableCappedPrefix || - option_config_ == kPlainTableCappedPrefixNonMmap)) { - continue; + (option_config == kPlainTableAllBytesPrefix || + option_config == kPlainTableFirstBytePrefix || + option_config == kPlainTableCappedPrefix || + option_config == kPlainTableCappedPrefixNonMmap)) { + return true; } if ((skip_mask & kSkipHashIndex) && - (option_config_ == kBlockBasedTableWithPrefixHashIndex || - option_config_ == kBlockBasedTableWithWholeKeyHashIndex)) { - continue; + (option_config == kBlockBasedTableWithPrefixHashIndex || + option_config == kBlockBasedTableWithWholeKeyHashIndex)) { + return true; } - if ((skip_mask & kSkipHashCuckoo) && (option_config_ == kHashCuckoo)) { - continue; + if ((skip_mask & kSkipHashCuckoo) && (option_config == kHashCuckoo)) { + return true; } - if ((skip_mask & kSkipFIFOCompaction) && - option_config_ == kFIFOCompaction) { - continue; + if ((skip_mask & kSkipFIFOCompaction) && option_config == kFIFOCompaction) { + return true; } - if ((skip_mask & kSkipMmapReads) && option_config_ == kWalDirAndMmapReads) { + if ((skip_mask & kSkipMmapReads) && option_config == kWalDirAndMmapReads) { + return true; + } + return false; +} + +// Switch to a fresh database with the next option configuration to +// test. Return false if there are no more configurations to test. +bool DBTestBase::ChangeOptions(int skip_mask) { + for (option_config_++; option_config_ < kEnd; option_config_++) { + if (ShouldSkipOptions(option_config_, skip_mask)) { continue; } break; @@ -333,6 +339,10 @@ Options DBTestBase::CurrentOptions( options.prefix_extractor.reset(NewNoopTransform()); break; } + case kBlockBasedTableWithIndexRestartInterval: { + table_options.index_block_restart_interval = 8; + break; + } case kOptimizeFiltersForHits: { options.optimize_filters_for_hits = true; set_block_based_table_factory = true; @@ -356,6 +366,11 @@ Options DBTestBase::CurrentOptions( options.max_subcompactions = 4; break; } + case kConcurrentSkipList: { + options.allow_concurrent_memtable_write = true; + options.enable_write_thread_adaptive_yield = true; + break; + } default: break; @@ -1000,4 +1015,29 @@ void DBTestBase::CopyFile(const std::string& source, ASSERT_OK(destfile->Close()); } +std::unordered_map DBTestBase::GetAllSSTFiles( + uint64_t* total_size) { + std::unordered_map res; + + if (total_size) { + *total_size = 0; + } + std::vector files; + env_->GetChildren(dbname_, &files); + for (auto& file_name : files) { + uint64_t number; + FileType type; + std::string file_path = dbname_ + "/" + file_name; + if (ParseFileName(file_name, &number, &type) && type == kTableFile) { + uint64_t file_size = 0; + env_->GetFileSize(file_path, &file_size); + res[file_path] = file_size; + if (total_size) { + *total_size += file_size; + } + } + } + return res; +} + } // namespace rocksdb diff --git a/db/db_test_util.h b/db/db_test_util.h index c13ebbda2..ca2b466e9 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ #endif #include +#include #include #include #include @@ -27,7 +28,6 @@ #include #include "db/db_impl.h" -#include "db/db_test_util.h" #include "db/dbformat.h" #include "db/filename.h" #include "memtable/hash_linklist_rep.h" @@ -525,9 +525,11 @@ class DBTestBase : public testing::Test { kOptimizeFiltersForHits = 27, kRowCache = 28, kRecycleLogFiles = 29, - kLevelSubcompactions = 30, - kUniversalSubcompactions = 31, - kEnd = 30 + kConcurrentSkipList = 30, + kEnd = 31, + kLevelSubcompactions = 31, + kUniversalSubcompactions = 32, + kBlockBasedTableWithIndexRestartInterval = 33, }; int option_config_; @@ -573,6 +575,8 @@ class DBTestBase : public testing::Test { return std::string(buf); } + static bool ShouldSkipOptions(int option_config, int skip_mask = kNoSkip); + // Switch to a fresh database with the next option configuration to // test. Return false if there are no more configurations to test. bool ChangeOptions(int skip_mask = kNoSkip); @@ -749,6 +753,9 @@ class DBTestBase : public testing::Test { void CopyFile(const std::string& source, const std::string& destination, uint64_t size = 0); + + std::unordered_map GetAllSSTFiles( + uint64_t* total_size = nullptr); }; } // namespace rocksdb diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 9efcf4ae5..82f11502b 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,6 +14,11 @@ namespace rocksdb { +static uint64_t TestGetTickerCount(const Options& options, + Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + static std::string CompressibleString(Random* rnd, int len) { std::string r; test::CompressibleString(rnd, 0.8, len, &r); @@ -154,6 +159,72 @@ TEST_P(DBTestUniversalCompaction, UniversalCompactionSingleSortedRun) { } } +TEST_P(DBTestUniversalCompaction, OptimizeFiltersForHits) { + Options options; + options = CurrentOptions(options); + options.compaction_style = kCompactionStyleUniversal; + options.compaction_options_universal.size_ratio = 5; + options.num_levels = num_levels_; + options.write_buffer_size = 105 << 10; // 105KB + options.arena_block_size = 4 << 10; + options.target_file_size_base = 32 << 10; // 32KB + // trigger compaction if there are >= 4 files + options.level0_file_num_compaction_trigger = 4; + BlockBasedTableOptions bbto; + bbto.cache_index_and_filter_blocks = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + options.optimize_filters_for_hits = true; + options.statistics = rocksdb::CreateDBStatistics(); + options.memtable_factory.reset(new SpecialSkipListFactory(3)); + + DestroyAndReopen(options); + + // block compaction from happening + env_->SetBackgroundThreads(1, Env::LOW); + test::SleepingBackgroundTask sleeping_task_low; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_task_low, + Env::Priority::LOW); + + for (int num = 0; num < options.level0_file_num_compaction_trigger; num++) { + Put(Key(num * 10), "val"); + if (num) { + dbfull()->TEST_WaitForFlushMemTable(); + } + Put(Key(30 + num * 10), "val"); + Put(Key(60 + num * 10), "val"); + } + Put("", ""); + dbfull()->TEST_WaitForFlushMemTable(); + + // Query set of non existing keys + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + + // Make sure bloom filter is used at least once. + ASSERT_GT(TestGetTickerCount(options, BLOOM_FILTER_USEFUL), 0); + auto prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Make sure bloom filter is used for all but the last L0 file when looking + // up a non-existent key that's in the range of all L0 files. + ASSERT_EQ(Get(Key(35)), "NOT_FOUND"); + ASSERT_EQ(prev_counter + NumTableFilesAtLevel(0) - 1, + TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); + prev_counter = TestGetTickerCount(options, BLOOM_FILTER_USEFUL); + + // Unblock compaction and wait it for happening. + sleeping_task_low.WakeUp(); + dbfull()->TEST_WaitForCompact(); + + // The same queries will not trigger bloom filter + for (int i = 5; i < 90; i += 10) { + ASSERT_EQ(Get(Key(i)), "NOT_FOUND"); + } + ASSERT_EQ(prev_counter, TestGetTickerCount(options, BLOOM_FILTER_USEFUL)); +} + // TODO(kailiu) The tests on UniversalCompaction has some issues: // 1. A lot of magic numbers ("11" or "12"). // 2. Made assumption on the memtable flush conditions, which may change from @@ -1032,16 +1103,11 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { for (int i = 0; i <= max_key1; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); - int non_level0_num_files = 0; - for (int i = 1; i < options.num_levels; i++) { - non_level0_num_files += NumTableFilesAtLevel(i, 1); - } - ASSERT_EQ(non_level0_num_files, 0); - // Stage 2: reopen with universal compaction, num_levels=4 options.compaction_style = kCompactionStyleUniversal; options.num_levels = 4; @@ -1054,6 +1120,7 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { for (int i = max_key1 + 1; i <= max_key2; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); @@ -1084,6 +1151,7 @@ TEST_P(DBTestUniversalCompaction, IncreaseUniversalCompactionNumLevels) { for (int i = max_key2 + 1; i <= max_key3; i++) { // each value is 10K ASSERT_OK(Put(1, Key(i), RandomString(&rnd, 10000))); + dbfull()->TEST_WaitForFlushMemTable(handles_[1]); } ASSERT_OK(Flush(1)); dbfull()->TEST_WaitForCompact(); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 9e8a19dce..14b9e2ffd 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/dbformat.cc b/db/dbformat.cc index eb19a7b17..d840aea86 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/dbformat.h b/db/dbformat.h index 1e1169639..3a9682d1d 100644 --- a/db/dbformat.h +++ b/db/dbformat.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/dbformat_test.cc b/db/dbformat_test.cc index 0273dd062..e79dbc683 100644 --- a/db/dbformat_test.cc +++ b/db/dbformat_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index a4cb296d9..57fafa5e7 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -37,6 +37,7 @@ class DeleteFileTest : public testing::Test { DeleteFileTest() { db_ = nullptr; env_ = Env::Default(); + options_.delete_obsolete_files_period_micros = 0; // always do full purge options_.enable_thread_tracking = true; options_.write_buffer_size = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000; diff --git a/db/event_helpers.cc b/db/event_helpers.cc index 834ad9b1b..1a591dc91 100644 --- a/db/event_helpers.cc +++ b/db/event_helpers.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/event_helpers.h b/db/event_helpers.h index a60bc9a9e..a36010e16 100644 --- a/db/event_helpers.h +++ b/db/event_helpers.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/experimental.cc b/db/experimental.cc index 0b5018aef..26b2113d2 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 4c4f38d4a..6e8363516 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/file_indexer.cc b/db/file_indexer.cc index 222cca9c0..9b31c2bd6 100644 --- a/db/file_indexer.cc +++ b/db/file_indexer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/file_indexer.h b/db/file_indexer.h index 418ae0f68..5eb10bc4d 100644 --- a/db/file_indexer.h +++ b/db/file_indexer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/file_indexer_test.cc b/db/file_indexer_test.cc index 98fea47fe..9b3cdd4d6 100644 --- a/db/file_indexer_test.cc +++ b/db/file_indexer_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/filename.cc b/db/filename.cc index 32cd8758a..d1f0958bb 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/filename.h b/db/filename.h index f7196c9f2..9a0a1eee3 100644 --- a/db/filename.h +++ b/db/filename.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/filename_test.cc b/db/filename_test.cc index 2eafd5230..0f8e37e7f 100644 --- a/db/filename_test.cc +++ b/db/filename_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/flush_job.cc b/db/flush_job.cc index 9da7d9546..b4e5b307f 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -94,7 +94,8 @@ FlushJob::~FlushJob() { } void FlushJob::ReportStartedFlush() { - ThreadStatusUtil::SetColumnFamily(cfd_); + ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, + cfd_->options()->enable_thread_tracking); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); ThreadStatusUtil::SetThreadOperationProperty( ThreadStatus::COMPACTION_JOB_ID, @@ -233,14 +234,14 @@ Status FlushJob::WriteLevel0Table(const autovector& mems, TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", &output_compression_); - s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_, - cfd_->table_cache(), iter.get(), meta, - cfd_->internal_comparator(), - cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), - existing_snapshots_, earliest_write_conflict_snapshot_, - output_compression_, cfd_->ioptions()->compression_opts, - mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), Env::IO_HIGH, &table_properties_); + s = BuildTable( + dbname_, db_options_.env, *cfd_->ioptions(), env_options_, + cfd_->table_cache(), iter.get(), meta, cfd_->internal_comparator(), + cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(), + existing_snapshots_, earliest_write_conflict_snapshot_, + output_compression_, cfd_->ioptions()->compression_opts, + mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), + Env::IO_HIGH, &table_properties_, 0 /* level */); info.table_properties = table_properties_; LogFlush(db_options_.info_log); } @@ -270,6 +271,7 @@ Status FlushJob::WriteLevel0Table(const autovector& mems, if (!db_options_.disableDataSync && output_file_directory_ != nullptr) { output_file_directory_->Fsync(); } + TEST_SYNC_POINT("FlushJob::WriteLevel0Table"); db_mutex_->Lock(); } base->Unref(); diff --git a/db/flush_job.h b/db/flush_job.h index d12da141e..4d088b58e 100644 --- a/db/flush_job.h +++ b/db/flush_job.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index f7071c1ee..3bba6337b 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/flush_scheduler.cc b/db/flush_scheduler.cc index f970f1ca8..60db59dd4 100644 --- a/db/flush_scheduler.cc +++ b/db/flush_scheduler.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/flush_scheduler.h b/db/flush_scheduler.h index dd439e410..820bd7b71 100644 --- a/db/flush_scheduler.h +++ b/db/flush_scheduler.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 15110fec3..bbca88f9c 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -10,15 +10,16 @@ #include #include -#include "db/job_context.h" +#include "db/column_family.h" #include "db/db_impl.h" #include "db/db_iter.h" -#include "db/column_family.h" +#include "db/dbformat.h" +#include "db/job_context.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" #include "table/merger.h" -#include "db/dbformat.h" +#include "util/string_util.h" #include "util/sync_point.h" namespace rocksdb { @@ -471,6 +472,15 @@ Status ForwardIterator::status() const { return immutable_status_; } +Status ForwardIterator::GetProperty(std::string prop_name, std::string* prop) { + assert(prop != nullptr); + if (prop_name == "rocksdb.iterator.super-version-number") { + *prop = ToString(sv_->version_number); + return Status::OK(); + } + return Status::InvalidArgument(); +} + void ForwardIterator::RebuildIterators(bool refresh_sv) { // Clean up Cleanup(refresh_sv); diff --git a/db/forward_iterator.h b/db/forward_iterator.h index 1c4d4975e..b5beeceef 100644 --- a/db/forward_iterator.h +++ b/db/forward_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -71,6 +71,8 @@ class ForwardIterator : public InternalIterator { virtual Slice key() const override; virtual Slice value() const override; virtual Status status() const override; + virtual Status GetProperty(std::string prop_name, std::string* prop) override; + bool TEST_CheckDeletedIters(int* deleted_iters, int* num_iters); private: diff --git a/db/forward_iterator_bench.cc b/db/forward_iterator_bench.cc index 69833a4af..0f44a9e44 100644 --- a/db/forward_iterator_bench.cc +++ b/db/forward_iterator_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/inlineskiplist.h b/db/inlineskiplist.h index 201580b10..cfd47f39f 100644 --- a/db/inlineskiplist.h +++ b/db/inlineskiplist.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional // grant of patent rights can be found in the PATENTS file in the same @@ -147,8 +147,9 @@ class InlineSkipList { // values are ok. std::atomic max_height_; // Height of the entire list - // Used for optimizing sequential insert patterns. Tricky. prev_[i] for - // i up to max_height_ - 1 (inclusive) is the predecessor of prev_[0]. + // Used for optimizing sequential insert patterns. Tricky. prev_height_ + // of zero means prev_ is undefined. Otherwise: prev_[i] for i up + // to max_height_ - 1 (inclusive) is the predecessor of prev_[0], and // prev_height_ is the height of prev_[0]. prev_[0] can only be equal // to head when max_height_ and prev_height_ are both 1. Node** prev_; @@ -510,11 +511,10 @@ InlineSkipList::AllocateNode(size_t key_size, int height) { template void InlineSkipList::Insert(const char* key) { - // InsertConcurrently can't maintain the prev_ invariants when it needs - // to increase max_height_. In that case it sets prev_height_ to zero, - // letting us know that we should ignore it. A relaxed load suffices - // here because write thread synchronization separates Insert calls - // from InsertConcurrently calls. + // InsertConcurrently often can't maintain the prev_ invariants, so + // it just sets prev_height_ to zero, letting us know that we should + // ignore it. A relaxed load suffices here because write thread + // synchronization separates Insert calls from InsertConcurrently calls. auto prev_height = prev_height_.load(std::memory_order_relaxed); // fast path for sequential insertion @@ -595,15 +595,24 @@ void InlineSkipList::InsertConcurrently(const char* key) { int height = x->UnstashHeight(); assert(height >= 1 && height <= kMaxHeight_); + // We don't have a lock-free algorithm for updating prev_, but we do have + // the option of invalidating the entire sequential-insertion cache. + // prev_'s invariant is that prev_[i] (i > 0) is the predecessor of + // prev_[0] at that level. We're only going to violate that if height + // > 1 and key lands after prev_[height - 1] but before prev_[0]. + // Comparisons are pretty expensive, so an easier version is to just + // clear the cache if height > 1. We only write to prev_height_ if the + // nobody else has, to avoid invalidating the root of the skip list in + // all of the other CPU caches. + if (height > 1 && prev_height_.load(std::memory_order_relaxed) != 0) { + prev_height_.store(0, std::memory_order_relaxed); + } + int max_height = max_height_.load(std::memory_order_relaxed); while (height > max_height) { if (max_height_.compare_exchange_strong(max_height, height)) { // successfully updated it max_height = height; - - // we dont have a lock-free algorithm for fixing up prev_, so just - // mark it invalid - prev_height_.store(0, std::memory_order_relaxed); break; } // else retry, possibly exiting the loop because somebody else diff --git a/db/inlineskiplist_test.cc b/db/inlineskiplist_test.cc index 5c2dd6fa5..5743bacec 100644 --- a/db/inlineskiplist_test.cc +++ b/db/inlineskiplist_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 74aac3649..a554f0b85 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "db/column_family.h" @@ -81,7 +82,21 @@ void PrintLevelStats(char* buf, size_t len, const std::string& name, stats.count == 0 ? 0 : stats.micros / kMicrosInSec / stats.count, num_input_records.c_str(), num_dropped_records.c_str()); } + +// Assumes that trailing numbers represent an optional argument. This requires +// property names to not end with numbers. +std::pair GetPropertyNameAndArg(const Slice& property) { + Slice name = property, arg = property; + size_t sfx_len = 0; + while (sfx_len < property.size() && + isdigit(property[property.size() - sfx_len - 1])) { + ++sfx_len; + } + name.remove_suffix(sfx_len); + arg.remove_prefix(property.size() - sfx_len); + return {name, arg}; } +} // anonymous namespace static const std::string rocksdb_prefix = "rocksdb."; @@ -99,9 +114,8 @@ static const std::string compaction_pending = "compaction-pending"; static const std::string background_errors = "background-errors"; static const std::string cur_size_active_mem_table = "cur-size-active-mem-table"; -static const std::string cur_size_unflushed_mem_tables = - "cur-size-all-mem-tables"; -static const std::string cur_size_all_mem_tables = "size-all-mem-tables"; +static const std::string cur_size_all_mem_tables = "cur-size-all-mem-tables"; +static const std::string size_all_mem_tables = "size-all-mem-tables"; static const std::string num_entries_active_mem_table = "num-entries-active-mem-table"; static const std::string num_entries_imm_mem_tables = @@ -118,6 +132,8 @@ static const std::string is_file_deletions_enabled = static const std::string num_snapshots = "num-snapshots"; static const std::string oldest_snapshot_time = "oldest-snapshot-time"; static const std::string num_live_versions = "num-live-versions"; +static const std::string current_version_number = + "current-super-version-number"; static const std::string estimate_live_data_size = "estimate-live-data-size"; static const std::string base_level = "base-level"; static const std::string total_sst_files_size = "total-sst-files-size"; @@ -136,8 +152,11 @@ const std::string DB::Properties::kStats = rocksdb_prefix + allstats; const std::string DB::Properties::kSSTables = rocksdb_prefix + sstables; const std::string DB::Properties::kCFStats = rocksdb_prefix + cfstats; const std::string DB::Properties::kDBStats = rocksdb_prefix + dbstats; +const std::string DB::Properties::kLevelStats = rocksdb_prefix + levelstats; const std::string DB::Properties::kNumImmutableMemTable = rocksdb_prefix + num_immutable_mem_table; +const std::string DB::Properties::kNumImmutableMemTableFlushed = + rocksdb_prefix + num_immutable_mem_table_flushed; const std::string DB::Properties::kMemTableFlushPending = rocksdb_prefix + mem_table_flush_pending; const std::string DB::Properties::kCompactionPending = @@ -151,9 +170,9 @@ const std::string DB::Properties::kBackgroundErrors = const std::string DB::Properties::kCurSizeActiveMemTable = rocksdb_prefix + cur_size_active_mem_table; const std::string DB::Properties::kCurSizeAllMemTables = - rocksdb_prefix + cur_size_unflushed_mem_tables; -const std::string DB::Properties::kSizeAllMemTables = rocksdb_prefix + cur_size_all_mem_tables; +const std::string DB::Properties::kSizeAllMemTables = + rocksdb_prefix + size_all_mem_tables; const std::string DB::Properties::kNumEntriesActiveMemTable = rocksdb_prefix + num_entries_active_mem_table; const std::string DB::Properties::kNumEntriesImmMemTables = @@ -174,10 +193,13 @@ const std::string DB::Properties::kOldestSnapshotTime = rocksdb_prefix + oldest_snapshot_time; const std::string DB::Properties::kNumLiveVersions = rocksdb_prefix + num_live_versions; +const std::string DB::Properties::kCurrentSuperVersionNumber = + rocksdb_prefix + current_version_number; const std::string DB::Properties::kEstimateLiveDataSize = rocksdb_prefix + estimate_live_data_size; const std::string DB::Properties::kTotalSstFilesSize = rocksdb_prefix + total_sst_files_size; +const std::string DB::Properties::kBaseLevel = rocksdb_prefix + base_level; const std::string DB::Properties::kEstimatePendingCompactionBytes = rocksdb_prefix + estimate_pending_comp_bytes; const std::string DB::Properties::kAggregatedTableProperties = @@ -185,294 +207,368 @@ const std::string DB::Properties::kAggregatedTableProperties = const std::string DB::Properties::kAggregatedTablePropertiesAtLevel = rocksdb_prefix + aggregated_table_properties_at_level; -DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, - bool* need_out_of_mutex) { - assert(is_int_property != nullptr); - assert(need_out_of_mutex != nullptr); - Slice in = property; - Slice prefix(rocksdb_prefix); - *need_out_of_mutex = false; - *is_int_property = false; - if (!in.starts_with(prefix)) { - return kUnknown; - } - in.remove_prefix(prefix.size()); - - if (in.starts_with(num_files_at_level_prefix)) { - return kNumFilesAtLevel; - } else if (in == levelstats) { - return kLevelStats; - } else if (in == allstats) { - return kStats; - } else if (in == cfstats) { - return kCFStats; - } else if (in == dbstats) { - return kDBStats; - } else if (in == sstables) { - return kSsTables; - } else if (in == aggregated_table_properties) { - return kAggregatedTableProperties; - } else if (in.starts_with(aggregated_table_properties_at_level)) { - return kAggregatedTablePropertiesAtLevel; +const std::unordered_map InternalStats::ppt_name_to_info = { + {DB::Properties::kNumFilesAtLevelPrefix, + {false, &InternalStats::HandleNumFilesAtLevel, nullptr}}, + {DB::Properties::kLevelStats, + {false, &InternalStats::HandleLevelStats, nullptr}}, + {DB::Properties::kStats, {false, &InternalStats::HandleStats, nullptr}}, + {DB::Properties::kCFStats, {false, &InternalStats::HandleCFStats, nullptr}}, + {DB::Properties::kDBStats, {false, &InternalStats::HandleDBStats, nullptr}}, + {DB::Properties::kSSTables, + {false, &InternalStats::HandleSsTables, nullptr}}, + {DB::Properties::kAggregatedTableProperties, + {false, &InternalStats::HandleAggregatedTableProperties, nullptr}}, + {DB::Properties::kAggregatedTablePropertiesAtLevel, + {false, &InternalStats::HandleAggregatedTablePropertiesAtLevel, nullptr}}, + {DB::Properties::kNumImmutableMemTable, + {false, nullptr, &InternalStats::HandleNumImmutableMemTable}}, + {DB::Properties::kNumImmutableMemTableFlushed, + {false, nullptr, &InternalStats::HandleNumImmutableMemTableFlushed}}, + {DB::Properties::kMemTableFlushPending, + {false, nullptr, &InternalStats::HandleMemTableFlushPending}}, + {DB::Properties::kCompactionPending, + {false, nullptr, &InternalStats::HandleCompactionPending}}, + {DB::Properties::kBackgroundErrors, + {false, nullptr, &InternalStats::HandleBackgroundErrors}}, + {DB::Properties::kCurSizeActiveMemTable, + {false, nullptr, &InternalStats::HandleCurSizeActiveMemTable}}, + {DB::Properties::kCurSizeAllMemTables, + {false, nullptr, &InternalStats::HandleCurSizeAllMemTables}}, + {DB::Properties::kSizeAllMemTables, + {false, nullptr, &InternalStats::HandleSizeAllMemTables}}, + {DB::Properties::kNumEntriesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumEntriesActiveMemTable}}, + {DB::Properties::kNumEntriesImmMemTables, + {false, nullptr, &InternalStats::HandleNumEntriesImmMemTables}}, + {DB::Properties::kNumDeletesActiveMemTable, + {false, nullptr, &InternalStats::HandleNumDeletesActiveMemTable}}, + {DB::Properties::kNumDeletesImmMemTables, + {false, nullptr, &InternalStats::HandleNumDeletesImmMemTables}}, + {DB::Properties::kEstimateNumKeys, + {false, nullptr, &InternalStats::HandleEstimateNumKeys}}, + {DB::Properties::kEstimateTableReadersMem, + {true, nullptr, &InternalStats::HandleEstimateTableReadersMem}}, + {DB::Properties::kIsFileDeletionsEnabled, + {false, nullptr, &InternalStats::HandleIsFileDeletionsEnabled}}, + {DB::Properties::kNumSnapshots, + {false, nullptr, &InternalStats::HandleNumSnapshots}}, + {DB::Properties::kOldestSnapshotTime, + {false, nullptr, &InternalStats::HandleOldestSnapshotTime}}, + {DB::Properties::kNumLiveVersions, + {false, nullptr, &InternalStats::HandleNumLiveVersions}}, + {DB::Properties::kCurrentSuperVersionNumber, + {false, nullptr, &InternalStats::HandleCurrentSuperVersionNumber}}, + {DB::Properties::kEstimateLiveDataSize, + {true, nullptr, &InternalStats::HandleEstimateLiveDataSize}}, + {DB::Properties::kBaseLevel, + {false, nullptr, &InternalStats::HandleBaseLevel}}, + {DB::Properties::kTotalSstFilesSize, + {false, nullptr, &InternalStats::HandleTotalSstFilesSize}}, + {DB::Properties::kEstimatePendingCompactionBytes, + {false, nullptr, &InternalStats::HandleEstimatePendingCompactionBytes}}, + {DB::Properties::kNumRunningFlushes, + {false, nullptr, &InternalStats::HandleNumRunningFlushes}}, + {DB::Properties::kNumRunningCompactions, + {false, nullptr, &InternalStats::HandleNumRunningCompactions}}, +}; + +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { + std::string ppt_name = GetPropertyNameAndArg(property).first.ToString(); + auto ppt_info_iter = InternalStats::ppt_name_to_info.find(ppt_name); + if (ppt_info_iter == InternalStats::ppt_name_to_info.end()) { + return nullptr; } + return &ppt_info_iter->second; +} - *is_int_property = true; - if (in == num_immutable_mem_table) { - return kNumImmutableMemTable; - } else if (in == num_immutable_mem_table_flushed) { - return kNumImmutableMemTableFlushed; - } else if (in == mem_table_flush_pending) { - return kMemtableFlushPending; - } else if (in == compaction_pending) { - return kCompactionPending; - } else if (in == background_errors) { - return kBackgroundErrors; - } else if (in == cur_size_active_mem_table) { - return kCurSizeActiveMemTable; - } else if (in == cur_size_unflushed_mem_tables) { - return kCurSizeAllMemTables; - } else if (in == cur_size_all_mem_tables) { - return kSizeAllMemTables; - } else if (in == num_entries_active_mem_table) { - return kNumEntriesInMutableMemtable; - } else if (in == num_entries_imm_mem_tables) { - return kNumEntriesInImmutableMemtable; - } else if (in == num_deletes_active_mem_table) { - return kNumDeletesInMutableMemtable; - } else if (in == num_deletes_imm_mem_tables) { - return kNumDeletesInImmutableMemtable; - } else if (in == estimate_num_keys) { - return kEstimatedNumKeys; - } else if (in == estimate_table_readers_mem) { - *need_out_of_mutex = true; - return kEstimatedUsageByTableReaders; - } else if (in == is_file_deletions_enabled) { - return kIsFileDeletionEnabled; - } else if (in == num_snapshots) { - return kNumSnapshots; - } else if (in == oldest_snapshot_time) { - return kOldestSnapshotTime; - } else if (in == num_live_versions) { - return kNumLiveVersions; - } else if (in == estimate_live_data_size) { - *need_out_of_mutex = true; - return kEstimateLiveDataSize; - } else if (in == base_level) { - return kBaseLevel; - } else if (in == total_sst_files_size) { - return kTotalSstFilesSize; - } else if (in == estimate_pending_comp_bytes) { - return kEstimatePendingCompactionBytes; - } else if (in == num_running_flushes) { - return kNumRunningFlushes; - } else if (in == num_running_compactions) { - return kNumRunningCompactions; - } - return kUnknown; +bool InternalStats::GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, + std::string* value) { + assert(value != nullptr); + assert(property_info.handle_string != nullptr); + Slice arg = GetPropertyNameAndArg(property).second; + return (this->*(property_info.handle_string))(value, arg); +} + +bool InternalStats::GetIntProperty(const DBPropertyInfo& property_info, + uint64_t* value, DBImpl* db) { + assert(value != nullptr); + assert(property_info.handle_int != nullptr && + !property_info.need_out_of_mutex); + db->mutex_.AssertHeld(); + return (this->*(property_info.handle_int))(value, db, nullptr /* version */); } -bool InternalStats::GetIntPropertyOutOfMutex(DBPropertyType property_type, - Version* version, - uint64_t* value) const { +bool InternalStats::GetIntPropertyOutOfMutex( + const DBPropertyInfo& property_info, Version* version, uint64_t* value) { assert(value != nullptr); + assert(property_info.handle_int != nullptr && + property_info.need_out_of_mutex); + return (this->*(property_info.handle_int))(value, nullptr /* db */, version); +} + +bool InternalStats::HandleNumFilesAtLevel(std::string* value, Slice suffix) { + uint64_t level; + const auto* vstorage = cfd_->current()->storage_info(); + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } else { + char buf[100]; + snprintf(buf, sizeof(buf), "%d", + vstorage->NumLevelFiles(static_cast(level))); + *value = buf; + return true; + } +} + +bool InternalStats::HandleLevelStats(std::string* value, Slice suffix) { + char buf[1000]; const auto* vstorage = cfd_->current()->storage_info(); + snprintf(buf, sizeof(buf), + "Level Files Size(MB)\n" + "--------------------\n"); + value->append(buf); - switch (property_type) { - case kEstimatedUsageByTableReaders: - *value = (version == nullptr) ? - 0 : version->GetMemoryUsageByTableReaders(); - return true; - case kEstimateLiveDataSize: - *value = vstorage->EstimateLiveDataSize(); - return true; - default: - return false; + for (int level = 0; level < number_levels_; level++) { + snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, + vstorage->NumLevelFiles(level), + vstorage->NumLevelBytes(level) / kMB); + value->append(buf); } + return true; } -bool InternalStats::GetStringProperty(DBPropertyType property_type, - const Slice& property, - std::string* value) { - assert(value != nullptr); +bool InternalStats::HandleStats(std::string* value, Slice suffix) { + if (!HandleCFStats(value, suffix)) { + return false; + } + if (!HandleDBStats(value, suffix)) { + return false; + } + return true; +} + +bool InternalStats::HandleCFStats(std::string* value, Slice suffix) { + DumpCFStats(value); + return true; +} + +bool InternalStats::HandleDBStats(std::string* value, Slice suffix) { + DumpDBStats(value); + return true; +} + +bool InternalStats::HandleSsTables(std::string* value, Slice suffix) { auto* current = cfd_->current(); - const auto* vstorage = current->storage_info(); - Slice in = property; - - switch (property_type) { - case kNumFilesAtLevel: { - in.remove_prefix(strlen("rocksdb.num-files-at-level")); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || (int)level >= number_levels_) { - return false; - } else { - char buf[100]; - snprintf(buf, sizeof(buf), "%d", - vstorage->NumLevelFiles(static_cast(level))); - *value = buf; - return true; - } - } - case kLevelStats: { - char buf[1000]; - snprintf(buf, sizeof(buf), - "Level Files Size(MB)\n" - "--------------------\n"); - value->append(buf); + *value = current->DebugString(); + return true; +} - for (int level = 0; level < number_levels_; level++) { - snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, - vstorage->NumLevelFiles(level), - vstorage->NumLevelBytes(level) / kMB); - value->append(buf); - } - return true; - } - case kStats: { - if (!GetStringProperty(kCFStats, DB::Properties::kCFStats, value)) { - return false; - } - if (!GetStringProperty(kDBStats, DB::Properties::kDBStats, value)) { - return false; - } - return true; - } - case kCFStats: { - DumpCFStats(value); - return true; - } - case kDBStats: { - DumpDBStats(value); - return true; - } - case kSsTables: - *value = current->DebugString(); - return true; - case kAggregatedTableProperties: { - std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); - if (!s.ok()) { - return false; - } - *value = tp->ToString(); - return true; - } - case kAggregatedTablePropertiesAtLevel: { - in.remove_prefix( - DB::Properties::kAggregatedTablePropertiesAtLevel.length()); - uint64_t level; - bool ok = ConsumeDecimalNumber(&in, &level) && in.empty(); - if (!ok || static_cast(level) >= number_levels_) { - return false; - } - std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); - if (!s.ok()) { - return false; - } - *value = tp->ToString(); - return true; - } - default: - return false; +bool InternalStats::HandleAggregatedTableProperties(std::string* value, + Slice suffix) { + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + if (!s.ok()) { + return false; } + *value = tp->ToString(); + return true; } -bool InternalStats::GetIntProperty(DBPropertyType property_type, - uint64_t* value, DBImpl* db) const { - db->mutex_.AssertHeld(); +bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* value, + Slice suffix) { + uint64_t level; + bool ok = ConsumeDecimalNumber(&suffix, &level) && suffix.empty(); + if (!ok || static_cast(level) >= number_levels_) { + return false; + } + std::shared_ptr tp; + auto s = cfd_->current()->GetAggregatedTableProperties( + &tp, static_cast(level)); + if (!s.ok()) { + return false; + } + *value = tp->ToString(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->imm()->NumNotFlushed(); + return true; +} + +bool InternalStats::HandleNumImmutableMemTableFlushed(uint64_t* value, + DBImpl* db, + Version* version) { + *value = cfd_->imm()->NumFlushed(); + return true; +} + +bool InternalStats::HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version) { + // Return number of mem tables that are ready to flush (made immutable) + *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); + return true; +} + +bool InternalStats::HandleNumRunningFlushes(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->num_running_flushes(); + return true; +} + +bool InternalStats::HandleCompactionPending(uint64_t* value, DBImpl* db, + Version* version) { + // 1 if the system already determines at least one compaction is needed. + // 0 otherwise, const auto* vstorage = cfd_->current()->storage_info(); + *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); + return true; +} - switch (property_type) { - case kNumImmutableMemTable: - *value = cfd_->imm()->NumNotFlushed(); - return true; - case kNumImmutableMemTableFlushed: - *value = cfd_->imm()->NumFlushed(); - return true; - case kMemtableFlushPending: - // Return number of mem tables that are ready to flush (made immutable) - *value = (cfd_->imm()->IsFlushPending() ? 1 : 0); - return true; - case kNumRunningFlushes: - *value = db->num_running_flushes(); - return true; - case kCompactionPending: - // 1 if the system already determines at least one compaction is needed. - // 0 otherwise, - *value = (cfd_->compaction_picker()->NeedsCompaction(vstorage) ? 1 : 0); - return true; - case kNumRunningCompactions: - *value = db->num_running_compactions_; - return true; - case kBackgroundErrors: - // Accumulated number of errors in background flushes or compactions. - *value = GetBackgroundErrorCount(); - return true; - case kCurSizeActiveMemTable: - // Current size of the active memtable - *value = cfd_->mem()->ApproximateMemoryUsage(); - return true; - case kCurSizeAllMemTables: - // Current size of the active memtable + immutable memtables - *value = cfd_->mem()->ApproximateMemoryUsage() + - cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); - return true; - case kSizeAllMemTables: - *value = cfd_->mem()->ApproximateMemoryUsage() + - cfd_->imm()->ApproximateMemoryUsage(); - return true; - case kNumEntriesInMutableMemtable: - // Current number of entires in the active memtable - *value = cfd_->mem()->num_entries(); - return true; - case kNumEntriesInImmutableMemtable: - // Current number of entries in the immutable memtables - *value = cfd_->imm()->current()->GetTotalNumEntries(); - return true; - case kNumDeletesInMutableMemtable: - // Current number of entires in the active memtable - *value = cfd_->mem()->num_deletes(); - return true; - case kNumDeletesInImmutableMemtable: - // Current number of entries in the immutable memtables - *value = cfd_->imm()->current()->GetTotalNumDeletes(); - return true; - case kEstimatedNumKeys: - // Estimate number of entries in the column family: - // Use estimated entries in tables + total entries in memtables. - *value = cfd_->mem()->num_entries() + - cfd_->imm()->current()->GetTotalNumEntries() - - (cfd_->mem()->num_deletes() + - cfd_->imm()->current()->GetTotalNumDeletes()) * - 2 + - vstorage->GetEstimatedActiveKeys(); - return true; - case kNumSnapshots: - *value = db->snapshots().count(); - return true; - case kOldestSnapshotTime: - *value = static_cast(db->snapshots().GetOldestSnapshotTime()); - return true; - case kNumLiveVersions: - *value = cfd_->GetNumLiveVersions(); - return true; - case kIsFileDeletionEnabled: - *value = db->IsFileDeletionsEnabled(); - return true; - case kBaseLevel: - *value = vstorage->base_level(); - return true; - case kTotalSstFilesSize: - *value = cfd_->GetTotalSstFilesSize(); - return true; - case kEstimatePendingCompactionBytes: - *value = vstorage->estimated_compaction_needed_bytes(); - return true; - default: - return false; - } +bool InternalStats::HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->num_running_compactions_; + return true; +} + +bool InternalStats::HandleBackgroundErrors(uint64_t* value, DBImpl* db, + Version* version) { + // Accumulated number of errors in background flushes or compactions. + *value = GetBackgroundErrorCount(); + return true; +} + +bool InternalStats::HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current size of the active memtable + *value = cfd_->mem()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current size of the active memtable + immutable memtables + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateUnflushedMemTablesMemoryUsage(); + return true; +} + +bool InternalStats::HandleSizeAllMemTables(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->mem()->ApproximateMemoryUsage() + + cfd_->imm()->ApproximateMemoryUsage(); + return true; +} + +bool InternalStats::HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_entries(); + return true; +} + +bool InternalStats::HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumEntries(); + return true; +} + +bool InternalStats::HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entires in the active memtable + *value = cfd_->mem()->num_deletes(); + return true; +} + +bool InternalStats::HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version) { + // Current number of entries in the immutable memtables + *value = cfd_->imm()->current()->GetTotalNumDeletes(); + return true; +} + +bool InternalStats::HandleEstimateNumKeys(uint64_t* value, DBImpl* db, + Version* version) { + // Estimate number of entries in the column family: + // Use estimated entries in tables + total entries in memtables. + const auto* vstorage = cfd_->current()->storage_info(); + *value = cfd_->mem()->num_entries() + + cfd_->imm()->current()->GetTotalNumEntries() - + (cfd_->mem()->num_deletes() + + cfd_->imm()->current()->GetTotalNumDeletes()) * + 2 + + vstorage->GetEstimatedActiveKeys(); + return true; +} + +bool InternalStats::HandleNumSnapshots(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->snapshots().count(); + return true; +} + +bool InternalStats::HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, + Version* version) { + *value = static_cast(db->snapshots().GetOldestSnapshotTime()); + return true; +} + +bool InternalStats::HandleNumLiveVersions(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetNumLiveVersions(); + return true; +} + +bool InternalStats::HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetSuperVersionNumber(); + return true; +} + +bool InternalStats::HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version) { + *value = db->IsFileDeletionsEnabled(); + return true; +} + +bool InternalStats::HandleBaseLevel(uint64_t* value, DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->base_level(); + return true; +} + +bool InternalStats::HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, + Version* version) { + *value = cfd_->GetTotalSstFilesSize(); + return true; +} + +bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, + DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->estimated_compaction_needed_bytes(); + return true; +} + +bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version) { + *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + return true; +} + +bool InternalStats::HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version) { + const auto* vstorage = cfd_->current()->storage_info(); + *value = vstorage->EstimateLiveDataSize(); + return true; } void InternalStats::DumpDBStats(std::string* value) { @@ -757,10 +853,7 @@ void InternalStats::DumpCFStats(std::string* value) { #else -DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property, - bool* need_out_of_mutex) { - return kUnknown; -} +const DBPropertyInfo* GetPropertyInfo(const Slice& property) { return nullptr; } #endif // !ROCKSDB_LITE diff --git a/db/internal_stats.h b/db/internal_stats.h index 9c4414ef1..03b2bd882 100644 --- a/db/internal_stats.h +++ b/db/internal_stats.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,63 +21,29 @@ namespace rocksdb { class MemTableList; class DBImpl; -// IMPORTANT: If you add a new property here, also add it to the list in -// include/rocksdb/db.h -enum DBPropertyType : uint32_t { - kUnknown, - kNumFilesAtLevel, // Number of files at a specific level - kLevelStats, // Return number of files and total sizes of each level - kCFStats, // Return general statitistics of CF - kDBStats, // Return general statitistics of DB - kStats, // Return general statitistics of both DB and CF - kSsTables, // Return a human readable string of current SST files - kStartIntTypes, // ---- Dummy value to indicate the start of integer values - kNumImmutableMemTable, // Return number of immutable mem tables that - // have not been flushed. - kNumImmutableMemTableFlushed, // Return number of immutable mem tables - // in memory that have already been flushed - kMemtableFlushPending, // Return 1 if mem table flushing is pending, - // otherwise 0. - kNumRunningFlushes, // Return the number of currently running flushes. - kCompactionPending, // Return 1 if a compaction is pending. Otherwise 0. - kNumRunningCompactions, // Return the number of currently running - // compactions. - kBackgroundErrors, // Return accumulated background errors encountered. - kCurSizeActiveMemTable, // Return current size of the active memtable - kCurSizeAllMemTables, // Return current size of unflushed - // (active + immutable) memtables - kSizeAllMemTables, // Return current size of all (active + immutable - // + pinned) memtables - kNumEntriesInMutableMemtable, // Return number of deletes in the mutable - // memtable. - kNumEntriesInImmutableMemtable, // Return sum of number of entries in all - // the immutable mem tables. - kNumDeletesInMutableMemtable, // Return number of deletion entries in the - // mutable memtable. - kNumDeletesInImmutableMemtable, // Return the total number of deletion - // entries in all the immutable mem tables. - kEstimatedNumKeys, // Estimated total number of keys in the database. - kEstimatedUsageByTableReaders, // Estimated memory by table readers. - kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_, - // 0 means file deletions enabled - kNumSnapshots, // Number of snapshots in the system - kOldestSnapshotTime, // Unix timestamp of the first snapshot - kNumLiveVersions, - kEstimateLiveDataSize, // Estimated amount of live data in bytes - kTotalSstFilesSize, // Total size of all sst files. - kBaseLevel, // The level that L0 data is compacted to - kEstimatePendingCompactionBytes, // Estimated bytes to compaction - kAggregatedTableProperties, // Return a string that contains the aggregated - // table properties. - kAggregatedTablePropertiesAtLevel, // Return a string that contains the - // aggregated - // table properties at the specified level. +// Config for retrieving a property's value. +struct DBPropertyInfo { + bool need_out_of_mutex; + + // gcc had an internal error for initializing union of pointer-to-member- + // functions. Workaround is to populate exactly one of the following function + // pointers with a non-nullptr value. + + // @param value Value-result argument for storing the property's string value + // @param suffix Argument portion of the property. For example, suffix would + // be "5" for the property "rocksdb.num-files-at-level5". So far, only + // certain string properties take an argument. + bool (InternalStats::*handle_string)(std::string* value, Slice suffix); + + // @param value Value-result argument for storing the property's uint64 value + // @param db Many of the int properties rely on DBImpl methods. + // @param version Version is needed in case the property is retrieved without + // holding db mutex, which is only supported for int properties. + bool (InternalStats::*handle_int)(uint64_t* value, DBImpl* db, + Version* version); }; -extern DBPropertyType GetPropertyType(const Slice& property, - bool* is_int_property, - bool* need_out_of_mutex); - +extern const DBPropertyInfo* GetPropertyInfo(const Slice& property); #ifndef ROCKSDB_LITE class InternalStats { @@ -248,14 +214,18 @@ class InternalStats { uint64_t BumpAndGetBackgroundErrorCount() { return ++bg_error_count_; } - bool GetStringProperty(DBPropertyType property_type, const Slice& property, - std::string* value); + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value); + + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db); - bool GetIntProperty(DBPropertyType property_type, uint64_t* value, - DBImpl* db) const; + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value); - bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, - uint64_t* value) const; + // Store a mapping from the user-facing DB::Properties string to our + // DBPropertyInfo struct used internally for retrieving properties. + static const std::unordered_map ppt_name_to_info; private: void DumpDBStats(std::string* value); @@ -321,6 +291,56 @@ class InternalStats { seconds_up(0) {} } db_stats_snapshot_; + // Handler functions for getting property values. They use "value" as a value- + // result argument, and return true upon successfully setting "value". + bool HandleNumFilesAtLevel(std::string* value, Slice suffix); + bool HandleLevelStats(std::string* value, Slice suffix); + bool HandleStats(std::string* value, Slice suffix); + bool HandleCFStats(std::string* value, Slice suffix); + bool HandleDBStats(std::string* value, Slice suffix); + bool HandleSsTables(std::string* value, Slice suffix); + bool HandleAggregatedTableProperties(std::string* value, Slice suffix); + bool HandleAggregatedTablePropertiesAtLevel(std::string* value, Slice suffix); + bool HandleNumImmutableMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumImmutableMemTableFlushed(uint64_t* value, DBImpl* db, + Version* version); + bool HandleMemTableFlushPending(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumRunningFlushes(uint64_t* value, DBImpl* db, Version* version); + bool HandleCompactionPending(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumRunningCompactions(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBackgroundErrors(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurSizeActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleCurSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleSizeAllMemTables(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumEntriesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumEntriesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesActiveMemTable(uint64_t* value, DBImpl* db, + Version* version); + bool HandleNumDeletesImmMemTables(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateNumKeys(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumSnapshots(uint64_t* value, DBImpl* db, Version* version); + bool HandleOldestSnapshotTime(uint64_t* value, DBImpl* db, Version* version); + bool HandleNumLiveVersions(uint64_t* value, DBImpl* db, Version* version); + bool HandleCurrentSuperVersionNumber(uint64_t* value, DBImpl* db, + Version* version); + bool HandleIsFileDeletionsEnabled(uint64_t* value, DBImpl* db, + Version* version); + bool HandleBaseLevel(uint64_t* value, DBImpl* db, Version* version); + bool HandleTotalSstFilesSize(uint64_t* value, DBImpl* db, Version* version); + bool HandleEstimatePendingCompactionBytes(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateTableReadersMem(uint64_t* value, DBImpl* db, + Version* version); + bool HandleEstimateLiveDataSize(uint64_t* value, DBImpl* db, + Version* version); + // Total number of background errors encountered. Every time a flush task // or compaction task fails, this counter is incremented. The failure can // be caused by any possible reason, including file system errors, out of @@ -402,14 +422,20 @@ class InternalStats { uint64_t BumpAndGetBackgroundErrorCount() { return 0; } - bool GetStringProperty(DBPropertyType property_type, const Slice& property, - std::string* value) { return false; } + bool GetStringProperty(const DBPropertyInfo& property_info, + const Slice& property, std::string* value) { + return false; + } - bool GetIntProperty(DBPropertyType property_type, uint64_t* value, - DBImpl* db) const { return false; } + bool GetIntProperty(const DBPropertyInfo& property_info, uint64_t* value, + DBImpl* db) const { + return false; + } - bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, - uint64_t* value) const { return false; } + bool GetIntPropertyOutOfMutex(const DBPropertyInfo& property_info, + Version* version, uint64_t* value) const { + return false; + } }; #endif // !ROCKSDB_LITE diff --git a/db/job_context.h b/db/job_context.h index 5a54e2d85..0f24136a3 100644 --- a/db/job_context.h +++ b/db/job_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -22,9 +22,9 @@ class MemTable; struct JobContext { inline bool HaveSomethingToDelete() const { return full_scan_candidate_files.size() || sst_delete_files.size() || - log_delete_files.size() || new_superversion != nullptr || - superversions_to_free.size() > 0 || memtables_to_free.size() > 0 || - logs_to_free.size() > 0; + log_delete_files.size() || manifest_delete_files.size() || + new_superversion != nullptr || superversions_to_free.size() > 0 || + memtables_to_free.size() > 0 || logs_to_free.size() > 0; } // Structure to store information for candidate files to delete. @@ -56,6 +56,9 @@ struct JobContext { // a list of log files that we need to delete std::vector log_delete_files; + // a list of manifest files that we need to delete + std::vector manifest_delete_files; + // a list of memtables to be free autovector memtables_to_free; diff --git a/db/listener_test.cc b/db/listener_test.cc index f194a113a..fe5547705 100644 --- a/db/listener_test.cc +++ b/db/listener_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_format.h b/db/log_format.h index 97eb13393..cf48a202f 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_reader.cc b/db/log_reader.cc index 512dd08d3..c33c43c53 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_reader.h b/db/log_reader.h index 28f0a2c1e..ab9dbab7e 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_test.cc b/db/log_test.cc index 41f4c8223..427a31a2b 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_writer.cc b/db/log_writer.cc index 84780d87f..3277088be 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/log_writer.h b/db/log_writer.h index 876e4c56f..23d896746 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/managed_iterator.cc b/db/managed_iterator.cc index 45faeba4e..1d47f933d 100644 --- a/db/managed_iterator.cc +++ b/db/managed_iterator.cc @@ -1,10 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE +#include "db/managed_iterator.h" + #include #include #include @@ -13,7 +15,7 @@ #include "db/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" -#include "db/managed_iterator.h" +#include "db/xfunc_test_points.h" #include "rocksdb/env.h" #include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" @@ -77,7 +79,7 @@ ManagedIterator::ManagedIterator(DBImpl* db, const ReadOptions& read_options, release_supported_(true) { read_options_.managed = false; if ((!read_options_.tailing) && (read_options_.snapshot == nullptr)) { - assert(read_options_.snapshot = db_->GetSnapshot()); + assert(nullptr != (read_options_.snapshot = db_->GetSnapshot())); snapshot_created_ = true; } cfh_.SetCFD(cfd); @@ -208,7 +210,8 @@ void ManagedIterator::RebuildIterator() { void ManagedIterator::UpdateCurrent() { assert(mutable_iter_ != nullptr); - if (!(valid_ = mutable_iter_->Valid())) { + valid_ = mutable_iter_->Valid(); + if (!valid_) { status_ = mutable_iter_->status(); return; } diff --git a/db/managed_iterator.h b/db/managed_iterator.h index 00f56aea4..d9a87596e 100644 --- a/db/managed_iterator.h +++ b/db/managed_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/manual_compaction_test.cc b/db/manual_compaction_test.cc index 8613b7b36..0ff52d184 100644 --- a/db/manual_compaction_test.cc +++ b/db/manual_compaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable.cc b/db/memtable.cc index a8f869261..f34acb319 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable.h b/db/memtable.h index 110985620..a01a598f0 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable_allocator.cc b/db/memtable_allocator.cc index 1ed2019b6..f9b2fbd73 100644 --- a/db/memtable_allocator.cc +++ b/db/memtable_allocator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable_allocator.h b/db/memtable_allocator.h index c2cf130cc..d8bd4c808 100644 --- a/db/memtable_allocator.h +++ b/db/memtable_allocator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 1734eda03..9c1d3632b 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -345,8 +345,8 @@ Status MemTableList::InstallMemtableFlushResults( imm_flush_needed.store(true, std::memory_order_release); } ++mem_id; - } while (!current_->memlist_.empty() && (m = current_->memlist_.back()) && - m->file_number_ == file_number); + } while (!current_->memlist_.empty() && (nullptr != (m = current_->memlist_.back())) && + (m->file_number_ == file_number)); } commit_in_progress_ = false; return s; diff --git a/db/memtable_list.h b/db/memtable_list.h index 117b4a506..37f5c7784 100644 --- a/db/memtable_list.h +++ b/db/memtable_list.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 7bb8b3b21..50f96b9f8 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/memtablerep_bench.cc b/db/memtablerep_bench.cc index 42edfdfc7..a897adeab 100644 --- a/db/memtablerep_bench.cc +++ b/db/memtablerep_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_context.h b/db/merge_context.h index f8609da75..74264c4c9 100644 --- a/db/merge_context.h +++ b/db/merge_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_helper.cc b/db/merge_helper.cc index c443ca2d9..145a72b0d 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_helper.h b/db/merge_helper.h index 488c7ac2b..7128b1a09 100644 --- a/db/merge_helper.h +++ b/db/merge_helper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_helper_test.cc b/db/merge_helper_test.cc index 2ef0d39e4..b21f56078 100644 --- a/db/merge_helper_test.cc +++ b/db/merge_helper_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_operator.cc b/db/merge_operator.cc index c6645a910..5c5d04008 100644 --- a/db/merge_operator.cc +++ b/db/merge_operator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/merge_test.cc b/db/merge_test.cc index 50f0e7c93..020f33ba6 100644 --- a/db/merge_test.cc +++ b/db/merge_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/options_file_test.cc b/db/options_file_test.cc index 86a98899a..fbbc8c552 100644 --- a/db/options_file_test.cc +++ b/db/options_file_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 00065dc92..adab6d78e 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,6 +13,7 @@ #include "rocksdb/slice_transform.h" #include "rocksdb/memtablerep.h" #include "util/histogram.h" +#include "util/instrumented_mutex.h" #include "util/stop_watch.h" #include "util/testharness.h" #include "util/thread_status_util.h" @@ -543,28 +544,31 @@ TEST_F(PerfContextTest, SeekKeyComparison) { } TEST_F(PerfContextTest, DBMutexLockCounter) { - SetPerfLevel(kEnableTime); int stats_code[] = {0, static_cast(DB_MUTEX_WAIT_MICROS)}; - for (int c = 0; c < 2; ++c) { + for (PerfLevel perf_level : + {PerfLevel::kEnableTimeExceptForMutex, PerfLevel::kEnableTime}) { + for (int c = 0; c < 2; ++c) { InstrumentedMutex mutex(nullptr, Env::Default(), stats_code[c]); mutex.Lock(); std::thread child_thread([&] { - SetPerfLevel(kEnableTime); + SetPerfLevel(perf_level); perf_context.Reset(); ASSERT_EQ(perf_context.db_mutex_lock_nanos, 0); mutex.Lock(); mutex.Unlock(); - if (stats_code[c] == DB_MUTEX_WAIT_MICROS) { + if (perf_level == PerfLevel::kEnableTimeExceptForMutex || + stats_code[c] != DB_MUTEX_WAIT_MICROS) { + ASSERT_EQ(perf_context.db_mutex_lock_nanos, 0); + } else { // increment the counter only when it's a DB Mutex ASSERT_GT(perf_context.db_mutex_lock_nanos, 0); - } else { - ASSERT_EQ(perf_context.db_mutex_lock_nanos, 0); } }); Env::Default()->SleepForMicroseconds(100); mutex.Unlock(); child_thread.join(); } + } } TEST_F(PerfContextTest, FalseDBMutexWait) { @@ -585,6 +589,19 @@ TEST_F(PerfContextTest, FalseDBMutexWait) { } } } + +TEST_F(PerfContextTest, ToString) { + perf_context.Reset(); + perf_context.block_read_count = 12345; + + std::string zero_included = perf_context.ToString(); + ASSERT_NE(std::string::npos, zero_included.find("= 0")); + ASSERT_NE(std::string::npos, zero_included.find("= 12345")); + + std::string zero_excluded = perf_context.ToString(true); + ASSERT_EQ(std::string::npos, zero_excluded.find("= 0")); + ASSERT_NE(std::string::npos, zero_excluded.find("= 12345")); +} } int main(int argc, char** argv) { diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index e3d9fc402..8cb070ac6 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/prefix_test.cc b/db/prefix_test.cc index a210e4d65..eccce06d5 100644 --- a/db/prefix_test.cc +++ b/db/prefix_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/repair.cc b/db/repair.cc index f4758d0cd..6aa72f792 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/skiplist.h b/db/skiplist.h index b80ecf210..3fdbd8f54 100644 --- a/db/skiplist.h +++ b/db/skiplist.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/skiplist_test.cc b/db/skiplist_test.cc index 3d1418625..b4f98e34c 100644 --- a/db/skiplist_test.cc +++ b/db/skiplist_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/slice.cc b/db/slice.cc index 7e7245d79..10b0ca592 100644 --- a/db/slice.cc +++ b/db/slice.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/snapshot_impl.cc b/db/snapshot_impl.cc index d901b61d2..5c4f6abaa 100644 --- a/db/snapshot_impl.cc +++ b/db/snapshot_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/snapshot_impl.h b/db/snapshot_impl.h index 277cf3a20..aaac7a0e3 100644 --- a/db/snapshot_impl.h +++ b/db/snapshot_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/table_cache.cc b/db/table_cache.cc index 663315840..be6b5c324 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -88,7 +88,7 @@ Status TableCache::GetTableReader( const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, - unique_ptr* table_reader, bool skip_filters) { + unique_ptr* table_reader, bool skip_filters, int level) { std::string fname = TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId()); unique_ptr file; @@ -109,7 +109,7 @@ Status TableCache::GetTableReader( file_read_hist)); s = ioptions_.table_factory->NewTableReader( TableReaderOptions(ioptions_, env_options, internal_comparator, - skip_filters), + skip_filters, level), std::move(file_reader), fd.GetFileSize(), table_reader); TEST_SYNC_POINT("TableCache::GetTableReader:0"); } @@ -120,7 +120,8 @@ Status TableCache::FindTable(const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, Cache::Handle** handle, const bool no_io, bool record_read_stats, - HistogramImpl* file_read_hist, bool skip_filters) { + HistogramImpl* file_read_hist, bool skip_filters, + int level) { PERF_TIMER_GUARD(find_table_nanos); Status s; uint64_t number = fd.GetNumber(); @@ -136,15 +137,19 @@ Status TableCache::FindTable(const EnvOptions& env_options, unique_ptr table_reader; s = GetTableReader(env_options, internal_comparator, fd, false /* sequential mode */, record_read_stats, - file_read_hist, &table_reader, skip_filters); + file_read_hist, &table_reader, skip_filters, level); if (!s.ok()) { assert(table_reader == nullptr); RecordTick(ioptions_.statistics, NO_FILE_ERRORS); // We do not cache error results so that if the error is transient, // or somebody repairs the file, we recover automatically. } else { - *handle = cache_->Insert(key, table_reader.release(), 1, - &DeleteEntry); + s = cache_->Insert(key, table_reader.get(), 1, &DeleteEntry, + handle); + if (s.ok()) { + // Release ownership of table reader. + table_reader.release(); + } } } return s; @@ -154,7 +159,7 @@ InternalIterator* TableCache::NewIterator( const ReadOptions& options, const EnvOptions& env_options, const InternalKeyComparator& icomparator, const FileDescriptor& fd, TableReader** table_reader_ptr, HistogramImpl* file_read_hist, - bool for_compaction, Arena* arena, bool skip_filters) { + bool for_compaction, Arena* arena, bool skip_filters, int level) { PERF_TIMER_GUARD(new_table_iterator_nanos); if (table_reader_ptr != nullptr) { @@ -169,7 +174,8 @@ InternalIterator* TableCache::NewIterator( unique_ptr table_reader_unique_ptr; Status s = GetTableReader( env_options, icomparator, fd, /* sequential mode */ true, - /* record stats */ false, nullptr, &table_reader_unique_ptr); + /* record stats */ false, nullptr, &table_reader_unique_ptr, + false /* skip_filters */, level); if (!s.ok()) { return NewErrorInternalIterator(s, arena); } @@ -180,7 +186,7 @@ InternalIterator* TableCache::NewIterator( Status s = FindTable(env_options, icomparator, fd, &handle, options.read_tier == kBlockCacheTier /* no_io */, !for_compaction /* record read_stats */, - file_read_hist, skip_filters); + file_read_hist, skip_filters, level); if (!s.ok()) { return NewErrorInternalIterator(s, arena); } @@ -212,7 +218,7 @@ Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const Slice& k, GetContext* get_context, HistogramImpl* file_read_hist, - bool skip_filters) { + bool skip_filters, int level) { TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; @@ -261,7 +267,8 @@ Status TableCache::Get(const ReadOptions& options, if (!t) { s = FindTable(env_options_, internal_comparator, fd, &handle, options.read_tier == kBlockCacheTier /* no_io */, - true /* record_read_stats */, file_read_hist, skip_filters); + true /* record_read_stats */, file_read_hist, skip_filters, + level); if (s.ok()) { t = GetTableReaderFromHandle(handle); } @@ -273,7 +280,7 @@ Status TableCache::Get(const ReadOptions& options, if (handle != nullptr) { ReleaseHandle(handle); } - } else if (options.read_tier && s.IsIncomplete()) { + } else if (options.read_tier == kBlockCacheTier && s.IsIncomplete()) { // Couldn't find Table in cache but treat as kFound if no_io set get_context->MarkKeyMayExist(); return Status::OK(); @@ -285,9 +292,8 @@ Status TableCache::Get(const ReadOptions& options, size_t charge = row_cache_key.Size() + row_cache_entry->size() + sizeof(std::string); void* row_ptr = new std::string(std::move(*row_cache_entry)); - auto row_handle = ioptions_.row_cache->Insert( - row_cache_key.GetKey(), row_ptr, charge, &DeleteEntry); - ioptions_.row_cache->Release(row_handle); + ioptions_.row_cache->Insert(row_cache_key.GetKey(), row_ptr, charge, + &DeleteEntry); } #endif // ROCKSDB_LITE diff --git a/db/table_cache.h b/db/table_cache.h index 44246fbf4..499b9dbe5 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -45,34 +45,37 @@ class TableCache { // the cache and should not be deleted, and is valid for as long as the // returned iterator is live. // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" InternalIterator* NewIterator( const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, TableReader** table_reader_ptr = nullptr, HistogramImpl* file_read_hist = nullptr, bool for_compaction = false, - Arena* arena = nullptr, bool skip_filters = false); + Arena* arena = nullptr, bool skip_filters = false, int level = -1); // If a seek to internal key "k" in specified file finds an entry, // call (*handle_result)(arg, found_key, found_value) repeatedly until // it returns false. // @param skip_filters Disables loading/accessing the filter block + // @param level The level this table is at, -1 for "not set / don't know" Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, const Slice& k, GetContext* get_context, HistogramImpl* file_read_hist = nullptr, - bool skip_filters = false); + bool skip_filters = false, int level = -1); // Evict any entry for the specified file number static void Evict(Cache* cache, uint64_t file_number); // Find table reader // @param skip_filters Disables loading/accessing the filter block + // @param level == -1 means not specified Status FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, const FileDescriptor& file_fd, Cache::Handle**, const bool no_io = false, bool record_read_stats = true, HistogramImpl* file_read_hist = nullptr, - bool skip_filters = false); + bool skip_filters = false, int level = -1); // Get TableReader from a cache handle. TableReader* GetTableReaderFromHandle(Cache::Handle* handle); @@ -106,7 +109,7 @@ class TableCache { const FileDescriptor& fd, bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist, unique_ptr* table_reader, - bool skip_filters = false); + bool skip_filters = false, int level = -1); const ImmutableCFOptions& ioptions_; const EnvOptions& env_options_; diff --git a/db/table_properties_collector.cc b/db/table_properties_collector.cc index c14ecec11..204f42895 100644 --- a/db/table_properties_collector.cc +++ b/db/table_properties_collector.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/table_properties_collector.h b/db/table_properties_collector.h index 0e5f4e347..2b0310b0d 100644 --- a/db/table_properties_collector.h +++ b/db/table_properties_collector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 7cea86fdd..d096e6c79 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 28c4490f5..624a3af99 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index f89cc3207..d4a2468e7 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_builder.cc b/db/version_builder.cc index adc7b82b6..d0e7640fd 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_builder.h b/db/version_builder.h index 143da9905..c09815217 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 66230eef4..2a87dc238 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_edit.cc b/db/version_edit.cc index 23df641af..4cbf61f51 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_edit.h b/db/version_edit.h index 65213ed1a..ec5df00a4 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 629f904b0..ab109be60 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/version_set.cc b/db/version_set.cc index ba62177a7..167586d71 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -84,24 +84,22 @@ int FindFileInRange(const InternalKeyComparator& icmp, // are MergeInProgress). class FilePicker { public: - FilePicker( - std::vector* files, - const Slice& user_key, - const Slice& ikey, - autovector* file_levels, - unsigned int num_levels, - FileIndexer* file_indexer, - const Comparator* user_comparator, - const InternalKeyComparator* internal_comparator) + FilePicker(std::vector* files, const Slice& user_key, + const Slice& ikey, autovector* file_levels, + unsigned int num_levels, FileIndexer* file_indexer, + const Comparator* user_comparator, + const InternalKeyComparator* internal_comparator) : num_levels_(num_levels), - curr_level_(-1), - hit_file_level_(-1), + curr_level_(static_cast(-1)), + returned_file_level_(static_cast(-1)), + hit_file_level_(static_cast(-1)), search_left_bound_(0), search_right_bound_(FileIndexer::kLevelMaxIndex), #ifndef NDEBUG files_(files), #endif level_files_brief_(file_levels), + is_hit_file_last_in_level_(false), user_key_(user_key), ikey_(ikey), file_indexer_(file_indexer), @@ -120,12 +118,16 @@ class FilePicker { } } + int GetCurrentLevel() { return returned_file_level_; } + FdWithKeyRange* GetNextFile() { while (!search_ended_) { // Loops over different levels. while (curr_index_in_curr_level_ < curr_file_level_->num_files) { // Loops over all files in current level. FdWithKeyRange* f = &curr_file_level_->files[curr_index_in_curr_level_]; hit_file_level_ = curr_level_; + is_hit_file_last_in_level_ = + curr_index_in_curr_level_ == curr_file_level_->num_files - 1; int cmp_largest = -1; // Do key range filtering of files or/and fractional cascading if: @@ -190,6 +192,7 @@ class FilePicker { } prev_file_ = f; #endif + returned_file_level_ = curr_level_; if (curr_level_ > 0 && cmp_largest < 0) { // No more files to search in this level. search_ended_ = !PrepareNextLevel(); @@ -209,9 +212,14 @@ class FilePicker { // for GET_HIT_L0, GET_HIT_L1 & GET_HIT_L2_AND_UP counts unsigned int GetHitFileLevel() { return hit_file_level_; } + // Returns true if the most recent "hit file" (i.e., one returned by + // GetNextFile()) is at the last index in its level. + bool IsHitFileLastInLevel() { return is_hit_file_last_in_level_; } + private: unsigned int num_levels_; unsigned int curr_level_; + unsigned int returned_file_level_; unsigned int hit_file_level_; int32_t search_left_bound_; int32_t search_right_bound_; @@ -220,6 +228,7 @@ class FilePicker { #endif autovector* level_files_brief_; bool search_ended_; + bool is_hit_file_last_in_level_; LevelFilesBrief* curr_file_level_; unsigned int curr_index_in_curr_level_; unsigned int start_index_in_curr_level_; @@ -481,7 +490,7 @@ class LevelFileIteratorState : public TwoLevelIteratorState { const EnvOptions& env_options, const InternalKeyComparator& icomparator, HistogramImpl* file_read_hist, bool for_compaction, - bool prefix_enabled, bool skip_filters) + bool prefix_enabled, bool skip_filters, int level) : TwoLevelIteratorState(prefix_enabled), table_cache_(table_cache), read_options_(read_options), @@ -489,7 +498,8 @@ class LevelFileIteratorState : public TwoLevelIteratorState { icomparator_(icomparator), file_read_hist_(file_read_hist), for_compaction_(for_compaction), - skip_filters_(skip_filters) {} + skip_filters_(skip_filters), + level_(level) {} InternalIterator* NewSecondaryIterator(const Slice& meta_handle) override { if (meta_handle.size() != sizeof(FileDescriptor)) { @@ -501,7 +511,7 @@ class LevelFileIteratorState : public TwoLevelIteratorState { return table_cache_->NewIterator( read_options_, env_options_, icomparator_, *fd, nullptr /* don't need reference to table*/, file_read_hist_, - for_compaction_, nullptr /* arena */, skip_filters_); + for_compaction_, nullptr /* arena */, skip_filters_, level_); } } @@ -517,6 +527,7 @@ class LevelFileIteratorState : public TwoLevelIteratorState { HistogramImpl* file_read_hist_; bool for_compaction_; bool skip_filters_; + int level_; }; // A wrapper of version builder which references the current version in @@ -758,7 +769,7 @@ uint64_t VersionStorageInfo::GetEstimatedActiveKeys() const { if (current_num_samples_ < file_count) { // casting to avoid overflowing - return + return static_cast( (est * static_cast(file_count) / current_num_samples_) ); @@ -784,7 +795,8 @@ void Version::AddIterators(const ReadOptions& read_options, const auto& file = storage_info_.LevelFilesBrief(0).files[i]; merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( read_options, soptions, cfd_->internal_comparator(), file.fd, nullptr, - cfd_->internal_stats()->GetFileReadHist(0), false, arena)); + cfd_->internal_stats()->GetFileReadHist(0), false, arena, + false /* skip_filters */, 0 /* level */)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -799,7 +811,7 @@ void Version::AddIterators(const ReadOptions& read_options, cfd_->internal_stats()->GetFileReadHist(level), false /* for_compaction */, cfd_->ioptions()->prefix_extractor != nullptr, - IsFilterSkipped(level)); + IsFilterSkipped(level), level); mem = arena->AllocateAligned(sizeof(LevelFileNumIterator)); auto* first_level_iter = new (mem) LevelFileNumIterator( cfd_->internal_comparator(), &storage_info_.LevelFilesBrief(level)); @@ -903,7 +915,9 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, *status = table_cache_->Get( read_options, *internal_comparator(), f->fd, ikey, &get_context, cfd_->internal_stats()->GetFileReadHist(fp.GetHitFileLevel()), - IsFilterSkipped(static_cast(fp.GetHitFileLevel()))); + IsFilterSkipped(static_cast(fp.GetHitFileLevel()), + fp.IsHitFileLastInLevel()), + fp.GetCurrentLevel()); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; @@ -960,10 +974,11 @@ void Version::Get(const ReadOptions& read_options, const LookupKey& k, } } -bool Version::IsFilterSkipped(int level) { +bool Version::IsFilterSkipped(int level, bool is_file_last_in_level) { // Reaching the bottom level implies misses at all upper levels, so we'll // skip checking the filters when we predict a hit. return cfd_->ioptions()->optimize_filters_for_hits && + (level > 0 || is_file_last_in_level) && level == storage_info_.num_non_empty_levels() - 1; } @@ -1371,6 +1386,47 @@ void VersionStorageInfo::UpdateNumNonEmptyLevels() { } } +namespace { +// Sort `temp` based on ratio of overlapping size over file size +void SortFileByOverlappingRatio( + const InternalKeyComparator& icmp, const std::vector& files, + const std::vector& next_level_files, + std::vector* temp) { + std::unordered_map file_to_order; + auto next_level_it = next_level_files.begin(); + + for (auto& file : files) { + uint64_t overlapping_bytes = 0; + // Skip files in next level that is smaller than current file + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->largest, file->smallest) < 0) { + next_level_it++; + } + + while (next_level_it != next_level_files.end() && + icmp.Compare((*next_level_it)->smallest, file->largest) < 0) { + overlapping_bytes += (*next_level_it)->fd.file_size; + + if (icmp.Compare((*next_level_it)->largest, file->largest) > 0) { + // next level file cross large boundary of current file. + break; + } + next_level_it++; + } + + assert(file->fd.file_size != 0); + file_to_order[file->fd.GetNumber()] = + overlapping_bytes * 1024u / file->fd.file_size; + } + + std::sort(temp->begin(), temp->end(), + [&](const Fsize& f1, const Fsize& f2) -> bool { + return file_to_order[f1.file->fd.GetNumber()] < + file_to_order[f2.file->fd.GetNumber()]; + }); +} +} // namespace + void VersionStorageInfo::UpdateFilesByCompactionPri( const MutableCFOptions& mutable_cf_options) { if (compaction_style_ == kCompactionStyleFIFO || @@ -1413,6 +1469,10 @@ void VersionStorageInfo::UpdateFilesByCompactionPri( return f1.file->smallest_seqno < f2.file->smallest_seqno; }); break; + case kMinOverlappingRatio: + SortFileByOverlappingRatio(*internal_comparator_, files_[level], + files_[level + 1], &temp); + break; default: assert(false); } @@ -2003,9 +2063,16 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options, env_options_(storage_options), env_options_compactions_(env_options_) {} +void CloseTables(void* ptr, size_t) { + TableReader* table_reader = reinterpret_cast(ptr); + table_reader->Close(); +} + VersionSet::~VersionSet() { // we need to delete column_family_set_ because its destructor depends on // VersionSet + column_family_set_->get_table_cache()->ApplyToAllCacheEntries(&CloseTables, + false); column_family_set_.reset(); for (auto file : obsolete_files_) { delete file; @@ -2193,27 +2260,6 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); - bool all_records_in = true; - for (auto& e : batch_edits) { - std::string record; - if (!e->EncodeTo(&record)) { - s = Status::Corruption( - "Unable to Encode VersionEdit:" + e->DebugString(true)); - all_records_in = false; - break; - } - if (!ManifestContains(pending_manifest_file_number_, record)) { - all_records_in = false; - break; - } - } - if (all_records_in) { - Log(InfoLogLevel::WARN_LEVEL, db_options_->info_log, - "MANIFEST contains log record despite error; advancing to new " - "version to prevent mismatch between in-memory and logged state" - " If paranoid is set, then the db is now in readonly mode."); - s = Status::OK(); - } } } @@ -2222,15 +2268,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, if (s.ok() && new_descriptor_log) { s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, db_options_->disableDataSync ? nullptr : db_directory); - if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { - // delete old manifest file - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", - manifest_file_number_, pending_manifest_file_number_); - // we don't care about an error here, PurgeObsoleteFiles will take care - // of it later - env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_)); - } + // Leave the old file behind since PurgeObsoleteFiles will take care of it + // later. It's unsafe to delete now since file deletion may be disabled. + obsolete_manifests_.emplace_back( + DescriptorFileName("", manifest_file_number_)); } if (s.ok()) { @@ -2239,11 +2280,13 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, } if (edit->is_column_family_drop_) { + TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); } LogFlush(db_options_->info_log); + TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifestDone"); mu->Lock(); } @@ -2282,7 +2325,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, Log(InfoLogLevel::ERROR_LEVEL, db_options_->info_log, "Error in committing version %lu to [%s]", (unsigned long)v->GetVersionNumber(), - column_family_data->GetName().c_str()); + column_family_data ? column_family_data->GetName().c_str() + : ""); delete v; if (new_descriptor_log) { Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, @@ -3066,45 +3110,6 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { return Status::OK(); } -// Opens the mainfest file and reads all records -// till it finds the record we are looking for. -bool VersionSet::ManifestContains(uint64_t manifest_file_num, - const std::string& record) const { - std::string fname = DescriptorFileName(dbname_, manifest_file_num); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: checking %s\n", fname.c_str()); - - unique_ptr file_reader; - Status s; - { - unique_ptr file; - s = env_->NewSequentialFile(fname, &file, env_options_); - if (!s.ok()) { - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: %s\n", s.ToString().c_str()); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: is unable to reopen the manifest file %s", - fname.c_str()); - return false; - } - file_reader.reset(new SequentialFileReader(std::move(file))); - } - log::Reader reader(NULL, std::move(file_reader), nullptr, - true /*checksum*/, 0, 0); - Slice r; - std::string scratch; - bool result = false; - while (reader.ReadRecord(&r, &scratch)) { - if (r == Slice(record)) { - result = true; - break; - } - } - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: result = %d\n", result ? 1 : 0); - return result; -} - // TODO(aekmekji): in CompactionJob::GenSubcompactionBoundaries(), this // function is called repeatedly with consecutive pairs of slices. For example // if the slice list is [a, b, c, d] this function is called with arguments @@ -3278,7 +3283,8 @@ InternalIterator* VersionSet::MakeInputIterator(Compaction* c) { read_options, env_options_compactions_, cfd->internal_comparator(), flevel->files[i].fd, nullptr, nullptr, /* no per level latency histogram*/ - true /* for compaction */); + true /* for_compaction */, nullptr /* arena */, + false /* skip_filters */, (int)which /* level */); } } else { // Create concatenating iterator for the files from this level @@ -3288,7 +3294,7 @@ InternalIterator* VersionSet::MakeInputIterator(Compaction* c) { cfd->internal_comparator(), nullptr /* no per level latency histogram */, true /* for_compaction */, false /* prefix enabled */, - false /* skip_filters */), + false /* skip_filters */, (int)which /* level */), new LevelFileNumIterator(cfd->internal_comparator(), c->input_levels(which))); } @@ -3401,7 +3407,10 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { } void VersionSet::GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, uint64_t min_pending_output) { + assert(manifest_filenames->empty()); + obsolete_manifests_.swap(*manifest_filenames); std::vector pending_files; for (auto f : obsolete_files_) { if (f->fd.GetNumber() < min_pending_output) { diff --git a/db/version_set.h b/db/version_set.h index 2d9d93f6f..d9ff91732 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -530,7 +531,7 @@ class Version { // checked during read operations. In certain cases (trivial move or preload), // the filter block may already be cached, but we still do not access it such // that it eventually expires from the cache. - bool IsFilterSkipped(int level); + bool IsFilterSkipped(int level, bool is_file_last_in_level = false); // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_mata from its associated TableProperties. @@ -697,6 +698,7 @@ class VersionSet { void GetLiveFilesMetaData(std::vector *metadata); void GetObsoleteFiles(std::vector* files, + std::vector* manifest_filenames, uint64_t min_pending_output); ColumnFamilySet* GetColumnFamilySet() { return column_family_set_.get(); } @@ -731,9 +733,6 @@ class VersionSet { void AppendVersion(ColumnFamilyData* column_family_data, Version* v); - bool ManifestContains(uint64_t manifest_file_number, - const std::string& record) const; - ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, VersionEdit* edit); @@ -761,6 +760,7 @@ class VersionSet { uint64_t manifest_file_size_; std::vector obsolete_files_; + std::vector obsolete_manifests_; // env options for all reads and writes except compactions const EnvOptions& env_options_; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 9dc6e95d6..98b20a110 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/wal_manager.cc b/db/wal_manager.cc index adeb6b96f..e1d911e6e 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/wal_manager.h b/db/wal_manager.h index fc04863b2..a3079ed48 100644 --- a/db/wal_manager.h +++ b/db/wal_manager.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 764706d33..4d3f5b6e2 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/write_batch.cc b/db/write_batch.cc index 0565c0599..3742ae694 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -798,18 +798,23 @@ class MemTableInserter : public WriteBatch::Handler { // 3) During Write(), in a concurrent context where memtables has been cloned // The reason is that it calls memtables->Seek(), which has a stateful cache Status WriteBatchInternal::InsertInto( - const autovector& batches, SequenceNumber sequence, + const autovector& writers, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, bool ignore_missing_column_families, uint64_t log_number, DB* db, const bool dont_filter_deletes, bool concurrent_memtable_writes) { MemTableInserter inserter(sequence, memtables, flush_scheduler, ignore_missing_column_families, log_number, db, dont_filter_deletes, concurrent_memtable_writes); - Status rv = Status::OK(); - for (size_t i = 0; i < batches.size() && rv.ok(); ++i) { - rv = batches[i]->Iterate(&inserter); + + for (size_t i = 0; i < writers.size(); i++) { + if (!writers[i]->CallbackFailed()) { + writers[i]->status = writers[i]->batch->Iterate(&inserter); + if (!writers[i]->status.ok()) { + return writers[i]->status; + } + } } - return rv; + return Status::OK(); } Status WriteBatchInternal::InsertInto(const WriteBatch* batch, diff --git a/db/write_batch_base.cc b/db/write_batch_base.cc index 9f7f00d2c..3936fbd92 100644 --- a/db/write_batch_base.cc +++ b/db/write_batch_base.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index d75d2ef65..1e7f61e69 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -9,6 +9,7 @@ #pragma once #include +#include "db/write_thread.h" #include "rocksdb/types.h" #include "rocksdb/write_batch.h" #include "rocksdb/db.h" @@ -134,7 +135,7 @@ class WriteBatchInternal { // // Under concurrent use, the caller is responsible for making sure that // the memtables object itself is thread-local. - static Status InsertInto(const autovector& batches, + static Status InsertInto(const autovector& batches, SequenceNumber sequence, ColumnFamilyMemTables* memtables, FlushScheduler* flush_scheduler, diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 5d008b3a4..58c7273c3 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -308,6 +308,120 @@ TEST_F(WriteBatchTest, Blob) { handler.seen); } +// It requires more than 30GB of memory to run the test. With single memory +// allocation of more than 30GB. +// Not all platform can run it. Also it runs a long time. So disable it. +TEST_F(WriteBatchTest, DISABLED_ManyUpdates) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 4u; + static const uint32_t kNumUpdates = 3 << 30; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(kNumUpdates * (4 + kKeyValueSize * 2) + 1024u); + char c = 'A'; + for (uint32_t i = 0; i < kNumUpdates; i++) { + if (c > 'Z') { + c = 'A'; + } + raw[0] = c; + raw[raw.length() - 1] = c; + c++; + batch.Put(raw, raw); + } + + ASSERT_EQ(kNumUpdates, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + uint32_t num_seen = 0; + char expected_char = 'A'; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ(expected_char, key[0]); + EXPECT_EQ(expected_char, value[0]); + EXPECT_EQ(expected_char, key[kKeyValueSize - 1]); + EXPECT_EQ(expected_char, value[kKeyValueSize - 1]); + expected_char++; + if (expected_char > 'Z') { + expected_char = 'A'; + } + ++num_seen; + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); } + virtual bool Continue() override { return num_seen < kNumUpdates; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(kNumUpdates, handler.num_seen); +} + +// The test requires more than 18GB memory to run it, with single memory +// allocation of more than 12GB. Not all the platform can run it. So disable it. +TEST_F(WriteBatchTest, DISABLED_LargeKeyValue) { + // Insert key and value of 3GB and push total batch size to 12GB. + static const size_t kKeyValueSize = 3221225472u; + std::string raw(kKeyValueSize, 'A'); + WriteBatch batch(12884901888u + 1024u); + for (char i = 0; i < 2; i++) { + raw[0] = 'A' + i; + raw[raw.length() - 1] = 'A' - i; + batch.Put(raw, raw); + } + + ASSERT_EQ(2, batch.Count()); + + struct NoopHandler : public WriteBatch::Handler { + int num_seen = 0; + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_EQ(kKeyValueSize, key.size()); + EXPECT_EQ(kKeyValueSize, value.size()); + EXPECT_EQ('A' + num_seen, key[0]); + EXPECT_EQ('A' + num_seen, value[0]); + EXPECT_EQ('A' - num_seen, key[kKeyValueSize - 1]); + EXPECT_EQ('A' - num_seen, value[kKeyValueSize - 1]); + ++num_seen; + return Status::OK(); + } + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status SingleDeleteCF(uint32_t column_family_id, + const Slice& key) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + EXPECT_TRUE(false); + return Status::OK(); + } + virtual void LogData(const Slice& blob) override { EXPECT_TRUE(false); } + virtual bool Continue() override { return num_seen < 2; } + } handler; + + batch.Iterate(&handler); + ASSERT_EQ(2, handler.num_seen); +} + TEST_F(WriteBatchTest, Continue) { WriteBatch batch; diff --git a/db/write_callback.h b/db/write_callback.h index 7dcca96fe..93c80d651 100644 --- a/db/write_callback.h +++ b/db/write_callback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,9 @@ class WriteCallback { // this function returns a non-OK status, the write will be aborted and this // status will be returned to the caller of DB::Write(). virtual Status Callback(DB* db) = 0; + + // return true if writes with this callback can be batched with other writes + virtual bool AllowWriteBatching() = 0; }; } // namespace rocksdb diff --git a/db/write_callback_test.cc b/db/write_callback_test.cc index 47b7cf72a..8acd60df8 100644 --- a/db/write_callback_test.cc +++ b/db/write_callback_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,12 +6,15 @@ #ifndef ROCKSDB_LITE #include +#include +#include #include "db/db_impl.h" #include "db/write_callback.h" #include "rocksdb/db.h" #include "rocksdb/write_batch.h" #include "util/logging.h" +#include "util/sync_point.h" #include "util/testharness.h" using std::string; @@ -42,6 +45,8 @@ class WriteCallbackTestWriteCallback1 : public WriteCallback { return Status::OK(); } + + bool AllowWriteBatching() override { return true; } }; class WriteCallbackTestWriteCallback2 : public WriteCallback { @@ -49,8 +54,226 @@ class WriteCallbackTestWriteCallback2 : public WriteCallback { Status Callback(DB *db) override { return Status::Busy(); } + bool AllowWriteBatching() override { return true; } +}; + +class MockWriteCallback : public WriteCallback { + public: + bool should_fail_ = false; + bool was_called_ = false; + bool allow_batching_ = false; + + Status Callback(DB* db) override { + was_called_ = true; + if (should_fail_) { + return Status::Busy(); + } else { + return Status::OK(); + } + } + + bool AllowWriteBatching() override { return allow_batching_; } }; +TEST_F(WriteCallbackTest, WriteWithCallbackTest) { + struct WriteOP { + WriteOP(bool should_fail = false) { callback_.should_fail_ = should_fail; } + + void Put(const string& key, const string& val) { + kvs_.push_back(std::make_pair(key, val)); + write_batch_.Put(key, val); + } + + void Clear() { + kvs_.clear(); + write_batch_.Clear(); + callback_.was_called_ = false; + } + + MockWriteCallback callback_; + WriteBatch write_batch_; + std::vector> kvs_; + }; + + std::vector> write_scenarios = { + {true}, + {false}, + {false, false}, + {true, true}, + {true, false}, + {false, true}, + {false, false, false}, + {true, true, true}, + {false, true, false}, + {true, false, true}, + {true, false, false, false, false}, + {false, false, false, false, true}, + {false, false, true, false, true}, + }; + + for (auto& allow_parallel : {true, false}) { + for (auto& allow_batching : {true, false}) { + for (auto& enable_WAL : {true, false}) { + for (auto& write_group : write_scenarios) { + Options options; + options.create_if_missing = true; + options.allow_concurrent_memtable_write = allow_parallel; + + ReadOptions read_options; + DB* db; + DBImpl* db_impl; + + ASSERT_OK(DB::Open(options, dbname, &db)); + + db_impl = dynamic_cast(db); + ASSERT_TRUE(db_impl); + + std::atomic threads_waiting(0); + std::atomic seq(db_impl->GetLatestSequenceNumber()); + ASSERT_EQ(db_impl->GetLatestSequenceNumber(), 0); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:Wait", [&](void* arg) { + uint64_t cur_threads_waiting = 0; + bool is_leader = false; + bool is_last = false; + + // who am i + do { + cur_threads_waiting = threads_waiting.load(); + is_leader = (cur_threads_waiting == 0); + is_last = (cur_threads_waiting == write_group.size() - 1); + } while (!threads_waiting.compare_exchange_strong( + cur_threads_waiting, cur_threads_waiting + 1)); + + // check my state + auto* writer = reinterpret_cast(arg); + + if (is_leader) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else { + ASSERT_TRUE(writer->state == WriteThread::State::STATE_INIT); + } + + // (meta test) the first WriteOP should indeed be the first + // and the last should be the last (all others can be out of + // order) + if (is_leader) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.front().callback_.should_fail_); + } else if (is_last) { + ASSERT_TRUE(writer->callback->Callback(nullptr).ok() == + !write_group.back().callback_.should_fail_); + } + + // wait for friends + while (threads_waiting.load() < write_group.size()) { + } + }); + + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "WriteThread::JoinBatchGroup:DoneWaiting", [&](void* arg) { + // check my state + auto* writer = reinterpret_cast(arg); + + if (!allow_batching) { + // no batching so everyone should be a leader + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_GROUP_LEADER); + } else if (!allow_parallel) { + ASSERT_TRUE(writer->state == + WriteThread::State::STATE_COMPLETED); + } + }); + + std::atomic thread_num(0); + std::atomic dummy_key(0); + std::function write_with_callback_func = [&]() { + uint32_t i = thread_num.fetch_add(1); + Random rnd(i); + + // leaders gotta lead + while (i > 0 && threads_waiting.load() < 1) { + } + + // loser has to lose + while (i == write_group.size() - 1 && + threads_waiting.load() < write_group.size() - 1) { + } + + auto& write_op = write_group.at(i); + write_op.Clear(); + write_op.callback_.allow_batching_ = allow_batching; + + // insert some keys + for (uint32_t j = 0; j < rnd.Next() % 50; j++) { + // grab unique key + char my_key = 0; + do { + my_key = dummy_key.load(); + } while (!dummy_key.compare_exchange_strong(my_key, my_key + 1)); + + string skey(5, my_key); + string sval(10, my_key); + write_op.Put(skey, sval); + + if (!write_op.callback_.should_fail_) { + seq.fetch_add(1); + } + } + + WriteOptions woptions; + woptions.disableWAL = !enable_WAL; + woptions.sync = enable_WAL; + Status s = db_impl->WriteWithCallback( + woptions, &write_op.write_batch_, &write_op.callback_); + + if (write_op.callback_.should_fail_) { + ASSERT_TRUE(s.IsBusy()); + } else { + ASSERT_OK(s); + } + }; + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + // do all the writes + std::vector threads; + for (uint32_t i = 0; i < write_group.size(); i++) { + threads.emplace_back(write_with_callback_func); + } + for (auto& t : threads) { + t.join(); + } + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + // check for keys + string value; + for (auto& w : write_group) { + ASSERT_TRUE(w.callback_.was_called_); + for (auto& kvp : w.kvs_) { + if (w.callback_.should_fail_) { + ASSERT_TRUE( + db->Get(read_options, kvp.first, &value).IsNotFound()); + } else { + ASSERT_OK(db->Get(read_options, kvp.first, &value)); + ASSERT_EQ(value, kvp.second); + } + } + } + + ASSERT_EQ(seq.load(), db_impl->GetLatestSequenceNumber()); + + delete db; + DestroyDB(dbname, options); + } + } + } + } +} + TEST_F(WriteCallbackTest, WriteCallBackTest) { Options options; WriteOptions write_options; diff --git a/db/write_controller.cc b/db/write_controller.cc index 7a933ec42..d46d8d3dd 100644 --- a/db/write_controller.cc +++ b/db/write_controller.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,6 +26,13 @@ std::unique_ptr WriteController::GetDelayToken( return std::unique_ptr(new DelayWriteToken(this)); } +std::unique_ptr +WriteController::GetCompactionPressureToken() { + ++total_compaction_pressure_; + return std::unique_ptr( + new CompactionPressureToken(this)); +} + bool WriteController::IsStopped() const { return total_stopped_ > 0; } // This is inside DB mutex, so we can't sleep and need to minimize // frequency to get time. @@ -106,4 +113,9 @@ DelayWriteToken::~DelayWriteToken() { assert(controller_->total_delayed_ >= 0); } +CompactionPressureToken::~CompactionPressureToken() { + controller_->total_compaction_pressure_--; + assert(controller_->total_compaction_pressure_ >= 0); +} + } // namespace rocksdb diff --git a/db/write_controller.h b/db/write_controller.h index a5d498c3a..052047177 100644 --- a/db/write_controller.h +++ b/db/write_controller.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -23,6 +23,7 @@ class WriteController { explicit WriteController(uint64_t _delayed_write_rate = 1024u * 1024u * 32u) : total_stopped_(0), total_delayed_(0), + total_compaction_pressure_(0), bytes_left_(0), last_refill_time_(0) { set_delayed_write_rate(_delayed_write_rate); @@ -38,10 +39,16 @@ class WriteController { // which returns number of microseconds to sleep. std::unique_ptr GetDelayToken( uint64_t delayed_write_rate); + // When an actor (column family) requests a moderate token, compaction + // threads will be increased + std::unique_ptr GetCompactionPressureToken(); - // these two metods are querying the state of the WriteController + // these three metods are querying the state of the WriteController bool IsStopped() const; bool NeedsDelay() const { return total_delayed_ > 0; } + bool NeedSpeedupCompaction() const { + return IsStopped() || NeedsDelay() || total_compaction_pressure_ > 0; + } // return how many microseconds the caller needs to sleep after the call // num_bytes: how many number of bytes to put into the DB. // Prerequisite: DB mutex held. @@ -59,9 +66,11 @@ class WriteController { friend class WriteControllerToken; friend class StopWriteToken; friend class DelayWriteToken; + friend class CompactionPressureToken; int total_stopped_; int total_delayed_; + int total_compaction_pressure_; uint64_t bytes_left_; uint64_t last_refill_time_; uint64_t delayed_write_rate_; @@ -96,4 +105,11 @@ class DelayWriteToken : public WriteControllerToken { virtual ~DelayWriteToken(); }; +class CompactionPressureToken : public WriteControllerToken { + public: + explicit CompactionPressureToken(WriteController* controller) + : WriteControllerToken(controller) {} + virtual ~CompactionPressureToken(); +}; + } // namespace rocksdb diff --git a/db/write_controller_test.cc b/db/write_controller_test.cc index dc5614855..db9a9db1b 100644 --- a/db/write_controller_test.cc +++ b/db/write_controller_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/write_thread.cc b/db/write_thread.cc index e153f319b..531da55df 100644 --- a/db/write_thread.cc +++ b/db/write_thread.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -218,21 +218,25 @@ void WriteThread::JoinBatchGroup(Writer* w) { assert(w->batch != nullptr); bool linked_as_leader; LinkOne(w, &linked_as_leader); + + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:Wait", w); + if (!linked_as_leader) { AwaitState(w, STATE_GROUP_LEADER | STATE_PARALLEL_FOLLOWER | STATE_COMPLETED, &ctx); + TEST_SYNC_POINT_CALLBACK("WriteThread::JoinBatchGroup:DoneWaiting", w); } } size_t WriteThread::EnterAsBatchGroupLeader( Writer* leader, WriteThread::Writer** last_writer, - autovector* write_batch_group) { + autovector* write_batch_group) { assert(leader->link_older == nullptr); assert(leader->batch != nullptr); size_t size = WriteBatchInternal::ByteSize(leader->batch); - write_batch_group->push_back(leader->batch); + write_batch_group->push_back(leader); // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow @@ -244,12 +248,6 @@ size_t WriteThread::EnterAsBatchGroupLeader( *last_writer = leader; - if (leader->has_callback) { - // TODO(agiardullo:) Batching not currently supported as this write may - // fail if the callback function decides to abort this write. - return size; - } - Writer* newest_writer = newest_writer_.load(std::memory_order_acquire); // This is safe regardless of any db mutex status of the caller. Previous @@ -276,18 +274,17 @@ size_t WriteThread::EnterAsBatchGroupLeader( break; } - if (w->has_callback) { - // Do not include writes which may be aborted if the callback does not - // succeed. - break; - } - if (w->batch == nullptr) { // Do not include those writes with nullptr batch. Those are not writes, // those are something else. They want to be alone break; } + if (w->callback != nullptr && !w->callback->AllowWriteBatching()) { + // dont batch writes that don't want to be batched + break; + } + auto batch_size = WriteBatchInternal::ByteSize(w->batch); if (size + batch_size > max_size) { // Do not make batch too big @@ -295,7 +292,7 @@ size_t WriteThread::EnterAsBatchGroupLeader( } size += batch_size; - write_batch_group->push_back(w->batch); + write_batch_group->push_back(w); w->in_batch_group = true; *last_writer = w; } @@ -313,7 +310,10 @@ void WriteThread::LaunchParallelFollowers(ParallelGroup* pg, w->sequence = sequence; while (w != pg->last_writer) { - sequence += WriteBatchInternal::Count(w->batch); + // Writers that won't write don't get sequence allotment + if (!w->CallbackFailed()) { + sequence += WriteBatchInternal::Count(w->batch); + } w = w->link_newer; w->sequence = sequence; @@ -330,6 +330,7 @@ bool WriteThread::CompleteParallelWorker(Writer* w) { std::lock_guard guard(w->StateMutex()); pg->status = w->status; } + auto leader = pg->leader; auto early_exit_allowed = pg->early_exit_allowed; @@ -364,8 +365,8 @@ void WriteThread::EarlyExitParallelGroup(Writer* w) { assert(w->state == STATE_PARALLEL_FOLLOWER); assert(pg->status.ok()); ExitAsBatchGroupLeader(pg->leader, pg->last_writer, pg->status); - assert(w->state == STATE_COMPLETED); assert(w->status.ok()); + assert(w->state == STATE_COMPLETED); SetState(pg->leader, STATE_COMPLETED); } @@ -407,7 +408,6 @@ void WriteThread::ExitAsBatchGroupLeader(Writer* leader, Writer* last_writer, while (last_writer != leader) { last_writer->status = status; - // we need to read link_older before calling SetState, because as soon // as it is marked committed the other thread's Await may return and // deallocate the Writer. diff --git a/db/write_thread.h b/db/write_thread.h index e31904ed1..c3cb5cc0e 100644 --- a/db/write_thread.h +++ b/db/write_thread.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,8 +13,10 @@ #include #include #include -#include "db/write_batch_internal.h" +#include "db/write_callback.h" +#include "rocksdb/types.h" #include "rocksdb/status.h" +#include "rocksdb/write_batch.h" #include "util/autovector.h" #include "util/instrumented_mutex.h" @@ -65,6 +67,7 @@ class WriteThread { struct ParallelGroup { Writer* leader; Writer* last_writer; + SequenceNumber last_sequence; bool early_exit_allowed; // before running goes to zero, status needs leader->StateMutex() Status status; @@ -77,12 +80,13 @@ class WriteThread { bool sync; bool disableWAL; bool in_batch_group; - bool has_callback; + WriteCallback* callback; bool made_waitable; // records lazy construction of mutex and cv std::atomic state; // write under StateMutex() or pre-link ParallelGroup* parallel_group; SequenceNumber sequence; // the sequence number to use - Status status; + Status status; // status of memtable inserter + Status callback_status; // status returned by callback->Callback() std::aligned_storage::type state_mutex_bytes; std::aligned_storage::type state_cv_bytes; Writer* link_older; // read/write only before linking, or as leader @@ -93,9 +97,10 @@ class WriteThread { sync(false), disableWAL(false), in_batch_group(false), - has_callback(false), + callback(nullptr), made_waitable(false), state(STATE_INIT), + parallel_group(nullptr), link_older(nullptr), link_newer(nullptr) {} @@ -106,6 +111,13 @@ class WriteThread { } } + bool CheckCallback(DB* db) { + if (callback != nullptr) { + callback_status = callback->Callback(db); + } + return callback_status.ok(); + } + void CreateMutex() { if (!made_waitable) { // Note that made_waitable is tracked separately from state @@ -117,6 +129,30 @@ class WriteThread { } } + // returns the aggregate status of this Writer + Status FinalStatus() { + if (!status.ok()) { + // a non-ok memtable write status takes presidence + assert(callback == nullptr || callback_status.ok()); + return status; + } else if (!callback_status.ok()) { + // if the callback failed then that is the status we want + // because a memtable insert should not have been attempted + assert(callback != nullptr); + assert(status.ok()); + return callback_status; + } else { + // if there is no callback then we only care about + // the memtable insert status + assert(callback == nullptr || callback_status.ok()); + return status; + } + } + + bool CallbackFailed() { + return (callback != nullptr) && !callback_status.ok(); + } + // No other mutexes may be acquired while holding StateMutex(), it is // always last in the order std::mutex& StateMutex() { @@ -160,8 +196,9 @@ class WriteThread { // Writer** last_writer: Out-param that identifies the last follower // autovector* write_batch_group: Out-param of group members // returns: Total batch group byte size - size_t EnterAsBatchGroupLeader(Writer* leader, Writer** last_writer, - autovector* write_batch_group); + size_t EnterAsBatchGroupLeader( + Writer* leader, Writer** last_writer, + autovector* write_batch_group); // Causes JoinBatchGroup to return STATE_PARALLEL_FOLLOWER for all of the // non-leader members of this write batch group. Sets Writer::sequence diff --git a/db/writebuffer.h b/db/writebuffer.h index 4fe51d8a7..19d51d925 100644 --- a/db/writebuffer.h +++ b/db/writebuffer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/db/xfunc_test_points.cc b/db/xfunc_test_points.cc new file mode 100644 index 000000000..67e96dd05 --- /dev/null +++ b/db/xfunc_test_points.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "db/xfunc_test_points.h" +#include "util/xfunc.h" + +namespace rocksdb { + +#ifdef XFUNC + +void xf_manage_release(ManagedIterator* iter) { + if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { + iter->ReleaseIter(false); + } +} + +void xf_manage_create(ManagedIterator* iter) { iter->SetDropOld(false); } + +void xf_manage_new(DBImpl* db, ReadOptions* read_options, + bool is_snapshot_supported) { + if ((!XFuncPoint::Check("managed_xftest_dropold") && + (!XFuncPoint::Check("managed_xftest_release"))) || + (!read_options->managed)) { + return; + } + if ((!read_options->tailing) && (read_options->snapshot == nullptr) && + (!is_snapshot_supported)) { + read_options->managed = false; + return; + } + if (db->GetOptions().prefix_extractor != nullptr) { + if (strcmp(db->GetOptions().table_factory.get()->Name(), "PlainTable")) { + if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { + read_options->total_order_seek = true; + } + } else { + read_options->managed = false; + } + } +} + +class XFTransactionWriteHandler : public WriteBatch::Handler { + public: + Transaction* txn_; + DBImpl* db_impl_; + + XFTransactionWriteHandler(Transaction* txn, DBImpl* db_impl) + : txn_(txn), db_impl_(db_impl) {} + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Put(cfh, key, value); + + return Status::OK(); + } + + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Merge(cfh, key, value); + + return Status::OK(); + } + + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + InstrumentedMutexLock l(&db_impl_->mutex_); + + ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); + if (cfh == nullptr) { + return Status::InvalidArgument( + "XFUNC test could not find column family " + "handle for id ", + ToString(column_family_id)); + } + + txn_->Delete(cfh, key); + + return Status::OK(); + } + + virtual void LogData(const Slice& blob) override { txn_->PutLogData(blob); } +}; + +// Whenever DBImpl::Write is called, create a transaction and do the write via +// the transaction. +void xf_transaction_write(const WriteOptions& write_options, + const DBOptions& db_options, WriteBatch* my_batch, + WriteCallback* callback, DBImpl* db_impl, Status* s, + bool* write_attempted) { + if (callback != nullptr) { + // We may already be in a transaction, don't force a transaction + *write_attempted = false; + return; + } + + OptimisticTransactionDB* txn_db = new OptimisticTransactionDB(db_impl); + Transaction* txn = Transaction::BeginTransaction(txn_db, write_options); + + XFTransactionWriteHandler handler(txn, db_impl); + *s = my_batch->Iterate(&handler); + + if (!s->ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, + "XFUNC test could not iterate batch. status: $s\n", + s->ToString().c_str()); + } + + *s = txn->Commit(); + + if (!s->ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, + "XFUNC test could not commit transaction. status: $s\n", + s->ToString().c_str()); + } + + *write_attempted = true; + delete txn; + delete txn_db; +} + +#endif // XFUNC + +} // namespace rocksdb diff --git a/db/xfunc_test_points.h b/db/xfunc_test_points.h new file mode 100644 index 000000000..8ed9f2c73 --- /dev/null +++ b/db/xfunc_test_points.h @@ -0,0 +1,33 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include "db/db_impl.h" +#include "db/managed_iterator.h" +#include "db/write_callback.h" +#include "rocksdb/options.h" +#include "rocksdb/write_batch.h" +#include "util/xfunc.h" + +namespace rocksdb { + +#ifdef XFUNC + +// DB-specific test points for the cross-functional test framework (see +// util/xfunc.h). +void xf_manage_release(ManagedIterator* iter); +void xf_manage_create(ManagedIterator* iter); +void xf_manage_new(DBImpl* db, ReadOptions* readoptions, + bool is_snapshot_supported); +void xf_transaction_write(const WriteOptions& write_options, + const DBOptions& db_options, + class WriteBatch* my_batch, + class WriteCallback* callback, DBImpl* db_impl, + Status* success, bool* write_attempted); + +#endif // XFUNC + +} // namespace rocksdb diff --git a/examples/column_families_example.cc b/examples/column_families_example.cc index 3ffac064d..f2dec691e 100644 --- a/examples/column_families_example.cc +++ b/examples/column_families_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/compact_files_example.cc b/examples/compact_files_example.cc index 6c0456675..023ae403b 100644 --- a/examples/compact_files_example.cc +++ b/examples/compact_files_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/compaction_filter_example.cc b/examples/compaction_filter_example.cc index 050f4611a..6b0feb149 100644 --- a/examples/compaction_filter_example.cc +++ b/examples/compaction_filter_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/optimistic_transaction_example.cc b/examples/optimistic_transaction_example.cc index e9ab0e5ee..d28a305b3 100644 --- a/examples/optimistic_transaction_example.cc +++ b/examples/optimistic_transaction_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/options_file_example.cc b/examples/options_file_example.cc index 916ff02f3..360ccddf2 100644 --- a/examples/options_file_example.cc +++ b/examples/options_file_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/rocksdb_option_file_example.ini b/examples/rocksdb_option_file_example.ini index 838afe8eb..7dc070429 100644 --- a/examples/rocksdb_option_file_example.ini +++ b/examples/rocksdb_option_file_example.ini @@ -138,6 +138,7 @@ block_size=8192 block_restart_interval=16 cache_index_and_filter_blocks=false + pin_l0_filter_and_index_blocks_in_cache=false index_type=kBinarySearch hash_index_allow_collision=true flush_block_policy_factory=FlushBlockBySizePolicyFactory diff --git a/examples/simple_example.cc b/examples/simple_example.cc index 28a7c9e8b..453443479 100644 --- a/examples/simple_example.cc +++ b/examples/simple_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/examples/transaction_example.cc b/examples/transaction_example.cc index a7d506129..914f1bc30 100644 --- a/examples/transaction_example.cc +++ b/examples/transaction_example.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/hdfs/env_hdfs.h b/hdfs/env_hdfs.h index d2afbfb3c..ab27e0440 100644 --- a/hdfs/env_hdfs.h +++ b/hdfs/env_hdfs.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 5cea81d15..6e52d20af 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2013, Facebook, Inc. All rights reserved. +/* Copyright (c) 2011-present, Facebook, Inc. All rights reserved. This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. An additional grant of patent rights can be found in the PATENTS file in the same directory. @@ -451,6 +451,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_cache_index_and_filter_blocks( rocksdb_block_based_table_options_t*, unsigned char); extern ROCKSDB_LIBRARY_API void +rocksdb_block_based_options_set_pin_l0_filter_and_index_blocks_in_cache( + rocksdb_block_based_table_options_t*, unsigned char); +extern ROCKSDB_LIBRARY_API void rocksdb_block_based_options_set_skip_table_builder_flush( rocksdb_block_based_table_options_t* options, unsigned char); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory( diff --git a/include/rocksdb/cache.h b/include/rocksdb/cache.h index 47657b90a..327270e34 100644 --- a/include/rocksdb/cache.h +++ b/include/rocksdb/cache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,6 +25,7 @@ #include #include #include "rocksdb/slice.h" +#include "rocksdb/status.h" namespace rocksdb { @@ -33,12 +34,15 @@ using std::shared_ptr; class Cache; // Create a new cache with a fixed size capacity. The cache is sharded -// to 2^numShardBits shards, by hash of the key. The total capacity +// to 2^num_shard_bits shards, by hash of the key. The total capacity // is divided and evenly assigned to each shard. // -// The functions without parameter numShardBits uses default value, which is 4 +// The parameter num_shard_bits defaults to 4, and strict_capacity_limit +// defaults to false. extern shared_ptr NewLRUCache(size_t capacity); -extern shared_ptr NewLRUCache(size_t capacity, int numShardBits); +extern shared_ptr NewLRUCache(size_t capacity, int num_shard_bits); +extern shared_ptr NewLRUCache(size_t capacity, int num_shard_bits, + bool strict_capacity_limit); class Cache { public: @@ -55,15 +59,22 @@ class Cache { // Insert a mapping from key->value into the cache and assign it // the specified charge against the total cache capacity. + // If strict_capacity_limit is true and cache reaches its full capacity, + // return Status::Incomplete. // - // Returns a handle that corresponds to the mapping. The caller - // must call this->Release(handle) when the returned mapping is no - // longer needed. + // If handle is not nullptr, returns a handle that corresponds to the + // mapping. The caller must call this->Release(handle) when the returned + // mapping is no longer needed. In case of error caller is responsible to + // cleanup the value (i.e. calling "deleter"). + // + // If handle is nullptr, it is as if Release is called immediately after + // insert. In case of error value will be cleanup. // // When the inserted entry is no longer needed, the key and // value will be passed to "deleter". - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) = 0; + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle = nullptr) = 0; // If the cache has no mapping for "key", returns nullptr. // @@ -100,6 +111,14 @@ class Cache { // purge the released entries from the cache in order to lower the usage virtual void SetCapacity(size_t capacity) = 0; + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) = 0; + + // Set whether to return error on insertion when cache reaches its full + // capacity. + virtual bool HasStrictCapacityLimit() const = 0; + // returns the maximum configured capacity of the cache virtual size_t GetCapacity() const = 0; diff --git a/include/rocksdb/compaction_filter.h b/include/rocksdb/compaction_filter.h index 1286840fe..acdc3aa1b 100644 --- a/include/rocksdb/compaction_filter.h +++ b/include/rocksdb/compaction_filter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/compaction_job_stats.h b/include/rocksdb/compaction_job_stats.h index 533190015..d06fbe403 100644 --- a/include/rocksdb/compaction_job_stats.h +++ b/include/rocksdb/compaction_job_stats.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/comparator.h b/include/rocksdb/comparator.h index 8fc2710aa..1c67b0d4e 100644 --- a/include/rocksdb/comparator.h +++ b/include/rocksdb/comparator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index f9111b4e3..b4935ce6e 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 9bc0993c5..d05c04ea1 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -327,92 +327,163 @@ class DB { // use "snapshot" after this call. virtual void ReleaseSnapshot(const Snapshot* snapshot) = 0; - // DB implementations can export properties about their state - // via this method. If "property" is a valid property understood by this - // DB implementation, fills "*value" with its current value and returns - // true. Otherwise returns false. - // - // - // Valid property names include: - // - // "rocksdb.num-files-at-level" - return the number of files at level , - // where is an ASCII representation of a level number (e.g. "0"). - // "rocksdb.stats" - returns a multi-line string that describes statistics - // about the internal operation of the DB. - // "rocksdb.sstables" - returns a multi-line string that describes all - // of the sstables that make up the db contents. - // "rocksdb.cfstats" - // "rocksdb.dbstats" - // "rocksdb.num-immutable-mem-table" - // "rocksdb.mem-table-flush-pending" - // "rocksdb.compaction-pending" - 1 if at least one compaction is pending - // "rocksdb.background-errors" - accumulated number of background errors - // "rocksdb.cur-size-active-mem-table" - // "rocksdb.size-all-mem-tables" - // "rocksdb.num-entries-active-mem-table" - // "rocksdb.num-entries-imm-mem-tables" - // "rocksdb.num-deletes-active-mem-table" - // "rocksdb.num-deletes-imm-mem-tables" - // "rocksdb.estimate-num-keys" - estimated keys in the column family - // "rocksdb.estimate-table-readers-mem" - estimated memory used for reding - // SST tables, that is not counted as a part of block cache. - // "rocksdb.is-file-deletions-enabled" - // "rocksdb.num-snapshots" - // "rocksdb.oldest-snapshot-time" - // "rocksdb.num-live-versions" - `version` is an internal data structure. - // See version_set.h for details. More live versions often mean more SST - // files are held from being deleted, by iterators or unfinished - // compactions. - // "rocksdb.estimate-live-data-size" - // "rocksdb.total-sst-files-size" - total size of all used sst files, this - // may slow down online queries if there are too many files. - // "rocksdb.base-level" - // "rocksdb.estimate-pending-compaction-bytes" - estimated total number of - // bytes compaction needs to rewrite the data to get all levels down - // to under target size. Not valid for other compactions than - // level-based. - // "rocksdb.aggregated-table-properties" - returns a string representation - // of the aggregated table properties of the target column family. - // "rocksdb.aggregated-table-properties-at-level", same as the previous - // one but only returns the aggregated table properties of the specified - // level "N" at the target column family. - // "rocksdb.num-running-compactions" - the number of currently running - // compacitons. - // "rocksdb.num-running-flushes" - the number of currently running flushes. #ifndef ROCKSDB_LITE + // Contains all valid property arguments for GetProperty(). + // + // NOTE: Property names cannot end in numbers since those are interpreted as + // arguments, e.g., see kNumFilesAtLevelPrefix. struct Properties { + // "rocksdb.num-files-at-level" - returns string containing the number + // of files at level , where is an ASCII representation of a + // level number (e.g., "0"). static const std::string kNumFilesAtLevelPrefix; + + // "rocksdb.stats" - returns a multi-line string containing the data + // described by kCFStats followed by the data described by kDBStats. static const std::string kStats; + + // "rocksdb.sstables" - returns a multi-line string summarizing current + // SST files. static const std::string kSSTables; + + // "rocksdb.cfstats" - returns a multi-line string with general column + // family stats per-level over db's lifetime ("L"), aggregated over + // db's lifetime ("Sum"), and aggregated over the interval since the + // last retrieval ("Int"). static const std::string kCFStats; + + // "rocksdb.dbstats" - returns a multi-line string with general database + // stats, both cumulative (over the db's lifetime) and interval (since + // the last retrieval of kDBStats). static const std::string kDBStats; + + // "rocksdb.levelstats" - returns multi-line string containing the number + // of files per level and total size of each level (MB). + static const std::string kLevelStats; + + // "rocksdb.num-immutable-mem-table" - returns number of immutable + // memtables that have not yet been flushed. static const std::string kNumImmutableMemTable; + + // "rocksdb.num-immutable-mem-table-flushed" - returns number of immutable + // memtables that have already been flushed. + static const std::string kNumImmutableMemTableFlushed; + + // "rocksdb.mem-table-flush-pending" - returns 1 if a memtable flush is + // pending; otherwise, returns 0. static const std::string kMemTableFlushPending; + + // "rocksdb.num-running-flushes" - returns the number of currently running + // flushes. static const std::string kNumRunningFlushes; + + // "rocksdb.compaction-pending" - returns 1 if at least one compaction is + // pending; otherwise, returns 0. static const std::string kCompactionPending; + + // "rocksdb.num-running-compactions" - returns the number of currently + // running compactions. static const std::string kNumRunningCompactions; + + // "rocksdb.background-errors" - returns accumulated number of background + // errors. static const std::string kBackgroundErrors; + + // "rocksdb.cur-size-active-mem-table" - returns approximate size of active + // memtable (bytes). static const std::string kCurSizeActiveMemTable; + + // "rocksdb.cur-size-all-mem-tables" - returns approximate size of active + // and unflushed immutable memtables (bytes). static const std::string kCurSizeAllMemTables; + + // "rocksdb.size-all-mem-tables" - returns approximate size of active, + // unflushed immutable, and pinned immutable memtables (bytes). static const std::string kSizeAllMemTables; + + // "rocksdb.num-entries-active-mem-table" - returns total number of entries + // in the active memtable. static const std::string kNumEntriesActiveMemTable; + + // "rocksdb.num-entries-imm-mem-tables" - returns total number of entries + // in the unflushed immutable memtables. static const std::string kNumEntriesImmMemTables; + + // "rocksdb.num-deletes-active-mem-table" - returns total number of delete + // entries in the active memtable. static const std::string kNumDeletesActiveMemTable; + + // "rocksdb.num-deletes-imm-mem-tables" - returns total number of delete + // entries in the unflushed immutable memtables. static const std::string kNumDeletesImmMemTables; + + // "rocksdb.estimate-num-keys" - returns estimated number of total keys in + // the active and unflushed immutable memtables. static const std::string kEstimateNumKeys; + + // "rocksdb.estimate-table-readers-mem" - returns estimated memory used for + // reading SST tables, excluding memory used in block cache (e.g., + // filter and index blocks). static const std::string kEstimateTableReadersMem; + + // "rocksdb.is-file-deletions-enabled" - returns 0 if deletion of obsolete + // files is enabled; otherwise, returns a non-zero number. static const std::string kIsFileDeletionsEnabled; + + // "rocksdb.num-snapshots" - returns number of unreleased snapshots of the + // database. static const std::string kNumSnapshots; + + // "rocksdb.oldest-snapshot-time" - returns number representing unix + // timestamp of oldest unreleased snapshot. static const std::string kOldestSnapshotTime; + + // "rocksdb.num-live-versions" - returns number of live versions. `Version` + // is an internal data structure. See version_set.h for details. More + // live versions often mean more SST files are held from being deleted, + // by iterators or unfinished compactions. static const std::string kNumLiveVersions; + + // "rocksdb.current-super-version-number" - returns number of curent LSM + // version. It is a uint64_t integer number, incremented after there is + // any change to the LSM tree. The number is not preserved after restarting + // the DB. After DB restart, it will start from 0 again. + static const std::string kCurrentSuperVersionNumber; + + // "rocksdb.estimate-live-data-size" - returns an estimate of the amount of + // live data in bytes. static const std::string kEstimateLiveDataSize; + + // "rocksdb.total-sst-files-size" - returns total size (bytes) of all SST + // files. + // WARNING: may slow down online queries if there are too many files. static const std::string kTotalSstFilesSize; + + // "rocksdb.base-level" - returns number of level to which L0 data will be + // compacted. + static const std::string kBaseLevel; + + // "rocksdb.estimate-pending-compaction-bytes" - returns estimated total + // number of bytes compaction needs to rewrite to get all levels down + // to under target size. Not valid for other compactions than level- + // based. static const std::string kEstimatePendingCompactionBytes; + + // "rocksdb.aggregated-table-properties" - returns a string representation + // of the aggregated table properties of the target column family. static const std::string kAggregatedTableProperties; + + // "rocksdb.aggregated-table-properties-at-level", same as the previous + // one but only returns the aggregated table properties of the + // specified level "N" at the target column family. static const std::string kAggregatedTablePropertiesAtLevel; }; #endif /* ROCKSDB_LITE */ + // DB implementations can export properties about their state via this method. + // If "property" is a valid property understood by this DB implementation (see + // Properties struct above for valid options), fills "*value" with its current + // value and returns true. Otherwise, returns false. virtual bool GetProperty(ColumnFamilyHandle* column_family, const Slice& property, std::string* value) = 0; virtual bool GetProperty(const Slice& property, std::string* value) { @@ -439,6 +510,7 @@ class DB { // "rocksdb.num-snapshots" // "rocksdb.oldest-snapshot-time" // "rocksdb.num-live-versions" + // "rocksdb.current-super-version-number" // "rocksdb.estimate-live-data-size" // "rocksdb.total-sst-files-size" // "rocksdb.base-level" diff --git a/include/rocksdb/db_bench_tool.h b/include/rocksdb/db_bench_tool.h new file mode 100644 index 000000000..0e33ae96e --- /dev/null +++ b/include/rocksdb/db_bench_tool.h @@ -0,0 +1,9 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#pragma once + +namespace rocksdb { +int db_bench_tool(int argc, char** argv); +} // namespace rocksdb diff --git a/include/rocksdb/db_dump_tool.h b/include/rocksdb/db_dump_tool.h index 67575a94b..1acc63176 100644 --- a/include/rocksdb/db_dump_tool.h +++ b/include/rocksdb/db_dump_tool.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/delete_scheduler.h b/include/rocksdb/delete_scheduler.h deleted file mode 100644 index 7c3eaee77..000000000 --- a/include/rocksdb/delete_scheduler.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#pragma once - -#include -#include -#include - -#include "rocksdb/status.h" - -namespace rocksdb { - -class Env; -class Logger; - -// DeleteScheduler allow the DB to enforce a rate limit on file deletion, -// Instead of deleteing files immediately, files are moved to trash_dir -// and deleted in a background thread that apply sleep penlty between deletes -// if they are happening in a rate faster than rate_bytes_per_sec, -// -// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this -// case DeleteScheduler will delete files immediately. -class DeleteScheduler { - public: - virtual ~DeleteScheduler() {} - - // Return delete rate limit in bytes per second - virtual int64_t GetRateBytesPerSecond() = 0; - - // Move file to trash directory and schedule it's deletion - virtual Status DeleteFile(const std::string& fname) = 0; - - // Return a map containing errors that happened in the background thread - // file_path => error status - virtual std::map GetBackgroundErrors() = 0; - - // Wait for all files being deleteing in the background to finish or for - // destructor to be called. - virtual void WaitForEmptyTrash() = 0; -}; - -// Create a new DeleteScheduler that can be shared among multiple RocksDB -// instances to control the file deletion rate. -// -// @env: Pointer to Env object, please see "rocksdb/env.h". -// @trash_dir: Path to the directory where deleted files will be moved into -// to be deleted in a background thread while applying rate limiting. If this -// directory dont exist, it will be created. This directory should not be -// used by any other process or any other DeleteScheduler. -// @rate_bytes_per_sec: How many bytes should be deleted per second, If this -// value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb -// in 1 second, we will wait for another 3 seconds before we delete other -// files, Set to 0 to disable rate limiting. -// @info_log: If not nullptr, info_log will be used to log errors. -// @delete_exisitng_trash: If set to true, the newly created DeleteScheduler -// will delete files that already exist in trash_dir. -// @status: If not nullptr, status will contain any errors that happened during -// creating the missing trash_dir or deleting existing files in trash. -extern DeleteScheduler* NewDeleteScheduler( - Env* env, const std::string& trash_dir, int64_t rate_bytes_per_sec, - std::shared_ptr info_log = nullptr, - bool delete_exisitng_trash = true, Status* status = nullptr); - -} // namespace rocksdb diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index e5f892a75..7bdb6ee61 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -103,6 +103,14 @@ struct EnvOptions { class Env { public: + struct FileAttributes { + // File name + std::string name; + + // Size of file in bytes + uint64_t size_bytes; + }; + Env() : thread_status_updater_(nullptr) {} virtual ~Env(); @@ -177,6 +185,15 @@ class Env { virtual Status GetChildren(const std::string& dir, std::vector* result) = 0; + // Store in *result the attributes of the children of the specified directory. + // In case the implementation lists the directory prior to iterating the files + // and files are concurrently deleted, the deleted files will be omitted from + // result. + // The name attributes are relative to "dir". + // Original contents of *results are dropped. + virtual Status GetChildrenFileAttributes(const std::string& dir, + std::vector* result); + // Delete the named file. virtual Status DeleteFile(const std::string& fname) = 0; @@ -789,6 +806,10 @@ class EnvWrapper : public Env { std::vector* r) override { return target_->GetChildren(dir, r); } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* result) override { + return target_->GetChildrenFileAttributes(dir, result); + } Status DeleteFile(const std::string& f) override { return target_->DeleteFile(f); } diff --git a/include/rocksdb/experimental.h b/include/rocksdb/experimental.h index 1d02e0238..70ad0b914 100644 --- a/include/rocksdb/experimental.h +++ b/include/rocksdb/experimental.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/filter_policy.h b/include/rocksdb/filter_policy.h index 90aefb388..2c1588a23 100644 --- a/include/rocksdb/filter_policy.h +++ b/include/rocksdb/filter_policy.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/flush_block_policy.h b/include/rocksdb/flush_block_policy.h index 939725cf4..022e0be4a 100644 --- a/include/rocksdb/flush_block_policy.h +++ b/include/rocksdb/flush_block_policy.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/immutable_options.h b/include/rocksdb/immutable_options.h index 52978691b..5a1500826 100644 --- a/include/rocksdb/immutable_options.h +++ b/include/rocksdb/immutable_options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/iostats_context.h b/include/rocksdb/iostats_context.h index e81092b52..632fe44c8 100644 --- a/include/rocksdb/iostats_context.h +++ b/include/rocksdb/iostats_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,7 +18,7 @@ struct IOStatsContext { // reset all io-stats counter to zero void Reset(); - std::string ToString() const; + std::string ToString(bool exclude_zero_counters = false) const; // the thread pool id uint64_t thread_pool_id; diff --git a/include/rocksdb/iterator.h b/include/rocksdb/iterator.h index 885232db6..7da37ec33 100644 --- a/include/rocksdb/iterator.h +++ b/include/rocksdb/iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ #ifndef STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ #define STORAGE_ROCKSDB_INCLUDE_ITERATOR_H_ +#include #include "rocksdb/slice.h" #include "rocksdb/status.h" @@ -95,14 +96,19 @@ class Iterator : public Cleanable { // satisfied without doing some IO, then this returns Status::Incomplete(). virtual Status status() const = 0; - // If true, this means that the Slice returned by key() is valid as long - // as the iterator is not deleted and ReleasePinnedData() is not called. - // - // IsKeyPinned() is guaranteed to always return true if - // - Iterator created with ReadOptions::pin_data = true - // - DB tables were created with BlockBasedTableOptions::use_delta_encoding - // set to false. - virtual bool IsKeyPinned() const { return false; } + // Property "rocksdb.iterator.is-key-pinned": + // If returning "1", this means that the Slice returned by key() is valid + // as long as the iterator is not deleted and ReleasePinnedData() is not + // called. + // It is guaranteed to always return "1" if + // - Iterator created with ReadOptions::pin_data = true + // - DB tables were created with + // BlockBasedTableOptions::use_delta_encoding + // set to false. + // Property "rocksdb.iterator.super-version-number": + // LSM version used by the iterator. The same format as DB Property + // kCurrentSuperVersionNumber. See its comment for more information. + virtual Status GetProperty(std::string prop_name, std::string* prop); private: // No copying allowed diff --git a/include/rocksdb/ldb_tool.h b/include/rocksdb/ldb_tool.h index 1b1c64b06..8a6918ba4 100644 --- a/include/rocksdb/ldb_tool.h +++ b/include/rocksdb/ldb_tool.h @@ -1,10 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE #pragma once #include +#include +#include "rocksdb/db.h" #include "rocksdb/options.h" namespace rocksdb { @@ -28,8 +30,10 @@ struct LDBOptions { class LDBTool { public: - void Run(int argc, char** argv, Options db_options= Options(), - const LDBOptions& ldb_options = LDBOptions()); + void Run( + int argc, char** argv, Options db_options = Options(), + const LDBOptions& ldb_options = LDBOptions(), + const std::vector* column_families = nullptr); }; } // namespace rocksdb diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index 6cd92d823..f6f030946 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -188,10 +188,6 @@ class MemTableRep { // Default: true virtual bool IsSnapshotSupported() const { return true; } - // Return true if the current MemTableRep supports concurrent inserts - // Default: false - virtual bool IsInsertConcurrentlySupported() const { return false; } - protected: // When *key is an internal key concatenated with the value, returns the // user key. @@ -210,6 +206,10 @@ class MemTableRepFactory { const SliceTransform*, Logger* logger) = 0; virtual const char* Name() const = 0; + + // Return true if the current MemTableRep supports concurrent inserts + // Default: false + virtual bool IsInsertConcurrentlySupported() const { return false; } }; // This uses a skip list to store keys. It is the default. @@ -229,6 +229,8 @@ class SkipListFactory : public MemTableRepFactory { Logger* logger) override; virtual const char* Name() const override { return "SkipListFactory"; } + bool IsInsertConcurrentlySupported() const override { return true; } + private: const size_t lookahead_; }; diff --git a/include/rocksdb/merge_operator.h b/include/rocksdb/merge_operator.h index 05b66f202..09b9d7dd6 100644 --- a/include/rocksdb/merge_operator.h +++ b/include/rocksdb/merge_operator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 7cdf4a1a9..5425146d7 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 5a18027e1..60f9cd121 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,6 +33,7 @@ class CompactionFilterFactory; class Comparator; class Env; enum InfoLogLevel : unsigned char; +class SstFileManager; class FilterPolicy; class Logger; class MergeOperator; @@ -41,7 +42,6 @@ class TableFactory; class MemTableRepFactory; class TablePropertiesCollectorFactory; class RateLimiter; -class DeleteScheduler; class Slice; class SliceTransform; class Statistics; @@ -80,6 +80,9 @@ enum CompactionStyle : char { kCompactionStyleNone = 0x3, }; +// In Level-based comapction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. enum CompactionPri : char { // Slightly Priotize larger files by size compensated by #deletes kByCompensatedSize = 0x0, @@ -90,6 +93,10 @@ enum CompactionPri : char { // for the longest. If your updates are random across the key space, // write amplification is slightly better with this option. kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, }; enum class WALRecoveryMode : char { @@ -830,12 +837,12 @@ struct DBOptions { // Default: nullptr std::shared_ptr rate_limiter; - // Use to control files deletion rate, can be used among multiple - // RocksDB instances. delete_scheduler is only used to delete table files that - // need to be deleted from the first db_path (db_name if db_paths is empty), - // other files types and other db_paths wont be affected by delete_scheduler. - // Default: nullptr (disabled) - std::shared_ptr delete_scheduler; + // Use to track SST files and control their file deletion rate, can be used + // among multiple RocksDB instances, sst_file_manager only track and throttle + // deletes of SST files in first db_path (db_name if db_paths is empty), other + // files and other db_paths wont be tracked or affected by sst_file_manager. + // Default: nullptr + std::shared_ptr sst_file_manager; // Any internal progress/error information generated by the db will // be written to info_log if it is non-nullptr, or to a file stored @@ -933,8 +940,19 @@ struct DBOptions { // regardless of this setting uint64_t delete_obsolete_files_period_micros; + // Suggested number of concurrent background compaction jobs, submitted to + // the default LOW priority thread pool. + // + // Default: max_background_compactions + int base_background_compactions; + // Maximum number of concurrent background compaction jobs, submitted to // the default LOW priority thread pool. + // We first try to schedule compactions based on + // `base_background_compactions`. If the compaction cannot catch up , we + // will increase number of compaction threads up to + // `max_background_compactions`. + // // If you're increasing this, also consider increasing number of threads in // LOW priority thread pool. For more information, see // Env::SetBackgroundThreads @@ -1023,7 +1041,23 @@ struct DBOptions { // large amounts of data (such as xfs's allocsize option). size_t manifest_preallocation_size; - // Data being read from file storage may be buffered in the OS + // Hint the OS that it should not buffer disk I/O. Enabling this + // parameter may improve performance but increases pressure on the + // system cache. + // + // The exact behavior of this parameter is platform dependent. + // + // On POSIX systems, after RocksDB reads data from disk it will + // mark the pages as "unneeded". The operating system may - or may not + // - evict these pages from memory, reducing pressure on the system + // cache. If the disk block is requested again this can result in + // additional disk I/O. + // + // On WINDOWS system, files will be opened in "unbuffered I/O" mode + // which means that data read from the disk will not be cached or + // bufferized. The hardware buffer of the devices may however still + // be used. Memory mapped files are not impacted by this parameter. + // // Default: true bool allow_os_buffer; @@ -1110,6 +1144,9 @@ struct DBOptions { // This option is currently honored only on Windows // // Default: 1 Mb + // + // Special value: 0 - means do not maintain per instance buffer. Allocate + // per request buffer and avoid locking. size_t random_access_max_buffer_size; // This is the maximum buffer size that is used by WritableFileWriter. @@ -1280,8 +1317,12 @@ struct Options : public DBOptions, public ColumnFamilyOptions { // the block cache. It will not page in data from the OS cache or data that // resides in storage. enum ReadTier { - kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage - kBlockCacheTier = 0x1 // data in memtable or block cache + kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage + kBlockCacheTier = 0x1, // data in memtable or block cache + kPersistedTier = 0x2 // persisted data. When WAL is disabled, this option + // will skip data in memtable. + // Note that this ReadTier currently only supports + // Get and MultiGet and does not support iterators. }; // Options that control read operations @@ -1374,8 +1415,9 @@ struct ReadOptions { // Keep the blocks loaded by the iterator pinned in memory as long as the // iterator is not deleted, If used when reading from tables created with - // BlockBasedTableOptions::use_delta_encoding = false, Iterator::IsKeyPinned() - // is guaranteed to return true. + // BlockBasedTableOptions::use_delta_encoding = false, + // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to + // return 1. // Default: false bool pin_data; diff --git a/include/rocksdb/perf_context.h b/include/rocksdb/perf_context.h index c2af729e3..7cae30aee 100644 --- a/include/rocksdb/perf_context.h +++ b/include/rocksdb/perf_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,7 +21,7 @@ struct PerfContext { void Reset(); // reset all performance counters to zero - std::string ToString() const; + std::string ToString(bool exclude_zero_counters = false) const; uint64_t user_key_comparison_count; // total number of user key comparisons uint64_t block_cache_hit_count; // total number of block cache hits diff --git a/include/rocksdb/perf_level.h b/include/rocksdb/perf_level.h index fee8ce1c4..61970cf54 100644 --- a/include/rocksdb/perf_level.h +++ b/include/rocksdb/perf_level.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,9 +14,11 @@ namespace rocksdb { // How much perf stats to collect. Affects perf_context and iostats_context. enum PerfLevel { - kDisable = 0, // disable perf stats - kEnableCount = 1, // enable only count stats - kEnableTime = 2 // enable time stats too + kDisable = 0, // disable perf stats + kEnableCount = 1, // enable only count stats + kEnableTimeExceptForMutex = 2, // Other than count stats, also enable time + // stats except for mutexes + kEnableTime = 3 // enable count and time stats }; // set the perf stats level for current thread diff --git a/include/rocksdb/rate_limiter.h b/include/rocksdb/rate_limiter.h index ae3ab8f84..b1bf3f427 100644 --- a/include/rocksdb/rate_limiter.h +++ b/include/rocksdb/rate_limiter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/slice.h b/include/rocksdb/slice.h index 3d39f3a04..3663716dc 100644 --- a/include/rocksdb/slice.h +++ b/include/rocksdb/slice.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/slice_transform.h b/include/rocksdb/slice_transform.h index 3694c5802..d12325812 100644 --- a/include/rocksdb/slice_transform.h +++ b/include/rocksdb/slice_transform.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/snapshot.h b/include/rocksdb/snapshot.h index 95822d297..d8d999dc2 100644 --- a/include/rocksdb/snapshot.h +++ b/include/rocksdb/snapshot.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/sst_dump_tool.h b/include/rocksdb/sst_dump_tool.h index 39bfb519b..0dd94caba 100644 --- a/include/rocksdb/sst_dump_tool.h +++ b/include/rocksdb/sst_dump_tool.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/sst_file_manager.h b/include/rocksdb/sst_file_manager.h new file mode 100644 index 000000000..bee243e4a --- /dev/null +++ b/include/rocksdb/sst_file_manager.h @@ -0,0 +1,80 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include +#include +#include + +#include "rocksdb/status.h" + +namespace rocksdb { + +class Env; +class Logger; + +// SstFileManager is used to track SST files in the DB and control there +// deletion rate. +// All SstFileManager public functions are thread-safe. +class SstFileManager { + public: + virtual ~SstFileManager() {} + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST files exceeds max_allowed_space, writes to + // RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature, maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0; + + // Return true if the total size of SST files exceeded the maximum allowed + // space usage. + // + // thread-safe. + virtual bool IsMaxAllowedSpaceReached() = 0; + + // Return the total size of all tracked files. + // thread-safe + virtual uint64_t GetTotalSize() = 0; + + // Return a map containing all tracked files and there corresponding sizes. + // thread-safe + virtual std::unordered_map GetTrackedFiles() = 0; + + // Return delete rate limit in bytes per second. + // thread-safe + virtual int64_t GetDeleteRateBytesPerSecond() = 0; +}; + +// Create a new SstFileManager that can be shared among multiple RocksDB +// instances to track SST file and control there deletion rate. +// +// @param env: Pointer to Env object, please see "rocksdb/env.h". +// @param info_log: If not nullptr, info_log will be used to log errors. +// +// == Deletion rate limiting specific arguments == +// @param trash_dir: Path to the directory where deleted files will be moved +// to be deleted in a background thread while applying rate limiting. If this +// directory dont exist, it will be created. This directory should not be +// used by any other process or any other SstFileManager, Set to "" to +// disable deletion rate limiting. +// @param rate_bytes_per_sec: How many bytes should be deleted per second, If +// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb +// in 1 second, we will wait for another 3 seconds before we delete other +// files, Set to 0 to disable deletion rate limiting. +// @param delete_exisitng_trash: If set to true, the newly created +// SstFileManager will delete files that already exist in trash_dir. +// @param status: If not nullptr, status will contain any errors that happened +// during creating the missing trash_dir or deleting existing files in trash. +extern SstFileManager* NewSstFileManager( + Env* env, std::shared_ptr info_log = nullptr, + std::string trash_dir = "", int64_t rate_bytes_per_sec = 0, + bool delete_exisitng_trash = true, Status* status = nullptr); + +} // namespace rocksdb diff --git a/include/rocksdb/sst_file_writer.h b/include/rocksdb/sst_file_writer.h index eb2f89491..fb01feb1f 100644 --- a/include/rocksdb/sst_file_writer.h +++ b/include/rocksdb/sst_file_writer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 15c49439c..c832516da 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -33,6 +33,8 @@ enum Tickers : uint32_t { BLOCK_CACHE_HIT, // # of blocks added to block cache. BLOCK_CACHE_ADD, + // # of failures when adding blocks to block cache. + BLOCK_CACHE_ADD_FAILURES, // # of times cache miss when accessing index block from block cache. BLOCK_CACHE_INDEX_MISS, // # of times cache hit when accessing index block from block cache. @@ -109,6 +111,7 @@ enum Tickers : uint32_t { // Writer has to wait for compaction or flush to finish. STALL_MICROS, // The wait time for db mutex. + // Disabled by default. To enable it set stats level to kAll DB_MUTEX_WAIT_MICROS, RATE_LIMIT_DELAY_MILLIS, NO_ITERATORS, // number of iterators currently open @@ -139,8 +142,12 @@ enum Tickers : uint32_t { GET_UPDATES_SINCE_CALLS, BLOCK_CACHE_COMPRESSED_MISS, // miss in the compressed block cache BLOCK_CACHE_COMPRESSED_HIT, // hit in the compressed block cache - WAL_FILE_SYNCED, // Number of times WAL sync is done - WAL_FILE_BYTES, // Number of bytes written to WAL + // Number of blocks added to comopressed block cache + BLOCK_CACHE_COMPRESSED_ADD, + // Number of failures when adding blocks to compressed block cache + BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + WAL_FILE_SYNCED, // Number of times WAL sync is done + WAL_FILE_BYTES, // Number of bytes written to WAL // Writes can be processed by requesting thread or by the thread at the // head of the writers queue. @@ -175,6 +182,7 @@ const std::vector> TickersNameMap = { {BLOCK_CACHE_MISS, "rocksdb.block.cache.miss"}, {BLOCK_CACHE_HIT, "rocksdb.block.cache.hit"}, {BLOCK_CACHE_ADD, "rocksdb.block.cache.add"}, + {BLOCK_CACHE_ADD_FAILURES, "rocksdb.block.cache.add.failures"}, {BLOCK_CACHE_INDEX_MISS, "rocksdb.block.cache.index.miss"}, {BLOCK_CACHE_INDEX_HIT, "rocksdb.block.cache.index.hit"}, {BLOCK_CACHE_FILTER_MISS, "rocksdb.block.cache.filter.miss"}, @@ -226,6 +234,9 @@ const std::vector> TickersNameMap = { {GET_UPDATES_SINCE_CALLS, "rocksdb.getupdatessince.calls"}, {BLOCK_CACHE_COMPRESSED_MISS, "rocksdb.block.cachecompressed.miss"}, {BLOCK_CACHE_COMPRESSED_HIT, "rocksdb.block.cachecompressed.hit"}, + {BLOCK_CACHE_COMPRESSED_ADD, "rocksdb.block.cachecompressed.add"}, + {BLOCK_CACHE_COMPRESSED_ADD_FAILURES, + "rocksdb.block.cachecompressed.add.failures"}, {WAL_FILE_SYNCED, "rocksdb.wal.synced"}, {WAL_FILE_BYTES, "rocksdb.wal.bytes"}, {WRITE_DONE_BY_SELF, "rocksdb.write.self"}, @@ -279,6 +290,10 @@ enum Histograms : uint32_t { SST_READ_MICROS, // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, + // Value size distribution in each operation + BYTES_PER_READ, + BYTES_PER_WRITE, + BYTES_PER_MULTIGET, HISTOGRAM_ENUM_MAX, // TODO(ldemailly): enforce HistogramsNameMap match }; @@ -306,6 +321,9 @@ const std::vector> HistogramsNameMap = { {WRITE_STALL, "rocksdb.db.write.stall"}, {SST_READ_MICROS, "rocksdb.sst.read.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, + {BYTES_PER_READ, "rocksdb.bytes.per.read"}, + {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, + {BYTES_PER_MULTIGET, "rocksdb.bytes.per.multiget"}, }; struct HistogramData { @@ -316,6 +334,16 @@ struct HistogramData { double standard_deviation; }; +enum StatsLevel { + // Collect all stats except the counters requiring to get time inside the + // mutex lock. + kExceptTimeForMutex, + // Collect all stats, including measuring duration of mutex operations. + // If getting time is expensive on the platform to run, it can + // reduce scalability to more threads, especialy for writes. + kAll, +}; + // Analyze the performance of a db class Statistics { public: @@ -339,6 +367,8 @@ class Statistics { virtual bool HistEnabledForType(uint32_t type) const { return type < HISTOGRAM_ENUM_MAX; } + + StatsLevel stats_level_ = kExceptTimeForMutex; }; // Create a concrete DBStatistics object diff --git a/include/rocksdb/status.h b/include/rocksdb/status.h index 511f6661f..bff15ee0f 100644 --- a/include/rocksdb/status.h +++ b/include/rocksdb/status.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -244,7 +244,7 @@ noexcept *this = std::move(s); } -inline Status& Status::operator=(Status&& s) +inline Status& Status::operator=(Status&& s) #if !(defined _MSC_VER) || ((defined _MSC_VER) && (_MSC_VER >= 1900)) noexcept #endif diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 2e1a91de9..8aba3a153 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -64,6 +64,12 @@ struct BlockBasedTableOptions { // block during table initialization. bool cache_index_and_filter_blocks = false; + // if cache_index_and_filter_blocks is true and the below is true, then + // filter and index blocks are stored in the cache, but a reference is + // held in the "table reader" object so the blocks are pinned and only + // evicted from cache when the table reader is freed. + bool pin_l0_filter_and_index_blocks_in_cache = false; + // The index type that will be used for this table. enum IndexType : char { // A space efficient index block that is optimized for @@ -120,6 +126,9 @@ struct BlockBasedTableOptions { // value will be silently overwritten with 1. int block_restart_interval = 16; + // Same as block_restart_interval but used for the index block. + int index_block_restart_interval = 1; + // Use delta encoding to compress keys in blocks. // Iterator::PinData() requires this option to be disabled. // @@ -163,7 +172,7 @@ struct BlockBasedTableOptions { // this. // This option only affects newly written tables. When reading exising tables, // the information about version is read from the footer. - uint32_t format_version = 0; + uint32_t format_version = 2; }; // Table Properties that are specific to block-based table properties. diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h index d8a61b490..0cdea2b51 100644 --- a/include/rocksdb/thread_status.h +++ b/include/rocksdb/thread_status.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/transaction_log.h b/include/rocksdb/transaction_log.h index 1b80b9a0c..1fb93ace1 100644 --- a/include/rocksdb/transaction_log.h +++ b/include/rocksdb/transaction_log.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/types.h b/include/rocksdb/types.h index f20bf8277..6a477cab8 100644 --- a/include/rocksdb/types.h +++ b/include/rocksdb/types.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/universal_compaction.h b/include/rocksdb/universal_compaction.h index e0f9f830f..11490e413 100644 --- a/include/rocksdb/universal_compaction.h +++ b/include/rocksdb/universal_compaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/backupable_db.h b/include/rocksdb/utilities/backupable_db.h index f5f394c22..06caa5bb0 100644 --- a/include/rocksdb/utilities/backupable_db.h +++ b/include/rocksdb/utilities/backupable_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -88,14 +88,6 @@ struct BackupableDBOptions { // *turn it on only if you know what you're doing* bool share_files_with_checksum; - // Try to use the file size in file name instead of getting size from HDFS, - // if the file is generated with options.share_files_with_checksum = true. - // This is a temporary solution to reduce the backupable Db open latency when - // There are too many sst files. Will remove the option after we have a - // permanent solution. - // Default: false - bool use_file_size_in_file_name; - // Up to this many background threads will copy files for CreateNewBackup() // and RestoreDBFromBackup() // Default: 1 @@ -125,7 +117,6 @@ struct BackupableDBOptions { backup_rate_limit(_backup_rate_limit), restore_rate_limit(_restore_rate_limit), share_files_with_checksum(false), - use_file_size_in_file_name(false), max_background_operations(_max_background_operations), callback_trigger_interval_size(_callback_trigger_interval_size) { assert(share_table_files || !share_files_with_checksum); diff --git a/include/rocksdb/utilities/checkpoint.h b/include/rocksdb/utilities/checkpoint.h index b2d5458e5..b4523c25e 100644 --- a/include/rocksdb/utilities/checkpoint.h +++ b/include/rocksdb/utilities/checkpoint.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/convenience.h b/include/rocksdb/utilities/convenience.h index fae420b77..b0ac15c6d 100644 --- a/include/rocksdb/utilities/convenience.h +++ b/include/rocksdb/utilities/convenience.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/db_ttl.h b/include/rocksdb/utilities/db_ttl.h index 4534e1ff7..09107c50c 100644 --- a/include/rocksdb/utilities/db_ttl.h +++ b/include/rocksdb/utilities/db_ttl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/document_db.h b/include/rocksdb/utilities/document_db.h index 7fde5ec9f..52f225705 100644 --- a/include/rocksdb/utilities/document_db.h +++ b/include/rocksdb/utilities/document_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/flashcache.h b/include/rocksdb/utilities/flashcache.h index 7bb760924..b54d245f0 100644 --- a/include/rocksdb/utilities/flashcache.h +++ b/include/rocksdb/utilities/flashcache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/geo_db.h b/include/rocksdb/utilities/geo_db.h index d603c5770..37e5ebdc7 100644 --- a/include/rocksdb/utilities/geo_db.h +++ b/include/rocksdb/utilities/geo_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/info_log_finder.h b/include/rocksdb/utilities/info_log_finder.h index 916c54c28..4b7530c28 100644 --- a/include/rocksdb/utilities/info_log_finder.h +++ b/include/rocksdb/utilities/info_log_finder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/json_document.h b/include/rocksdb/utilities/json_document.h index a5e3ab256..9473258c8 100644 --- a/include/rocksdb/utilities/json_document.h +++ b/include/rocksdb/utilities/json_document.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/leveldb_options.h b/include/rocksdb/utilities/leveldb_options.h index 8e2c3a1d5..ea5063459 100644 --- a/include/rocksdb/utilities/leveldb_options.h +++ b/include/rocksdb/utilities/leveldb_options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/memory_util.h b/include/rocksdb/utilities/memory_util.h index 323a8a127..d89bb6adc 100644 --- a/include/rocksdb/utilities/memory_util.h +++ b/include/rocksdb/utilities/memory_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/optimistic_transaction_db.h b/include/rocksdb/utilities/optimistic_transaction_db.h index 772e64549..b2c2f99a8 100644 --- a/include/rocksdb/utilities/optimistic_transaction_db.h +++ b/include/rocksdb/utilities/optimistic_transaction_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -43,15 +43,19 @@ class OptimisticTransactionDB { virtual ~OptimisticTransactionDB() {} - // Starts a new Transaction. Passing set_snapshot=true has the same effect - // as calling SetSnapshot(). + // Starts a new Transaction. // - // Caller should delete the returned transaction after calling - // Commit() or Rollback(). + // Caller is responsible for deleting the returned transaction when no + // longer needed. + // + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. virtual Transaction* BeginTransaction( const WriteOptions& write_options, - const OptimisticTransactionOptions& - txn_options = OptimisticTransactionOptions()) = 0; + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions(), + Transaction* old_txn = nullptr) = 0; // Return the underlying Database that was opened virtual DB* GetBaseDB() = 0; diff --git a/include/rocksdb/utilities/options_util.h b/include/rocksdb/utilities/options_util.h index a2c09fe31..1d961a2bb 100644 --- a/include/rocksdb/utilities/options_util.h +++ b/include/rocksdb/utilities/options_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -44,6 +44,12 @@ namespace rocksdb { // examples/options_file_example.cc demonstrates how to use this function // to open a RocksDB instance. // +// @return the function returns an OK status when it went successfully. If +// the specified "dbpath" does not contain any option file, then a +// Status::NotFound will be returned. A return value other than +// Status::OK or Status::NotFound indicates there're some error related +// to the options file itself. +// // @see LoadOptionsFromFile Status LoadLatestOptions(const std::string& dbpath, Env* env, DBOptions* db_options, diff --git a/include/rocksdb/utilities/spatial_db.h b/include/rocksdb/utilities/spatial_db.h index 50abbf446..108915fd7 100644 --- a/include/rocksdb/utilities/spatial_db.h +++ b/include/rocksdb/utilities/spatial_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/table_properties_collectors.h b/include/rocksdb/utilities/table_properties_collectors.h index d31baf9a0..68a88e718 100644 --- a/include/rocksdb/utilities/table_properties_collectors.h +++ b/include/rocksdb/utilities/table_properties_collectors.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 8e9ead11c..4ccbb7fb9 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -356,6 +356,28 @@ class Transaction { // Reset the WriteOptions that will be used during Commit(). virtual void SetWriteOptions(const WriteOptions& write_options) = 0; + // If this key was previously fetched in this transaction using + // GetForUpdate/MultigetForUpdate(), calling UndoGetForUpdate will tell + // the transaction that it no longer needs to do any conflict checking + // for this key. + // + // If a key has been fetched N times via GetForUpdate/MultigetForUpdate(), + // then UndoGetForUpdate will only have an effect if it is also called N + // times. If this key has been written to in this transaction, + // UndoGetForUpdate() will have no effect. + // + // If SetSavePoint() has been called after the GetForUpdate(), + // UndoGetForUpdate() will not have any effect. + // + // If this Transaction was created by an OptimisticTransactionDB, + // calling UndoGetForUpdate can affect whether this key is conflict checked + // at commit time. + // If this Transaction was created by a TransactionDB, + // calling UndoGetForUpdate may release any held locks for this key. + virtual void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; + virtual void UndoGetForUpdate(const Slice& key) = 0; + protected: explicit Transaction(const TransactionDB* db) {} Transaction() {} diff --git a/include/rocksdb/utilities/transaction_db.h b/include/rocksdb/utilities/transaction_db.h index f9023fc21..ff29bc57a 100644 --- a/include/rocksdb/utilities/transaction_db.h +++ b/include/rocksdb/utilities/transaction_db.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -92,8 +92,6 @@ struct TransactionOptions { // will never relinquish any locks it holds. This could prevent keys from // being // written by other writers. - // - // TODO(agiardullo): Improve performance of checking expiration time. int64_t expiration = -1; }; @@ -113,14 +111,18 @@ class TransactionDB : public StackableDB { virtual ~TransactionDB() {} - // Starts a new Transaction. Passing set_snapshot=true has the same effect - // as calling Transaction::SetSnapshot(). + // Starts a new Transaction. + // + // Caller is responsible for deleting the returned transaction when no + // longer needed. // - // Caller should delete the returned transaction after calling - // Transaction::Commit() or Transaction::Rollback(). + // If old_txn is not null, BeginTransaction will reuse this Transaction + // handle instead of allocating a new one. This is an optimization to avoid + // extra allocations when repeatedly creating transactions. virtual Transaction* BeginTransaction( const WriteOptions& write_options, - const TransactionOptions& txn_options = TransactionOptions()) = 0; + const TransactionOptions& txn_options = TransactionOptions(), + Transaction* old_txn = nullptr) = 0; protected: // To Create an TransactionDB, call Open() diff --git a/include/rocksdb/utilities/transaction_db_mutex.h b/include/rocksdb/utilities/transaction_db_mutex.h index d9274df50..cedf54295 100644 --- a/include/rocksdb/utilities/transaction_db_mutex.h +++ b/include/rocksdb/utilities/transaction_db_mutex.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/utilities/write_batch_with_index.h b/include/rocksdb/utilities/write_batch_with_index.h index 1e41e7869..aab12ba02 100644 --- a/include/rocksdb/utilities/write_batch_with_index.h +++ b/include/rocksdb/utilities/write_batch_with_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index d8e93db42..8f6b899e1 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -1,11 +1,11 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #pragma once #define ROCKSDB_MAJOR 4 -#define ROCKSDB_MINOR 4 +#define ROCKSDB_MINOR 6 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with diff --git a/include/rocksdb/wal_filter.h b/include/rocksdb/wal_filter.h index c164e4d47..8b032bb9d 100644 --- a/include/rocksdb/wal_filter.h +++ b/include/rocksdb/wal_filter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index f4a7ac06e..e9bd72b58 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/include/rocksdb/write_batch_base.h b/include/rocksdb/write_batch_base.h index c4083754d..86ccbaa18 100644 --- a/include/rocksdb/write_batch_base.h +++ b/include/rocksdb/write_batch_base.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/CMakeLists.txt b/java/CMakeLists.txt new file mode 100644 index 000000000..d4a707b3c --- /dev/null +++ b/java/CMakeLists.txt @@ -0,0 +1,155 @@ +set(JNI_NATIVE_SOURCES + rocksjni/backupenginejni.cc + rocksjni/backupablejni.cc + rocksjni/checkpoint.cc + rocksjni/columnfamilyhandle.cc + rocksjni/compaction_filter.cc + rocksjni/comparator.cc + rocksjni/comparatorjnicallback.cc + rocksjni/env.cc + rocksjni/filter.cc + rocksjni/iterator.cc + rocksjni/loggerjnicallback.cc + rocksjni/memtablejni.cc + rocksjni/merge_operator.cc + rocksjni/options.cc + rocksjni/ratelimiterjni.cc + rocksjni/remove_emptyvalue_compactionfilterjni.cc + rocksjni/restorejni.cc + rocksjni/rocksjni.cc + rocksjni/slice.cc + rocksjni/snapshot.cc + rocksjni/statistics.cc + rocksjni/table.cc + rocksjni/transaction_log.cc + rocksjni/ttl.cc + rocksjni/write_batch.cc + rocksjni/writebatchhandlerjnicallback.cc + rocksjni/write_batch_with_index.cc + rocksjni/write_batch_test.cc +) + +set(NATIVE_JAVA_CLASSES + org.rocksdb.AbstractCompactionFilter + org.rocksdb.AbstractComparator + org.rocksdb.AbstractSlice + org.rocksdb.BackupEngine + org.rocksdb.BackupableDB + org.rocksdb.BackupableDBOptions + org.rocksdb.BlockBasedTableConfig + org.rocksdb.BloomFilter + org.rocksdb.Checkpoint + org.rocksdb.ColumnFamilyHandle + org.rocksdb.ColumnFamilyOptions + org.rocksdb.Comparator + org.rocksdb.ComparatorOptions + org.rocksdb.DBOptions + org.rocksdb.DirectComparator + org.rocksdb.DirectSlice + org.rocksdb.Env + org.rocksdb.FlushOptions + org.rocksdb.Filter + org.rocksdb.GenericRateLimiterConfig + org.rocksdb.HashLinkedListMemTableConfig + org.rocksdb.HashSkipListMemTableConfig + org.rocksdb.Logger + org.rocksdb.MergeOperator + org.rocksdb.Options + org.rocksdb.PlainTableConfig + org.rocksdb.ReadOptions + org.rocksdb.RemoveEmptyValueCompactionFilter + org.rocksdb.RestoreBackupableDB + org.rocksdb.RestoreOptions + org.rocksdb.RocksDB + org.rocksdb.RocksEnv + org.rocksdb.RocksIterator + org.rocksdb.RocksMemEnv + org.rocksdb.SkipListMemTableConfig + org.rocksdb.Slice + org.rocksdb.Statistics + org.rocksdb.TransactionLogIterator + org.rocksdb.TtlDB + org.rocksdb.VectorMemTableConfig + org.rocksdb.Snapshot + org.rocksdb.StringAppendOperator + org.rocksdb.WriteBatch + org.rocksdb.WriteBatch.Handler + org.rocksdb.WriteOptions + org.rocksdb.WriteBatchWithIndex + org.rocksdb.WBWIRocksIterator + org.rocksdb.WriteBatchTest + org.rocksdb.WriteBatchTestInternalHelper +) + +include_directories($ENV{JAVA_HOME}/include) +include_directories($ENV{JAVA_HOME}/include/win32) +include_directories(${PROJECT_SOURCE_DIR}/java) + +set(JAVA_TEST_LIBDIR ${PROJECT_SOURCE_DIR}/java/test-libs) +set(JAVA_TMP_JAR ${JAVA_TEST_LIBDIR}/tmp.jar) +set(JAVA_JUNIT_JAR ${JAVA_TEST_LIBDIR}/junit-4.12.jar) +set(JAVA_HAMCR_JAR ${JAVA_TEST_LIBDIR}/hamcrest-core-1.3.jar) +set(JAVA_MOCKITO_JAR ${JAVA_TEST_LIBDIR}/mockito-all-1.10.19.jar) +set(JAVA_CGLIB_JAR ${JAVA_TEST_LIBDIR}/cglib-2.2.2.jar) +set(JAVA_ASSERTJ_JAR ${JAVA_TEST_LIBDIR}/assertj-core-1.7.1.jar) +set(JAVA_TESTCLASSPATH "${JAVA_JUNIT_JAR}\;${JAVA_HAMCR_JAR}\;${JAVA_MOCKITO_JAR}\;${JAVA_CGLIB_JAR}\;${JAVA_ASSERTJ_JAR}") + +if(NOT EXISTS ${PROJECT_SOURCE_DIR}/java/classes) + execute_process(COMMAND mkdir ${PROJECT_SOURCE_DIR}/java/classes) +endif() + +if(NOT EXISTS ${JAVA_TEST_LIBDIR}) + execute_process(COMMAND mkdir ${JAVA_TEST_LIBDIR}) +endif() + +if(NOT EXISTS ${JAVA_JUNIT_JAR}) + message("Downloading ${JAVA_JUNIT_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=junit/junit/4.12/junit-4.12.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_JUNIT_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_JUNIT_JAR}) +endif() +if(NOT EXISTS ${JAVA_HAMCR_JAR}) + message("Downloading ${JAVA_HAMCR_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_HAMCR_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_HAMCR_JAR}) +endif() +if(NOT EXISTS ${JAVA_MOCKITO_JAR}) + message("Downloading ${JAVA_MOCKITO_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=org/mockito/mockito-all/1.10.19/mockito-all-1.10.19.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_MOCKITO_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_MOCKITO_JAR}) +endif() +if(NOT EXISTS ${JAVA_CGLIB_JAR}) + message("Downloading ${JAVA_CGLIB_JAR}") + file(DOWNLOAD http://search.maven.org/remotecontent?filepath=cglib/cglib/2.2.2/cglib-2.2.2.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_CGLIB_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_CGLIB_JAR}) +endif() +if(NOT EXISTS ${JAVA_ASSERTJ_JAR}) + message("Downloading ${JAVA_ASSERTJ_JAR}") + file(DOWNLOAD http://central.maven.org/maven2/org/assertj/assertj-core/1.7.1/assertj-core-1.7.1.jar ${JAVA_TMP_JAR} STATUS downloadStatus) + list(GET downloadStatus 0 error_code) + if(NOT error_code EQUAL 0) + message(FATAL_ERROR "Failed downloading ${JAVA_ASSERTJ_JAR}") + endif() + file(RENAME ${JAVA_TMP_JAR} ${JAVA_ASSERTJ_JAR}) +endif() + +execute_process(COMMAND javac -cp ${JAVA_TESTCLASSPATH} -d ${PROJECT_SOURCE_DIR}/java/classes ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/util/*.java ${PROJECT_SOURCE_DIR}/java/src/main/java/org/rocksdb/*.java ${PROJECT_SOURCE_DIR}/java/src/test/java/org/rocksdb/*.java) +execute_process(COMMAND javah -cp ${PROJECT_SOURCE_DIR}/java/classes -d ${PROJECT_SOURCE_DIR}/java/include -jni ${NATIVE_JAVA_CLASSES}) +add_library(rocksdbjni${ARTIFACT_SUFFIX} SHARED ${JNI_NATIVE_SOURCES}) +set_target_properties(rocksdbjni${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdbjni${ARTIFACT_SUFFIX}.pdb") +target_link_libraries(rocksdbjni${ARTIFACT_SUFFIX} rocksdblib${ARTIFACT_SUFFIX} ${LIBS}) diff --git a/java/Makefile b/java/Makefile index abc8f73ee..bffca4b27 100644 --- a/java/Makefile +++ b/java/Makefile @@ -99,6 +99,7 @@ JAVA_TESTS = org.rocksdb.BackupableDBOptionsTest\ org.rocksdb.StatisticsCollectorTest\ org.rocksdb.WriteBatchHandlerTest\ org.rocksdb.WriteBatchTest\ + org.rocksdb.WriteBatchThreadedTest\ org.rocksdb.WriteOptionsTest\ org.rocksdb.WriteBatchWithIndexTest diff --git a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java index 14eea09e9..fde2824b7 100644 --- a/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java +++ b/java/benchmark/src/main/java/org/rocksdb/benchmark/DbBenchmark.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/backupablejni.cc b/java/rocksjni/backupablejni.cc index d26e46e88..f2304dadb 100644 --- a/java/rocksjni/backupablejni.cc +++ b/java/rocksjni/backupablejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -103,20 +103,14 @@ jintArray Java_org_rocksdb_BackupableDB_getCorruptedBackups( reinterpret_cast(jhandle)-> GetCorruptedBackups(&backup_ids); // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } + std::vector int_backup_ids(backup_ids.begin(), backup_ids.end()); // Store ints in java array jintArray ret_backup_ids; // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); + jsize ret_backup_ids_size = static_cast(backup_ids.size()); ret_backup_ids = env->NewIntArray(ret_backup_ids_size); env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); + int_backup_ids.data()); return ret_backup_ids; } diff --git a/java/rocksjni/backupenginejni.cc b/java/rocksjni/backupenginejni.cc index 750ab965a..a42399873 100644 --- a/java/rocksjni/backupenginejni.cc +++ b/java/rocksjni/backupenginejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -81,20 +81,14 @@ jintArray Java_org_rocksdb_BackupEngine_getCorruptedBackups( std::vector backup_ids; backup_engine->GetCorruptedBackups(&backup_ids); // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } + std::vector int_backup_ids(backup_ids.begin(), backup_ids.end()); // Store ints in java array jintArray ret_backup_ids; // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); + jsize ret_backup_ids_size = static_cast(backup_ids.size()); ret_backup_ids = env->NewIntArray(ret_backup_ids_size); env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); + int_backup_ids.data()); return ret_backup_ids; } diff --git a/java/rocksjni/checkpoint.cc b/java/rocksjni/checkpoint.cc index 72a40be00..45f0fde6b 100644 --- a/java/rocksjni/checkpoint.cc +++ b/java/rocksjni/checkpoint.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/columnfamilyhandle.cc b/java/rocksjni/columnfamilyhandle.cc index be3b4c82f..2a874b1d9 100644 --- a/java/rocksjni/columnfamilyhandle.cc +++ b/java/rocksjni/columnfamilyhandle.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/compaction_filter.cc b/java/rocksjni/compaction_filter.cc index 5fa52c0dc..20b36a412 100644 --- a/java/rocksjni/compaction_filter.cc +++ b/java/rocksjni/compaction_filter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/comparator.cc b/java/rocksjni/comparator.cc index 196376235..8765daa34 100644 --- a/java/rocksjni/comparator.cc +++ b/java/rocksjni/comparator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/comparatorjnicallback.cc b/java/rocksjni/comparatorjnicallback.cc index a85b45085..1c0317003 100644 --- a/java/rocksjni/comparatorjnicallback.cc +++ b/java/rocksjni/comparatorjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/comparatorjnicallback.h b/java/rocksjni/comparatorjnicallback.h index 65b986ca4..821a91e45 100644 --- a/java/rocksjni/comparatorjnicallback.h +++ b/java/rocksjni/comparatorjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/env.cc b/java/rocksjni/env.cc index b50d5ae30..a58f54ea7 100644 --- a/java/rocksjni/env.cc +++ b/java/rocksjni/env.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/filter.cc b/java/rocksjni/filter.cc index 2ce17d499..2b662d03f 100644 --- a/java/rocksjni/filter.cc +++ b/java/rocksjni/filter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/iterator.cc b/java/rocksjni/iterator.cc index e9eb0bb37..c5e64adfb 100644 --- a/java/rocksjni/iterator.cc +++ b/java/rocksjni/iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/loggerjnicallback.cc b/java/rocksjni/loggerjnicallback.cc index 71e50b9a9..56857b750 100644 --- a/java/rocksjni/loggerjnicallback.cc +++ b/java/rocksjni/loggerjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/loggerjnicallback.h b/java/rocksjni/loggerjnicallback.h index 3936252bc..2355a3985 100644 --- a/java/rocksjni/loggerjnicallback.h +++ b/java/rocksjni/loggerjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/memtablejni.cc b/java/rocksjni/memtablejni.cc index ce27f9769..ead038d50 100644 --- a/java/rocksjni/memtablejni.cc +++ b/java/rocksjni/memtablejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index de3df942c..9cb466538 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,7 +8,6 @@ #include #include #include -#include #include #include "include/org_rocksdb_Options.h" @@ -1180,7 +1179,7 @@ jbyte Java_org_rocksdb_Options_compactionStyle( void Java_org_rocksdb_Options_setMaxTableFilesSizeFIFO( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle)->compaction_options_fifo.max_table_files_size = - static_cast(jmax_table_files_size); + static_cast(jmax_table_files_size); } /* @@ -2339,7 +2338,7 @@ jbyte Java_org_rocksdb_ColumnFamilyOptions_compactionStyle( void Java_org_rocksdb_ColumnFamilyOptions_setMaxTableFilesSizeFIFO( JNIEnv* env, jobject jobj, jlong jhandle, jlong jmax_table_files_size) { reinterpret_cast(jhandle)->compaction_options_fifo.max_table_files_size = - static_cast(jmax_table_files_size); + static_cast(jmax_table_files_size); } /* diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index 804bbc68a..0c5a9245f 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,11 @@ #include "rocksjni/loggerjnicallback.h" #include "rocksjni/writebatchhandlerjnicallback.h" +// Remove macro on windows +#ifdef DELETE +#undef DELETE +#endif + namespace rocksdb { // Detect if jlong overflows size_t diff --git a/java/rocksjni/ratelimiterjni.cc b/java/rocksjni/ratelimiterjni.cc index ab6160e0d..7b4bc1f22 100644 --- a/java/rocksjni/ratelimiterjni.cc +++ b/java/rocksjni/ratelimiterjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc b/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc index e442d8daf..3cf7b3a03 100644 --- a/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc +++ b/java/rocksjni/remove_emptyvalue_compactionfilterjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/restorejni.cc b/java/rocksjni/restorejni.cc index a2341632b..40b13dac5 100644 --- a/java/rocksjni/restorejni.cc +++ b/java/rocksjni/restorejni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -156,21 +156,14 @@ jintArray Java_org_rocksdb_RestoreBackupableDB_getCorruptedBackups( reinterpret_cast(jhandle)-> GetCorruptedBackups(&backup_ids); // store backupids in int array - const std::vector::size_type - kIdSize = backup_ids.size(); - - int int_backup_ids[kIdSize]; - for (std::vector::size_type i = 0; - i != kIdSize; i++) { - int_backup_ids[i] = backup_ids[i]; - } + std::vector int_backup_ids(backup_ids.begin(), backup_ids.end()); // Store ints in java array jintArray ret_backup_ids; // Its ok to loose precision here (64->32) - jsize ret_backup_ids_size = static_cast(kIdSize); + jsize ret_backup_ids_size = static_cast(backup_ids.size()); ret_backup_ids = env->NewIntArray(ret_backup_ids_size); env->SetIntArrayRegion(ret_backup_ids, 0, ret_backup_ids_size, - int_backup_ids); + int_backup_ids.data()); return ret_backup_ids; } diff --git a/java/rocksjni/rocksjni.cc b/java/rocksjni/rocksjni.cc index 221e7fff2..d9c0c6147 100644 --- a/java/rocksjni/rocksjni.cc +++ b/java/rocksjni/rocksjni.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,7 @@ #include #include #include +#include #include "include/org_rocksdb_RocksDB.h" #include "rocksdb/db.h" @@ -19,6 +20,10 @@ #include "rocksdb/types.h" #include "rocksjni/portal.h" +#ifdef min +#undef min +#endif + ////////////////////////////////////////////////////////////////////////////// // rocksdb::DB::Open @@ -688,8 +693,8 @@ jint rocksdb_get_helper( return kStatusError; } - int cvalue_len = static_cast(cvalue.size()); - int length = std::min(jentry_value_len, cvalue_len); + jint cvalue_len = static_cast(cvalue.size()); + jint length = std::min(jentry_value_len, cvalue_len); env->SetByteArrayRegion( jentry_value, 0, length, diff --git a/java/rocksjni/slice.cc b/java/rocksjni/slice.cc index 811117397..5e05e46f7 100644 --- a/java/rocksjni/slice.cc +++ b/java/rocksjni/slice.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/snapshot.cc b/java/rocksjni/snapshot.cc index cd10c97c8..fa8ede7ab 100644 --- a/java/rocksjni/snapshot.cc +++ b/java/rocksjni/snapshot.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/statistics.cc b/java/rocksjni/statistics.cc index bf170c6de..6d1ef8db0 100644 --- a/java/rocksjni/statistics.cc +++ b/java/rocksjni/statistics.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,7 +20,7 @@ * Signature: (IJ)J */ jlong Java_org_rocksdb_Statistics_getTickerCount0( - JNIEnv* env, jobject jobj, int tickerType, jlong handle) { + JNIEnv* env, jobject jobj, jint tickerType, jlong handle) { auto st = reinterpret_cast(handle); assert(st != nullptr); @@ -33,7 +33,7 @@ jlong Java_org_rocksdb_Statistics_getTickerCount0( * Signature: (IJ)Lorg/rocksdb/HistogramData; */ jobject Java_org_rocksdb_Statistics_geHistogramData0( - JNIEnv* env, jobject jobj, int histogramType, jlong handle) { + JNIEnv* env, jobject jobj, jint histogramType, jlong handle) { auto st = reinterpret_cast(handle); assert(st != nullptr); diff --git a/java/rocksjni/table.cc b/java/rocksjni/table.cc index e78e7e0d7..204d1ba38 100644 --- a/java/rocksjni/table.cc +++ b/java/rocksjni/table.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -38,13 +38,14 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle( /* * Class: org_rocksdb_BlockBasedTableConfig * Method: newTableFactoryHandle - * Signature: (ZJIJIIZIZZJIBBI)J + * Signature: (ZJIJIIZIZZZJIBBI)J */ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size, jint block_cache_num_shardbits, jlong block_size, jint block_size_deviation, jint block_restart_interval, jboolean whole_key_filtering, jlong jfilterPolicy, jboolean cache_index_and_filter_blocks, + jboolean pin_l0_filter_and_index_blocks_in_cache, jboolean hash_index_allow_collision, jlong block_cache_compressed_size, jint block_cache_compressd_num_shard_bits, jbyte jchecksum_type, jbyte jindex_type, jint jformat_version) { @@ -70,6 +71,8 @@ jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle( options.filter_policy = *pFilterPolicy; } options.cache_index_and_filter_blocks = cache_index_and_filter_blocks; + options.pin_l0_filter_and_index_blocks_in_cache = + pin_l0_filter_and_index_blocks_in_cache; options.hash_index_allow_collision = hash_index_allow_collision; if (block_cache_compressed_size > 0) { if (block_cache_compressd_num_shard_bits > 0) { diff --git a/java/rocksjni/transaction_log.cc b/java/rocksjni/transaction_log.cc index 1d3d7c100..eed8d84b5 100644 --- a/java/rocksjni/transaction_log.cc +++ b/java/rocksjni/transaction_log.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/ttl.cc b/java/rocksjni/ttl.cc index ec5b419f1..219e6c4db 100644 --- a/java/rocksjni/ttl.cc +++ b/java/rocksjni/ttl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/write_batch.cc b/java/rocksjni/write_batch.cc index dc3f6d2c6..83d2e6dfe 100644 --- a/java/rocksjni/write_batch.cc +++ b/java/rocksjni/write_batch.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/write_batch_test.cc b/java/rocksjni/write_batch_test.cc index 2690f619e..9b4c7fd61 100644 --- a/java/rocksjni/write_batch_test.cc +++ b/java/rocksjni/write_batch_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -60,7 +60,8 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents( for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { rocksdb::ParsedInternalKey ikey; memset(reinterpret_cast(&ikey), 0, sizeof(ikey)); - assert(rocksdb::ParseInternalKey(iter->key(), &ikey)); + bool parsed = rocksdb::ParseInternalKey(iter->key(), &ikey); + assert(parsed); switch (ikey.type) { case rocksdb::kTypeValue: state.append("Put("); diff --git a/java/rocksjni/write_batch_with_index.cc b/java/rocksjni/write_batch_with_index.cc index 7c57a0e06..51296427e 100644 --- a/java/rocksjni/write_batch_with_index.cc +++ b/java/rocksjni/write_batch_with_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/writebatchhandlerjnicallback.cc b/java/rocksjni/writebatchhandlerjnicallback.cc index b12e35544..b25236518 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.cc +++ b/java/rocksjni/writebatchhandlerjnicallback.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/rocksjni/writebatchhandlerjnicallback.h b/java/rocksjni/writebatchhandlerjnicallback.h index 9a2a47e80..1c421db03 100644 --- a/java/rocksjni/writebatchhandlerjnicallback.h +++ b/java/rocksjni/writebatchhandlerjnicallback.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/samples/src/main/java/RocksDBColumnFamilySample.java b/java/samples/src/main/java/RocksDBColumnFamilySample.java index da9f4d28b..8d682928c 100644 --- a/java/samples/src/main/java/RocksDBColumnFamilySample.java +++ b/java/samples/src/main/java/RocksDBColumnFamilySample.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/samples/src/main/java/RocksDBSample.java b/java/samples/src/main/java/RocksDBSample.java index 402fd8f89..3ac17777d 100644 --- a/java/samples/src/main/java/RocksDBSample.java +++ b/java/samples/src/main/java/RocksDBSample.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java b/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java index 2b78deddb..1ecedf156 100644 --- a/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java +++ b/java/src/main/java/org/rocksdb/AbstractCompactionFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/AbstractComparator.java b/java/src/main/java/org/rocksdb/AbstractComparator.java index c2412d7f2..04a26bfba 100644 --- a/java/src/main/java/org/rocksdb/AbstractComparator.java +++ b/java/src/main/java/org/rocksdb/AbstractComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/AbstractRocksIterator.java b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java index f3f89a671..b7419cba9 100644 --- a/java/src/main/java/org/rocksdb/AbstractRocksIterator.java +++ b/java/src/main/java/org/rocksdb/AbstractRocksIterator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/AbstractSlice.java b/java/src/main/java/org/rocksdb/AbstractSlice.java index a37bd023e..ea77f5384 100644 --- a/java/src/main/java/org/rocksdb/AbstractSlice.java +++ b/java/src/main/java/org/rocksdb/AbstractSlice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java index b380c5d8a..984e400ab 100644 --- a/java/src/main/java/org/rocksdb/AbstractWriteBatch.java +++ b/java/src/main/java/org/rocksdb/AbstractWriteBatch.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BackupEngine.java b/java/src/main/java/org/rocksdb/BackupEngine.java index 2f944e5fb..4791719aa 100644 --- a/java/src/main/java/org/rocksdb/BackupEngine.java +++ b/java/src/main/java/org/rocksdb/BackupEngine.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BackupInfo.java b/java/src/main/java/org/rocksdb/BackupInfo.java index 48a52a789..4f3a62845 100644 --- a/java/src/main/java/org/rocksdb/BackupInfo.java +++ b/java/src/main/java/org/rocksdb/BackupInfo.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BackupableDB.java b/java/src/main/java/org/rocksdb/BackupableDB.java index f2646d22a..6de20736f 100644 --- a/java/src/main/java/org/rocksdb/BackupableDB.java +++ b/java/src/main/java/org/rocksdb/BackupableDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BackupableDBOptions.java b/java/src/main/java/org/rocksdb/BackupableDBOptions.java index 17a0afc28..d32f2db8c 100644 --- a/java/src/main/java/org/rocksdb/BackupableDBOptions.java +++ b/java/src/main/java/org/rocksdb/BackupableDBOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java index c3c6309b3..f569e6f42 100644 --- a/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java +++ b/java/src/main/java/org/rocksdb/BlockBasedTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BloomFilter.java b/java/src/main/java/org/rocksdb/BloomFilter.java index 67c45d717..2c9585f71 100644 --- a/java/src/main/java/org/rocksdb/BloomFilter.java +++ b/java/src/main/java/org/rocksdb/BloomFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/BuiltinComparator.java b/java/src/main/java/org/rocksdb/BuiltinComparator.java index ee92e8dd9..436cb513f 100644 --- a/java/src/main/java/org/rocksdb/BuiltinComparator.java +++ b/java/src/main/java/org/rocksdb/BuiltinComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Checkpoint.java b/java/src/main/java/org/rocksdb/Checkpoint.java index 816eceacf..9faa355e1 100644 --- a/java/src/main/java/org/rocksdb/Checkpoint.java +++ b/java/src/main/java/org/rocksdb/Checkpoint.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ChecksumType.java b/java/src/main/java/org/rocksdb/ChecksumType.java index e685376bf..7f560170c 100644 --- a/java/src/main/java/org/rocksdb/ChecksumType.java +++ b/java/src/main/java/org/rocksdb/ChecksumType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java index 8def05e74..84581f465 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyDescriptor.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java index 613cb892c..d414ee587 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyHandle.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java index 4304f589a..612efbe7f 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java index c4d7245a1..9856ec686 100644 --- a/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/ColumnFamilyOptionsInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/CompactionStyle.java b/java/src/main/java/org/rocksdb/CompactionStyle.java index 76064395c..22dc7dcf5 100644 --- a/java/src/main/java/org/rocksdb/CompactionStyle.java +++ b/java/src/main/java/org/rocksdb/CompactionStyle.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Comparator.java b/java/src/main/java/org/rocksdb/Comparator.java index c8e050bca..41f7fbc93 100644 --- a/java/src/main/java/org/rocksdb/Comparator.java +++ b/java/src/main/java/org/rocksdb/Comparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/CompressionType.java b/java/src/main/java/org/rocksdb/CompressionType.java index ec0c42f4d..b4d86166e 100644 --- a/java/src/main/java/org/rocksdb/CompressionType.java +++ b/java/src/main/java/org/rocksdb/CompressionType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/DBOptions.java b/java/src/main/java/org/rocksdb/DBOptions.java index 85aad1e72..d2e1bf94c 100644 --- a/java/src/main/java/org/rocksdb/DBOptions.java +++ b/java/src/main/java/org/rocksdb/DBOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/DBOptionsInterface.java b/java/src/main/java/org/rocksdb/DBOptionsInterface.java index 0c230e436..917e26ab0 100644 --- a/java/src/main/java/org/rocksdb/DBOptionsInterface.java +++ b/java/src/main/java/org/rocksdb/DBOptionsInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/DirectComparator.java b/java/src/main/java/org/rocksdb/DirectComparator.java index 47f4d7256..68ad11f6c 100644 --- a/java/src/main/java/org/rocksdb/DirectComparator.java +++ b/java/src/main/java/org/rocksdb/DirectComparator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/DirectSlice.java b/java/src/main/java/org/rocksdb/DirectSlice.java index 765b01586..7a59a3d82 100644 --- a/java/src/main/java/org/rocksdb/DirectSlice.java +++ b/java/src/main/java/org/rocksdb/DirectSlice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/EncodingType.java b/java/src/main/java/org/rocksdb/EncodingType.java index d639542aa..e27a9853f 100644 --- a/java/src/main/java/org/rocksdb/EncodingType.java +++ b/java/src/main/java/org/rocksdb/EncodingType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Env.java b/java/src/main/java/org/rocksdb/Env.java index 929a394c3..74088fd86 100644 --- a/java/src/main/java/org/rocksdb/Env.java +++ b/java/src/main/java/org/rocksdb/Env.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Filter.java b/java/src/main/java/org/rocksdb/Filter.java index ce5c41f26..1cc0ccd4c 100644 --- a/java/src/main/java/org/rocksdb/Filter.java +++ b/java/src/main/java/org/rocksdb/Filter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java b/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java index 89951c5d1..cc00c6f0a 100644 --- a/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java +++ b/java/src/main/java/org/rocksdb/GenericRateLimiterConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/HistogramData.java b/java/src/main/java/org/rocksdb/HistogramData.java index 020a9c9a5..a920f4b4e 100644 --- a/java/src/main/java/org/rocksdb/HistogramData.java +++ b/java/src/main/java/org/rocksdb/HistogramData.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 9b4548108..a4459eecc 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/IndexType.java b/java/src/main/java/org/rocksdb/IndexType.java index f3c104566..db24a6f68 100644 --- a/java/src/main/java/org/rocksdb/IndexType.java +++ b/java/src/main/java/org/rocksdb/IndexType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Logger.java b/java/src/main/java/org/rocksdb/Logger.java index 05c53b56e..26359ff2e 100644 --- a/java/src/main/java/org/rocksdb/Logger.java +++ b/java/src/main/java/org/rocksdb/Logger.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/MemTableConfig.java b/java/src/main/java/org/rocksdb/MemTableConfig.java index 7c34826e1..8b854917f 100644 --- a/java/src/main/java/org/rocksdb/MemTableConfig.java +++ b/java/src/main/java/org/rocksdb/MemTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java index dca9b3119..49d8f7110 100644 --- a/java/src/main/java/org/rocksdb/NativeLibraryLoader.java +++ b/java/src/main/java/org/rocksdb/NativeLibraryLoader.java @@ -19,7 +19,7 @@ public class NativeLibraryLoader { private static final String jniLibraryName = Environment.getJniLibraryName("rocksdb"); private static final String jniLibraryFileName = Environment.getJniLibraryFileName("rocksdb"); private static final String tempFilePrefix = "librocksdbjni"; - private static final String tempFileSuffix = "." + Environment.getJniLibraryExtension(); + private static final String tempFileSuffix = Environment.getJniLibraryExtension(); /** * Get a reference to the NativeLibraryLoader @@ -75,37 +75,43 @@ public class NativeLibraryLoader { void loadLibraryFromJar(final String tmpDir) throws IOException { if (!initialized) { - final File temp; - if (tmpDir == null || tmpDir.equals("")) { - temp = File.createTempFile(tempFilePrefix, tempFileSuffix); - } else { - temp = new File(tmpDir, jniLibraryFileName); - if (!temp.createNewFile()) { - throw new RuntimeException("File: " + temp.getAbsolutePath() - + " could not be created."); - } - } + System.load(loadLibraryFromJarToTemp(tmpDir).getAbsolutePath()); + initialized = true; + } + } - if (!temp.exists()) { - throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); - } else { - temp.deleteOnExit(); + File loadLibraryFromJarToTemp(final String tmpDir) + throws IOException { + final File temp; + if (tmpDir == null || tmpDir.isEmpty()) { + temp = File.createTempFile(tempFilePrefix, tempFileSuffix); + } else { + temp = new File(tmpDir, jniLibraryFileName); + if (!temp.createNewFile()) { + throw new RuntimeException("File: " + temp.getAbsolutePath() + + " could not be created."); } + } - // attempt to copy the library from the Jar file to the temp destination - try (final InputStream is = getClass().getClassLoader(). - getResourceAsStream(jniLibraryFileName)) { - if (is == null) { - throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); - } else { - Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); - } - } + if (!temp.exists()) { + throw new RuntimeException("File " + temp.getAbsolutePath() + " does not exist."); + } else { + temp.deleteOnExit(); + } - System.load(temp.getAbsolutePath()); - initialized = true; + // attempt to copy the library from the Jar file to the temp destination + try (final InputStream is = getClass().getClassLoader(). + getResourceAsStream(jniLibraryFileName)) { + if (is == null) { + throw new RuntimeException(jniLibraryFileName + " was not found inside JAR."); + } else { + Files.copy(is, temp.toPath(), StandardCopyOption.REPLACE_EXISTING); + } } + + return temp; } + /** * Private constructor to disallow instantiation */ diff --git a/java/src/main/java/org/rocksdb/Options.java b/java/src/main/java/org/rocksdb/Options.java index 771de0ac6..dfce746bf 100644 --- a/java/src/main/java/org/rocksdb/Options.java +++ b/java/src/main/java/org/rocksdb/Options.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/PlainTableConfig.java b/java/src/main/java/org/rocksdb/PlainTableConfig.java index 3a41bea84..044c18d80 100644 --- a/java/src/main/java/org/rocksdb/PlainTableConfig.java +++ b/java/src/main/java/org/rocksdb/PlainTableConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RateLimiterConfig.java b/java/src/main/java/org/rocksdb/RateLimiterConfig.java index 09d1c7a04..d2e7459e3 100644 --- a/java/src/main/java/org/rocksdb/RateLimiterConfig.java +++ b/java/src/main/java/org/rocksdb/RateLimiterConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/ReadOptions.java b/java/src/main/java/org/rocksdb/ReadOptions.java index a72a6e0d8..3baf8e808 100644 --- a/java/src/main/java/org/rocksdb/ReadOptions.java +++ b/java/src/main/java/org/rocksdb/ReadOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java b/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java index 61c46131b..2f54cdf45 100644 --- a/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java +++ b/java/src/main/java/org/rocksdb/RemoveEmptyValueCompactionFilter.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RestoreBackupableDB.java b/java/src/main/java/org/rocksdb/RestoreBackupableDB.java index 5a3b2fc9a..90592e845 100644 --- a/java/src/main/java/org/rocksdb/RestoreBackupableDB.java +++ b/java/src/main/java/org/rocksdb/RestoreBackupableDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RestoreOptions.java b/java/src/main/java/org/rocksdb/RestoreOptions.java index d98167aeb..8cfe56640 100644 --- a/java/src/main/java/org/rocksdb/RestoreOptions.java +++ b/java/src/main/java/org/rocksdb/RestoreOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksDB.java b/java/src/main/java/org/rocksdb/RocksDB.java index 2af55c420..786335745 100644 --- a/java/src/main/java/org/rocksdb/RocksDB.java +++ b/java/src/main/java/org/rocksdb/RocksDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksDBException.java b/java/src/main/java/org/rocksdb/RocksDBException.java index a65d40124..ee869f20f 100644 --- a/java/src/main/java/org/rocksdb/RocksDBException.java +++ b/java/src/main/java/org/rocksdb/RocksDBException.java @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksEnv.java b/java/src/main/java/org/rocksdb/RocksEnv.java index 4c399eafa..4c34a9f4b 100644 --- a/java/src/main/java/org/rocksdb/RocksEnv.java +++ b/java/src/main/java/org/rocksdb/RocksEnv.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksIterator.java b/java/src/main/java/org/rocksdb/RocksIterator.java index bb9a6e697..d93a96197 100644 --- a/java/src/main/java/org/rocksdb/RocksIterator.java +++ b/java/src/main/java/org/rocksdb/RocksIterator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksIteratorInterface.java b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java index fce8fe314..3ac74a90a 100644 --- a/java/src/main/java/org/rocksdb/RocksIteratorInterface.java +++ b/java/src/main/java/org/rocksdb/RocksIteratorInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksMemEnv.java b/java/src/main/java/org/rocksdb/RocksMemEnv.java index 54c9f9981..4517577be 100644 --- a/java/src/main/java/org/rocksdb/RocksMemEnv.java +++ b/java/src/main/java/org/rocksdb/RocksMemEnv.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/RocksObject.java b/java/src/main/java/org/rocksdb/RocksObject.java index 6e24a1385..2d645805a 100644 --- a/java/src/main/java/org/rocksdb/RocksObject.java +++ b/java/src/main/java/org/rocksdb/RocksObject.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Slice.java b/java/src/main/java/org/rocksdb/Slice.java index d26490e5f..2a1ae6fae 100644 --- a/java/src/main/java/org/rocksdb/Slice.java +++ b/java/src/main/java/org/rocksdb/Slice.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Snapshot.java b/java/src/main/java/org/rocksdb/Snapshot.java index 7ef5c383d..c71eac937 100644 --- a/java/src/main/java/org/rocksdb/Snapshot.java +++ b/java/src/main/java/org/rocksdb/Snapshot.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/Statistics.java b/java/src/main/java/org/rocksdb/Statistics.java index a099444f4..7e014ce99 100644 --- a/java/src/main/java/org/rocksdb/Statistics.java +++ b/java/src/main/java/org/rocksdb/Statistics.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/StatisticsCollector.java b/java/src/main/java/org/rocksdb/StatisticsCollector.java index 4f1577ca7..f435b514f 100644 --- a/java/src/main/java/org/rocksdb/StatisticsCollector.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollector.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java index 2ce92c5ee..18f81790e 100644 --- a/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java +++ b/java/src/main/java/org/rocksdb/StatisticsCollectorCallback.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/StatsCollectorInput.java b/java/src/main/java/org/rocksdb/StatsCollectorInput.java index 0e842c256..a3acede3f 100644 --- a/java/src/main/java/org/rocksdb/StatsCollectorInput.java +++ b/java/src/main/java/org/rocksdb/StatsCollectorInput.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/TableFormatConfig.java b/java/src/main/java/org/rocksdb/TableFormatConfig.java index 58a533b22..29cd262c2 100644 --- a/java/src/main/java/org/rocksdb/TableFormatConfig.java +++ b/java/src/main/java/org/rocksdb/TableFormatConfig.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/TickerType.java b/java/src/main/java/org/rocksdb/TickerType.java index 180fbf4a6..9ff819a20 100644 --- a/java/src/main/java/org/rocksdb/TickerType.java +++ b/java/src/main/java/org/rocksdb/TickerType.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/TtlDB.java b/java/src/main/java/org/rocksdb/TtlDB.java index de6dea9a5..351ab5c07 100644 --- a/java/src/main/java/org/rocksdb/TtlDB.java +++ b/java/src/main/java/org/rocksdb/TtlDB.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java index f42f5498b..b807810dc 100644 --- a/java/src/main/java/org/rocksdb/WBWIRocksIterator.java +++ b/java/src/main/java/org/rocksdb/WBWIRocksIterator.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/WriteBatch.java b/java/src/main/java/org/rocksdb/WriteBatch.java index 960d122e2..65223bb99 100644 --- a/java/src/main/java/org/rocksdb/WriteBatch.java +++ b/java/src/main/java/org/rocksdb/WriteBatch.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/WriteBatchInterface.java b/java/src/main/java/org/rocksdb/WriteBatchInterface.java index d5c24ec3a..885f1213d 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchInterface.java +++ b/java/src/main/java/org/rocksdb/WriteBatchInterface.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java index bde037bc3..cec3d0393 100644 --- a/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java +++ b/java/src/main/java/org/rocksdb/WriteBatchWithIndex.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/WriteOptions.java b/java/src/main/java/org/rocksdb/WriteOptions.java index c27dc9b3c..d6a32fb4f 100644 --- a/java/src/main/java/org/rocksdb/WriteOptions.java +++ b/java/src/main/java/org/rocksdb/WriteOptions.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/main/java/org/rocksdb/util/Environment.java b/java/src/main/java/org/rocksdb/util/Environment.java index f65b92a0e..6fccc43bb 100644 --- a/java/src/main/java/org/rocksdb/util/Environment.java +++ b/java/src/main/java/org/rocksdb/util/Environment.java @@ -18,6 +18,10 @@ public class Environment { OS.contains("aix")); } + public static boolean isSolaris() { + return OS.contains("sunos"); + } + public static boolean is64Bit() { return (ARCH.indexOf("64") > 0); } @@ -36,6 +40,10 @@ public class Environment { return String.format("%sjni-linux%s", name, arch); } else if (isMac()) { return String.format("%sjni-osx", name); + } else if (isSolaris()) { + return String.format("%sjni-solaris%d", name, is64Bit() ? 64 : 32); + } else if (isWindows() && is64Bit()) { + return String.format("%sjni-win64", name); } throw new UnsupportedOperationException(); } @@ -45,15 +53,20 @@ public class Environment { } private static String appendLibOsSuffix(final String libraryFileName, final boolean shared) { - if (isUnix()) { + if (isUnix() || isSolaris()) { return libraryFileName + ".so"; } else if (isMac()) { return libraryFileName + (shared ? ".dylib" : ".jnilib"); + } else if (isWindows()) { + return libraryFileName + ".dll"; } throw new UnsupportedOperationException(); } public static String getJniLibraryExtension() { + if (isWindows()) { + return ".dll"; + } return (isMac()) ? ".jnilib" : ".so"; } } diff --git a/java/src/main/java/org/rocksdb/util/SizeUnit.java b/java/src/main/java/org/rocksdb/util/SizeUnit.java index 8d50cd10e..e66fc371c 100644 --- a/java/src/main/java/org/rocksdb/util/SizeUnit.java +++ b/java/src/main/java/org/rocksdb/util/SizeUnit.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/AbstractComparatorTest.java b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java index a776351c0..bf8b3c0f7 100644 --- a/java/src/test/java/org/rocksdb/AbstractComparatorTest.java +++ b/java/src/test/java/org/rocksdb/AbstractComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/BackupEngineTest.java b/java/src/test/java/org/rocksdb/BackupEngineTest.java index 48dff19e1..f010ff3ac 100644 --- a/java/src/test/java/org/rocksdb/BackupEngineTest.java +++ b/java/src/test/java/org/rocksdb/BackupEngineTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java index 6fe3bd2f0..44dc5b578 100644 --- a/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/BackupableDBOptionsTest.java @@ -1,22 +1,22 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. package org.rocksdb; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Random; + import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.ExpectedException; -import java.util.Random; - -import static org.assertj.core.api.Assertions.assertThat; - public class BackupableDBOptionsTest { - private final static String ARBITRARY_PATH = "/tmp"; + private final static String ARBITRARY_PATH = System.getProperty("java.io.tmpdir"); @ClassRule public static final RocksMemoryResource rocksMemoryResource = diff --git a/java/src/test/java/org/rocksdb/BackupableDBTest.java b/java/src/test/java/org/rocksdb/BackupableDBTest.java index 3f358bdb7..b5e2f129c 100644 --- a/java/src/test/java/org/rocksdb/BackupableDBTest.java +++ b/java/src/test/java/org/rocksdb/BackupableDBTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java index aacf44054..2b1ce5ffa 100644 --- a/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/BlockBasedTableConfigTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java index af7216128..e0ebd67ac 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java index decdbbcb2..5c62cca73 100644 --- a/java/src/test/java/org/rocksdb/ColumnFamilyTest.java +++ b/java/src/test/java/org/rocksdb/ColumnFamilyTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java index 4f8a7d1a6..2a86515e3 100644 --- a/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ComparatorOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ComparatorTest.java b/java/src/test/java/org/rocksdb/ComparatorTest.java index e689a9cf5..d4cea0cb8 100644 --- a/java/src/test/java/org/rocksdb/ComparatorTest.java +++ b/java/src/test/java/org/rocksdb/ComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/CompressionOptionsTest.java b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java index bff4d5f6c..2e2633524 100644 --- a/java/src/test/java/org/rocksdb/CompressionOptionsTest.java +++ b/java/src/test/java/org/rocksdb/CompressionOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/DBOptionsTest.java b/java/src/test/java/org/rocksdb/DBOptionsTest.java index 98ba4ce38..7cb29a4a5 100644 --- a/java/src/test/java/org/rocksdb/DBOptionsTest.java +++ b/java/src/test/java/org/rocksdb/DBOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/DirectComparatorTest.java b/java/src/test/java/org/rocksdb/DirectComparatorTest.java index be84d6647..abdbeada9 100644 --- a/java/src/test/java/org/rocksdb/DirectComparatorTest.java +++ b/java/src/test/java/org/rocksdb/DirectComparatorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/DirectSliceTest.java b/java/src/test/java/org/rocksdb/DirectSliceTest.java index 123eed2e7..615adab38 100644 --- a/java/src/test/java/org/rocksdb/DirectSliceTest.java +++ b/java/src/test/java/org/rocksdb/DirectSliceTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/FilterTest.java b/java/src/test/java/org/rocksdb/FilterTest.java index 36ce37970..d5a1830b3 100644 --- a/java/src/test/java/org/rocksdb/FilterTest.java +++ b/java/src/test/java/org/rocksdb/FilterTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/FlushTest.java b/java/src/test/java/org/rocksdb/FlushTest.java index 94a32d383..094910f27 100644 --- a/java/src/test/java/org/rocksdb/FlushTest.java +++ b/java/src/test/java/org/rocksdb/FlushTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/InfoLogLevelTest.java b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java index 630666b90..71a032a0b 100644 --- a/java/src/test/java/org/rocksdb/InfoLogLevelTest.java +++ b/java/src/test/java/org/rocksdb/InfoLogLevelTest.java @@ -4,6 +4,7 @@ import org.junit.ClassRule; import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.rocksdb.util.Environment; import java.io.IOException; @@ -113,7 +114,7 @@ public class InfoLogLevelTest { * @throws IOException if file is not found. */ private String getLogContentsWithoutHeader() throws IOException { - final String separator = System.getProperty("line.separator"); + final String separator = Environment.isWindows() ? "\n" : System.getProperty("line.separator"); final String[] lines = new String(readAllBytes(get( dbFolder.getRoot().getAbsolutePath()+ "/LOG"))).split(separator); diff --git a/java/src/test/java/org/rocksdb/KeyMayExistTest.java b/java/src/test/java/org/rocksdb/KeyMayExistTest.java index b670caddc..a39ddbb21 100644 --- a/java/src/test/java/org/rocksdb/KeyMayExistTest.java +++ b/java/src/test/java/org/rocksdb/KeyMayExistTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/MemTableTest.java b/java/src/test/java/org/rocksdb/MemTableTest.java index bfc898c42..b54f583d0 100644 --- a/java/src/test/java/org/rocksdb/MemTableTest.java +++ b/java/src/test/java/org/rocksdb/MemTableTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/MergeTest.java b/java/src/test/java/org/rocksdb/MergeTest.java index a5f8e1fe9..9eec4e1eb 100644 --- a/java/src/test/java/org/rocksdb/MergeTest.java +++ b/java/src/test/java/org/rocksdb/MergeTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/MixedOptionsTest.java b/java/src/test/java/org/rocksdb/MixedOptionsTest.java index f095e99d8..a3090a1b1 100644 --- a/java/src/test/java/org/rocksdb/MixedOptionsTest.java +++ b/java/src/test/java/org/rocksdb/MixedOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java index 7d9322a53..4e9ad27a2 100644 --- a/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java +++ b/java/src/test/java/org/rocksdb/NativeLibraryLoaderTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,7 +21,7 @@ public class NativeLibraryLoaderTest { @Test public void tempFolder() throws IOException { - NativeLibraryLoader.getInstance().loadLibraryFromJar( + NativeLibraryLoader.getInstance().loadLibraryFromJarToTemp( temporaryFolder.getRoot().getAbsolutePath()); Path path = Paths.get(temporaryFolder.getRoot().getAbsolutePath(), Environment.getJniLibraryFileName("rocksdb")); diff --git a/java/src/test/java/org/rocksdb/OptionsTest.java b/java/src/test/java/org/rocksdb/OptionsTest.java index 1c1dfc63a..6d11e6fa7 100644 --- a/java/src/test/java/org/rocksdb/OptionsTest.java +++ b/java/src/test/java/org/rocksdb/OptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/PlainTableConfigTest.java b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java index 850b050a0..b815cd058 100644 --- a/java/src/test/java/org/rocksdb/PlainTableConfigTest.java +++ b/java/src/test/java/org/rocksdb/PlainTableConfigTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/PlatformRandomHelper.java b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java index 0155ce263..e88a8951d 100644 --- a/java/src/test/java/org/rocksdb/PlatformRandomHelper.java +++ b/java/src/test/java/org/rocksdb/PlatformRandomHelper.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ReadOnlyTest.java b/java/src/test/java/org/rocksdb/ReadOnlyTest.java index 70ea75d15..5cf2b32d4 100644 --- a/java/src/test/java/org/rocksdb/ReadOnlyTest.java +++ b/java/src/test/java/org/rocksdb/ReadOnlyTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/ReadOptionsTest.java b/java/src/test/java/org/rocksdb/ReadOptionsTest.java index af88ce351..df42cf0cd 100644 --- a/java/src/test/java/org/rocksdb/ReadOptionsTest.java +++ b/java/src/test/java/org/rocksdb/ReadOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/RocksDBTest.java b/java/src/test/java/org/rocksdb/RocksDBTest.java index 31d2c5238..c8e59a5b3 100644 --- a/java/src/test/java/org/rocksdb/RocksDBTest.java +++ b/java/src/test/java/org/rocksdb/RocksDBTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/RocksEnvTest.java b/java/src/test/java/org/rocksdb/RocksEnvTest.java index 5914e6e29..a051a3562 100644 --- a/java/src/test/java/org/rocksdb/RocksEnvTest.java +++ b/java/src/test/java/org/rocksdb/RocksEnvTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/RocksIteratorTest.java b/java/src/test/java/org/rocksdb/RocksIteratorTest.java index 170170f5c..eb841d3e6 100644 --- a/java/src/test/java/org/rocksdb/RocksIteratorTest.java +++ b/java/src/test/java/org/rocksdb/RocksIteratorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/RocksMemEnvTest.java b/java/src/test/java/org/rocksdb/RocksMemEnvTest.java index d2791c93e..7530e51b1 100644 --- a/java/src/test/java/org/rocksdb/RocksMemEnvTest.java +++ b/java/src/test/java/org/rocksdb/RocksMemEnvTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/SliceTest.java b/java/src/test/java/org/rocksdb/SliceTest.java index fbd602b14..51f542fa5 100644 --- a/java/src/test/java/org/rocksdb/SliceTest.java +++ b/java/src/test/java/org/rocksdb/SliceTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/SnapshotTest.java b/java/src/test/java/org/rocksdb/SnapshotTest.java index 87ccdbcb5..19e4c5021 100644 --- a/java/src/test/java/org/rocksdb/SnapshotTest.java +++ b/java/src/test/java/org/rocksdb/SnapshotTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java index 927826d71..0feaa4237 100644 --- a/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java +++ b/java/src/test/java/org/rocksdb/StatisticsCollectorTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/StatsCallbackMock.java b/java/src/test/java/org/rocksdb/StatsCallbackMock.java index 3c5800e42..2e28f28ef 100644 --- a/java/src/test/java/org/rocksdb/StatsCallbackMock.java +++ b/java/src/test/java/org/rocksdb/StatsCallbackMock.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/TtlDBTest.java b/java/src/test/java/org/rocksdb/TtlDBTest.java index c60b1d512..934363a87 100644 --- a/java/src/test/java/org/rocksdb/TtlDBTest.java +++ b/java/src/test/java/org/rocksdb/TtlDBTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/Types.java b/java/src/test/java/org/rocksdb/Types.java index 5ad35f463..ca5feb4cb 100644 --- a/java/src/test/java/org/rocksdb/Types.java +++ b/java/src/test/java/org/rocksdb/Types.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java index b09cc9259..257ef6438 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchHandlerTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/WriteBatchTest.java b/java/src/test/java/org/rocksdb/WriteBatchTest.java index 89a9d5405..0cdfb7b1d 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java b/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java new file mode 100644 index 000000000..66e1c8966 --- /dev/null +++ b/java/src/test/java/org/rocksdb/WriteBatchThreadedTest.java @@ -0,0 +1,104 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +package org.rocksdb; + +import org.junit.After; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameter; +import org.junit.runners.Parameterized.Parameters; + +import java.nio.ByteBuffer; +import java.util.*; +import java.util.concurrent.*; + +@RunWith(Parameterized.class) +public class WriteBatchThreadedTest { + + @Parameters(name = "WriteBatchThreadedTest(threadCount={0})") + public static Iterable data() { + return Arrays.asList(new Integer[]{1, 10, 50, 100}); + } + + @Parameter + public int threadCount; + + @Rule + public TemporaryFolder dbFolder = new TemporaryFolder(); + + RocksDB db; + + @Before + public void setUp() throws Exception { + RocksDB.loadLibrary(); + final Options options = new Options() + .setCreateIfMissing(true) + .setIncreaseParallelism(32); + db = RocksDB.open(options, dbFolder.getRoot().getAbsolutePath()); + assert (db != null); + } + + @After + public void tearDown() throws Exception { + if (db != null) { + db.close(); + } + } + + @Test + public void threadedWrites() throws InterruptedException, ExecutionException { + final List> callables = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + final int offset = i * 100; + callables.add(new Callable() { + @Override + public Void call() throws RocksDBException { + final WriteBatch wb = new WriteBatch(); + for (int i = offset; i < offset + 100; i++) { + wb.put(ByteBuffer.allocate(4).putInt(i).array(), + "parallel rocks test".getBytes()); + } + db.write(new WriteOptions(), wb); + + return null; + } + }); + } + + //submit the callables + final ExecutorService executorService = + Executors.newFixedThreadPool(threadCount); + try { + final ExecutorCompletionService completionService = + new ExecutorCompletionService<>(executorService); + final Set> futures = new HashSet<>(); + for (final Callable callable : callables) { + futures.add(completionService.submit(callable)); + } + + while (futures.size() > 0) { + final Future future = completionService.take(); + futures.remove(future); + + try { + future.get(); + } catch (final ExecutionException e) { + for (final Future f : futures) { + f.cancel(true); + } + + throw e; + } + } + } finally { + executorService.shutdown(); + executorService.awaitTermination(10, TimeUnit.SECONDS); + } + } +} diff --git a/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java index b0c729a58..837610d29 100644 --- a/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java +++ b/java/src/test/java/org/rocksdb/WriteBatchWithIndexTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/WriteOptionsTest.java b/java/src/test/java/org/rocksdb/WriteOptionsTest.java index 4d8e6d97e..333a76194 100644 --- a/java/src/test/java/org/rocksdb/WriteOptionsTest.java +++ b/java/src/test/java/org/rocksdb/WriteOptionsTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java index c800574f5..044f96b94 100644 --- a/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java +++ b/java/src/test/java/org/rocksdb/test/RocksJunitRunner.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java index c7160deb6..2de1c45f7 100644 --- a/java/src/test/java/org/rocksdb/util/EnvironmentTest.java +++ b/java/src/test/java/org/rocksdb/util/EnvironmentTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -117,16 +117,22 @@ public class EnvironmentTest { assertThat(Environment.isWindows()).isTrue(); } - @Test(expected = UnsupportedOperationException.class) - public void failWinJniLibraryName(){ + @Test + public void win64() { setEnvironmentClassFields("win", "x64"); - Environment.getJniLibraryFileName("rocksdb"); + assertThat(Environment.isWindows()).isTrue(); + assertThat(Environment.getJniLibraryExtension()). + isEqualTo(".dll"); + assertThat(Environment.getJniLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni-win64.dll"); + assertThat(Environment.getSharedLibraryFileName("rocksdb")). + isEqualTo("librocksdbjni.dll"); } @Test(expected = UnsupportedOperationException.class) - public void failWinSharedLibrary(){ - setEnvironmentClassFields("win", "x64"); - Environment.getSharedLibraryFileName("rocksdb"); + public void win32(){ + setEnvironmentClassFields("win", "32"); + Environment.getJniLibraryFileName("rocksdb"); } private void setEnvironmentClassFields(String osName, diff --git a/java/src/test/java/org/rocksdb/util/SizeUnitTest.java b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java index 517e1b2b5..e74c04103 100644 --- a/java/src/test/java/org/rocksdb/util/SizeUnitTest.java +++ b/java/src/test/java/org/rocksdb/util/SizeUnitTest.java @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_cuckoo_rep.cc b/memtable/hash_cuckoo_rep.cc index 6f3cdbf67..6ae3e098b 100644 --- a/memtable/hash_cuckoo_rep.cc +++ b/memtable/hash_cuckoo_rep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_cuckoo_rep.h b/memtable/hash_cuckoo_rep.h index 6de4baa20..173a907b4 100644 --- a/memtable/hash_cuckoo_rep.h +++ b/memtable/hash_cuckoo_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_linklist_rep.cc b/memtable/hash_linklist_rep.cc index 2e761ce15..902c30e8a 100644 --- a/memtable/hash_linklist_rep.cc +++ b/memtable/hash_linklist_rep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_linklist_rep.h b/memtable/hash_linklist_rep.h index 1bab441ed..5197e7cfb 100644 --- a/memtable/hash_linklist_rep.h +++ b/memtable/hash_linklist_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_skiplist_rep.cc b/memtable/hash_skiplist_rep.cc index cbdd88993..73a917607 100644 --- a/memtable/hash_skiplist_rep.cc +++ b/memtable/hash_skiplist_rep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/memtable/hash_skiplist_rep.h b/memtable/hash_skiplist_rep.h index a6544ff04..56a289c4b 100644 --- a/memtable/hash_skiplist_rep.h +++ b/memtable/hash_skiplist_rep.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/skiplistrep.cc b/memtable/skiplistrep.cc similarity index 98% rename from util/skiplistrep.cc rename to memtable/skiplistrep.cc index 7108008a8..b8c90c6d6 100644 --- a/util/skiplistrep.cc +++ b/memtable/skiplistrep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,8 +25,6 @@ public: transform_(transform), lookahead_(lookahead) { } - virtual bool IsInsertConcurrentlySupported() const override { return true; } - virtual KeyHandle Allocate(const size_t len, char** buf) override { *buf = skip_list_.AllocateKey(len); return static_cast(*buf); diff --git a/memtable/stl_wrappers.h b/memtable/stl_wrappers.h index cef8301fa..a43133017 100644 --- a/memtable/stl_wrappers.h +++ b/memtable/stl_wrappers.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/vectorrep.cc b/memtable/vectorrep.cc similarity index 99% rename from util/vectorrep.cc rename to memtable/vectorrep.cc index 324439a1d..b9d9ebe0a 100644 --- a/util/vectorrep.cc +++ b/memtable/vectorrep.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/dirent.h b/port/dirent.h index ee4ded143..f927db7e2 100644 --- a/port/dirent.h +++ b/port/dirent.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/likely.h b/port/likely.h index ede0df5a1..d6e6295cc 100644 --- a/port/likely.h +++ b/port/likely.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/port.h b/port/port.h index 670006e82..5f45dbb42 100644 --- a/port/port.h +++ b/port/port.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/port_example.h b/port/port_example.h index ba14618fa..e4bcb329b 100644 --- a/port/port_example.h +++ b/port/port_example.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/port_posix.cc b/port/port_posix.cc index 73ad3caf1..ca1909bf6 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/port_posix.h b/port/port_posix.h index efcd1aa8e..454d6c1c3 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,6 +16,8 @@ // in fact, we could use that one #define ROCKSDB_PRIszt "zu" +#define __declspec(S) + #define ROCKSDB_NOEXCEPT noexcept #undef PLATFORM_IS_LITTLE_ENDIAN diff --git a/port/stack_trace.cc b/port/stack_trace.cc index e2211e987..debeb5a46 100644 --- a/port/stack_trace.cc +++ b/port/stack_trace.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/stack_trace.h b/port/stack_trace.h index 8bc6c7d2e..3108b4d2e 100644 --- a/port/stack_trace.h +++ b/port/stack_trace.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/sys_time.h b/port/sys_time.h index 6c23d8e50..53e646e69 100644 --- a/port/sys_time.h +++ b/port/sys_time.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/util_logger.h b/port/util_logger.h index dbb67173f..05782b0c5 100644 --- a/port/util_logger.h +++ b/port/util_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 977c80b88..f0825651b 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -766,6 +766,18 @@ class WinRandomAccessFile : public RandomAccessFile { return read; } + void CalculateReadParameters(uint64_t offset, size_t bytes_requested, + size_t& actual_bytes_toread, + uint64_t& first_page_start) const { + + const size_t alignment = buffer_.Alignment(); + + first_page_start = TruncateToPageBoundary(alignment, offset); + const uint64_t last_page_start = + TruncateToPageBoundary(alignment, offset + bytes_requested - 1); + actual_bytes_toread = (last_page_start - first_page_start) + alignment; + } + public: WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, const EnvOptions& options) @@ -797,66 +809,87 @@ class WinRandomAccessFile : public RandomAccessFile { virtual Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const override { + Status s; SSIZE_T r = -1; size_t left = n; char* dest = scratch; + if (n == 0) { + *result = Slice(scratch, 0); + return s; + } + // When in unbuffered mode we need to do the following changes: // - use our own aligned buffer // - always read at the offset of that is a multiple of alignment if (!use_os_buffer_) { - std::unique_lock lock(buffer_mut_); - // Let's see if at least some of the requested data is already - // in the buffer - if (offset >= buffered_start_ && - offset < (buffered_start_ + buffer_.CurrentSize())) { - size_t buffer_offset = offset - buffered_start_; - r = buffer_.Read(dest, buffer_offset, left); - assert(r >= 0); + uint64_t first_page_start = 0; + size_t actual_bytes_toread = 0; + size_t bytes_requested = left; - left -= size_t(r); - offset += r; - dest += r; - } + if (!read_ahead_ && random_access_max_buffer_size_ == 0) { + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); - // Still some left or none was buffered - if (left > 0) { - // Figure out the start/end offset for reading and amount to read - const size_t alignment = buffer_.Alignment(); - const size_t first_page_start = - TruncateToPageBoundary(alignment, offset); + assert(actual_bytes_toread > 0); + + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } else { + + std::unique_lock lock(buffer_mut_); + + // Let's see if at least some of the requested data is already + // in the buffer + if (offset >= buffered_start_ && + offset < (buffered_start_ + buffer_.CurrentSize())) { + size_t buffer_offset = offset - buffered_start_; + r = buffer_.Read(dest, buffer_offset, left); + assert(r >= 0); - size_t bytes_requested = left; - if (read_ahead_ && bytes_requested < compaction_readahead_size_) { - bytes_requested = compaction_readahead_size_; + left -= size_t(r); + offset += r; + dest += r; } - const size_t last_page_start = - TruncateToPageBoundary(alignment, offset + bytes_requested - 1); - const size_t actual_bytes_toread = - (last_page_start - first_page_start) + alignment; + // Still some left or none was buffered + if (left > 0) { + // Figure out the start/end offset for reading and amount to read + bytes_requested = left; - if (buffer_.Capacity() < actual_bytes_toread) { - // If we are in read-ahead mode or the requested size - // exceeds max buffer size then use one-shot - // big buffer otherwise reallocate main buffer - if (read_ahead_ || + if (read_ahead_ && bytes_requested < compaction_readahead_size_) { + bytes_requested = compaction_readahead_size_; + } + + CalculateReadParameters(offset, bytes_requested, actual_bytes_toread, + first_page_start); + + assert(actual_bytes_toread > 0); + + if (buffer_.Capacity() < actual_bytes_toread) { + // If we are in read-ahead mode or the requested size + // exceeds max buffer size then use one-shot + // big buffer otherwise reallocate main buffer + if (read_ahead_ || (actual_bytes_toread > random_access_max_buffer_size_)) { - // Unlock the mutex since we are not using instance buffer - lock.unlock(); - r = ReadIntoOneShotBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); - } else { - buffer_.AllocateNewBuffer(actual_bytes_toread); + // Unlock the mutex since we are not using instance buffer + lock.unlock(); + r = ReadIntoOneShotBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + else { + buffer_.AllocateNewBuffer(actual_bytes_toread); + r = ReadIntoInstanceBuffer(offset, first_page_start, + actual_bytes_toread, left, dest); + } + } + else { + buffer_.Clear(); r = ReadIntoInstanceBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); + actual_bytes_toread, left, dest); } - } else { - buffer_.Clear(); - r = ReadIntoInstanceBuffer(offset, first_page_start, - actual_bytes_toread, left, dest); } } } else { @@ -1105,6 +1138,8 @@ void WinthreadCall(const char* label, std::error_code result) { } } +typedef VOID(WINAPI * FnGetSystemTimePreciseAsFileTime)(LPFILETIME); + class WinEnv : public Env { public: WinEnv(); @@ -1643,25 +1678,29 @@ class WinEnv : public Env { } virtual uint64_t NowMicros() override { - // all std::chrono clocks on windows proved to return - // values that may repeat that is not good enough for some uses. - const int64_t c_UnixEpochStartTicks = 116444736000000000i64; - const int64_t c_FtToMicroSec = 10; - - // This interface needs to return system time and not - // just any microseconds because it is often used as an argument - // to TimedWait() on condition variable - FILETIME ftSystemTime; - GetSystemTimePreciseAsFileTime(&ftSystemTime); - - LARGE_INTEGER li; - li.LowPart = ftSystemTime.dwLowDateTime; - li.HighPart = ftSystemTime.dwHighDateTime; - // Subtract unix epoch start - li.QuadPart -= c_UnixEpochStartTicks; - // Convert to microsecs - li.QuadPart /= c_FtToMicroSec; - return li.QuadPart; + if (GetSystemTimePreciseAsFileTime_ != NULL) { + // all std::chrono clocks on windows proved to return + // values that may repeat that is not good enough for some uses. + const int64_t c_UnixEpochStartTicks = 116444736000000000i64; + const int64_t c_FtToMicroSec = 10; + + // This interface needs to return system time and not + // just any microseconds because it is often used as an argument + // to TimedWait() on condition variable + FILETIME ftSystemTime; + GetSystemTimePreciseAsFileTime_(&ftSystemTime); + + LARGE_INTEGER li; + li.LowPart = ftSystemTime.dwLowDateTime; + li.HighPart = ftSystemTime.dwHighDateTime; + // Subtract unix epoch start + li.QuadPart -= c_UnixEpochStartTicks; + // Convert to microsecs + li.QuadPart /= c_FtToMicroSec; + return li.QuadPart; + } + using namespace std::chrono; + return duration_cast(system_clock::now().time_since_epoch()).count(); } virtual uint64_t NowNanos() override { @@ -1684,9 +1723,8 @@ class WinEnv : public Env { virtual Status GetHostName(char* name, uint64_t len) override { Status s; - DWORD nSize = - static_cast(std::min(len, - std::numeric_limits::max())); + DWORD nSize = static_cast( + std::min(len, std::numeric_limits::max())); if (!::GetComputerNameA(name, &nSize)) { auto lastError = GetLastError(); @@ -2071,6 +2109,7 @@ class WinEnv : public Env { std::vector thread_pools_; mutable std::mutex mu_; std::vector threads_to_join_; + FnGetSystemTimePreciseAsFileTime GetSystemTimePreciseAsFileTime_; }; WinEnv::WinEnv() @@ -2079,7 +2118,15 @@ WinEnv::WinEnv() page_size_(4 * 1012), allocation_granularity_(page_size_), perf_counter_frequency_(0), - thread_pools_(Priority::TOTAL) { + thread_pools_(Priority::TOTAL), + GetSystemTimePreciseAsFileTime_(NULL) { + + HMODULE module = GetModuleHandle("kernel32.dll"); + if (module != NULL) { + GetSystemTimePreciseAsFileTime_ = (FnGetSystemTimePreciseAsFileTime)GetProcAddress( + module, "GetSystemTimePreciseAsFileTime"); + } + SYSTEM_INFO sinfo; GetSystemInfo(&sinfo); diff --git a/port/win/port_win.cc b/port/win/port_win.cc index e08f0ec22..dd87c3577 100644 --- a/port/win/port_win.cc +++ b/port/win/port_win.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -40,7 +40,7 @@ void gettimeofday(struct timeval* tv, struct timezone* /* tz */) { seconds secNow(duration_cast(usNow)); tv->tv_sec = static_cast(secNow.count()); - tv->tv_usec = static_cast(usNow.count() - + tv->tv_usec = static_cast(usNow.count() - duration_cast(secNow).count()); } @@ -233,6 +233,8 @@ int GetMaxOpenFiles() { return -1; } #include "jemalloc/jemalloc.h" +#ifndef JEMALLOC_NON_INIT + namespace rocksdb { namespace port { @@ -278,6 +280,8 @@ JEMALLOC_SECTION(".CRT$XCT") JEMALLOC_ATTR(used) static const void( } // extern "C" +#endif // JEMALLOC_NON_INIT + // Global operators to be replaced by a linker void* operator new(size_t size) { diff --git a/port/win/port_win.h b/port/win/port_win.h index 9ee7d96be..1c3e94ef1 100644 --- a/port/win/port_win.h +++ b/port/win/port_win.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -245,7 +245,7 @@ extern void InitOnce(OnceType* once, void (*initializer)()); static inline void AsmVolatilePause() { #if defined(_M_IX86) || defined(_M_X64) - ::_mm_pause(); + YieldProcessor(); #endif // it would be nice to get "wfe" on ARM here } diff --git a/port/win/win_logger.cc b/port/win/win_logger.cc index 9d6203cc4..764d75325 100644 --- a/port/win/win_logger.cc +++ b/port/win/win_logger.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/port/win/win_logger.h b/port/win/win_logger.h index 1460ba8b6..87c4dfe46 100644 --- a/port/win/win_logger.h +++ b/port/win/win_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/src.mk b/src.mk index 369890258..9f3671305 100644 --- a/src.mk +++ b/src.mk @@ -1,5 +1,6 @@ # These are the sources from which librocksdb.a is built: LIB_SOURCES = \ + db/auto_roll_logger.cc \ db/builder.cc \ db/c.cc \ db/column_family.cc \ @@ -15,6 +16,7 @@ LIB_SOURCES = \ db/db_impl_debug.cc \ db/db_impl_readonly.cc \ db/db_impl_experimental.cc \ + db/db_info_dumper.cc \ db/db_iter.cc \ db/experimental.cc \ db/event_helpers.cc \ @@ -46,9 +48,12 @@ LIB_SOURCES = \ db/write_batch_base.cc \ db/write_controller.cc \ db/write_thread.cc \ + db/xfunc_test_points.cc \ memtable/hash_cuckoo_rep.cc \ memtable/hash_linklist_rep.cc \ memtable/hash_skiplist_rep.cc \ + memtable/skiplistrep.cc \ + memtable/vectorrep.cc \ port/stack_trace.cc \ port/port_posix.cc \ table/adaptive_table_factory.cc \ @@ -81,7 +86,6 @@ LIB_SOURCES = \ table/two_level_iterator.cc \ tools/dump/db_dump_tool.cc \ util/arena.cc \ - util/auto_roll_logger.cc \ util/bloom.cc \ util/build_version.cc \ util/cache.cc \ @@ -90,19 +94,21 @@ LIB_SOURCES = \ util/compaction_job_stats_impl.cc \ util/concurrent_arena.cc \ util/crc32c.cc \ - util/db_info_dumper.cc \ - util/delete_scheduler_impl.cc \ + util/delete_scheduler.cc \ util/dynamic_bloom.cc \ util/env.cc \ util/env_hdfs.cc \ util/env_posix.cc \ util/io_posix.cc \ util/thread_posix.cc \ + util/transaction_test_util.cc \ + util/sst_file_manager_impl.cc \ util/file_util.cc \ util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ util/histogram.cc \ + util/histogram_windowing.cc \ util/instrumented_mutex.cc \ util/iostats_context.cc \ utilities/backupable/backupable_db.cc \ @@ -151,7 +157,6 @@ LIB_SOURCES = \ util/perf_level.cc \ util/random.cc \ util/rate_limiter.cc \ - util/skiplistrep.cc \ util/slice.cc \ util/statistics.cc \ util/status.cc \ @@ -164,7 +169,6 @@ LIB_SOURCES = \ util/thread_status_updater_debug.cc \ util/thread_status_util.cc \ util/thread_status_util_debug.cc \ - util/vectorrep.cc \ util/xfunc.cc \ util/xxhash.cc \ @@ -177,8 +181,12 @@ MOCK_SOURCES = \ table/mock_table.cc \ util/mock_env.cc +BENCH_SOURCES = \ + tools/db_bench_tool.cc + TEST_BENCH_SOURCES = \ third-party/gtest-1.7.0/fused-src/gtest/gtest-all.cc \ + db/auto_roll_logger_test.cc \ db/column_family_test.cc \ db/compaction_job_test.cc \ db/compaction_job_stats_test.cc \ @@ -186,7 +194,7 @@ TEST_BENCH_SOURCES = \ db/comparator_db_test.cc \ db/corruption_test.cc \ db/cuckoo_table_db_test.cc \ - db/db_bench.cc \ + tools/db_bench_tool.cc \ db/dbformat_test.cc \ db/db_iter_test.cc \ db/db_test.cc \ @@ -206,7 +214,6 @@ TEST_BENCH_SOURCES = \ db/flush_job_test.cc \ db/inlineskiplist_test.cc \ db/listener_test.cc \ - db/log_and_apply_bench.cc \ db/log_test.cc \ db/manual_compaction_test.cc \ db/memtablerep_bench.cc \ @@ -239,10 +246,7 @@ TEST_BENCH_SOURCES = \ tools/reduce_levels_test.cc \ tools/sst_dump_test.cc \ util/arena_test.cc \ - util/auto_roll_logger_test.cc \ util/autovector_test.cc \ - util/benchharness.cc \ - util/benchharness_test.cc \ util/bloom_test.cc \ util/cache_bench.cc \ util/cache_test.cc \ @@ -259,7 +263,7 @@ TEST_BENCH_SOURCES = \ utilities/geodb/geodb_test.cc \ utilities/memory/memory_test.cc \ utilities/merge_operators/string_append/stringappend_test.cc \ - utilities/options_util_test.cc \ + utilities/options/options_util_test.cc \ utilities/redis/redis_lists_test.cc \ utilities/spatialdb/spatial_db_test.cc \ utilities/table_properties_collectors/compact_on_deletion_collector_test.cc \ @@ -267,6 +271,7 @@ TEST_BENCH_SOURCES = \ utilities/transactions/transaction_test.cc \ utilities/ttl/ttl_test.cc \ utilities/write_batch_with_index/write_batch_with_index_test.cc \ + util/iostats_context_test.cc \ util/log_write_bench.cc \ util/memenv_test.cc \ util/mock_env_test.cc \ diff --git a/table/block.cc b/table/block.cc index c84dc173d..6e6cae576 100644 --- a/table/block.cc +++ b/table/block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block.h b/table/block.h index 4fe63add6..c3a26ef1f 100644 --- a/table/block.h +++ b/table/block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_based_filter_block.cc b/table/block_based_filter_block.cc index 9992e9bd0..e65ee280d 100644 --- a/table/block_based_filter_block.cc +++ b/table/block_based_filter_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,18 +19,6 @@ namespace rocksdb { namespace { -bool SamePrefix(const SliceTransform* prefix_extractor, const Slice& key1, - const Slice& key2) { - if (!prefix_extractor->InDomain(key1) && !prefix_extractor->InDomain(key2)) { - return true; - } else if (!prefix_extractor->InDomain(key1) || - !prefix_extractor->InDomain(key2)) { - return false; - } else { - return (prefix_extractor->Transform(key1) == - prefix_extractor->Transform(key2)); - } -} void AppendItem(std::string* props, const std::string& key, const std::string& value) { @@ -77,7 +65,9 @@ BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder( const BlockBasedTableOptions& table_opt) : policy_(table_opt.filter_policy.get()), prefix_extractor_(prefix_extractor), - whole_key_filtering_(table_opt.whole_key_filtering) { + whole_key_filtering_(table_opt.whole_key_filtering), + prev_prefix_start_(0), + prev_prefix_size_(0) { assert(policy_); } @@ -90,14 +80,13 @@ void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) { } void BlockBasedFilterBlockBuilder::Add(const Slice& key) { - added_to_start_ = 0; - if (whole_key_filtering_) { - AddKey(key); - added_to_start_ = 1; - } if (prefix_extractor_ && prefix_extractor_->InDomain(key)) { AddPrefix(key); } + + if (whole_key_filtering_) { + AddKey(key); + } } // Add key to filter if needed @@ -110,19 +99,16 @@ inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) { inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) { // get slice for most recently added entry Slice prev; - if (start_.size() > added_to_start_) { - size_t prev_start = start_[start_.size() - 1 - added_to_start_]; - const char* base = entries_.data() + prev_start; - size_t length = entries_.size() - prev_start; - prev = Slice(base, length); + if (prev_prefix_size_ > 0) { + prev = Slice(entries_.data() + prev_prefix_start_, prev_prefix_size_); } - // this assumes prefix(prefix(key)) == prefix(key), as the last - // entry in entries_ may be either a key or prefix, and we use - // prefix(last entry) to get the prefix of the last key. - if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) { - Slice prefix = prefix_extractor_->Transform(key); + Slice prefix = prefix_extractor_->Transform(key); + // insert prefix only when it's different from the previous prefix. + if (prev.size() == 0 || prefix != prev) { start_.push_back(entries_.size()); + prev_prefix_start_ = entries_.size(); + prev_prefix_size_ = prefix.size(); entries_.append(prefix.data(), prefix.size()); } } @@ -168,6 +154,8 @@ void BlockBasedFilterBlockBuilder::GenerateFilter() { tmp_entries_.clear(); entries_.clear(); start_.clear(); + prev_prefix_start_ = 0; + prev_prefix_size_ = 0; } BlockBasedFilterBlockReader::BlockBasedFilterBlockReader( diff --git a/table/block_based_filter_block.h b/table/block_based_filter_block.h index d339ac68a..a97309f2e 100644 --- a/table/block_based_filter_block.h +++ b/table/block_based_filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -55,9 +55,12 @@ class BlockBasedFilterBlockBuilder : public FilterBlockBuilder { const SliceTransform* prefix_extractor_; bool whole_key_filtering_; + size_t prev_prefix_start_; // the position of the last appended prefix + // to "entries_". + size_t prev_prefix_size_; // the length of the last appended prefix to + // "entries_". std::string entries_; // Flattened entry contents std::vector start_; // Starting index in entries_ of each entry - uint32_t added_to_start_; // To indicate if key is added std::string result_; // Filter data computed so far std::vector tmp_entries_; // policy_->CreateFilter() argument std::vector filter_offsets_; diff --git a/table/block_based_filter_block_test.cc b/table/block_based_filter_block_test.cc index 017de5906..d77def3d9 100644 --- a/table/block_based_filter_block_test.cc +++ b/table/block_based_filter_block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 006908eaa..47d74bc5f 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -113,15 +113,17 @@ class IndexBuilder { // // Optimizations: // 1. Made block's `block_restart_interval` to be 1, which will avoid linear -// search when doing index lookup. +// search when doing index lookup (can be disabled by setting +// index_block_restart_interval). // 2. Shorten the key length for index block. Other than honestly using the // last key in the data block as the index key, we instead find a shortest // substitute key that serves the same function. class ShortenedIndexBuilder : public IndexBuilder { public: - explicit ShortenedIndexBuilder(const Comparator* comparator) + explicit ShortenedIndexBuilder(const Comparator* comparator, + int index_block_restart_interval) : IndexBuilder(comparator), - index_block_builder_(1 /* block_restart_interval == 1 */) {} + index_block_builder_(index_block_restart_interval) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, const Slice* first_key_in_next_block, @@ -178,9 +180,10 @@ class ShortenedIndexBuilder : public IndexBuilder { class HashIndexBuilder : public IndexBuilder { public: explicit HashIndexBuilder(const Comparator* comparator, - const SliceTransform* hash_key_extractor) + const SliceTransform* hash_key_extractor, + int index_block_restart_interval) : IndexBuilder(comparator), - primary_index_builder_(comparator), + primary_index_builder_(comparator, index_block_restart_interval), hash_key_extractor_(hash_key_extractor) {} virtual void AddIndexEntry(std::string* last_key_in_current_block, @@ -266,13 +269,16 @@ namespace { // Create a index builder based on its type. IndexBuilder* CreateIndexBuilder(IndexType type, const Comparator* comparator, - const SliceTransform* prefix_extractor) { + const SliceTransform* prefix_extractor, + int index_block_restart_interval) { switch (type) { case BlockBasedTableOptions::kBinarySearch: { - return new ShortenedIndexBuilder(comparator); + return new ShortenedIndexBuilder(comparator, + index_block_restart_interval); } case BlockBasedTableOptions::kHashSearch: { - return new HashIndexBuilder(comparator, prefix_extractor); + return new HashIndexBuilder(comparator, prefix_extractor, + index_block_restart_interval); } default: { assert(!"Do not recognize the index type "); @@ -484,9 +490,10 @@ struct BlockBasedTableBuilder::Rep { data_block(table_options.block_restart_interval, table_options.use_delta_encoding), internal_prefix_transform(_ioptions.prefix_extractor), - index_builder(CreateIndexBuilder(table_options.index_type, - &internal_comparator, - &this->internal_prefix_transform)), + index_builder( + CreateIndexBuilder(table_options.index_type, &internal_comparator, + &this->internal_prefix_transform, + table_options.index_block_restart_interval)), compression_type(_compression_type), compression_opts(_compression_opts), filter_block(skip_filters ? nullptr : CreateFilterBlockBuilder( @@ -696,7 +703,6 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, if (type != kNoCompression && block_cache_compressed != nullptr) { - Cache::Handle* cache_handle = nullptr; size_t size = block_contents.size(); std::unique_ptr ubuf(new char[size + 1]); @@ -716,9 +722,8 @@ Status BlockBasedTableBuilder::InsertBlockInCache(const Slice& block_contents, (end - r->compressed_cache_key_prefix)); // Insert into compressed block cache. - cache_handle = block_cache_compressed->Insert( - key, block, block->usable_size(), &DeleteCachedBlock); - block_cache_compressed->Release(cache_handle); + block_cache_compressed->Insert(key, block, block->usable_size(), + &DeleteCachedBlock); // Invalidate OS cache. r->file->InvalidateCache(static_cast(r->offset), size); diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 7dc93b754..49fe8dbf4 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index a6484c4ee..c2617b168 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -42,6 +42,9 @@ BlockBasedTableFactory::BlockBasedTableFactory( if (table_options_.block_restart_interval < 1) { table_options_.block_restart_interval = 1; } + if (table_options_.index_block_restart_interval < 1) { + table_options_.index_block_restart_interval = 1; + } } Status BlockBasedTableFactory::NewTableReader( @@ -61,7 +64,7 @@ Status BlockBasedTableFactory::NewTableReader( table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader, prefetch_enabled, - table_reader_options.skip_filters); + table_reader_options.skip_filters, table_reader_options.level); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( @@ -91,6 +94,12 @@ Status BlockBasedTableFactory::SanitizeOptions( return Status::InvalidArgument("Enable cache_index_and_filter_blocks, " ", but block cache is disabled"); } + if (table_options_.pin_l0_filter_and_index_blocks_in_cache && + table_options_.no_block_cache) { + return Status::InvalidArgument( + "Enable pin_l0_filter_and_index_blocks_in_cache, " + ", but block cache is disabled"); + } if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { return Status::InvalidArgument( "Unsupported BlockBasedTable format_version. Please check " @@ -107,11 +116,15 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", table_options_.flush_block_policy_factory->Name(), - table_options_.flush_block_policy_factory.get()); + static_cast(table_options_.flush_block_policy_factory.get())); ret.append(buffer); snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", table_options_.cache_index_and_filter_blocks); ret.append(buffer); + snprintf(buffer, kBufferSize, + " pin_l0_filter_and_index_blocks_in_cache: %d\n", + table_options_.pin_l0_filter_and_index_blocks_in_cache); + ret.append(buffer); snprintf(buffer, kBufferSize, " index_type: %d\n", table_options_.index_type); ret.append(buffer); @@ -125,7 +138,7 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { table_options_.no_block_cache); ret.append(buffer); snprintf(buffer, kBufferSize, " block_cache: %p\n", - table_options_.block_cache.get()); + static_cast(table_options_.block_cache.get())); ret.append(buffer); if (table_options_.block_cache) { snprintf(buffer, kBufferSize, " block_cache_size: %" ROCKSDB_PRIszt "\n", @@ -133,7 +146,7 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { ret.append(buffer); } snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", - table_options_.block_cache_compressed.get()); + static_cast(table_options_.block_cache_compressed.get())); ret.append(buffer); if (table_options_.block_cache_compressed) { snprintf(buffer, kBufferSize, @@ -150,6 +163,9 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const { snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", table_options_.block_restart_interval); ret.append(buffer); + snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", + table_options_.index_block_restart_interval); + ret.append(buffer); snprintf(buffer, kBufferSize, " filter_policy: %s\n", table_options_.filter_policy == nullptr ? "nullptr" : table_options_.filter_policy->Name()); diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 714a4f82a..6b4e563e2 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 4a358d361..0f9cf185c 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -98,17 +98,23 @@ void ReleaseCachedEntry(void* arg, void* h) { cache->Release(handle); } -Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, - const BlockHandle& handle, char* cache_key) { +Slice GetCacheKeyFromOffset(const char* cache_key_prefix, + size_t cache_key_prefix_size, uint64_t offset, + char* cache_key) { assert(cache_key != nullptr); assert(cache_key_prefix_size != 0); assert(cache_key_prefix_size <= kMaxCacheKeyPrefixSize); memcpy(cache_key, cache_key_prefix, cache_key_prefix_size); - char* end = - EncodeVarint64(cache_key + cache_key_prefix_size, handle.offset()); + char* end = EncodeVarint64(cache_key + cache_key_prefix_size, offset); return Slice(cache_key, static_cast(end - cache_key)); } +Slice GetCacheKey(const char* cache_key_prefix, size_t cache_key_prefix_size, + const BlockHandle& handle, char* cache_key) { + return GetCacheKeyFromOffset(cache_key_prefix, cache_key_prefix_size, + handle.offset(), cache_key); +} + Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key, Tickers block_cache_miss_ticker, Tickers block_cache_hit_ticker, @@ -334,6 +340,28 @@ class HashIndexReader : public IndexReader { BlockContents prefixes_contents_; }; +// CachableEntry represents the entries that *may* be fetched from block cache. +// field `value` is the item we want to get. +// field `cache_handle` is the cache handle to the block cache. If the value +// was not read from cache, `cache_handle` will be nullptr. +template +struct BlockBasedTable::CachableEntry { + CachableEntry(TValue* _value, Cache::Handle* _cache_handle) + : value(_value), cache_handle(_cache_handle) {} + CachableEntry() : CachableEntry(nullptr, nullptr) {} + void Release(Cache* cache) { + if (cache_handle) { + cache->Release(cache_handle); + value = nullptr; + cache_handle = nullptr; + } + } + bool IsSet() const { return cache_handle != nullptr; } + + TValue* value = nullptr; + // if the entry is from the cache, cache_handle will be populated. + Cache::Handle* cache_handle = nullptr; +}; struct BlockBasedTable::Rep { Rep(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, @@ -359,6 +387,8 @@ struct BlockBasedTable::Rep { size_t cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t compressed_cache_key_prefix_size = 0; + uint64_t dummy_index_reader_offset = + 0; // ID that is unique for the block cache. // Footer contains the fixed table information Footer footer; @@ -386,42 +416,32 @@ struct BlockBasedTable::Rep { // and compatible with existing code, we introduce a wrapper that allows // block to extract prefix without knowing if a key is internal or not. unique_ptr internal_prefix_transform; + + // only used in level 0 files: + // when pin_l0_filter_and_index_blocks_in_cache is true, we do use the + // LRU cache, but we always keep the filter & idndex block's handle checked + // out here (=we don't call Release()), plus the parsed out objects + // the LRU cache will never push flush them out, hence they're pinned + CachableEntry filter_entry; + CachableEntry index_entry; }; BlockBasedTable::~BlockBasedTable() { + Close(); delete rep_; } -// CachableEntry represents the entries that *may* be fetched from block cache. -// field `value` is the item we want to get. -// field `cache_handle` is the cache handle to the block cache. If the value -// was not read from cache, `cache_handle` will be nullptr. -template -struct BlockBasedTable::CachableEntry { - CachableEntry(TValue* _value, Cache::Handle* _cache_handle) - : value(_value), cache_handle(_cache_handle) {} - CachableEntry() : CachableEntry(nullptr, nullptr) {} - void Release(Cache* cache) { - if (cache_handle) { - cache->Release(cache_handle); - value = nullptr; - cache_handle = nullptr; - } - } - - TValue* value = nullptr; - // if the entry is from the cache, cache_handle will be populated. - Cache::Handle* cache_handle = nullptr; -}; - // Helper function to setup the cache key's prefix for the Table. -void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { +void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, uint64_t file_size) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); + // Create dummy offset of index reader which is beyond the file size. + rep->dummy_index_reader_offset = + file_size + rep->table_options.block_cache->NewId(); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), @@ -487,7 +507,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, uint64_t file_size, unique_ptr* table_reader, const bool prefetch_index_and_filter, - const bool skip_filters) { + const bool skip_filters, const int level) { table_reader->reset(); Footer footer; @@ -510,7 +530,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, rep->footer = footer; rep->index_type = table_options.index_type; rep->hash_index_allow_collision = table_options.hash_index_allow_collision; - SetupCacheKeyPrefix(rep); + SetupCacheKeyPrefix(rep, file_size); unique_ptr new_table(new BlockBasedTable(rep)); // Read meta index @@ -583,14 +603,33 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, assert(table_options.block_cache != nullptr); // Hack: Call NewIndexIterator() to implicitly add index to the // block_cache + + // if pin_l0_filter_and_index_blocks_in_cache is true and this is + // a level0 file, then we will pass in this pointer to rep->index + // to NewIndexIterator(), which will save the index block in there + // else it's a nullptr and nothing special happens + CachableEntry* index_entry = nullptr; + if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache && + level == 0) { + index_entry = &rep->index_entry; + } unique_ptr iter( - new_table->NewIndexIterator(ReadOptions())); + new_table->NewIndexIterator(ReadOptions(), nullptr, index_entry)); s = iter->status(); if (s.ok()) { // Hack: Call GetFilter() to implicitly add filter to the block_cache auto filter_entry = new_table->GetFilter(); - filter_entry.Release(table_options.block_cache.get()); + // if pin_l0_filter_and_index_blocks_in_cache is true, and this is + // a level0 file, then save it in rep_->filter_entry; it will be + // released in the destructor only, hence it will be pinned in the + // cache until this reader is alive + if (rep->table_options.pin_l0_filter_and_index_blocks_in_cache && + level == 0) { + rep->filter_entry = filter_entry; + } else { + filter_entry.Release(table_options.block_cache.get()); + } } } else { // If we don't use block cache for index/filter blocks access, we'll @@ -740,11 +779,16 @@ Status BlockBasedTable::GetDataBlockFromCache( assert(block->value->compression_type() == kNoCompression); if (block_cache != nullptr && block->value->cachable() && read_options.fill_cache) { - block->cache_handle = block_cache->Insert(block_cache_key, block->value, - block->value->usable_size(), - &DeleteCachedEntry); - assert(reinterpret_cast( - block_cache->Value(block->cache_handle)) == block->value); + s = block_cache->Insert( + block_cache_key, block->value, block->value->usable_size(), + &DeleteCachedEntry, &(block->cache_handle)); + if (s.ok()) { + RecordTick(statistics, BLOCK_CACHE_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete block->value; + block->value = nullptr; + } } } @@ -784,27 +828,37 @@ Status BlockBasedTable::PutDataBlockToCache( // Release the hold on the compressed cache entry immediately. if (block_cache_compressed != nullptr && raw_block != nullptr && raw_block->cachable()) { - auto cache_handle = block_cache_compressed->Insert( - compressed_block_cache_key, raw_block, raw_block->usable_size(), - &DeleteCachedEntry); - block_cache_compressed->Release(cache_handle); - RecordTick(statistics, BLOCK_CACHE_COMPRESSED_MISS); - // Avoid the following code to delete this cached block. - raw_block = nullptr; + s = block_cache_compressed->Insert(compressed_block_cache_key, raw_block, + raw_block->usable_size(), + &DeleteCachedEntry); + if (s.ok()) { + // Avoid the following code to delete this cached block. + raw_block = nullptr; + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD); + } else { + RecordTick(statistics, BLOCK_CACHE_COMPRESSED_ADD_FAILURES); + } } delete raw_block; // insert into uncompressed block cache assert((block->value->compression_type() == kNoCompression)); if (block_cache != nullptr && block->value->cachable()) { - block->cache_handle = block_cache->Insert(block_cache_key, block->value, - block->value->usable_size(), - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - block->value->usable_size()); - assert(reinterpret_cast(block_cache->Value(block->cache_handle)) == - block->value); + s = block_cache->Insert(block_cache_key, block->value, + block->value->usable_size(), + &DeleteCachedEntry, &(block->cache_handle)); + if (s.ok()) { + assert(block->cache_handle != nullptr); + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, + block->value->usable_size()); + assert(reinterpret_cast( + block_cache->Value(block->cache_handle)) == block->value); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete block->value; + block->value = nullptr; + } } return s; @@ -860,6 +914,11 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( return {rep_->filter.get(), nullptr /* cache handle */}; } + // we have a pinned filter block + if (rep_->filter_entry.IsSet()) { + return rep_->filter_entry; + } + PERF_TIMER_GUARD(read_filter_block_nanos); Cache* block_cache = rep_->table_options.block_cache.get(); @@ -891,10 +950,17 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( filter = ReadFilter(rep_, &filter_size); if (filter != nullptr) { assert(filter_size > 0); - cache_handle = block_cache->Insert(key, filter, filter_size, - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size); + Status s = block_cache->Insert(key, filter, filter_size, + &DeleteCachedEntry, + &cache_handle); + if (s.ok()) { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); + delete filter; + return CachableEntry(); + } } } @@ -902,19 +968,27 @@ BlockBasedTable::CachableEntry BlockBasedTable::GetFilter( } InternalIterator* BlockBasedTable::NewIndexIterator( - const ReadOptions& read_options, BlockIter* input_iter) { + const ReadOptions& read_options, BlockIter* input_iter, + CachableEntry* index_entry) { // index reader has already been pre-populated. if (rep_->index_reader) { return rep_->index_reader->NewIterator( input_iter, read_options.total_order_seek); } + // we have a pinned index block + if (rep_->index_entry.IsSet()) { + return rep_->index_entry.value->NewIterator(input_iter, + read_options.total_order_seek); + } + PERF_TIMER_GUARD(read_index_block_nanos); bool no_io = read_options.read_tier == kBlockCacheTier; Cache* block_cache = rep_->table_options.block_cache.get(); char cache_key[kMaxCacheKeyPrefixSize + kMaxVarint64Length]; - auto key = GetCacheKey(rep_->cache_key_prefix, rep_->cache_key_prefix_size, - rep_->footer.index_handle(), cache_key); + auto key = + GetCacheKeyFromOffset(rep_->cache_key_prefix, rep_->cache_key_prefix_size, + rep_->dummy_index_reader_offset, cache_key); Statistics* statistics = rep_->ioptions.statistics; auto cache_handle = GetEntryFromCache(block_cache, key, BLOCK_CACHE_INDEX_MISS, @@ -937,10 +1011,18 @@ InternalIterator* BlockBasedTable::NewIndexIterator( // Create index reader and put it in the cache. Status s; s = CreateIndexReader(&index_reader); + if (s.ok()) { + s = block_cache->Insert(key, index_reader, index_reader->usable_size(), + &DeleteCachedEntry, &cache_handle); + } - if (!s.ok()) { + if (s.ok()) { + RecordTick(statistics, BLOCK_CACHE_ADD); + RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, + index_reader->usable_size()); + } else { + RecordTick(statistics, BLOCK_CACHE_ADD_FAILURES); // make sure if something goes wrong, index_reader shall remain intact. - assert(index_reader == nullptr); if (input_iter != nullptr) { input_iter->SetStatus(s); return input_iter; @@ -949,18 +1031,20 @@ InternalIterator* BlockBasedTable::NewIndexIterator( } } - cache_handle = - block_cache->Insert(key, index_reader, index_reader->usable_size(), - &DeleteCachedEntry); - RecordTick(statistics, BLOCK_CACHE_ADD); - RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, - index_reader->usable_size()); } assert(cache_handle); auto* iter = index_reader->NewIterator( input_iter, read_options.total_order_seek); - iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + + // the caller would like to take ownership of the index block + // don't call RegisterCleanup() in this case, the caller will take care of it + if (index_entry != nullptr) { + *index_entry = {index_reader, cache_handle}; + } else { + iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, cache_handle); + } + return iter; } @@ -1036,7 +1120,7 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( } // Didn't get any data from block caches. - if (block.value == nullptr) { + if (s.ok() && block.value == nullptr) { if (no_io) { // Could not read from block_cache and can't do IO if (input_iter != nullptr) { @@ -1055,7 +1139,7 @@ InternalIterator* BlockBasedTable::NewDataBlockIterator( } InternalIterator* iter; - if (block.value != nullptr) { + if (s.ok() && block.value != nullptr) { iter = block.value->NewIterator(&rep->internal_comparator, input_iter); if (block.cache_handle != nullptr) { iter->RegisterCleanup(&ReleaseCachedEntry, block_cache, @@ -1120,8 +1204,11 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { } assert(rep_->ioptions.prefix_extractor != nullptr); - auto prefix = rep_->ioptions.prefix_extractor->Transform( - ExtractUserKey(internal_key)); + auto user_key = ExtractUserKey(internal_key); + if (!rep_->ioptions.prefix_extractor->InDomain(user_key)) { + return true; + } + auto prefix = rep_->ioptions.prefix_extractor->Transform(user_key); InternalKey internal_key_prefix(prefix, kMaxSequenceNumber, kTypeValue); auto internal_prefix = internal_key_prefix.Encode(); @@ -1185,7 +1272,13 @@ bool BlockBasedTable::PrefixMayMatch(const Slice& internal_key) { RecordTick(statistics, BLOOM_FILTER_PREFIX_USEFUL); } - filter_entry.Release(rep_->table_options.block_cache.get()); + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } + return may_match; } @@ -1207,6 +1300,7 @@ bool BlockBasedTable::FullFilterKeyMayMatch(FilterBlockReader* filter, return false; } if (rep_->ioptions.prefix_extractor && + rep_->ioptions.prefix_extractor->InDomain(user_key) && !filter->PrefixMayMatch( rep_->ioptions.prefix_extractor->Transform(user_key))) { return false; @@ -1251,7 +1345,8 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, BlockIter biter; NewDataBlockIterator(rep_, read_options, iiter.value(), &biter); - if (read_options.read_tier && biter.status().IsIncomplete()) { + if (read_options.read_tier == kBlockCacheTier && + biter.status().IsIncomplete()) { // couldn't get block from block_cache // Update Saver.state to Found because we are only looking for whether // we can guarantee the key is not there when "no_io" is set @@ -1283,7 +1378,12 @@ Status BlockBasedTable::Get(const ReadOptions& read_options, const Slice& key, } } - filter_entry.Release(rep_->table_options.block_cache.get()); + // if rep_->filter_entry is not set, we should call Release(); otherwise + // don't call, in this case we have a local copy in rep_->filter_entry, + // it's pinned to the cache and will be released in the destructor + if (!rep_->filter_entry.IsSet()) { + filter_entry.Release(rep_->table_options.block_cache.get()); + } return s; } @@ -1571,6 +1671,11 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { return s; } +void BlockBasedTable::Close() { + rep_->filter_entry.Release(rep_->table_options.block_cache.get()); + rep_->index_entry.Release(rep_->table_options.block_cache.get()); +} + Status BlockBasedTable::DumpIndexBlock(WritableFile* out_file) { out_file->Append( "Index Details:\n" diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index f8d0649e0..6a88d9d9a 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -76,7 +76,7 @@ class BlockBasedTable : public TableReader { unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, bool prefetch_index_and_filter = true, - bool skip_filters = false); + bool skip_filters = false, int level = -1); bool PrefixMayMatch(const Slice& internal_key); @@ -119,6 +119,8 @@ class BlockBasedTable : public TableReader { // convert SST file to a human readable form Status DumpTable(WritableFile* out_file) override; + void Close() override; + ~BlockBasedTable(); bool TEST_filter_block_preloaded() const; @@ -155,8 +157,9 @@ class BlockBasedTable : public TableReader { // 2. index is not present in block cache. // 3. We disallowed any io to be performed, that is, read_options == // kBlockCacheTier - InternalIterator* NewIndexIterator(const ReadOptions& read_options, - BlockIter* input_iter = nullptr); + InternalIterator* NewIndexIterator( + const ReadOptions& read_options, BlockIter* input_iter = nullptr, + CachableEntry* index_entry = nullptr); // Read block cache from block caches (if set): block_cache and // block_cache_compressed. @@ -207,7 +210,7 @@ class BlockBasedTable : public TableReader { // Create the filter from the filter block. static FilterBlockReader* ReadFilter(Rep* rep, size_t* filter_size = nullptr); - static void SetupCacheKeyPrefix(Rep* rep); + static void SetupCacheKeyPrefix(Rep* rep, uint64_t file_size); explicit BlockBasedTable(Rep* rep) : rep_(rep), compaction_optimized_(false) {} diff --git a/table/block_builder.cc b/table/block_builder.cc index 846d62369..aa9d46669 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_builder.h b/table/block_builder.h index 9eec4ce33..f9ced8ad4 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_hash_index.cc b/table/block_hash_index.cc index b38cc8a57..9b73ff2af 100644 --- a/table/block_hash_index.cc +++ b/table/block_hash_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_hash_index.h b/table/block_hash_index.h index fc110d54a..ceaed626f 100644 --- a/table/block_hash_index.h +++ b/table/block_hash_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_hash_index_test.cc b/table/block_hash_index_test.cc index ffca663d1..b51d3ef25 100644 --- a/table/block_hash_index_test.cc +++ b/table/block_hash_index_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_prefix_index.cc b/table/block_prefix_index.cc index 147bcf56e..bc6465a32 100644 --- a/table/block_prefix_index.cc +++ b/table/block_prefix_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_prefix_index.h b/table/block_prefix_index.h index bc36c48f6..d9c3b97e0 100644 --- a/table/block_prefix_index.h +++ b/table/block_prefix_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/block_test.cc b/table/block_test.cc index e9c0179c1..7c36cf1ca 100644 --- a/table/block_test.cc +++ b/table/block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/bloom_block.cc b/table/bloom_block.cc index cfea8a2c5..7eef9cc05 100644 --- a/table/bloom_block.cc +++ b/table/bloom_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/bloom_block.h b/table/bloom_block.h index 5b60d2bca..5ba74601f 100644 --- a/table/bloom_block.h +++ b/table/bloom_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 475055fcb..f2f71b78e 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -246,7 +246,7 @@ Status CuckooTableBuilder::Finish() { if (num_entries_ > 0) { // Calculate the real hash size if module hash is enabled. if (use_module_hash_) { - hash_table_size_ = + hash_table_size_ = static_cast(num_entries_ / max_hash_table_ratio_); } s = MakeHashTable(&buckets); diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 093e1c245..8e6873e88 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index a3cd21224..fef7bc3a5 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index 2b9407f2f..bf1561a87 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index 3f89ca86d..82aa57150 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index b1f910181..fb1aef2e8 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index b936e70c8..5e3e5528a 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index c398b1383..aa4a93d40 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/filter_block.h b/table/filter_block.h index 855a23169..e326018f7 100644 --- a/table/filter_block.h +++ b/table/filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/flush_block_policy.cc b/table/flush_block_policy.cc index 4c12b30bb..fa4c3e331 100644 --- a/table/flush_block_policy.cc +++ b/table/flush_block_policy.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/format.cc b/table/format.cc index a58bbee24..bb028c99a 100644 --- a/table/format.cc +++ b/table/format.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/format.h b/table/format.h index 74ec808c6..48bcf6785 100644 --- a/table/format.h +++ b/table/format.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/full_filter_block.cc b/table/full_filter_block.cc index 3744d417f..11c8a016c 100644 --- a/table/full_filter_block.cc +++ b/table/full_filter_block.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/full_filter_block.h b/table/full_filter_block.h index 1ecc07a01..27e10eba1 100644 --- a/table/full_filter_block.h +++ b/table/full_filter_block.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/full_filter_block_test.cc b/table/full_filter_block_test.cc index 0275a6ca6..5840cb035 100644 --- a/table/full_filter_block_test.cc +++ b/table/full_filter_block_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/get_context.cc b/table/get_context.cc index 59d44f3a5..39b841e25 100644 --- a/table/get_context.cc +++ b/table/get_context.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/get_context.h b/table/get_context.h index c06c3c8d4..283df90c8 100644 --- a/table/get_context.h +++ b/table/get_context.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/internal_iterator.h b/table/internal_iterator.h index cc2430ca0..d487b3124 100644 --- a/table/internal_iterator.h +++ b/table/internal_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,6 +6,7 @@ #pragma once +#include #include "rocksdb/iterator.h" #include "rocksdb/status.h" @@ -78,6 +79,10 @@ class InternalIterator : public Cleanable { // set to false. virtual bool IsKeyPinned() const { return false; } + virtual Status GetProperty(std::string prop_name, std::string* prop) { + return Status::NotSupported(""); + } + private: // No copying allowed InternalIterator(const InternalIterator&) = delete; diff --git a/table/iter_heap.h b/table/iter_heap.h index 5343175c3..642383345 100644 --- a/table/iter_heap.h +++ b/table/iter_heap.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/iterator.cc b/table/iterator.cc index 2db321edd..09f7f8e68 100644 --- a/table/iterator.cc +++ b/table/iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -46,6 +46,17 @@ void Cleanable::RegisterCleanup(CleanupFunction func, void* arg1, void* arg2) { c->arg2 = arg2; } +Status Iterator::GetProperty(std::string prop_name, std::string* prop) { + if (prop == nullptr) { + return Status::InvalidArgument("prop is nullptr"); + } + if (prop_name == "rocksdb.iterator.is-key-pinned") { + *prop = "0"; + return Status::OK(); + } + return Status::InvalidArgument("Undentified property."); +} + namespace { class EmptyIterator : public Iterator { public: diff --git a/table/iterator_wrapper.h b/table/iterator_wrapper.h index eef300499..0a0a819d7 100644 --- a/table/iterator_wrapper.h +++ b/table/iterator_wrapper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/merger.cc b/table/merger.cc index 81eb9608c..1cc80ec8d 100644 --- a/table/merger.cc +++ b/table/merger.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/merger.h b/table/merger.h index 5ea624648..7291a0378 100644 --- a/table/merger.h +++ b/table/merger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/merger_test.cc b/table/merger_test.cc index e9397dc1d..97979af7c 100644 --- a/table/merger_test.cc +++ b/table/merger_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 505dbacd0..b94d0b6ea 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 085ae308e..ab4f7e127 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/mock_table.cc b/table/mock_table.cc index 4525994d3..7d5cefa78 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/mock_table.h b/table/mock_table.h index 1b822d783..4352a2c7a 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -1,6 +1,6 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 2306a7efb..efaf47b07 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index 5c0cad977..02fb8d87a 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/plain_table_index.cc b/table/plain_table_index.cc index 4f4ebabf1..c8081c006 100644 --- a/table/plain_table_index.cc +++ b/table/plain_table_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -117,7 +117,7 @@ void PlainTableIndexBuilder::AllocateIndex() { index_size_ = 1; } else { double hash_table_size_multipier = 1.0 / hash_table_ratio_; - index_size_ = + index_size_ = static_cast(num_prefixes_ * hash_table_size_multipier) + 1; assert(index_size_ > 0); } diff --git a/table/plain_table_index.h b/table/plain_table_index.h index be8ad1639..ab2be3d1e 100644 --- a/table/plain_table_index.h +++ b/table/plain_table_index.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index b1aa14d8d..8442f1129 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index e2dc7dff4..ed4ce5d38 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/scoped_arena_iterator.h b/table/scoped_arena_iterator.h index 0372b5691..5629ba5aa 100644 --- a/table/scoped_arena_iterator.h +++ b/table/scoped_arena_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/sst_file_writer.cc b/table/sst_file_writer.cc index 1c21a25f7..58ca0e84a 100644 --- a/table/sst_file_writer.cc +++ b/table/sst_file_writer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -163,6 +163,9 @@ Status SstFileWriter::Finish(ExternalSstFileInfo* file_info) { if (!r->builder) { return Status::InvalidArgument("File is not opened"); } + if (r->file_info.num_entries == 0) { + return Status::InvalidArgument("Cannot create sst file with no entries"); + } Status s = r->builder->Finish(); if (s.ok()) { diff --git a/table/table_builder.h b/table/table_builder.h index 930c99f10..274245f08 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -29,17 +29,20 @@ struct TableReaderOptions { TableReaderOptions(const ImmutableCFOptions& _ioptions, const EnvOptions& _env_options, const InternalKeyComparator& _internal_comparator, - bool _skip_filters = false) + bool _skip_filters = false, int _level = -1) : ioptions(_ioptions), env_options(_env_options), internal_comparator(_internal_comparator), - skip_filters(_skip_filters) {} + skip_filters(_skip_filters), + level(_level) {} const ImmutableCFOptions& ioptions; const EnvOptions& env_options; const InternalKeyComparator& internal_comparator; // This is only used for BlockBasedTable (reader) bool skip_filters; + // what level this table/file is on, -1 for "not set, don't know" + int level; }; struct TableBuilderOptions { diff --git a/table/table_properties.cc b/table/table_properties.cc index 7a51779fe..5bf3e0103 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/table_properties_internal.h b/table/table_properties_internal.h index 10f38cdf2..77042acbb 100644 --- a/table/table_properties_internal.h +++ b/table/table_properties_internal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/table_reader.h b/table/table_reader.h index 2fef5df30..c047bf8cb 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -91,6 +91,8 @@ class TableReader { virtual Status DumpTable(WritableFile* out_file) { return Status::NotSupported("DumpTable() not supported"); } + + virtual void Close() {} }; } // namespace rocksdb diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index cee0d10e3..1a7d7b5a7 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/table_test.cc b/table/table_test.cc index 0a84f2750..424ca005e 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -637,6 +637,7 @@ class HarnessTest : public testing::Test { new FlushBlockBySizePolicyFactory()); table_options_.block_size = 256; table_options_.block_restart_interval = args.restart_interval; + table_options_.index_block_restart_interval = args.restart_interval; table_options_.format_version = args.format_version; options_.table_factory.reset( new BlockBasedTableFactory(table_options_)); @@ -1714,7 +1715,7 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) { ImmutableCFOptions ioptions3(options); // Generate table without filter policy c3.Finish(options, ioptions3, table_options, - GetPlainInternalComparator(options.comparator), &keys, &kvmap); + GetPlainInternalComparator(options.comparator), &keys, &kvmap); // Open table with filter policy table_options.filter_policy.reset(NewBloomFilterPolicy(1)); options.table_factory.reset(new BlockBasedTableFactory(table_options)); @@ -2282,6 +2283,149 @@ TEST_F(HarnessTest, FooterTests) { } } +class IndexBlockRestartIntervalTest + : public BlockBasedTableTest, + public ::testing::WithParamInterface { + public: + static std::vector GetRestartValues() { return {-1, 0, 1, 8, 16, 32}; } +}; + +INSTANTIATE_TEST_CASE_P( + IndexBlockRestartIntervalTest, IndexBlockRestartIntervalTest, + ::testing::ValuesIn(IndexBlockRestartIntervalTest::GetRestartValues())); + +TEST_P(IndexBlockRestartIntervalTest, IndexBlockRestartInterval) { + const int kKeysInTable = 10000; + const int kKeySize = 100; + const int kValSize = 500; + + int index_block_restart_interval = GetParam(); + + Options options; + BlockBasedTableOptions table_options; + table_options.block_size = 64; // small block size to get big index block + table_options.index_block_restart_interval = index_block_restart_interval; + options.table_factory.reset(new BlockBasedTableFactory(table_options)); + + TableConstructor c(BytewiseComparator()); + static Random rnd(301); + for (int i = 0; i < kKeysInTable; i++) { + InternalKey k(RandomString(&rnd, kKeySize), 0, kTypeValue); + c.Add(k.Encode().ToString(), RandomString(&rnd, kValSize)); + } + + std::vector keys; + stl_wrappers::KVMap kvmap; + std::unique_ptr comparator( + new InternalKeyComparator(BytewiseComparator())); + const ImmutableCFOptions ioptions(options); + c.Finish(options, ioptions, table_options, *comparator, &keys, &kvmap); + auto reader = c.GetTableReader(); + + std::unique_ptr db_iter(reader->NewIterator(ReadOptions())); + + // Test point lookup + for (auto& kv : kvmap) { + db_iter->Seek(kv.first); + + ASSERT_TRUE(db_iter->Valid()); + ASSERT_OK(db_iter->status()); + ASSERT_EQ(db_iter->key(), kv.first); + ASSERT_EQ(db_iter->value(), kv.second); + } + + // Test iterating + auto kv_iter = kvmap.begin(); + for (db_iter->SeekToFirst(); db_iter->Valid(); db_iter->Next()) { + ASSERT_EQ(db_iter->key(), kv_iter->first); + ASSERT_EQ(db_iter->value(), kv_iter->second); + kv_iter++; + } + ASSERT_EQ(kv_iter, kvmap.end()); +} + +class PrefixTest : public testing::Test { + public: + PrefixTest() : testing::Test() {} + ~PrefixTest() {} +}; + +namespace { +// A simple PrefixExtractor that only works for test PrefixAndWholeKeyTest +class TestPrefixExtractor : public rocksdb::SliceTransform { + public: + ~TestPrefixExtractor() override{}; + const char* Name() const override { return "TestPrefixExtractor"; } + + rocksdb::Slice Transform(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return rocksdb::Slice(src.data(), 3); + } + + bool InDomain(const rocksdb::Slice& src) const override { + assert(IsValid(src)); + return true; + } + + bool InRange(const rocksdb::Slice& dst) const override { return true; } + + bool IsValid(const rocksdb::Slice& src) const { + if (src.size() != 4) { + return false; + } + if (src[0] != '[') { + return false; + } + if (src[1] < '0' || src[1] > '9') { + return false; + } + if (src[2] != ']') { + return false; + } + if (src[3] < '0' || src[3] > '9') { + return false; + } + return true; + } +}; +} // namespace + +TEST_F(PrefixTest, PrefixAndWholeKeyTest) { + rocksdb::Options options; + options.compaction_style = rocksdb::kCompactionStyleUniversal; + options.num_levels = 20; + options.create_if_missing = true; + options.optimize_filters_for_hits = false; + options.target_file_size_base = 268435456; + options.prefix_extractor = std::make_shared(); + rocksdb::BlockBasedTableOptions bbto; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(10)); + bbto.block_size = 262144; + + bbto.whole_key_filtering = true; + + const std::string kDBPath = test::TmpDir() + "/prefix_test"; + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + DestroyDB(kDBPath, options); + rocksdb::DB* db; + ASSERT_OK(rocksdb::DB::Open(options, kDBPath, &db)); + + // Create a bunch of keys with 10 filters. + for (int i = 0; i < 10; i++) { + std::string prefix = "[" + std::to_string(i) + "]"; + for (int j = 0; j < 10; j++) { + std::string key = prefix + std::to_string(j); + db->Put(rocksdb::WriteOptions(), key, "1"); + } + } + + // Trigger compaction. + db->CompactRange(CompactRangeOptions(), nullptr, nullptr); + delete db; + // In the second round, turn whole_key_filtering off and expect + // rocksdb still works. +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/table/two_level_iterator.cc b/table/two_level_iterator.cc index a01c12007..2656b317a 100644 --- a/table/two_level_iterator.cc +++ b/table/two_level_iterator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/table/two_level_iterator.h b/table/two_level_iterator.h index ed5380bd4..d210132cb 100644 --- a/table/two_level_iterator.h +++ b/table/two_level_iterator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/third-party/fbson/FbsonDocument.h b/third-party/fbson/FbsonDocument.h index c70f9ecb2..9a00e2471 100644 --- a/third-party/fbson/FbsonDocument.h +++ b/third-party/fbson/FbsonDocument.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/third-party/fbson/FbsonJsonParser.h b/third-party/fbson/FbsonJsonParser.h index 1c9c8ed6e..73c1febfa 100644 --- a/third-party/fbson/FbsonJsonParser.h +++ b/third-party/fbson/FbsonJsonParser.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/third-party/fbson/FbsonStream.h b/third-party/fbson/FbsonStream.h index 22851240d..5f70221db 100644 --- a/third-party/fbson/FbsonStream.h +++ b/third-party/fbson/FbsonStream.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/third-party/fbson/FbsonUtil.h b/third-party/fbson/FbsonUtil.h index ab965630d..2c4154769 100644 --- a/third-party/fbson/FbsonUtil.h +++ b/third-party/fbson/FbsonUtil.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/third-party/fbson/FbsonWriter.h b/third-party/fbson/FbsonWriter.h index 21bd6f232..4efaf817c 100644 --- a/third-party/fbson/FbsonWriter.h +++ b/third-party/fbson/FbsonWriter.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Facebook, Inc. + * Copyright (c) 2011-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the BSD-style license found in the diff --git a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h b/third-party/gtest-1.7.0/fused-src/gtest/gtest.h index 2756b47d5..e3f0cfb95 100644 --- a/third-party/gtest-1.7.0/fused-src/gtest/gtest.h +++ b/third-party/gtest-1.7.0/fused-src/gtest/gtest.h @@ -7682,7 +7682,7 @@ namespace edit_distance { // Returns the optimal edits to go from 'left' to 'right'. // All edits cost the same, with replace having lower priority than // add/remove. -// Simple implementation of the Wagner–Fischer algorithm. +// Simple implementation of the Wagner-Fischer algorithm. // See http://en.wikipedia.org/wiki/Wagner-Fischer_algorithm enum EditType { kMatch, kAdd, kRemove, kReplace }; GTEST_API_ std::vector CalculateOptimalEdits( @@ -17586,7 +17586,7 @@ internal::CartesianProductHolder10()); \ return 0; \ } \ - static int gtest_registering_dummy_; \ + static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_; \ GTEST_DISALLOW_COPY_AND_ASSIGN_(\ GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ }; \ diff --git a/thirdparty.inc b/thirdparty.inc index 46da30c52..e10bdaa4e 100644 --- a/thirdparty.inc +++ b/thirdparty.inc @@ -8,6 +8,7 @@ set(USE_SNAPPY_DEFAULT 0) # SNAPPY is disabled by default, enable with -D set(USE_LZ4_DEFAULT 0) # LZ4 is disabled by default, enable with -DLZ4=1 cmake command line agrument set(USE_ZLIB_DEFAULT 0) # ZLIB is disabled by default, enable with -DZLIB=1 cmake command line agrument set(USE_JEMALLOC_DEFAULT 0) # JEMALLOC is disabled by default, enable with -DJEMALLOC=1 cmake command line agrument +set(USE_JENONINIT_DEFAULT 1) # Default is enabled do not call je_init/je_uninit as the newer versions do not have it disable with -DJENONINIT=0 # # This example assumes all the libraries locate in directories under THIRDPARTY_HOME environment variable @@ -208,7 +209,7 @@ endif () if (${USE_JEMALLOC} EQUAL 1) message(STATUS "JEMALLOC library is enabled") - set(JEMALLOC_CXX_FLAGS -DJEMALLOC) + set(JEMALLOC_CXX_FLAGS "-DJEMALLOC -DJEMALLOC_EXPORT= ") if(DEFINED ENV{JEMALLOC_INCLUDE}) set(JEMALLOC_INCLUDE $ENV{JEMALLOC_INCLUDE}) @@ -228,6 +229,18 @@ if (${USE_JEMALLOC} EQUAL 1) include_directories(${JEMALLOC_INCLUDE}) set (THIRDPARTY_LIBS ${THIRDPARTY_LIBS} ${JEMALLOC_LIBS}) set (ARTIFACT_SUFFIX "_je") + + set(USE_JENONINIT USE_JENONINIT_DEFAULT) + + if(JENONINIT) + set(USE_JENONINIT ${JENONINIT}) + endif() + + if(${USE_JENONINIT} EQUAL 1) + add_definitions(-DJEMALLOC_NON_INIT) + message(STATUS "JEMALLOC NONINIT version") + endif() + else () set (ARTIFACT_SUFFIX "") message(STATUS "JEMALLOC library is disabled") diff --git a/tools/benchmark.sh b/tools/benchmark.sh index b0d1babd9..d28aeb271 100755 --- a/tools/benchmark.sh +++ b/tools/benchmark.sh @@ -4,7 +4,15 @@ if [ $# -ne 1 ]; then echo -n "./benchmark.sh [bulkload/fillseq/overwrite/filluniquerandom/" echo "readrandom/readwhilewriting/readwhilemerging/updaterandom/" - echo "mergerandom/randomtransaction]" + echo "mergerandom/randomtransaction/compact]" + exit 0 +fi + +# Make it easier to run only the compaction test. Getting valid data requires +# a number of iterations and having an ability to run the test separately from +# rest of the benchmarks helps. +if [ "$COMPACTION_TEST" == "1" -a "$1" != "universal_compaction" ]; then + echo "Skipping $1 because it's not a compaction test." exit 0 fi @@ -66,12 +74,12 @@ const_params=" --level_compaction_dynamic_level_bytes=true \ --bytes_per_sync=$((8 * M)) \ --cache_index_and_filter_blocks=0 \ + --pin_l0_filter_and_index_blocks_in_cache=1 \ --benchmark_write_rate_limit=$(( 1024 * 1024 * $mb_written_per_sec )) \ \ --hard_rate_limit=3 \ --rate_limit_delay_max_milliseconds=1000000 \ --write_buffer_size=$((128 * M)) \ - --max_write_buffer_number=8 \ --target_file_size_base=$((128 * M)) \ --max_bytes_for_level_base=$((1 * G)) \ \ @@ -98,17 +106,48 @@ if [ $duration -gt 0 ]; then const_params="$const_params --duration=$duration" fi -params_w="$const_params $l0_config --max_background_compactions=16 --max_background_flushes=7" -params_bulkload="$const_params --max_background_compactions=16 --max_background_flushes=7 \ +params_w="$const_params \ + $l0_config \ + --max_background_compactions=16 \ + --max_write_buffer_number=8 \ + --max_background_flushes=7" + +params_bulkload="$const_params \ + --max_background_compactions=16 \ + --max_write_buffer_number=8 \ + --max_background_flushes=7 \ --level0_file_num_compaction_trigger=$((10 * M)) \ --level0_slowdown_writes_trigger=$((10 * M)) \ --level0_stop_writes_trigger=$((10 * M))" +# +# Tune values for level and universal compaction. +# For universal compaction, these level0_* options mean total sorted of runs in +# LSM. In level-based compaction, it means number of L0 files. +# +params_level_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=4 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + +params_univ_compact="$const_params \ + --max_background_flushes=4 \ + --max_write_buffer_number=4 \ + --level0_file_num_compaction_trigger=8 \ + --level0_slowdown_writes_trigger=16 \ + --level0_stop_writes_trigger=20" + function summarize_result { test_out=$1 test_name=$2 bench_name=$3 + # Note that this function assumes that the benchmark executes long enough so + # that "Compaction Stats" is written to stdout at least once. If it won't + # happen then empty output from grep when searching for "Sum" will cause + # syntax errors. uptime=$( grep ^Uptime\(secs $test_out | tail -1 | awk '{ printf "%.0f", $2 }' ) stall_time=$( grep "^Cumulative stall" $test_out | tail -1 | awk '{ print $3 }' ) stall_pct=$( grep "^Cumulative stall" $test_out| tail -1 | awk '{ print $5 }' ) @@ -158,9 +197,116 @@ function run_bulkload { eval $cmd } +# +# Parameter description: +# +# $1 - 1 if I/O statistics should be collected. +# $2 - compaction type to use (level=0, universal=1). +# $3 - number of subcompactions. +# $4 - number of maximum background compactions. +# +function run_manual_compaction_worker { + # This runs with a vector memtable and the WAL disabled to load faster. + # It is still crash safe and the client can discover where to restart a + # load after a crash. I think this is a good way to load. + echo "Bulk loading $num_keys random keys for manual compaction." + + fillrandom_output_file=$output_dir/benchmark_man_compact_fillrandom_$3.log + man_compact_output_log=$output_dir/benchmark_man_compact_$3.log + + if [ "$2" == "1" ]; then + extra_params=$params_univ_compact + else + extra_params=$params_level_compact + fi + + # Make sure that fillrandom uses the same compaction options as compact. + cmd="./db_bench --benchmarks=fillrandom \ + --use_existing_db=0 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --memtablerep=vector \ + --disable_wal=1 \ + --max_background_compactions=$4 \ + --seed=$( date +%s ) \ + 2>&1 | tee -a $fillrandom_output_file" + + echo $cmd | tee $fillrandom_output_file + eval $cmd + + summarize_result $fillrandom_output_file man_compact_fillrandom_$3 fillrandom + + echo "Compacting with $3 subcompactions specified ..." + + # This is the part we're really interested in. Given that compact benchmark + # doesn't output regular statistics then we'll just use the time command to + # measure how long this step takes. + cmd="{ \ + time ./db_bench --benchmarks=compact \ + --use_existing_db=1 \ + --disable_auto_compactions=0 \ + --sync=0 \ + $extra_params \ + --threads=$num_threads \ + --compaction_measure_io_stats=$1 \ + --compaction_style=$2 \ + --subcompactions=$3 \ + --max_background_compactions=$4 \ + ;} + 2>&1 | tee -a $man_compact_output_log" + + echo $cmd | tee $man_compact_output_log + eval $cmd + + # Can't use summarize_result here. One way to analyze the results is to run + # "grep real" on the resulting log files. +} + +function run_univ_compaction { + # Always ask for I/O statistics to be measured. + io_stats=1 + + # Values: kCompactionStyleLevel = 0x0, kCompactionStyleUniversal = 0x1. + compaction_style=1 + + # Define a set of benchmarks. + subcompactions=(1 2 4 8 16) + max_background_compactions=(16 16 8 4 2) + + i=0 + total=${#subcompactions[@]} + + # Execute a set of benchmarks to cover variety of scenarios. + while [ "$i" -lt "$total" ] + do + run_manual_compaction_worker $io_stats $compaction_style ${subcompactions[$i]} \ + ${max_background_compactions[$i]} + ((i++)) + done +} + function run_fillseq { - # This runs with a vector memtable and the WAL disabled to load faster. It is still crash safe and the - # client can discover where to restart a load after a crash. I think this is a good way to load. + # This runs with a vector memtable. WAL can be either disabled or enabled + # depending on the input parameter (1 for disabled, 0 for enabled). The main + # benefit behind disabling WAL is to make loading faster. It is still crash + # safe and the client can discover where to restart a load after a crash. I + # think this is a good way to load. + + # Make sure that we'll have unique names for all the files so that data won't + # be overwritten. + if [ $1 == 1 ]; then + log_file_name=$output_dir/benchmark_fillseq.wal_disabled.v${value_size}.log + test_name=fillseq.wal_disabled.v${value_size} + else + log_file_name=$output_dir/benchmark_fillseq.wal_enabled.v${value_size}.log + test_name=fillseq.wal_enabled.v${value_size} + fi + echo "Loading $num_keys keys sequentially" cmd="./db_bench --benchmarks=fillseq \ --use_existing_db=0 \ @@ -169,12 +315,14 @@ function run_fillseq { --min_level_to_compress=0 \ --threads=1 \ --memtablerep=vector \ - --disable_wal=1 \ + --disable_wal=$1 \ --seed=$( date +%s ) \ - 2>&1 | tee -a $output_dir/benchmark_fillseq.v${value_size}.log" - echo $cmd | tee $output_dir/benchmark_fillseq.v${value_size}.log + 2>&1 | tee -a $log_file_name" + echo $cmd | tee $log_file_name eval $cmd - summarize_result $output_dir/benchmark_fillseq.v${value_size}.log fillseq.v${value_size} fillseq + + # The constant "fillseq" which we pass to db_bench is the benchmark name. + summarize_result $log_file_name $test_name fillseq } function run_change { @@ -310,8 +458,10 @@ for job in ${jobs[@]}; do start=$(now) if [ $job = bulkload ]; then run_bulkload - elif [ $job = fillseq ]; then - run_fillseq + elif [ $job = fillseq_disable_wal ]; then + run_fillseq 1 + elif [ $job = fillseq_enable_wal ]; then + run_fillseq 0 elif [ $job = overwrite ]; then run_change overwrite elif [ $job = updaterandom ]; then @@ -340,6 +490,8 @@ for job in ${jobs[@]}; do run_rangewhile merging $job true elif [ $job = randomtransaction ]; then run_randomtransaction + elif [ $job = universal_compaction ]; then + run_univ_compaction elif [ $job = debug ]; then num_keys=1000; # debug echo "Setting num_keys to $num_keys" diff --git a/tools/db_bench.cc b/tools/db_bench.cc new file mode 100644 index 000000000..692ff1d23 --- /dev/null +++ b/tools/db_bench.cc @@ -0,0 +1,23 @@ +// Copyright (c) 2013-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#ifndef GFLAGS +#include +int main() { + fprintf(stderr, "Please install gflags to run rocksdb tools\n"); + return 1; +} +#else +#include +int main(int argc, char** argv) { rocksdb::db_bench_tool(argc, argv); } +#endif // GFLAGS diff --git a/db/db_bench.cc b/tools/db_bench_tool.cc similarity index 96% rename from db/db_bench.cc rename to tools/db_bench_tool.cc index a07798b41..2e1a83237 100644 --- a/db/db_bench.cc +++ b/tools/db_bench_tool.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -11,14 +11,7 @@ #define __STDC_FORMAT_MACROS #endif -#ifndef GFLAGS -#include -int main() { - fprintf(stderr, "Please install gflags to run rocksdb tools\n"); - return 1; -} -#else - +#ifdef GFLAGS #ifdef NUMA #include #include @@ -43,39 +36,41 @@ int main() { #include "db/db_impl.h" #include "db/version_set.h" -#include "rocksdb/options.h" +#include "hdfs/env_hdfs.h" +#include "port/port.h" +#include "port/stack_trace.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" #include "rocksdb/env.h" -#include "rocksdb/memtablerep.h" -#include "rocksdb/write_batch.h" -#include "rocksdb/slice.h" #include "rocksdb/filter_policy.h" +#include "rocksdb/memtablerep.h" +#include "rocksdb/options.h" +#include "rocksdb/perf_context.h" #include "rocksdb/rate_limiter.h" +#include "rocksdb/slice.h" #include "rocksdb/slice_transform.h" -#include "rocksdb/perf_context.h" #include "rocksdb/utilities/flashcache.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/transaction_db.h" -#include "rocksdb/utilities/optimistic_transaction_db.h" -#include "port/port.h" -#include "port/stack_trace.h" -#include "util/crc32c.h" +#include "rocksdb/write_batch.h" #include "util/compression.h" +#include "util/crc32c.h" #include "util/histogram.h" #include "util/mutexlock.h" #include "util/random.h" -#include "util/string_util.h" #include "util/statistics.h" +#include "util/string_util.h" #include "util/testutil.h" +#include "util/transaction_test_util.h" #include "util/xxhash.h" -#include "hdfs/env_hdfs.h" #include "utilities/merge_operators.h" #ifdef OS_WIN #include // open/close #endif +namespace { using GFLAGS::ParseCommandLineFlags; using GFLAGS::RegisterFlagValidator; using GFLAGS::SetUsageMessage; @@ -345,6 +340,9 @@ DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" DEFINE_bool(cache_index_and_filter_blocks, false, "Cache index/filter blocks in block cache."); +DEFINE_bool(pin_l0_filter_and_index_blocks_in_cache, false, + "Pin index/filter blocks of L0 files in block cache."); + DEFINE_int32(block_size, static_cast(rocksdb::BlockBasedTableOptions().block_size), "Number of bytes in a block."); @@ -521,7 +519,6 @@ DEFINE_uint64(transaction_lock_timeout, 100, DEFINE_bool(compaction_measure_io_stats, false, "Measure times spents on I/Os while in compactions. "); -namespace { enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -541,7 +538,7 @@ enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { return rocksdb::kZSTDNotFinalCompression; fprintf(stdout, "Cannot parse compression type '%s'\n", ctype); - return rocksdb::kSnappyCompression; //default value + return rocksdb::kSnappyCompression; // default value } std::string ColumnFamilyName(size_t i) { @@ -553,7 +550,6 @@ std::string ColumnFamilyName(size_t i) { return std::string(name); } } -} // namespace DEFINE_string(compression_type, "snappy", "Algorithm to use to compress the database"); @@ -764,7 +760,6 @@ enum RepFactory { kCuckoo }; -namespace { enum RepFactory StringToRepFactory(const char* ctype) { assert(ctype); @@ -782,7 +777,6 @@ enum RepFactory StringToRepFactory(const char* ctype) { fprintf(stdout, "Cannot parse memreptable %s\n", ctype); return kSkipList; } -} // namespace static enum RepFactory FLAGS_rep_factory; DEFINE_string(memtablerep, "skip_list", ""); @@ -834,6 +828,7 @@ static const bool FLAGS_deletepercent_dummy __attribute__((unused)) = static const bool FLAGS_table_cache_numshardbits_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_table_cache_numshardbits, &ValidateTableCacheNumshardbits); +} // namespace namespace rocksdb { @@ -1214,7 +1209,7 @@ class Stats { uint64_t bytes_; uint64_t last_op_finish_; uint64_t last_report_finish_; - std::unordered_map, std::hash> hist_; std::string message_; bool exclude_from_merge_; @@ -1251,7 +1246,7 @@ class Stats { for (auto it = other.hist_.begin(); it != other.hist_.end(); ++it) { auto this_it = hist_.find(it->first); if (this_it != hist_.end()) { - this_it->second.Merge(other.hist_.at(it->first)); + this_it->second->Merge(*(other.hist_.at(it->first))); } else { hist_.insert({ it->first, it->second }); } @@ -1325,10 +1320,10 @@ class Stats { if (hist_.find(op_type) == hist_.end()) { - HistogramImpl hist_temp; - hist_.insert({op_type, hist_temp}); + auto hist_temp = std::make_shared(); + hist_.insert({op_type, std::move(hist_temp)}); } - hist_[op_type].Add(micros); + hist_[op_type]->Add(micros); if (micros > 20000 && !FLAGS_stats_interval) { fprintf(stderr, "long op: %" PRIu64 " micros%30s\r", micros, ""); @@ -1461,7 +1456,7 @@ class Stats { for (auto it = hist_.begin(); it != hist_.end(); ++it) { fprintf(stdout, "Microseconds per %s:\n%s\n", OperationTypeString[it->first].c_str(), - it->second.ToString().c_str()); + it->second->ToString().c_str()); } } if (FLAGS_report_file_operations) { @@ -2249,7 +2244,7 @@ class Benchmark { count++; thread->stats.FinishedOps(nullptr, nullptr, 1, kOthers); } - if (ptr == nullptr) exit(1); // Disable unused variable warning. + if (ptr == nullptr) exit(1); // Disable unused variable warning. } void Compress(ThreadState *thread) { @@ -2262,6 +2257,7 @@ class Benchmark { // Compress 1G while (ok && bytes < int64_t(1) << 30) { + compressed.clear(); ok = CompressSlice(input, &compressed); produced += compressed.size(); bytes += input.size(); @@ -2518,6 +2514,8 @@ class Benchmark { } block_based_options.cache_index_and_filter_blocks = FLAGS_cache_index_and_filter_blocks; + block_based_options.pin_l0_filter_and_index_blocks_in_cache = + FLAGS_pin_l0_filter_and_index_blocks_in_cache; block_based_options.block_cache = cache_; block_based_options.block_cache_compressed = compressed_cache_; block_based_options.block_size = FLAGS_block_size; @@ -3771,18 +3769,22 @@ class Benchmark { ReadOptions options(FLAGS_verify_checksum, true); Duration duration(FLAGS_duration, readwrites_); ReadOptions read_options(FLAGS_verify_checksum, true); - std::string value; - DB* db = db_.db; + uint16_t num_prefix_ranges = static_cast(FLAGS_transaction_sets); uint64_t transactions_done = 0; - uint64_t transactions_aborted = 0; - Status s; - uint64_t num_prefix_ranges = FLAGS_transaction_sets; if (num_prefix_ranges == 0 || num_prefix_ranges > 9999) { fprintf(stderr, "invalid value for transaction_sets\n"); abort(); } + TransactionOptions txn_options; + txn_options.lock_timeout = FLAGS_transaction_lock_timeout; + txn_options.set_snapshot = FLAGS_transaction_set_snapshot; + + RandomTransactionInserter inserter(&thread->rand, write_options_, + read_options, FLAGS_num, + num_prefix_ranges); + if (FLAGS_num_multi_db > 1) { fprintf(stderr, "Cannot run RandomTransaction benchmark with " @@ -3791,126 +3793,26 @@ class Benchmark { } while (!duration.Done(1)) { - Transaction* txn = nullptr; - WriteBatch* batch = nullptr; + bool success; + // RandomTransactionInserter will attempt to insert a key for each + // # of FLAGS_transaction_sets if (FLAGS_optimistic_transaction_db) { - txn = db_.opt_txn_db->BeginTransaction(write_options_); - assert(txn); + success = inserter.OptimisticTransactionDBInsert(db_.opt_txn_db); } else if (FLAGS_transaction_db) { TransactionDB* txn_db = reinterpret_cast(db_.db); - - TransactionOptions txn_options; - txn_options.lock_timeout = FLAGS_transaction_lock_timeout; - - txn = txn_db->BeginTransaction(write_options_, txn_options); - assert(txn); + success = inserter.TransactionDBInsert(txn_db, txn_options); } else { - batch = new WriteBatch(); + success = inserter.DBInsert(db_.db); } - if (txn && FLAGS_transaction_set_snapshot) { - txn->SetSnapshot(); - } - - // pick a random number to use to increment a key in each set - uint64_t incr = (thread->rand.Next() % 100) + 1; - - bool failed = false; - // For each set, pick a key at random and increment it - for (uint8_t i = 0; i < num_prefix_ranges; i++) { - uint64_t int_value; - char prefix_buf[5]; - - // key format: [SET#][random#] - std::string rand_key = ToString(thread->rand.Next() % FLAGS_num); - Slice base_key(rand_key); - - // Pad prefix appropriately so we can iterate over each set - snprintf(prefix_buf, sizeof(prefix_buf), "%04d", i + 1); - std::string full_key = std::string(prefix_buf) + base_key.ToString(); - Slice key(full_key); - - if (txn) { - s = txn->GetForUpdate(read_options, key, &value); - } else { - s = db->Get(read_options, key, &value); - } - - if (s.ok()) { - int_value = std::stoull(value); - - if (int_value == 0 || int_value == ULONG_MAX) { - fprintf(stderr, "Get returned unexpected value: %s\n", - value.c_str()); - abort(); - } - } else if (s.IsNotFound()) { - int_value = 0; - } else if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { - fprintf(stderr, "Get returned an unexpected error: %s\n", - s.ToString().c_str()); - abort(); - } else { - failed = true; - break; - } - - if (FLAGS_transaction_sleep > 0) { - FLAGS_env->SleepForMicroseconds(thread->rand.Next() % - FLAGS_transaction_sleep); - } - - std::string sum = ToString(int_value + incr); - if (txn) { - s = txn->Put(key, sum); - if (!s.ok()) { - // Since we did a GetForUpdate, Put should not fail. - fprintf(stderr, "Put returned an unexpected error: %s\n", - s.ToString().c_str()); - abort(); - } - } else { - batch->Put(key, sum); - } - } - - if (txn) { - if (failed) { - transactions_aborted++; - txn->Rollback(); - s = Status::OK(); - } else { - s = txn->Commit(); - } - } else { - s = db->Write(write_options_, batch); - } - - if (!s.ok()) { - failed = true; - - // Ideally, we'd want to run this stress test with enough concurrency - // on a small enough set of keys that we get some failed transactions - // due to conflicts. - if (FLAGS_optimistic_transaction_db && - (s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { - transactions_aborted++; - } else if (FLAGS_transaction_db && s.IsExpired()) { - transactions_aborted++; - } else { - fprintf(stderr, "Unexpected write error: %s\n", s.ToString().c_str()); - abort(); - } - } - - delete txn; - delete batch; - - if (!failed) { - thread->stats.FinishedOps(nullptr, db, 1, kOthers); + if (!success) { + fprintf(stderr, "Unexpected error: %s\n", + inserter.GetLastStatus().ToString().c_str()); + abort(); } + thread->stats.FinishedOps(nullptr, db_.db, 1, kOthers); transactions_done++; } @@ -3918,7 +3820,7 @@ class Benchmark { if (FLAGS_optimistic_transaction_db || FLAGS_transaction_db) { snprintf(msg, sizeof(msg), "( transactions:%" PRIu64 " aborts:%" PRIu64 ")", - transactions_done, transactions_aborted); + transactions_done, inserter.GetFailureCount()); } else { snprintf(msg, sizeof(msg), "( batches:%" PRIu64 " )", transactions_done); } @@ -3938,50 +3840,15 @@ class Benchmark { return; } - uint64_t prev_total = 0; + Status s = + RandomTransactionInserter::Verify(db_.db, + static_cast(FLAGS_transaction_sets)); - // For each set of keys with the same prefix, sum all the values - for (uint32_t i = 0; i < FLAGS_transaction_sets; i++) { - char prefix_buf[5]; - snprintf(prefix_buf, sizeof(prefix_buf), "%04u", i + 1); - uint64_t total = 0; - - Iterator* iter = db_.db->NewIterator(ReadOptions()); - - for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) { - Slice key = iter->key(); - - // stop when we reach a different prefix - if (key.ToString().compare(0, 4, prefix_buf) != 0) { - break; - } - - Slice value = iter->value(); - uint64_t int_value = std::stoull(value.ToString()); - if (int_value == 0 || int_value == ULONG_MAX) { - fprintf(stderr, "Iter returned unexpected value: %s\n", - value.ToString().c_str()); - abort(); - } - - total += int_value; - } - delete iter; - - if (i > 0) { - if (total != prev_total) { - fprintf(stderr, - "RandomTransactionVerify found inconsistent totals. " - "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64 - " \n", - i - 1, prev_total, i, total); - abort(); - } - } - prev_total = total; + if (s.ok()) { + fprintf(stdout, "RandomTransactionVerify Success.\n"); + } else { + fprintf(stdout, "RandomTransactionVerify FAILED!!\n"); } - - fprintf(stdout, "RandomTransactionVerify Success!\n"); } #endif // ROCKSDB_LITE @@ -4071,9 +3938,7 @@ class Benchmark { } }; -} // namespace rocksdb - -int main(int argc, char** argv) { +int db_bench_tool(int argc, char** argv) { rocksdb::port::InstallStackTraceHandler(); SetUsageMessage(std::string("\nUSAGE:\n") + std::string(argv[0]) + " [OPTIONS]..."); @@ -4142,5 +4007,5 @@ int main(int argc, char** argv) { benchmark.Run(); return 0; } - -#endif // GFLAGS +} // namespace rocksdb +#endif diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 348ed1215..3fab88d88 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -24,6 +24,7 @@ default_params = { "disable_data_sync": 0, "disable_wal": 0, "filter_deletes": lambda: random.randint(0, 1), + "allow_concurrent_memtable_write": 0, "iterpercent": 10, "max_background_compactions": 20, "max_bytes_for_level_base": 10485760, @@ -85,6 +86,7 @@ simple_default_params = { "disable_data_sync": 0, "disable_wal": 0, "filter_deletes": lambda: random.randint(0, 1), + "allow_concurrent_memtable_write": lambda: random.randint(0, 1), "iterpercent": 10, "max_background_compactions": 1, "max_bytes_for_level_base": 67108864, @@ -126,6 +128,16 @@ whitebox_simple_default_params = { } +def finalize_and_sanitize(src_params): + dest_params = dict([(k, v() if callable(v) else v) + for (k, v) in src_params.items()]) + # --allow_concurrent_memtable_write with --filter_deletes is not supported. + if dest_params.get("allow_concurrent_memtable_write", 1) == 1: + dest_params["filter_deletes"] = 0 + dest_params["memtablerep"] = "skip_list" + return dest_params + + def gen_cmd_params(args): params = {} @@ -151,8 +163,8 @@ def gen_cmd_params(args): def gen_cmd(params): cmd = './db_stress ' + ' '.join( - '--{0}={1}'.format(k, v() if callable(v) else v) - for k, v in params.items() + '--{0}={1}'.format(k, v) + for k, v in finalize_and_sanitize(params).items() if k not in set(['test_type', 'simple', 'duration', 'interval']) and v is not None) return cmd @@ -236,7 +248,7 @@ def whitebox_crash_main(args): total_check_mode = 4 check_mode = 0 - kill_random_test = 97 + kill_random_test = 888887 kill_mode = 0 while time.time() < exit_time: @@ -255,13 +267,13 @@ def whitebox_crash_main(args): }) elif kill_mode == 1: additional_opts.update({ - "kill_random_test": (kill_random_test / 2 + 1), + "kill_random_test": (kill_random_test / 10 + 1), "kill_prefix_blacklist": "WritableFileWriter::Append," + "WritableFileWriter::WriteBuffered", }) elif kill_mode == 2: additional_opts.update({ - "kill_random_test": (kill_random_test / 4 + 1), + "kill_random_test": (kill_random_test / 5000 + 1), "kill_prefix_blacklist": "WritableFileWriter::Append," "WritableFileWriter::WriteBuffered," "PosixMmapFile::Allocate,WritableFileWriter::Flush", diff --git a/tools/db_repl_stress.cc b/tools/db_repl_stress.cc index 0fca5d506..a01909069 100644 --- a/tools/db_repl_stress.cc +++ b/tools/db_repl_stress.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/db_sanity_test.cc b/tools/db_sanity_test.cc index b7176f41c..773acff6b 100644 --- a/tools/db_sanity_test.cc +++ b/tools/db_sanity_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 102803862..f77dc445f 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -230,6 +230,13 @@ DEFINE_int64(cache_size, 2LL * KB * KB * KB, DEFINE_uint64(subcompactions, 1, "Maximum number of subcompactions to divide L0-L1 compactions " "into."); + +DEFINE_bool(allow_concurrent_memtable_write, true, + "Allow multi-writers to update mem tables in parallel."); + +DEFINE_bool(enable_write_thread_adaptive_yield, true, + "Use a yielding spin loop for brief writer thread waits."); + static const bool FLAGS_subcompactions_dummy __attribute__((unused)) = RegisterFlagValidator(&FLAGS_subcompactions, &ValidateUint32Range); @@ -1997,11 +2004,21 @@ class StressTest { options_.filter_deletes = FLAGS_filter_deletes; options_.inplace_update_support = FLAGS_in_place_update; options_.max_subcompactions = static_cast(FLAGS_subcompactions); - if ((FLAGS_prefix_size == 0) == (FLAGS_rep_factory == kHashSkipList)) { + options_.allow_concurrent_memtable_write = + FLAGS_allow_concurrent_memtable_write; + options_.enable_write_thread_adaptive_yield = + FLAGS_enable_write_thread_adaptive_yield; + + if (FLAGS_prefix_size == 0 && FLAGS_rep_factory == kHashSkipList) { fprintf(stderr, - "prefix_size should be non-zero iff memtablerep == prefix_hash\n"); + "prefeix_size cannot be zero if memtablerep == prefix_hash\n"); exit(1); } + if (FLAGS_prefix_size != 0 && FLAGS_rep_factory != kHashSkipList) { + fprintf(stderr, + "WARNING: prefix_size is non-zero but " + "memtablerep != prefix_hash\n"); + } switch (FLAGS_rep_factory) { case kSkipList: // no need to do anything diff --git a/tools/dump/db_dump_tool.cc b/tools/dump/db_dump_tool.cc index 389e65dba..1db793f73 100644 --- a/tools/dump/db_dump_tool.cc +++ b/tools/dump/db_dump_tool.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/dump/rocksdb_dump.cc b/tools/dump/rocksdb_dump.cc index 2bfc6cee3..fa5fcf5ed 100644 --- a/tools/dump/rocksdb_dump.cc +++ b/tools/dump/rocksdb_dump.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/dump/rocksdb_undump.cc b/tools/dump/rocksdb_undump.cc index 81034f0ce..7da002be3 100644 --- a/tools/dump/rocksdb_undump.cc +++ b/tools/dump/rocksdb_undump.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/ldb.cc b/tools/ldb.cc index cb5ef5204..ec2559a5d 100644 --- a/tools/ldb.cc +++ b/tools/ldb.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 7ec4690d0..be743955d 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -44,6 +44,7 @@ const string LDBCommand::ARG_PATH = "path"; const string LDBCommand::ARG_HEX = "hex"; const string LDBCommand::ARG_KEY_HEX = "key_hex"; const string LDBCommand::ARG_VALUE_HEX = "value_hex"; +const string LDBCommand::ARG_CF_NAME = "column_family"; const string LDBCommand::ARG_TTL = "ttl"; const string LDBCommand::ARG_TTL_START = "start_time"; const string LDBCommand::ARG_TTL_END = "end_time"; @@ -60,6 +61,7 @@ const string LDBCommand::ARG_DB_WRITE_BUFFER_SIZE = "db_write_buffer_size"; const string LDBCommand::ARG_WRITE_BUFFER_SIZE = "write_buffer_size"; const string LDBCommand::ARG_FILE_SIZE = "file_size"; const string LDBCommand::ARG_CREATE_IF_MISSING = "create_if_missing"; +const string LDBCommand::ARG_NO_VALUE = "no_value"; const char* LDBCommand::DELIM = " ==> "; @@ -72,16 +74,14 @@ void DumpSstFile(std::string filename, bool output_hex, bool show_properties); }; LDBCommand* LDBCommand::InitFromCmdLineArgs( - int argc, - char** argv, - const Options& options, - const LDBOptions& ldb_options -) { + int argc, char** argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families) { vector args; for (int i = 1; i < argc; i++) { args.push_back(argv[i]); } - return InitFromCmdLineArgs(args, options, ldb_options); + return InitFromCmdLineArgs(args, options, ldb_options, column_families); } /** @@ -95,10 +95,9 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( * Returns nullptr if the command-line cannot be parsed. */ LDBCommand* LDBCommand::InitFromCmdLineArgs( - const vector& args, - const Options& options, - const LDBOptions& ldb_options -) { + const vector& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families) { // --x=y command line arguments are added as x->y map entries. map option_map; @@ -184,12 +183,16 @@ LDBCommand* LDBCommand::SelectCommand( return new ManifestDumpCommand(cmdParams, option_map, flags); } else if (cmd == ListColumnFamiliesCommand::Name()) { return new ListColumnFamiliesCommand(cmdParams, option_map, flags); + } else if (cmd == CreateColumnFamilyCommand::Name()) { + return new CreateColumnFamilyCommand(cmdParams, option_map, flags); } else if (cmd == DBFileDumperCommand::Name()) { return new DBFileDumperCommand(cmdParams, option_map, flags); } else if (cmd == InternalDumpCommand::Name()) { return new InternalDumpCommand(cmdParams, option_map, flags); } else if (cmd == CheckConsistencyCommand::Name()) { return new CheckConsistencyCommand(cmdParams, option_map, flags); + } else if (cmd == RepairCommand::Name()) { + return new RepairCommand(cmdParams, option_map, flags); } return nullptr; } @@ -450,6 +453,10 @@ void CompactorCommand::Help(string& ret) { } void CompactorCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } Slice* begin = nullptr; Slice* end = nullptr; @@ -513,6 +520,7 @@ Options DBLoaderCommand::PrepareOptionsForOpenDB() { void DBLoaderCommand::DoCommand() { if (!db_) { + assert(GetExecuteState().IsFailed()); return; } @@ -527,7 +535,7 @@ void DBLoaderCommand::DoCommand() { string key; string value; if (ParseKeyValue(line, &key, &value, is_key_hex_, is_value_hex_)) { - db_->Put(write_options, Slice(key), Slice(value)); + db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value)); } else if (0 == line.find("Keys in range:")) { // ignore this line } else if (0 == line.find("Created bg thread 0x")) { @@ -541,7 +549,7 @@ void DBLoaderCommand::DoCommand() { cout << "Warning: " << bad_lines << " bad lines ignored." << endl; } if (compact_) { - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr); } } @@ -696,6 +704,38 @@ void ListColumnFamiliesCommand::DoCommand() { } } +void CreateColumnFamilyCommand::Help(string& ret) { + ret.append(" "); + ret.append(CreateColumnFamilyCommand::Name()); + ret.append(" --db= "); + ret.append("\n"); +} + +CreateColumnFamilyCommand::CreateColumnFamilyCommand( + const vector& params, const map& options, + const vector& flags) + : LDBCommand(options, flags, true, {ARG_DB}) { + if (params.size() != 1) { + exec_state_ = LDBCommandExecuteResult::Failed( + "new column family name must be specified"); + } else { + new_cf_name_ = params[0]; + } +} + +void CreateColumnFamilyCommand::DoCommand() { + ColumnFamilyHandle* new_cf_handle; + Status st = db_->CreateColumnFamily(options_, new_cf_name_, &new_cf_handle); + if (st.ok()) { + fprintf(stdout, "OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed( + "Fail to create new column family: " + st.ToString()); + } + delete new_cf_handle; + CloseDB(); +} + // ---------------------------------------------------------------------------- namespace { @@ -800,12 +840,13 @@ void InternalDumpCommand::Help(string& ret) { void InternalDumpCommand::DoCommand() { if (!db_) { + assert(GetExecuteState().IsFailed()); return; } if (print_stats_) { string stats; - if (db_->GetProperty("rocksdb.stats", &stats)) { + if (db_->GetProperty(GetCfHandle(), "rocksdb.stats", &stats)) { fprintf(stdout, "%s\n", stats.c_str()); } } @@ -1050,7 +1091,7 @@ void DBDumperCommand::DoDumpCommand() { } // Setup key iterator - Iterator* iter = db_->NewIterator(ReadOptions()); + Iterator* iter = db_->NewIterator(ReadOptions(), GetCfHandle()); Status st = iter->status(); if (!st.ok()) { exec_state_ = @@ -1285,7 +1326,7 @@ void ReduceDBLevelsCommand::DoCommand() { } // Compact the whole DB to put all files to the highest level. fprintf(stdout, "Compacting the db...\n"); - db_->CompactRange(CompactRangeOptions(), nullptr, nullptr); + db_->CompactRange(CompactRangeOptions(), GetCfHandle(), nullptr, nullptr); CloseDB(); EnvOptions soptions; @@ -1377,8 +1418,9 @@ void ChangeCompactionStyleCommand::DoCommand() { // print db stats before we have made any change std::string property; std::string files_per_level; - for (int i = 0; i < db_->NumberLevels(); i++) { - db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + for (int i = 0; i < db_->NumberLevels(GetCfHandle()); i++) { + db_->GetProperty(GetCfHandle(), + "rocksdb.num-files-at-level" + NumberToString(i), &property); // format print string @@ -1393,13 +1435,14 @@ void ChangeCompactionStyleCommand::DoCommand() { CompactRangeOptions compact_options; compact_options.change_level = true; compact_options.target_level = 0; - db_->CompactRange(compact_options, nullptr, nullptr); + db_->CompactRange(compact_options, GetCfHandle(), nullptr, nullptr); // verify compaction result files_per_level = ""; int num_files = 0; for (int i = 0; i < db_->NumberLevels(); i++) { - db_->GetProperty("rocksdb.num-files-at-level" + NumberToString(i), + db_->GetProperty(GetCfHandle(), + "rocksdb.num-files-at-level" + NumberToString(i), &property); // format print string @@ -1622,8 +1665,12 @@ void GetCommand::Help(string& ret) { } void GetCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } string value; - Status st = db_->Get(ReadOptions(), key_, &value); + Status st = db_->Get(ReadOptions(), GetCfHandle(), key_, &value); if (st.ok()) { fprintf(stdout, "%s\n", (is_value_hex_ ? StringToHex(value) : value).c_str()); @@ -1670,11 +1717,14 @@ void ApproxSizeCommand::Help(string& ret) { } void ApproxSizeCommand::DoCommand() { - + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } Range ranges[1]; ranges[0] = Range(start_key_, end_key_); uint64_t sizes[1]; - db_->GetApproximateSizes(ranges, 1, sizes); + db_->GetApproximateSizes(GetCfHandle(), ranges, 1, sizes); fprintf(stdout, "%lu\n", (unsigned long)sizes[0]); /* Weird that GetApproximateSizes() returns void, although documentation * says that it returns a Status object. @@ -1718,11 +1768,15 @@ void BatchPutCommand::Help(string& ret) { } void BatchPutCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } WriteBatch batch; for (vector>::const_iterator itr = key_values_.begin(); itr != key_values_.end(); ++itr) { - batch.Put(itr->first, itr->second); + batch.Put(GetCfHandle(), itr->first, itr->second); } Status st = db_->Write(WriteOptions(), &batch); if (st.ok()) { @@ -1741,14 +1795,17 @@ Options BatchPutCommand::PrepareOptionsForOpenDB() { // ---------------------------------------------------------------------------- ScanCommand::ScanCommand(const vector& params, - const map& options, const vector& flags) : - LDBCommand(options, flags, true, - BuildCmdLineOptions({ARG_TTL, ARG_HEX, ARG_KEY_HEX, ARG_TO, - ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, - ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), - start_key_specified_(false), - end_key_specified_(false), - max_keys_scanned_(-1) { + const map& options, + const vector& flags) + : LDBCommand(options, flags, true, + BuildCmdLineOptions( + {ARG_TTL, ARG_NO_VALUE, ARG_HEX, ARG_KEY_HEX, + ARG_TO, ARG_VALUE_HEX, ARG_FROM, ARG_TIMESTAMP, + ARG_MAX_KEYS, ARG_TTL_START, ARG_TTL_END})), + start_key_specified_(false), + end_key_specified_(false), + max_keys_scanned_(-1), + no_value_(false) { map::const_iterator itr = options.find(ARG_FROM); if (itr != options.end()) { @@ -1767,6 +1824,12 @@ ScanCommand::ScanCommand(const vector& params, end_key_specified_ = true; } + vector::const_iterator vitr = + std::find(flags.begin(), flags.end(), ARG_NO_VALUE); + if (vitr != flags.end()) { + no_value_ = true; + } + itr = options.find(ARG_MAX_KEYS); if (itr != options.end()) { try { @@ -1794,13 +1857,18 @@ void ScanCommand::Help(string& ret) { ret.append(" [--" + ARG_MAX_KEYS + "=q] "); ret.append(" [--" + ARG_TTL_START + "=:- is inclusive]"); ret.append(" [--" + ARG_TTL_END + "=:- is exclusive]"); + ret.append(" [--" + ARG_NO_VALUE + "]"); ret.append("\n"); } void ScanCommand::DoCommand() { + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } int num_keys_scanned = 0; - Iterator* it = db_->NewIterator(ReadOptions()); + Iterator* it = db_->NewIterator(ReadOptions(), GetCfHandle()); if (start_key_specified_) { it->Seek(start_key_); } else { @@ -1839,7 +1907,6 @@ void ScanCommand::DoCommand() { } Slice key_slice = it->key(); - Slice val_slice = it->value(); std::string formatted_key; if (is_key_hex_) { @@ -1850,16 +1917,21 @@ void ScanCommand::DoCommand() { key_slice = formatted_key; } - std::string formatted_value; - if (is_value_hex_) { - formatted_value = "0x" + val_slice.ToString(true /* hex */); - val_slice = formatted_value; + if (no_value_) { + fprintf(stdout, "%.*s\n", static_cast(key_slice.size()), + key_slice.data()); + } else { + Slice val_slice = it->value(); + std::string formatted_value; + if (is_value_hex_) { + formatted_value = "0x" + val_slice.ToString(true /* hex */); + val_slice = formatted_value; + } + fprintf(stdout, "%.*s : %.*s\n", static_cast(key_slice.size()), + key_slice.data(), static_cast(val_slice.size()), + val_slice.data()); } - fprintf(stdout, "%.*s : %.*s\n", - static_cast(key_slice.size()), key_slice.data(), - static_cast(val_slice.size()), val_slice.data()); - num_keys_scanned++; if (max_keys_scanned_ >= 0 && num_keys_scanned >= max_keys_scanned_) { break; @@ -1896,7 +1968,11 @@ void DeleteCommand::Help(string& ret) { } void DeleteCommand::DoCommand() { - Status st = db_->Delete(WriteOptions(), key_); + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = db_->Delete(WriteOptions(), GetCfHandle(), key_); if (st.ok()) { fprintf(stdout, "OK\n"); } else { @@ -1937,7 +2013,11 @@ void PutCommand::Help(string& ret) { } void PutCommand::DoCommand() { - Status st = db_->Put(WriteOptions(), key_, value_); + if (!db_) { + assert(GetExecuteState().IsFailed()); + return; + } + Status st = db_->Put(WriteOptions(), GetCfHandle(), key_, value_); if (st.ok()) { fprintf(stdout, "OK\n"); } else { @@ -1978,6 +2058,7 @@ void DBQuerierCommand::Help(string& ret) { void DBQuerierCommand::DoCommand() { if (!db_) { + assert(GetExecuteState().IsFailed()); return; } @@ -2011,17 +2092,17 @@ void DBQuerierCommand::DoCommand() { "delete \n"); } else if (cmd == DELETE_CMD && tokens.size() == 2) { key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); - db_->Delete(write_options, Slice(key)); + db_->Delete(write_options, GetCfHandle(), Slice(key)); fprintf(stdout, "Successfully deleted %s\n", tokens[1].c_str()); } else if (cmd == PUT_CMD && tokens.size() == 3) { key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); value = (is_value_hex_ ? HexToString(tokens[2]) : tokens[2]); - db_->Put(write_options, Slice(key), Slice(value)); + db_->Put(write_options, GetCfHandle(), Slice(key), Slice(value)); fprintf(stdout, "Successfully put %s %s\n", tokens[1].c_str(), tokens[2].c_str()); } else if (cmd == GET_CMD && tokens.size() == 2) { key = (is_key_hex_ ? HexToString(tokens[1]) : tokens[1]); - if (db_->Get(read_options, Slice(key), &value).ok()) { + if (db_->Get(read_options, GetCfHandle(), Slice(key), &value).ok()) { fprintf(stdout, "%s\n", PrintKeyValue(key, value, is_key_hex_, is_value_hex_).c_str()); } else { @@ -2065,6 +2146,29 @@ void CheckConsistencyCommand::DoCommand() { // ---------------------------------------------------------------------------- +RepairCommand::RepairCommand(const vector& params, + const map& options, + const vector& flags) + : LDBCommand(options, flags, false, BuildCmdLineOptions({})) {} + +void RepairCommand::Help(string& ret) { + ret.append(" "); + ret.append(RepairCommand::Name()); + ret.append("\n"); +} + +void RepairCommand::DoCommand() { + Options options = PrepareOptionsForOpenDB(); + Status status = RepairDB(db_path_, options); + if (status.ok()) { + printf("OK\n"); + } else { + exec_state_ = LDBCommandExecuteResult::Failed(status.ToString()); + } +} + +// ---------------------------------------------------------------------------- + namespace { void DumpSstFile(std::string filename, bool output_hex, bool show_properties) { @@ -2125,6 +2229,7 @@ void DBFileDumperCommand::Help(string& ret) { void DBFileDumperCommand::DoCommand() { if (!db_) { + assert(GetExecuteState().IsFailed()); return; } Status s; diff --git a/tools/ldb_cmd.h b/tools/ldb_cmd.h index 0c048e794..cc3814c2f 100644 --- a/tools/ldb_cmd.h +++ b/tools/ldb_cmd.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -44,6 +44,7 @@ public: static const string ARG_HEX; static const string ARG_KEY_HEX; static const string ARG_VALUE_HEX; + static const string ARG_CF_NAME; static const string ARG_TTL; static const string ARG_TTL_START; static const string ARG_TTL_END; @@ -60,19 +61,17 @@ public: static const string ARG_WRITE_BUFFER_SIZE; static const string ARG_FILE_SIZE; static const string ARG_CREATE_IF_MISSING; + static const string ARG_NO_VALUE; static LDBCommand* InitFromCmdLineArgs( - const vector& args, - const Options& options, - const LDBOptions& ldb_options - ); + const vector& args, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families); static LDBCommand* InitFromCmdLineArgs( - int argc, - char** argv, - const Options& options, - const LDBOptions& ldb_options - ); + int argc, char** argv, const Options& options, + const LDBOptions& ldb_options, + const std::vector* column_families); bool ValidateCmdLineOptions(); @@ -82,6 +81,15 @@ public: options_ = options; } + virtual void SetColumnFamilies( + const std::vector* column_families) { + if (column_families != nullptr) { + column_families_ = *column_families; + } else { + column_families_.clear(); + } + } + void SetLDBOptions(const LDBOptions& ldb_options) { ldb_options_ = ldb_options; } @@ -90,10 +98,7 @@ public: return false; } - virtual ~LDBCommand() { - delete db_; - db_ = nullptr; - } + virtual ~LDBCommand() { CloseDB(); } /* Run the command, and return the execute result. */ void Run() { @@ -181,8 +186,10 @@ protected: LDBCommandExecuteResult exec_state_; string db_path_; + string column_family_name_; DB* db_; DBWithTTL* db_ttl_; + std::map cf_handles_; /** * true implies that this command can work if the db is opened in read-only @@ -235,6 +242,13 @@ protected: db_path_ = itr->second; } + itr = options.find(ARG_CF_NAME); + if (itr != options.end()) { + column_family_name_ = itr->second; + } else { + column_family_name_ = kDefaultColumnFamilyName; + } + is_key_hex_ = IsKeyHex(options, flags); is_value_hex_ = IsValueHex(options, flags); is_db_ttl_ = IsFlagPresent(flags, ARG_TTL); @@ -248,21 +262,75 @@ protected: } // Open the DB. Status st; + std::vector handles_opened; if (is_db_ttl_) { + // ldb doesn't yet support TTL DB with multiple column families + if (!column_family_name_.empty() || !column_families_.empty()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "ldb doesn't support TTL DB with multiple column families"); + } if (is_read_only_) { st = DBWithTTL::Open(opt, db_path_, &db_ttl_, 0, true); } else { st = DBWithTTL::Open(opt, db_path_, &db_ttl_); } db_ = db_ttl_; - } else if (is_read_only_) { - st = DB::OpenForReadOnly(opt, db_path_, &db_); } else { - st = DB::Open(opt, db_path_, &db_); + if (column_families_.empty()) { + // Try to figure out column family lists + std::vector cf_list; + st = DB::ListColumnFamilies(DBOptions(), db_path_, &cf_list); + // There is possible the DB doesn't exist yet, for "create if not + // "existing case". The failure is ignored here. We rely on DB::Open() + // to give us the correct error message for problem with opening + // existing DB. + if (st.ok() && cf_list.size() > 1) { + // Ignore single column family DB. + for (auto cf_name : cf_list) { + column_families_.emplace_back(cf_name, opt); + } + } + } + if (is_read_only_) { + if (column_families_.empty()) { + st = DB::OpenForReadOnly(opt, db_path_, &db_); + } else { + st = DB::OpenForReadOnly(opt, db_path_, column_families_, + &handles_opened, &db_); + } + } else { + if (column_families_.empty()) { + st = DB::Open(opt, db_path_, &db_); + } else { + st = DB::Open(opt, db_path_, column_families_, &handles_opened, &db_); + } + } } if (!st.ok()) { string msg = st.ToString(); exec_state_ = LDBCommandExecuteResult::Failed(msg); + } else if (!handles_opened.empty()) { + assert(handles_opened.size() == column_families_.size()); + bool found_cf_name = false; + for (size_t i = 0; i < handles_opened.size(); i++) { + cf_handles_[column_families_[i].name] = handles_opened[i]; + if (column_family_name_ == column_families_[i].name) { + found_cf_name = true; + } + } + if (!found_cf_name) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + CloseDB(); + } + } else { + // We successfully opened DB in single column family mode. + assert(column_families_.empty()); + if (column_family_name_ != kDefaultColumnFamilyName) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Non-existing column family " + column_family_name_); + CloseDB(); + } } options_ = opt; @@ -270,11 +338,27 @@ protected: void CloseDB () { if (db_ != nullptr) { + for (auto& pair : cf_handles_) { + delete pair.second; + } delete db_; db_ = nullptr; } } + ColumnFamilyHandle* GetCfHandle() { + if (!cf_handles_.empty()) { + auto it = cf_handles_.find(column_family_name_); + if (it == cf_handles_.end()) { + exec_state_ = LDBCommandExecuteResult::Failed( + "Cannot find column family " + column_family_name_); + } else { + return it->second; + } + } + return db_->DefaultColumnFamily(); + } + static string PrintKeyValue(const string& key, const string& value, bool is_key_hex, bool is_value_hex) { string result; @@ -310,10 +394,10 @@ protected: * passed in. */ static vector BuildCmdLineOptions(vector options) { - vector ret = {ARG_DB, ARG_BLOOM_BITS, - ARG_BLOCK_SIZE, ARG_AUTO_COMPACTION, - ARG_COMPRESSION_TYPE, ARG_WRITE_BUFFER_SIZE, - ARG_FILE_SIZE, ARG_FIX_PREFIX_LEN}; + vector ret = {ARG_DB, ARG_BLOOM_BITS, ARG_BLOCK_SIZE, + ARG_AUTO_COMPACTION, ARG_COMPRESSION_TYPE, + ARG_WRITE_BUFFER_SIZE, ARG_FILE_SIZE, + ARG_FIX_PREFIX_LEN, ARG_CF_NAME}; ret.insert(ret.end(), options.begin(), options.end()); return ret; } @@ -325,6 +409,7 @@ protected: const string& option, string* value); Options options_; + std::vector column_families_; LDBOptions ldb_options_; private: @@ -377,7 +462,7 @@ private: */ bool StringToBool(string val) { std::transform(val.begin(), val.end(), val.begin(), - [](char ch) -> char { return ::tolower(ch); }); + [](char ch)->char { return (char)::tolower(ch); }); if (val == "true") { return true; @@ -568,6 +653,23 @@ class ListColumnFamiliesCommand : public LDBCommand { string dbname_; }; +class CreateColumnFamilyCommand : public LDBCommand { + public: + static string Name() { return "create_column_family"; } + + CreateColumnFamilyCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return false; } + + private: + string new_cf_name_; +}; + class ReduceDBLevelsCommand : public LDBCommand { public: static string Name() { return "reduce_levels"; } @@ -709,6 +811,7 @@ private: bool start_key_specified_; bool end_key_specified_; int max_keys_scanned_; + bool no_value_; }; class DeleteCommand : public LDBCommand { @@ -780,6 +883,21 @@ public: static void Help(string& ret); }; +class RepairCommand : public LDBCommand { + public: + static string Name() { return "repair"; } + + RepairCommand(const vector& params, + const map& options, + const vector& flags); + + virtual void DoCommand() override; + + virtual bool NoDBOpen() override { return true; } + + static void Help(string& ret); +}; + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/tools/ldb_cmd_execute_result.h b/tools/ldb_cmd_execute_result.h index 29ebfc240..94f271c86 100644 --- a/tools/ldb_cmd_execute_result.h +++ b/tools/ldb_cmd_execute_result.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/ldb_cmd_test.cc b/tools/ldb_cmd_test.cc index edb6a2106..892f5843c 100644 --- a/tools/ldb_cmd_test.cc +++ b/tools/ldb_cmd_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/ldb_test.py b/tools/ldb_test.py index 471232419..f4899587d 100644 --- a/tools/ldb_test.py +++ b/tools/ldb_test.py @@ -503,7 +503,36 @@ class LDBTestCase(unittest.TestCase): # Test on empty path. self.assertRunFAILFull(cmd % "") - + def testColumnFamilies(self): + print "Running testColumnFamilies..." + dbPath = os.path.join(self.TMP_DIR, self.DB_NAME) + self.assertRunOK("put cf1_1 1 --create_if_missing", "OK") + self.assertRunOK("put cf1_2 2 --create_if_missing", "OK") + self.assertRunOK("put cf1_3 3", "OK") + # Given non-default column family to single CF DB. + self.assertRunFAIL("get cf1_1 --column_family=two") + self.assertRunOK("create_column_family two", "OK") + self.assertRunOK("put cf2_1 1 --create_if_missing --column_family=two", + "OK") + self.assertRunOK("put cf2_2 2 --create_if_missing --column_family=two", + "OK") + self.assertRunOK("delete cf1_2", "OK") + self.assertRunOK("create_column_family three", "OK") + self.assertRunOK("delete cf2_2 --column_family=two", "OK") + self.assertRunOK( + "put cf3_1 3 --create_if_missing --column_family=three", + "OK") + self.assertRunOK("get cf1_1 --column_family=default", "1") + self.assertRunOK("dump --column_family=two", + "cf2_1 ==> 1\nKeys in range: 1") + self.assertRunOK("dump", + "cf1_1 ==> 1\ncf1_3 ==> 3\nKeys in range: 2") + self.assertRunOK("get cf2_1 --column_family=two", + "1") + self.assertRunOK("get cf3_1 --column_family=three", + "3") + # non-existing column family. + self.assertRunFAIL("get cf3_1 --column_family=four") if __name__ == "__main__": unittest.main() diff --git a/tools/ldb_tool.cc b/tools/ldb_tool.cc index c1b23ebac..7ec56f115 100644 --- a/tools/ldb_tool.cc +++ b/tools/ldb_tool.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -30,6 +30,10 @@ public: " : Values are input/output as hex\n"); ret.append(" --" + LDBCommand::ARG_HEX + " : Both keys and values are input/output as hex\n"); + ret.append( + " --" + LDBCommand::ARG_CF_NAME + + " : name of the column family to operate on. default: default column " + "family\n"); ret.append("\n"); ret.append("The following optional parameters control the database " @@ -73,19 +77,21 @@ public: ListColumnFamiliesCommand::Help(ret); DBFileDumperCommand::Help(ret); InternalDumpCommand::Help(ret); + RepairCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); } - static void RunCommand(int argc, char** argv, Options options, - const LDBOptions& ldb_options) { + static void RunCommand( + int argc, char** argv, Options options, const LDBOptions& ldb_options, + const std::vector* column_families) { if (argc <= 2) { PrintHelp(argv[0]); exit(1); } - LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs(argc, argv, options, - ldb_options); + LDBCommand* cmdObj = LDBCommand::InitFromCmdLineArgs( + argc, argv, options, ldb_options, column_families); if (cmdObj == nullptr) { fprintf(stderr, "Unknown command\n"); PrintHelp(argv[0]); @@ -106,10 +112,11 @@ public: }; - void LDBTool::Run(int argc, char** argv, Options options, - const LDBOptions& ldb_options) { - LDBCommandRunner::RunCommand(argc, argv, options, ldb_options); + const LDBOptions& ldb_options, + const std::vector* column_families) { + LDBCommandRunner::RunCommand(argc, argv, options, ldb_options, + column_families); } } // namespace rocksdb diff --git a/tools/reduce_levels_test.cc b/tools/reduce_levels_test.cc index 863d8607e..2f009cb84 100644 --- a/tools/reduce_levels_test.cc +++ b/tools/reduce_levels_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -92,8 +92,8 @@ Status ReduceLevelTest::OpenDB(bool create_if_missing, int num_levels) { bool ReduceLevelTest::ReduceLevels(int target_level) { std::vector args = rocksdb::ReduceDBLevelsCommand::PrepareArgs( dbname_, target_level, false); - LDBCommand* level_reducer = LDBCommand::InitFromCmdLineArgs( - args, Options(), LDBOptions()); + LDBCommand* level_reducer = + LDBCommand::InitFromCmdLineArgs(args, Options(), LDBOptions(), nullptr); level_reducer->Run(); bool is_succeed = level_reducer->GetExecuteState().IsSucceed(); delete level_reducer; diff --git a/tools/run_flash_bench.sh b/tools/run_flash_bench.sh index 1f59a5ada..fc2c9470f 100755 --- a/tools/run_flash_bench.sh +++ b/tools/run_flash_bench.sh @@ -22,6 +22,7 @@ # test and the tests are listed below. # # The environment variables are also optional. The variables are: +# # NKEYS - number of key/value pairs to load # BG_MBWRITEPERSEC - write rate limit in MB/second for tests in which # there is one thread doing writes and stats are @@ -54,6 +55,10 @@ # SAVE_SETUP - saves a copy of the database at the end of step 1 to # $DATA_DIR.bak. When LOG_DIR != DATA_DIR then it is copied # to $LOG_DIR.bak. +# SKIP_LOW_PRI_TESTS - skip some of the tests which aren't crucial for getting +# actionable benchmarking data (look for keywords "bulkload", +# "sync=1", and "while merging"). +# # Size constants K=1024 @@ -89,6 +94,14 @@ wal_dir=${LOG_DIR:-"/tmp/rocksdb/"} do_setup=${DO_SETUP:-1} save_setup=${SAVE_SETUP:-0} +# By default we'll run all the tests. Set this to skip a set of tests which +# aren't critical for getting key metrics. +skip_low_pri_tests=${SKIP_LOW_PRI_TESTS:-0} + +if [[ $skip_low_pri_tests == 1 ]]; then + echo "Skipping some non-critical tests because SKIP_LOW_PRI_TESTS is set." +fi + output_dir="/tmp/output" ARGS="\ @@ -116,16 +129,25 @@ echo -e "ops/sec\tmb/sec\tSize-GB\tL0_GB\tSum_GB\tW-Amp\tW-MB/s\tusec/op\tp50\tp if [[ $do_setup != 0 ]]; then echo Doing setup - # Test 1: bulk load - env $ARGS ./tools/benchmark.sh bulkload + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 1: bulk load + env $ARGS ./tools/benchmark.sh bulkload + fi # Test 2a: sequential fill with large values to get peak ingest # adjust NUM_KEYS given the use of larger values env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \ - ./tools/benchmark.sh fillseq + ./tools/benchmark.sh fillseq_disable_wal # Test 2b: sequential fill with the configured value size - env $ARGS ./tools/benchmark.sh fillseq + env $ARGS ./tools/benchmark.sh fillseq_disable_wal + + # Test 2c: same as 2a, but with WAL being enabled. + env $ARGS BLOCK_SIZE=$((1 * M)) VALUE_SIZE=$((32 * K)) NUM_KEYS=$(( num_keys / 64 )) \ + ./tools/benchmark.sh fillseq_enable_wal + + # Test 2d: same as 2b, but with WAL being enabled. + env $ARGS ./tools/benchmark.sh fillseq_enable_wal # Test 3: single-threaded overwrite env $ARGS NUM_THREADS=1 DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite @@ -188,16 +210,20 @@ for num_thr in "${nthreads[@]}" ; do env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh overwrite - # Test 8: overwrite with sync=1 - env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ - ./tools/benchmark.sh overwrite + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 8: overwrite with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + ./tools/benchmark.sh overwrite + fi # Test 9: random update with sync=0 env $ARGS DURATION=$duration NUM_THREADS=$num_thr DB_BENCH_NO_SYNC=1 \ ./tools/benchmark.sh updaterandom - # Test 10: random update with sync=1 - env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 10: random update with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr ./tools/benchmark.sh updaterandom + fi # Test 11: random read while writing env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ @@ -219,73 +245,114 @@ for num_thr in "${nthreads[@]}" ; do env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh mergerandom - # Test 15: random merge with sync=1 - env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ - ./tools/benchmark.sh mergerandom + if [[ $skip_low_pri_tests != 1 ]]; then + # Test 15: random merge with sync=1 + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$fg_mbwps \ + ./tools/benchmark.sh mergerandom - # Test 16: random read while merging - env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ - DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging + # Test 16: random read while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 ./tools/benchmark.sh readwhilemerging - # Test 17: range scan while merging - env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ - DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging + # Test 17: range scan while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh fwdrangewhilemerging - # Test 18: reverse range scan while merging - env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ - DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging + # Test 18: reverse range scan while merging + env $ARGS DURATION=$duration NUM_THREADS=$num_thr MB_WRITE_PER_SEC=$bg_mbwps \ + DB_BENCH_NO_SYNC=1 NUM_NEXTS_PER_SEEK=$nps ./tools/benchmark.sh revrangewhilemerging + fi done -echo bulkload > $output_dir/report2.txt +###### Universal compaction tests. + +# Use a single thread to reduce the variability in the benchmark. +env $ARGS COMPACTION_TEST=1 NUM_THREADS=1 ./tools/benchmark.sh universal_compaction + +if [[ $skip_low_pri_tests != 1 ]]; then + echo bulkload > $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep bulkload $output_dir/report.txt >> $output_dir/report2.txt +fi + +echo fillseq_wal_disabled >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep bulkload $output_dir/report.txt >> $output_dir/report2.txt -echo fillseq >> $output_dir/report2.txt +grep fillseq.wal_disabled $output_dir/report.txt >> $output_dir/report2.txt + +echo fillseq_wal_enabled >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep fillseq $output_dir/report.txt >> $output_dir/report2.txt +grep fillseq.wal_enabled $output_dir/report.txt >> $output_dir/report2.txt + echo overwrite sync=0 >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep overwrite $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt -echo overwrite sync=1 >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo overwrite sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep overwrite $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + echo updaterandom sync=0 >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep updaterandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt -echo updaterandom sync=1 >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo updaterandom sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep updaterandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + echo mergerandom sync=0 >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep mergerandom $output_dir/report.txt | grep \.s0 >> $output_dir/report2.txt -echo mergerandom sync=1 >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo mergerandom sync=1 >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep mergerandom $output_dir/report.txt | grep \.s1 >> $output_dir/report2.txt +fi + echo readrandom >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep readrandom $output_dir/report.txt >> $output_dir/report2.txt + echo fwdrange >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep fwdrange\.t $output_dir/report.txt >> $output_dir/report2.txt + echo revrange >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep revrange\.t $output_dir/report.txt >> $output_dir/report2.txt + echo readwhile >> $output_dir/report2.txt >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep readwhilewriting $output_dir/report.txt >> $output_dir/report2.txt -echo readwhile >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo readwhile >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep readwhilemerging $output_dir/report.txt >> $output_dir/report2.txt +fi + echo fwdreadwhilewriting >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep fwdrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt -echo fwdreadwhilemerging >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo fwdreadwhilemerging >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep fwdrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt +fi + echo revreadwhilewriting >> $output_dir/report2.txt head -1 $output_dir/report.txt >> $output_dir/report2.txt grep revrangewhilewriting $output_dir/report.txt >> $output_dir/report2.txt -echo revreadwhilemerging >> $output_dir/report2.txt -head -1 $output_dir/report.txt >> $output_dir/report2.txt -grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt + +if [[ $skip_low_pri_tests != 1 ]]; then + echo revreadwhilemerging >> $output_dir/report2.txt + head -1 $output_dir/report.txt >> $output_dir/report2.txt + grep revrangewhilemerg $output_dir/report.txt >> $output_dir/report2.txt +fi cat $output_dir/report2.txt diff --git a/tools/sst_dump.cc b/tools/sst_dump.cc index 403893779..a0b253e9b 100644 --- a/tools/sst_dump.cc +++ b/tools/sst_dump.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/sst_dump_test.cc b/tools/sst_dump_test.cc index b40a3346c..84edaea37 100644 --- a/tools/sst_dump_test.cc +++ b/tools/sst_dump_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 23a33fc1a..316ee4045 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,29 @@ #endif #include +#include +#include +#include + +#include "db/memtable.h" +#include "db/write_batch_internal.h" +#include "rocksdb/db.h" +#include "rocksdb/env.h" +#include "rocksdb/immutable_options.h" +#include "rocksdb/iterator.h" +#include "rocksdb/slice_transform.h" +#include "rocksdb/status.h" +#include "rocksdb/table_properties.h" +#include "table/block.h" +#include "table/block_based_table_builder.h" +#include "table/block_based_table_factory.h" +#include "table/block_builder.h" +#include "table/format.h" +#include "table/meta_blocks.h" +#include "table/plain_table_factory.h" +#include "tools/ldb_cmd.h" +#include "util/random.h" + #include "port/port.h" namespace rocksdb { diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h index dd65d3b10..6bbc4d676 100644 --- a/tools/sst_dump_tool_imp.h +++ b/tools/sst_dump_tool_imp.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,33 +7,10 @@ #include "rocksdb/sst_dump_tool.h" -#include -#include +#include #include -#include - #include "db/dbformat.h" -#include "db/memtable.h" -#include "db/write_batch_internal.h" -#include "rocksdb/db.h" -#include "rocksdb/env.h" -#include "rocksdb/immutable_options.h" -#include "rocksdb/iterator.h" -#include "rocksdb/slice_transform.h" -#include "rocksdb/status.h" -#include "rocksdb/table_properties.h" -#include "table/block.h" -#include "table/block_based_table_builder.h" -#include "table/block_based_table_factory.h" -#include "table/block_builder.h" -#include "table/format.h" -#include "table/meta_blocks.h" -#include "table/plain_table_factory.h" -#include "tools/ldb_cmd.h" #include "util/file_reader_writer.h" -#include "util/random.h" -#include "util/testharness.h" -#include "util/testutil.h" namespace rocksdb { diff --git a/tools/write_stress.cc b/tools/write_stress.cc index 05321fece..c2cbec4f4 100644 --- a/tools/write_stress.cc +++ b/tools/write_stress.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/aligned_buffer.h b/util/aligned_buffer.h index 2244316fe..2f79f12f7 100644 --- a/util/aligned_buffer.h +++ b/util/aligned_buffer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/allocator.h b/util/allocator.h index 58bf0da31..ee253528a 100644 --- a/util/allocator.h +++ b/util/allocator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/arena.cc b/util/arena.cc index 1d292ec01..77eec10e7 100644 --- a/util/arena.cc +++ b/util/arena.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,7 +26,7 @@ const size_t Arena::kInlineSize; #endif const size_t Arena::kMinBlockSize = 4096; -const size_t Arena::kMaxBlockSize = 2 << 30; +const size_t Arena::kMaxBlockSize = 2u << 30; static const int kAlignUnit = sizeof(void*); size_t OptimizeBlockSize(size_t block_size) { diff --git a/util/arena.h b/util/arena.h index db2150a8f..f96794ad1 100644 --- a/util/arena.h +++ b/util/arena.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/arena_test.cc b/util/arena_test.cc index d4fa48cfc..3e9a74b86 100644 --- a/util/arena_test.cc +++ b/util/arena_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/autovector.h b/util/autovector.h index 266a53a56..74fcc70ec 100644 --- a/util/autovector.h +++ b/util/autovector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/autovector_test.cc b/util/autovector_test.cc index 94e992660..d72bd507f 100644 --- a/util/autovector_test.cc +++ b/util/autovector_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/bloom.cc b/util/bloom.cc index d3f3abd61..4d30d7e82 100644 --- a/util/bloom.cc +++ b/util/bloom.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/bloom_test.cc b/util/bloom_test.cc index aac5b3978..6cc256316 100644 --- a/util/bloom_test.cc +++ b/util/bloom_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/build_version.h b/util/build_version.h index ca1dbf5f9..e53c3a06a 100644 --- a/util/build_version.h +++ b/util/build_version.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/cache.cc b/util/cache.cc index 9c47edd0e..6015644f6 100644 --- a/util/cache.cc +++ b/util/cache.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -196,10 +196,13 @@ class LRUCache { // free the needed space void SetCapacity(size_t capacity); + // Set the flag to reject insertion if cache if full. + void SetStrictCapacityLimit(bool strict_capacity_limit); + // Like Cache methods, but with an extra "hash" parameter. - Cache::Handle* Insert(const Slice& key, uint32_t hash, - void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)); + Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Cache::Handle** handle); Cache::Handle* Lookup(const Slice& key, uint32_t hash); void Release(Cache::Handle* handle); void Erase(const Slice& key, uint32_t hash); @@ -245,6 +248,9 @@ class LRUCache { // Memory size for entries residing only in the LRU list size_t lru_usage_; + // Whether to reject insertion if cache reaches its full capacity. + bool strict_capacity_limit_; + // mutex_ protects the following state. // We don't count mutex_ as the cache's internal state so semantically we // don't mind mutex_ invoking the non-const actions. @@ -336,6 +342,11 @@ void LRUCache::SetCapacity(size_t capacity) { } } +void LRUCache::SetStrictCapacityLimit(bool strict_capacity_limit) { + MutexLock l(&mutex_); + strict_capacity_limit_ = strict_capacity_limit; +} + Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { MutexLock l(&mutex_); LRUHandle* e = table_.Lookup(key, hash); @@ -350,6 +361,9 @@ Cache::Handle* LRUCache::Lookup(const Slice& key, uint32_t hash) { } void LRUCache::Release(Cache::Handle* handle) { + if (handle == nullptr) { + return; + } LRUHandle* e = reinterpret_cast(handle); bool last_reference = false; { @@ -383,15 +397,16 @@ void LRUCache::Release(Cache::Handle* handle) { } } -Cache::Handle* LRUCache::Insert( - const Slice& key, uint32_t hash, void* value, size_t charge, - void (*deleter)(const Slice& key, void* value)) { - +Status LRUCache::Insert(const Slice& key, uint32_t hash, void* value, + size_t charge, + void (*deleter)(const Slice& key, void* value), + Cache::Handle** handle) { // Allocate the memory here outside of the mutex // If the cache is full, we'll have to release it // It shouldn't happen very often though. LRUHandle* e = reinterpret_cast( new char[sizeof(LRUHandle) - 1 + key.size()]); + Status s; autovector last_reference_list; e->value = value; @@ -399,7 +414,9 @@ Cache::Handle* LRUCache::Insert( e->charge = charge; e->key_length = key.size(); e->hash = hash; - e->refs = 2; // One from LRUCache, one for the returned handle + e->refs = (handle == nullptr + ? 1 + : 2); // One from LRUCache, one for the returned handle e->next = e->prev = nullptr; e->in_cache = true; memcpy(e->key_data, key.data(), key.size()); @@ -411,20 +428,36 @@ Cache::Handle* LRUCache::Insert( // is freed or the lru list is empty EvictFromLRU(charge, &last_reference_list); - // insert into the cache - // note that the cache might get larger than its capacity if not enough - // space was freed - LRUHandle* old = table_.Insert(e); - usage_ += e->charge; - if (old != nullptr) { - old->in_cache = false; - if (Unref(old)) { - usage_ -= old->charge; - // old is on LRU because it's in cache and its reference count - // was just 1 (Unref returned 0) - LRU_Remove(old); - last_reference_list.push_back(old); + if (strict_capacity_limit_ && usage_ - lru_usage_ + charge > capacity_) { + if (handle == nullptr) { + last_reference_list.push_back(e); + } else { + delete[] reinterpret_cast(e); + *handle = nullptr; + } + s = Status::Incomplete("Insert failed due to LRU cache being full."); + } else { + // insert into the cache + // note that the cache might get larger than its capacity if not enough + // space was freed + LRUHandle* old = table_.Insert(e); + usage_ += e->charge; + if (old != nullptr) { + old->in_cache = false; + if (Unref(old)) { + usage_ -= old->charge; + // old is on LRU because it's in cache and its reference count + // was just 1 (Unref returned 0) + LRU_Remove(old); + last_reference_list.push_back(old); + } + } + if (handle == nullptr) { + LRU_Append(e); + } else { + *handle = reinterpret_cast(e); } + s = Status::OK(); } } @@ -434,7 +467,7 @@ Cache::Handle* LRUCache::Insert( entry->Free(); } - return reinterpret_cast(e); + return s; } void LRUCache::Erase(const Slice& key, uint32_t hash) { @@ -472,6 +505,7 @@ class ShardedLRUCache : public Cache { uint64_t last_id_; int num_shard_bits_; size_t capacity_; + bool strict_capacity_limit_; static inline uint32_t HashSlice(const Slice& s) { return Hash(s.data(), s.size(), 0); @@ -483,13 +517,18 @@ class ShardedLRUCache : public Cache { } public: - ShardedLRUCache(size_t capacity, int num_shard_bits) - : last_id_(0), num_shard_bits_(num_shard_bits), capacity_(capacity) { + ShardedLRUCache(size_t capacity, int num_shard_bits, + bool strict_capacity_limit) + : last_id_(0), + num_shard_bits_(num_shard_bits), + capacity_(capacity), + strict_capacity_limit_(strict_capacity_limit) { int num_shards = 1 << num_shard_bits_; shards_ = new LRUCache[num_shards]; const size_t per_shard = (capacity + (num_shards - 1)) / num_shards; for (int s = 0; s < num_shards; s++) { shards_[s].SetCapacity(per_shard); + shards_[s].SetStrictCapacityLimit(strict_capacity_limit); } } virtual ~ShardedLRUCache() { @@ -504,11 +543,19 @@ class ShardedLRUCache : public Cache { } capacity_ = capacity; } - virtual Handle* Insert(const Slice& key, void* value, size_t charge, - void (*deleter)(const Slice& key, - void* value)) override { + virtual void SetStrictCapacityLimit(bool strict_capacity_limit) override { + int num_shards = 1 << num_shard_bits_; + for (int s = 0; s < num_shards; s++) { + shards_[s].SetStrictCapacityLimit(strict_capacity_limit); + } + strict_capacity_limit_ = strict_capacity_limit; + } + virtual Status Insert(const Slice& key, void* value, size_t charge, + void (*deleter)(const Slice& key, void* value), + Handle** handle) override { const uint32_t hash = HashSlice(key); - return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter); + return shards_[Shard(hash)].Insert(key, hash, value, charge, deleter, + handle); } virtual Handle* Lookup(const Slice& key) override { const uint32_t hash = HashSlice(key); @@ -531,6 +578,10 @@ class ShardedLRUCache : public Cache { } virtual size_t GetCapacity() const override { return capacity_; } + virtual bool HasStrictCapacityLimit() const override { + return strict_capacity_limit_; + } + virtual size_t GetUsage() const override { // We will not lock the cache when getting the usage from shards. int num_shards = 1 << num_shard_bits_; @@ -569,14 +620,20 @@ class ShardedLRUCache : public Cache { } // end anonymous namespace shared_ptr NewLRUCache(size_t capacity) { - return NewLRUCache(capacity, kNumShardBits); + return NewLRUCache(capacity, kNumShardBits, false); } shared_ptr NewLRUCache(size_t capacity, int num_shard_bits) { + return NewLRUCache(capacity, num_shard_bits, false); +} + +shared_ptr NewLRUCache(size_t capacity, int num_shard_bits, + bool strict_capacity_limit) { if (num_shard_bits >= 20) { return nullptr; // the cache cannot be sharded into too many fine pieces } - return std::make_shared(capacity, num_shard_bits); + return std::make_shared(capacity, num_shard_bits, + strict_capacity_limit); } } // namespace rocksdb diff --git a/util/cache_bench.cc b/util/cache_bench.cc index 92df77267..266c9e1c5 100644 --- a/util/cache_bench.cc +++ b/util/cache_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -142,8 +142,7 @@ class CacheBench { // Cast uint64* to be char*, data would be copied to cache Slice key(reinterpret_cast(&rand_key), 8); // do insert - auto handle = cache_->Insert(key, new char[10], 1, &deleter); - cache_->Release(handle); + cache_->Insert(key, new char[10], 1, &deleter); } } @@ -221,8 +220,7 @@ class CacheBench { int32_t prob_op = thread->rnd.Uniform(100); if (prob_op >= 0 && prob_op < FLAGS_insert_percent) { // do insert - auto handle = cache_->Insert(key, new char[10], 1, &deleter); - cache_->Release(handle); + cache_->Insert(key, new char[10], 1, &deleter); } else if (prob_op -= FLAGS_insert_percent && prob_op < FLAGS_lookup_percent) { // do lookup diff --git a/util/cache_test.cc b/util/cache_test.cc index c8b2de8f5..3df71c098 100644 --- a/util/cache_test.cc +++ b/util/cache_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -73,8 +73,8 @@ class CacheTest : public testing::Test { } void Insert(shared_ptr cache, int key, int value, int charge = 1) { - cache->Release(cache->Insert(EncodeKey(key), EncodeValue(value), charge, - &CacheTest::Deleter)); + cache->Insert(EncodeKey(key), EncodeValue(value), charge, + &CacheTest::Deleter); } void Erase(shared_ptr cache, int key) { @@ -118,14 +118,12 @@ TEST_F(CacheTest, UsageTest) { auto cache = NewLRUCache(kCapacity, 8); size_t usage = 0; - const char* value = "abcdef"; + char value[10] = "abcdef"; // make sure everything will be cached for (int i = 1; i < 100; ++i) { std::string key(i, 'a'); auto kv_size = key.size() + 5; - cache->Release( - cache->Insert(key, (void*)value, kv_size, dumbDeleter) - ); + cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter); usage += kv_size; ASSERT_EQ(usage, cache->GetUsage()); } @@ -133,9 +131,8 @@ TEST_F(CacheTest, UsageTest) { // make sure the cache will be overloaded for (uint64_t i = 1; i < kCapacity; ++i) { auto key = ToString(i); - cache->Release( - cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter) - ); + cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } // the usage should be close to the capacity @@ -149,7 +146,7 @@ TEST_F(CacheTest, PinnedUsageTest) { auto cache = NewLRUCache(kCapacity, 8); size_t pinned_usage = 0; - const char* value = "abcdef"; + char value[10] = "abcdef"; std::forward_list unreleased_handles; @@ -158,7 +155,9 @@ TEST_F(CacheTest, PinnedUsageTest) { for (int i = 1; i < 100; ++i) { std::string key(i, 'a'); auto kv_size = key.size() + 5; - auto handle = cache->Insert(key, (void*)value, kv_size, dumbDeleter); + Cache::Handle* handle; + cache->Insert(key, reinterpret_cast(value), kv_size, dumbDeleter, + &handle); pinned_usage += kv_size; ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); if (i % 2 == 0) { @@ -182,8 +181,8 @@ TEST_F(CacheTest, PinnedUsageTest) { // check that overloading the cache does not change the pinned usage for (uint64_t i = 1; i < 2 * kCapacity; ++i) { auto key = ToString(i); - cache->Release( - cache->Insert(key, (void*)value, key.size() + 5, dumbDeleter)); + cache->Insert(key, reinterpret_cast(value), key.size() + 5, + dumbDeleter); } ASSERT_EQ(pinned_usage, cache->GetPinnedUsage()); @@ -408,7 +407,8 @@ TEST_F(CacheTest, SetCapacity) { // Insert 5 entries, but not releasing. for (size_t i = 0; i < 5; i++) { std::string key = ToString(i+1); - handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); } ASSERT_EQ(5U, cache->GetCapacity()); ASSERT_EQ(5U, cache->GetUsage()); @@ -422,7 +422,8 @@ TEST_F(CacheTest, SetCapacity) { // and usage should be 7 for (size_t i = 5; i < 10; i++) { std::string key = ToString(i+1); - handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); } ASSERT_EQ(10U, cache->GetCapacity()); ASSERT_EQ(10U, cache->GetUsage()); @@ -441,6 +442,53 @@ TEST_F(CacheTest, SetCapacity) { } } +TEST_F(CacheTest, SetStrictCapacityLimit) { + // test1: set the flag to false. Insert more keys than capacity. See if they + // all go through. + std::shared_ptr cache = NewLRUCache(5, 0, false); + std::vector handles(10); + Status s; + for (size_t i = 0; i < 10; i++) { + std::string key = ToString(i + 1); + s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); + ASSERT_NE(nullptr, handles[i]); + } + + // test2: set the flag to true. Insert and check if it fails. + std::string extra_key = "extra"; + Value* extra_value = new Value(0); + cache->SetStrictCapacityLimit(true); + Cache::Handle* handle; + s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_EQ(nullptr, handle); + + for (size_t i = 0; i < 10; i++) { + cache->Release(handles[i]); + } + + // test3: init with flag being true. + std::shared_ptr cache2 = NewLRUCache(5, 0, true); + for (size_t i = 0; i < 5; i++) { + std::string key = ToString(i + 1); + s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); + ASSERT_NE(nullptr, handles[i]); + } + s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_EQ(nullptr, handle); + // test insert without handle + s = cache2->Insert(extra_key, extra_value, 1, &deleter); + ASSERT_TRUE(s.IsIncomplete()); + ASSERT_EQ(5, cache->GetUsage()); + + for (size_t i = 0; i < 5; i++) { + cache2->Release(handles[i]); + } +} + TEST_F(CacheTest, OverCapacity) { size_t n = 10; @@ -452,7 +500,8 @@ TEST_F(CacheTest, OverCapacity) { // Insert n+1 entries, but not releasing. for (size_t i = 0; i < n + 1; i++) { std::string key = ToString(i+1); - handles[i] = cache->Insert(key, new Value(i+1), 1, &deleter); + Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]); + ASSERT_TRUE(s.ok()); } // Guess what's in the cache now? diff --git a/util/channel.h b/util/channel.h index a8987163f..974bed093 100644 --- a/util/channel.h +++ b/util/channel.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/coding.cc b/util/coding.cc index f09e67284..d38fdb1f8 100644 --- a/util/coding.cc +++ b/util/coding.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/coding.h b/util/coding.h index 5ea9aad40..6aaf403a3 100644 --- a/util/coding.h +++ b/util/coding.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/coding_test.cc b/util/coding_test.cc index e3c265b69..d724ef4c1 100644 --- a/util/coding_test.cc +++ b/util/coding_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/compaction_job_stats_impl.cc b/util/compaction_job_stats_impl.cc index 01f022f3c..4610496f8 100644 --- a/util/compaction_job_stats_impl.cc +++ b/util/compaction_job_stats_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/comparator.cc b/util/comparator.cc index 6d7709db5..cb802d55b 100644 --- a/util/comparator.cc +++ b/util/comparator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/compression.h b/util/compression.h index ac285e5f1..2690e3001 100644 --- a/util/compression.h +++ b/util/compression.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/concurrent_arena.cc b/util/concurrent_arena.cc index fae09d7d2..6aa82751f 100644 --- a/util/concurrent_arena.cc +++ b/util/concurrent_arena.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/concurrent_arena.h b/util/concurrent_arena.h index fb29c87a1..ba4b024b9 100644 --- a/util/concurrent_arena.h +++ b/util/concurrent_arena.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/crc32c.cc b/util/crc32c.cc index b8d281a27..ce574544e 100644 --- a/util/crc32c.cc +++ b/util/crc32c.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -292,10 +292,12 @@ static inline uint32_t LE_LOAD32(const uint8_t *p) { } #ifdef __SSE4_2__ +#ifdef __LP64__ static inline uint64_t LE_LOAD64(const uint8_t *p) { return DecodeFixed64(reinterpret_cast(p)); } #endif +#endif static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { uint32_t c = static_cast(*l ^ LE_LOAD32(*p)); @@ -315,8 +317,15 @@ static inline void Slow_CRC32(uint64_t* l, uint8_t const **p) { static inline void Fast_CRC32(uint64_t* l, uint8_t const **p) { #ifdef __SSE4_2__ +#ifdef __LP64__ *l = _mm_crc32_u64(*l, LE_LOAD64(*p)); *p += 8; +#else + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; + *l = _mm_crc32_u32(static_cast(*l), LE_LOAD32(*p)); + *p += 4; +#endif #else Slow_CRC32(l, p); #endif diff --git a/util/crc32c.h b/util/crc32c.h index 14167c1a0..90d950c6e 100644 --- a/util/crc32c.h +++ b/util/crc32c.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/crc32c_test.cc b/util/crc32c_test.cc index 413302a24..47a24ddac 100644 --- a/util/crc32c_test.cc +++ b/util/crc32c_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/delete_scheduler_impl.cc b/util/delete_scheduler.cc similarity index 62% rename from util/delete_scheduler_impl.cc rename to util/delete_scheduler.cc index e0f7511e0..b403c0572 100644 --- a/util/delete_scheduler_impl.cc +++ b/util/delete_scheduler.cc @@ -1,40 +1,42 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. -#include "util/delete_scheduler_impl.h" +#include "util/delete_scheduler.h" #include #include #include "port/port.h" #include "rocksdb/env.h" +#include "util/sst_file_manager_impl.h" #include "util/mutexlock.h" #include "util/sync_point.h" namespace rocksdb { -DeleteSchedulerImpl::DeleteSchedulerImpl(Env* env, const std::string& trash_dir, - int64_t rate_bytes_per_sec, - std::shared_ptr info_log) +DeleteScheduler::DeleteScheduler(Env* env, const std::string& trash_dir, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager) : env_(env), trash_dir_(trash_dir), rate_bytes_per_sec_(rate_bytes_per_sec), pending_files_(0), closing_(false), cv_(&mu_), - info_log_(info_log) { - if (rate_bytes_per_sec_ == 0) { + info_log_(info_log), + sst_file_manager_(sst_file_manager) { + if (rate_bytes_per_sec_ <= 0) { // Rate limiting is disabled bg_thread_.reset(); } else { bg_thread_.reset( - new std::thread(&DeleteSchedulerImpl::BackgroundEmptyTrash, this)); + new std::thread(&DeleteScheduler::BackgroundEmptyTrash, this)); } } -DeleteSchedulerImpl::~DeleteSchedulerImpl() { +DeleteScheduler::~DeleteScheduler() { { MutexLock l(&mu_); closing_ = true; @@ -45,20 +47,29 @@ DeleteSchedulerImpl::~DeleteSchedulerImpl() { } } -Status DeleteSchedulerImpl::DeleteFile(const std::string& file_path) { - if (rate_bytes_per_sec_ == 0) { +Status DeleteScheduler::DeleteFile(const std::string& file_path) { + Status s; + if (rate_bytes_per_sec_ <= 0) { // Rate limiting is disabled - return env_->DeleteFile(file_path); + s = env_->DeleteFile(file_path); + if (s.ok() && sst_file_manager_) { + sst_file_manager_->OnDeleteFile(file_path); + } + return s; } // Move file to trash std::string path_in_trash; - Status s = MoveToTrash(file_path, &path_in_trash); + s = MoveToTrash(file_path, &path_in_trash); if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, info_log_, "Failed to move %s to trash directory (%s)", file_path.c_str(), trash_dir_.c_str()); - return env_->DeleteFile(file_path); + s = env_->DeleteFile(file_path); + if (s.ok() && sst_file_manager_) { + sst_file_manager_->OnDeleteFile(file_path); + } + return s; } // Add file to delete queue @@ -73,13 +84,13 @@ Status DeleteSchedulerImpl::DeleteFile(const std::string& file_path) { return s; } -std::map DeleteSchedulerImpl::GetBackgroundErrors() { +std::map DeleteScheduler::GetBackgroundErrors() { MutexLock l(&mu_); return bg_errors_; } -Status DeleteSchedulerImpl::MoveToTrash(const std::string& file_path, - std::string* path_in_trash) { +Status DeleteScheduler::MoveToTrash(const std::string& file_path, + std::string* path_in_trash) { Status s; // Figure out the name of the file in trash folder size_t idx = file_path.rfind("/"); @@ -112,11 +123,14 @@ Status DeleteSchedulerImpl::MoveToTrash(const std::string& file_path, break; } } + if (s.ok() && sst_file_manager_) { + sst_file_manager_->OnMoveFile(file_path, *path_in_trash); + } return s; } -void DeleteSchedulerImpl::BackgroundEmptyTrash() { - TEST_SYNC_POINT("DeleteSchedulerImpl::BackgroundEmptyTrash"); +void DeleteScheduler::BackgroundEmptyTrash() { + TEST_SYNC_POINT("DeleteScheduler::BackgroundEmptyTrash"); while (true) { MutexLock l(&mu_); @@ -151,7 +165,7 @@ void DeleteSchedulerImpl::BackgroundEmptyTrash() { uint64_t total_penlty = ((total_deleted_bytes * kMicrosInSecond) / rate_bytes_per_sec_); while (!closing_ && !cv_.TimedWait(start_time + total_penlty)) {} - TEST_SYNC_POINT_CALLBACK("DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", + TEST_SYNC_POINT_CALLBACK("DeleteScheduler::BackgroundEmptyTrash:Wait", &total_penlty); pending_files_--; @@ -164,12 +178,12 @@ void DeleteSchedulerImpl::BackgroundEmptyTrash() { } } -Status DeleteSchedulerImpl::DeleteTrashFile(const std::string& path_in_trash, - uint64_t* deleted_bytes) { +Status DeleteScheduler::DeleteTrashFile(const std::string& path_in_trash, + uint64_t* deleted_bytes) { uint64_t file_size; Status s = env_->GetFileSize(path_in_trash, &file_size); if (s.ok()) { - TEST_SYNC_POINT("DeleteSchedulerImpl::DeleteTrashFile:DeleteFile"); + TEST_SYNC_POINT("DeleteScheduler::DeleteTrashFile:DeleteFile"); s = env_->DeleteFile(path_in_trash); } @@ -181,51 +195,19 @@ Status DeleteSchedulerImpl::DeleteTrashFile(const std::string& path_in_trash, *deleted_bytes = 0; } else { *deleted_bytes = file_size; + if (sst_file_manager_) { + sst_file_manager_->OnDeleteFile(path_in_trash); + } } return s; } -void DeleteSchedulerImpl::WaitForEmptyTrash() { +void DeleteScheduler::WaitForEmptyTrash() { MutexLock l(&mu_); while (pending_files_ > 0 && !closing_) { cv_.Wait(); } } -DeleteScheduler* NewDeleteScheduler(Env* env, const std::string& trash_dir, - int64_t rate_bytes_per_sec, - std::shared_ptr info_log, - bool delete_exisitng_trash, - Status* status) { - DeleteScheduler* res = - new DeleteSchedulerImpl(env, trash_dir, rate_bytes_per_sec, info_log); - - Status s; - if (trash_dir != "") { - s = env->CreateDirIfMissing(trash_dir); - if (s.ok() && delete_exisitng_trash) { - std::vector files_in_trash; - s = env->GetChildren(trash_dir, &files_in_trash); - if (s.ok()) { - for (const std::string& trash_file : files_in_trash) { - if (trash_file == "." || trash_file == "..") { - continue; - } - Status file_delete = res->DeleteFile(trash_dir + "/" + trash_file); - if (s.ok() && !file_delete.ok()) { - s = file_delete; - } - } - } - } - } - - if (status) { - *status = s; - } - - return res; -} - } // namespace rocksdb diff --git a/util/delete_scheduler_impl.h b/util/delete_scheduler.h similarity index 71% rename from util/delete_scheduler_impl.h rename to util/delete_scheduler.h index 32ef65f0c..eec118708 100644 --- a/util/delete_scheduler_impl.h +++ b/util/delete_scheduler.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,21 +12,28 @@ #include "port/port.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/status.h" namespace rocksdb { class Env; class Logger; - -class DeleteSchedulerImpl : public DeleteScheduler { +class SstFileManagerImpl; + +// DeleteScheduler allows the DB to enforce a rate limit on file deletion, +// Instead of deleteing files immediately, files are moved to trash_dir +// and deleted in a background thread that apply sleep penlty between deletes +// if they are happening in a rate faster than rate_bytes_per_sec, +// +// Rate limiting can be turned off by setting rate_bytes_per_sec = 0, In this +// case DeleteScheduler will delete files immediately. +class DeleteScheduler { public: - DeleteSchedulerImpl(Env* env, const std::string& trash_dir, - int64_t rate_bytes_per_sec, - std::shared_ptr info_log); + DeleteScheduler(Env* env, const std::string& trash_dir, + int64_t rate_bytes_per_sec, Logger* info_log, + SstFileManagerImpl* sst_file_manager); - ~DeleteSchedulerImpl(); + ~DeleteScheduler(); // Return delete rate limit in bytes per second int64_t GetRateBytesPerSecond() { return rate_bytes_per_sec_; } @@ -63,7 +70,7 @@ class DeleteSchedulerImpl : public DeleteScheduler { int32_t pending_files_; // Errors that happened in BackgroundEmptyTrash (file_path => error) std::map bg_errors_; - // Set to true in ~DeleteSchedulerImpl() to force BackgroundEmptyTrash to stop + // Set to true in ~DeleteScheduler() to force BackgroundEmptyTrash to stop bool closing_; // Condition variable signaled in these conditions // - pending_files_ value change from 0 => 1 @@ -74,7 +81,8 @@ class DeleteSchedulerImpl : public DeleteScheduler { std::unique_ptr bg_thread_; // Mutex to protect threads from file name conflicts port::Mutex file_move_mu_; - std::shared_ptr info_log_; + Logger* info_log_; + SstFileManagerImpl* sst_file_manager_; static const uint64_t kMicrosInSecond = 1000 * 1000LL; }; diff --git a/util/delete_scheduler_test.cc b/util/delete_scheduler_test.cc index fcd821c15..563813f9b 100644 --- a/util/delete_scheduler_test.cc +++ b/util/delete_scheduler_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,9 +12,9 @@ #include #include -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/options.h" +#include "util/delete_scheduler.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/testharness.h" @@ -74,6 +74,12 @@ class DeleteSchedulerTest : public testing::Test { return file_path; } + void NewDeleteScheduler() { + ASSERT_OK(env_->CreateDirIfMissing(trash_dir_)); + delete_scheduler_.reset(new DeleteScheduler( + env_, trash_dir_, rate_bytes_per_sec_, nullptr, nullptr)); + } + Env* env_; std::string dummy_files_dir_; std::string trash_dir_; @@ -84,19 +90,19 @@ class DeleteSchedulerTest : public testing::Test { // Test the basic functionality of DeleteScheduler (Rate Limiting). // 1- Create 100 dummy files // 2- Delete the 100 dummy files using DeleteScheduler -// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- // 3- Wait for DeleteScheduler to delete all files in trash // 4- Verify that BackgroundEmptyTrash used to correct penlties for the files // 5- Make sure that all created files were completely deleted TEST_F(DeleteSchedulerTest, BasicRateLimiting) { rocksdb::SyncPoint::GetInstance()->LoadDependency({ {"DeleteSchedulerTest::BasicRateLimiting:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, + "DeleteScheduler::BackgroundEmptyTrash"}, }); std::vector penalties; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", + "DeleteScheduler::BackgroundEmptyTrash:Wait", [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); int num_files = 100; // 100 files @@ -110,8 +116,7 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { DestroyAndCreateDir(dummy_files_dir_); rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // Create 100 dummy files, every file is 1 Kb std::vector generated_files; @@ -152,19 +157,19 @@ TEST_F(DeleteSchedulerTest, BasicRateLimiting) { // Same as the BasicRateLimiting test but delete files in multiple threads. // 1- Create 100 dummy files // 2- Delete the 100 dummy files using DeleteScheduler using 10 threads -// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- // 3- Wait for DeleteScheduler to delete all files in queue // 4- Verify that BackgroundEmptyTrash used to correct penlties for the files // 5- Make sure that all created files were completely deleted TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { rocksdb::SyncPoint::GetInstance()->LoadDependency({ {"DeleteSchedulerTest::RateLimitingMultiThreaded:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, + "DeleteScheduler::BackgroundEmptyTrash"}, }); std::vector penalties; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::BackgroundEmptyTrash:Wait", + "DeleteScheduler::BackgroundEmptyTrash:Wait", [&](void* arg) { penalties.push_back(*(static_cast(arg))); }); int thread_cnt = 10; @@ -179,8 +184,7 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { DestroyAndCreateDir(dummy_files_dir_); rate_bytes_per_sec_ = delete_kbs_per_sec[t] * 1024; - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // Create 100 dummy files, every file is 1 Kb std::vector generated_files; @@ -239,12 +243,13 @@ TEST_F(DeleteSchedulerTest, RateLimitingMultiThreaded) { TEST_F(DeleteSchedulerTest, DisableRateLimiting) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); - delete_scheduler_.reset(NewDeleteScheduler(env_, "", 0)); + rate_bytes_per_sec_ = 0; + NewDeleteScheduler(); for (int i = 0; i < 10; i++) { // Every file we delete will be deleted immediately @@ -264,18 +269,17 @@ TEST_F(DeleteSchedulerTest, DisableRateLimiting) { // 1- Create 10 files with the same name "conflict.data" // 2- Delete the 10 files using DeleteScheduler // 3- Make sure that trash directory contain 10 files ("conflict.data" x 10) -// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- // 4- Make sure that files are deleted from trash TEST_F(DeleteSchedulerTest, ConflictNames) { rocksdb::SyncPoint::GetInstance()->LoadDependency({ {"DeleteSchedulerTest::ConflictNames:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, + "DeleteScheduler::BackgroundEmptyTrash"}, }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // Create "conflict.data" and move it to trash 10 times for (int i = 0; i < 10; i++) { @@ -300,19 +304,18 @@ TEST_F(DeleteSchedulerTest, ConflictNames) { // 1- Create 10 dummy files // 2- Delete the 10 files using DeleteScheduler (move them to trsah) // 3- Delete the 10 files directly (using env_->DeleteFile) -// --- Hold DeleteSchedulerImpl::BackgroundEmptyTrash --- +// --- Hold DeleteScheduler::BackgroundEmptyTrash --- // 4- Make sure that DeleteScheduler failed to delete the 10 files and // reported 10 background errors TEST_F(DeleteSchedulerTest, BackgroundError) { rocksdb::SyncPoint::GetInstance()->LoadDependency({ {"DeleteSchedulerTest::BackgroundError:1", - "DeleteSchedulerImpl::BackgroundEmptyTrash"}, + "DeleteScheduler::BackgroundEmptyTrash"}, }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // Generate 10 dummy files and move them to trash for (int i = 0; i < 10; i++) { @@ -339,32 +342,6 @@ TEST_F(DeleteSchedulerTest, BackgroundError) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -// 1- Create 10 files in trash -// 2- Create a DeleteScheduler with delete_exisitng_trash = true -// 3- Wait for DeleteScheduler to delete all files in queue -// 4- Make sure that all files in trash directory were deleted -TEST_F(DeleteSchedulerTest, TrashWithExistingFiles) { - std::vector dummy_files; - for (int i = 0; i < 10; i++) { - std::string file_name = "data_" + ToString(i) + ".data"; - std::string trash_path = trash_dir_ + "/" + file_name; - env_->RenameFile(NewDummyFile(file_name), trash_path); - } - ASSERT_EQ(CountFilesInDir(trash_dir_), 10); - - Status s; - rate_bytes_per_sec_ = 1024 * 1024; // 1 Mb/sec - delete_scheduler_.reset(NewDeleteScheduler( - env_, trash_dir_, rate_bytes_per_sec_, nullptr, true, &s)); - ASSERT_OK(s); - - delete_scheduler_->WaitForEmptyTrash(); - ASSERT_EQ(CountFilesInDir(trash_dir_), 0); - - auto bg_errors = delete_scheduler_->GetBackgroundErrors(); - ASSERT_EQ(bg_errors.size(), 0); -} - // 1- Create 10 dummy files // 2- Delete 10 dummy files using DeleteScheduler // 3- Wait for DeleteScheduler to delete all files in queue @@ -373,13 +350,12 @@ TEST_F(DeleteSchedulerTest, TrashWithExistingFiles) { TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rate_bytes_per_sec_ = 1024 * 1024; // 1 MB / sec - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // Move files to trash, wait for empty trash, start again for (int run = 1; run <= 5; run++) { @@ -409,13 +385,12 @@ TEST_F(DeleteSchedulerTest, StartBGEmptyTrashMultipleTimes) { TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rate_bytes_per_sec_ = 1; // 1 Byte / sec - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); for (int i = 0; i < 100; i++) { std::string file_name = "data_" + ToString(i) + ".data"; @@ -439,13 +414,12 @@ TEST_F(DeleteSchedulerTest, DestructorWithNonEmptyQueue) { TEST_F(DeleteSchedulerTest, MoveToTrashError) { int bg_delete_file = 0; rocksdb::SyncPoint::GetInstance()->SetCallBack( - "DeleteSchedulerImpl::DeleteTrashFile:DeleteFile", + "DeleteScheduler::DeleteTrashFile:DeleteFile", [&](void* arg) { bg_delete_file++; }); rocksdb::SyncPoint::GetInstance()->EnableProcessing(); rate_bytes_per_sec_ = 1024; // 1 Kb / sec - delete_scheduler_.reset( - NewDeleteScheduler(env_, trash_dir_, rate_bytes_per_sec_)); + NewDeleteScheduler(); // We will delete the trash directory, that mean that DeleteScheduler wont // be able to move files to trash and will delete files them immediately. @@ -460,7 +434,6 @@ TEST_F(DeleteSchedulerTest, MoveToTrashError) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/dynamic_bloom.cc b/util/dynamic_bloom.cc index 4df81d527..bd54ed933 100644 --- a/util/dynamic_bloom.cc +++ b/util/dynamic_bloom.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/dynamic_bloom.h b/util/dynamic_bloom.h index 8d1b7b4af..909db54c8 100644 --- a/util/dynamic_bloom.h +++ b/util/dynamic_bloom.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/dynamic_bloom_test.cc b/util/dynamic_bloom_test.cc index e7a730fcf..bad88a94b 100644 --- a/util/dynamic_bloom_test.cc +++ b/util/dynamic_bloom_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/env.cc b/util/env.cc index 968d300b8..38509c0f4 100644 --- a/util/env.cc +++ b/util/env.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -38,6 +38,32 @@ Status Env::ReuseWritableFile(const std::string& fname, return NewWritableFile(fname, result, options); } +Status Env::GetChildrenFileAttributes(const std::string& dir, + std::vector* result) { + assert(result != nullptr); + std::vector child_fnames; + Status s = GetChildren(dir, &child_fnames); + if (!s.ok()) { + return s; + } + result->resize(child_fnames.size()); + size_t result_size = 0; + for (size_t i = 0; i < child_fnames.size(); ++i) { + const std::string path = dir + "/" + child_fnames[i]; + if (!(s = GetFileSize(path, &(*result)[result_size].size_bytes)).ok()) { + if (FileExists(path).IsNotFound()) { + // The file may have been deleted since we listed the directory + continue; + } + return s; + } + (*result)[result_size].name = std::move(child_fnames[i]); + result_size++; + } + result->resize(result_size); + return Status::OK(); +} + SequentialFile::~SequentialFile() { } diff --git a/util/env_hdfs.cc b/util/env_hdfs.cc index 7e12c747a..e82895ee0 100644 --- a/util/env_hdfs.cc +++ b/util/env_hdfs.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/env_posix.cc b/util/env_posix.cc index 9d549b44d..06de7a486 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -46,6 +46,7 @@ #include "util/iostats_context_imp.h" #include "util/logging.h" #include "util/posix_logger.h" +#include "util/random.h" #include "util/string_util.h" #include "util/sync_point.h" #include "util/thread_local.h" diff --git a/util/env_test.cc b/util/env_test.cc index e5fa37099..ab4976df8 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,6 +18,7 @@ #include #ifdef OS_LINUX +#include #include #include #include @@ -26,7 +27,6 @@ #ifdef ROCKSDB_FALLOCATE_PRESENT #include -#include #endif #include "rocksdb/env.h" @@ -935,6 +935,42 @@ TEST_F(EnvPosixTest, Preallocation) { ASSERT_EQ(last_allocated_block, 7UL); } +// Test that the two ways to get children file attributes (in bulk or +// individually) behave consistently. +TEST_F(EnvPosixTest, ConsistentChildrenAttributes) { + const EnvOptions soptions; + const int kNumChildren = 10; + + std::string data; + for (int i = 0; i < kNumChildren; ++i) { + std::ostringstream oss; + oss << test::TmpDir() << "/testfile_" << i; + const std::string path = oss.str(); + unique_ptr file; + ASSERT_OK(env_->NewWritableFile(path, &file, soptions)); + file->Append(data); + data.append("test"); + } + + std::vector file_attrs; + ASSERT_OK(env_->GetChildrenFileAttributes(test::TmpDir(), &file_attrs)); + for (int i = 0; i < kNumChildren; ++i) { + std::ostringstream oss; + oss << "testfile_" << i; + const std::string name = oss.str(); + const std::string path = test::TmpDir() + "/" + name; + + auto file_attrs_iter = std::find_if( + file_attrs.begin(), file_attrs.end(), + [&name](const Env::FileAttributes& fm) { return fm.name == name; }); + ASSERT_TRUE(file_attrs_iter != file_attrs.end()); + uint64_t size; + ASSERT_OK(env_->GetFileSize(path, &size)); + ASSERT_EQ(size, 4 * i); + ASSERT_EQ(size, file_attrs_iter->size_bytes); + } +} + // Test that all WritableFileWrapper forwards all calls to WritableFile. TEST_F(EnvPosixTest, WritableFileWrapper) { class Base : public WritableFile { diff --git a/util/event_logger.cc b/util/event_logger.cc index 92a781c19..7e71b0cf9 100644 --- a/util/event_logger.cc +++ b/util/event_logger.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/event_logger.h b/util/event_logger.h index 53a40c255..f845ab6a5 100644 --- a/util/event_logger.h +++ b/util/event_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/event_logger_test.cc b/util/event_logger_test.cc index 1aad0acc2..807f64b1c 100644 --- a/util/event_logger_test.cc +++ b/util/event_logger_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index 6d548c449..b12263610 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index c10cde2ab..1a7b81bed 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc index 69b8cfea8..367de8b9e 100644 --- a/util/file_reader_writer_test.cc +++ b/util/file_reader_writer_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/file_util.cc b/util/file_util.cc index d4f7b4004..c14309da2 100644 --- a/util/file_util.cc +++ b/util/file_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,10 +8,9 @@ #include #include -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" #include "rocksdb/options.h" -#include "db/filename.h" +#include "util/sst_file_manager_impl.h" #include "util/file_reader_writer.h" namespace rocksdb { @@ -67,12 +66,31 @@ Status CopyFile(Env* env, const std::string& source, return Status::OK(); } -Status DeleteOrMoveToTrash(const DBOptions* db_options, - const std::string& fname) { - if (db_options->delete_scheduler == nullptr) { - return db_options->env->DeleteFile(fname); +// Utility function to create a file with the provided contents +Status CreateFile(Env* env, const std::string& destination, + const std::string& contents) { + const EnvOptions soptions; + Status s; + unique_ptr dest_writer; + + unique_ptr destfile; + s = env->NewWritableFile(destination, &destfile, soptions); + if (!s.ok()) { + return s; + } + dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions)); + return dest_writer->Append(Slice(contents)); +} + +Status DeleteSSTFile(const DBOptions* db_options, const std::string& fname, + uint32_t path_id) { + // TODO(tec): support sst_file_manager for multiple path_ids + auto sfm = + static_cast(db_options->sst_file_manager.get()); + if (sfm && path_id == 0) { + return sfm->ScheduleFileDeletion(fname); } else { - return db_options->delete_scheduler->DeleteFile(fname); + return db_options->env->DeleteFile(fname); } } diff --git a/util/file_util.h b/util/file_util.h index f3e02fb0b..5b2320e33 100644 --- a/util/file_util.h +++ b/util/file_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,7 +16,10 @@ namespace rocksdb { extern Status CopyFile(Env* env, const std::string& source, const std::string& destination, uint64_t size = 0); -extern Status DeleteOrMoveToTrash(const DBOptions* db_options, - const std::string& fname); +extern Status CreateFile(Env* env, const std::string& destination, + const std::string& contents); + +extern Status DeleteSSTFile(const DBOptions* db_options, + const std::string& fname, uint32_t path_id); } // namespace rocksdb diff --git a/util/filelock_test.cc b/util/filelock_test.cc index 33362f8c7..d8c1172dc 100644 --- a/util/filelock_test.cc +++ b/util/filelock_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/filter_policy.cc b/util/filter_policy.cc index e950b75f7..8924982b4 100644 --- a/util/filter_policy.cc +++ b/util/filter_policy.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/hash.cc b/util/hash.cc index 427f0d138..dfd2dc403 100644 --- a/util/hash.cc +++ b/util/hash.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/hash.h b/util/hash.h index cab8d4677..5c90e63ec 100644 --- a/util/hash.h +++ b/util/hash.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/heap.h b/util/heap.h index 7d9e11113..9c25297f4 100644 --- a/util/heap.h +++ b/util/heap.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/heap_test.cc b/util/heap_test.cc index dd73e11a0..f2b902df0 100644 --- a/util/heap_test.cc +++ b/util/heap_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/histogram.cc b/util/histogram.cc index dea3808c6..1e63c39b3 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,11 +7,15 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "util/histogram.h" +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif +#include #include #include #include +#include "util/histogram.h" #include "port/port.h" namespace rocksdb { @@ -73,90 +77,126 @@ namespace { const HistogramBucketMapper bucketMapper; } -void HistogramImpl::Clear() { - min_ = static_cast(bucketMapper.LastValue()); - max_ = 0; - num_ = 0; - sum_ = 0; - sum_squares_ = 0; - memset(buckets_, 0, sizeof buckets_); +HistogramStat::HistogramStat() + : num_buckets_(bucketMapper.BucketCount()) { + assert(num_buckets_ == sizeof(buckets_) / sizeof(*buckets_)); + Clear(); } -bool HistogramImpl::Empty() { return num_ == 0; } +void HistogramStat::Clear() { + min_.store(bucketMapper.LastValue(), std::memory_order_relaxed); + max_.store(0, std::memory_order_relaxed); + num_.store(0, std::memory_order_relaxed); + sum_.store(0, std::memory_order_relaxed); + sum_squares_.store(0, std::memory_order_relaxed); + for (unsigned int b = 0; b < num_buckets_; b++) { + buckets_[b].store(0, std::memory_order_relaxed); + } +}; -void HistogramImpl::Add(uint64_t value) { +bool HistogramStat::Empty() const { return num() == 0; } + +void HistogramStat::Add(uint64_t value) { + // This function is designed to be lock free, as it's in the critical path + // of any operation. Each individual value is atomic and the order of updates + // by concurrent threads is tolerable. const size_t index = bucketMapper.IndexForValue(value); - buckets_[index] += 1; - if (min_ > value) min_ = static_cast(value); - if (max_ < value) max_ = static_cast(value); - num_++; - sum_ += value; - sum_squares_ += (value * value); + assert(index < num_buckets_); + buckets_[index].fetch_add(1, std::memory_order_relaxed); + + uint64_t old_min = min(); + while (value < old_min && !min_.compare_exchange_weak(old_min, value)) {} + + uint64_t old_max = max(); + while (value > old_max && !max_.compare_exchange_weak(old_max, value)) {} + + num_.fetch_add(1, std::memory_order_relaxed); + sum_.fetch_add(value, std::memory_order_relaxed); + sum_squares_.fetch_add(value * value, std::memory_order_relaxed); } -void HistogramImpl::Merge(const HistogramImpl& other) { - if (other.min_ < min_) min_ = other.min_; - if (other.max_ > max_) max_ = other.max_; - num_ += other.num_; - sum_ += other.sum_; - sum_squares_ += other.sum_squares_; - for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { - buckets_[b] += other.buckets_[b]; +void HistogramStat::Merge(const HistogramStat& other) { + // This function needs to be performned with the outer lock acquired + // However, atomic operation on every member is still need, since Add() + // requires no lock and value update can still happen concurrently + uint64_t old_min = min(); + uint64_t other_min = other.min(); + while (other_min < old_min && + !min_.compare_exchange_weak(old_min, other_min)) {} + + uint64_t old_max = max(); + uint64_t other_max = other.max(); + while (other_max > old_max && + !max_.compare_exchange_weak(old_max, other_max)) {} + + num_.fetch_add(other.num(), std::memory_order_relaxed); + sum_.fetch_add(other.sum(), std::memory_order_relaxed); + sum_squares_.fetch_add(other.sum_squares(), std::memory_order_relaxed); + for (unsigned int b = 0; b < num_buckets_; b++) { + buckets_[b].fetch_add(other.bucket_at(b), std::memory_order_relaxed); } } -double HistogramImpl::Median() const { +double HistogramStat::Median() const { return Percentile(50.0); } -double HistogramImpl::Percentile(double p) const { - double threshold = num_ * (p / 100.0); - double sum = 0; - for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { - sum += buckets_[b]; - if (sum >= threshold) { +double HistogramStat::Percentile(double p) const { + double threshold = num() * (p / 100.0); + uint64_t cumulative_sum = 0; + for (unsigned int b = 0; b < num_buckets_; b++) { + uint64_t bucket_value = bucket_at(b); + cumulative_sum += bucket_value; + if (cumulative_sum >= threshold) { // Scale linearly within this bucket - double left_point = - static_cast((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)); - double right_point = - static_cast(bucketMapper.BucketLimit(b)); - double left_sum = sum - buckets_[b]; - double right_sum = sum; + uint64_t left_point = (b == 0) ? 0 : bucketMapper.BucketLimit(b-1); + uint64_t right_point = bucketMapper.BucketLimit(b); + uint64_t left_sum = cumulative_sum - bucket_value; + uint64_t right_sum = cumulative_sum; double pos = 0; - double right_left_diff = right_sum - left_sum; + uint64_t right_left_diff = right_sum - left_sum; if (right_left_diff != 0) { - pos = (threshold - left_sum) / (right_sum - left_sum); + pos = (threshold - left_sum) / right_left_diff; } double r = left_point + (right_point - left_point) * pos; - if (r < min_) r = min_; - if (r > max_) r = max_; + uint64_t cur_min = min(); + uint64_t cur_max = max(); + if (r < cur_min) r = static_cast(cur_min); + if (r > cur_max) r = static_cast(cur_max); return r; } } - return max_; + return static_cast(max()); } -double HistogramImpl::Average() const { - if (num_ == 0.0) return 0; - return sum_ / num_; +double HistogramStat::Average() const { + uint64_t cur_num = num(); + uint64_t cur_sum = sum(); + if (cur_num == 0) return 0; + return static_cast(cur_sum) / static_cast(cur_num); } -double HistogramImpl::StandardDeviation() const { - if (num_ == 0.0) return 0; - double variance = (sum_squares_ * num_ - sum_ * sum_) / (num_ * num_); +double HistogramStat::StandardDeviation() const { + uint64_t cur_num = num(); + uint64_t cur_sum = sum(); + uint64_t cur_sum_squares = sum_squares(); + if (cur_num == 0) return 0; + double variance = + static_cast(cur_sum_squares * cur_num - cur_sum * cur_sum) / + static_cast(cur_num * cur_num); return sqrt(variance); } - -std::string HistogramImpl::ToString() const { +std::string HistogramStat::ToString() const { + uint64_t cur_num = num(); std::string r; char buf[200]; snprintf(buf, sizeof(buf), - "Count: %.0f Average: %.4f StdDev: %.2f\n", - num_, Average(), StandardDeviation()); + "Count: %" PRIu64 " Average: %.4f StdDev: %.2f\n", + cur_num, Average(), StandardDeviation()); r.append(buf); snprintf(buf, sizeof(buf), - "Min: %.4f Median: %.4f Max: %.4f\n", - (num_ == 0.0 ? 0.0 : min_), Median(), max_); + "Min: %" PRIu64 " Median: %.4f Max: %" PRIu64 "\n", + (cur_num == 0 ? 0 : min()), Median(), (cur_num == 0 ? 0 : max())); r.append(buf); snprintf(buf, sizeof(buf), "Percentiles: " @@ -165,30 +205,30 @@ std::string HistogramImpl::ToString() const { Percentile(99.99)); r.append(buf); r.append("------------------------------------------------------\n"); - const double mult = 100.0 / num_; - double sum = 0; - for (unsigned int b = 0; b < bucketMapper.BucketCount(); b++) { - if (buckets_[b] <= 0.0) continue; - sum += buckets_[b]; + const double mult = 100.0 / cur_num; + uint64_t cumulative_sum = 0; + for (unsigned int b = 0; b < num_buckets_; b++) { + uint64_t bucket_value = bucket_at(b); + if (bucket_value <= 0.0) continue; + cumulative_sum += bucket_value; snprintf(buf, sizeof(buf), - "[ %7lu, %7lu ) %8lu %7.3f%% %7.3f%% ", - // left - (unsigned long)((b == 0) ? 0 : bucketMapper.BucketLimit(b-1)), - (unsigned long)bucketMapper.BucketLimit(b), // right - (unsigned long)buckets_[b], // count - (mult * buckets_[b]), // percentage - (mult * sum)); // cumulative percentage + "[ %7" PRIu64 ", %7" PRIu64 " ) %8" PRIu64 " %7.3f%% %7.3f%% ", + (b == 0) ? 0 : bucketMapper.BucketLimit(b-1), // left + bucketMapper.BucketLimit(b), // right + bucket_value, // count + (mult * bucket_value), // percentage + (mult * cumulative_sum)); // cumulative percentage r.append(buf); // Add hash marks based on percentage; 20 marks for 100%. - int marks = static_cast(20*(buckets_[b] / num_) + 0.5); + size_t marks = static_cast(mult * bucket_value / 5 + 0.5); r.append(marks, '#'); r.push_back('\n'); } return r; } -void HistogramImpl::Data(HistogramData * const data) const { +void HistogramStat::Data(HistogramData * const data) const { assert(data); data->median = Median(); data->percentile95 = Percentile(95); @@ -197,4 +237,52 @@ void HistogramImpl::Data(HistogramData * const data) const { data->standard_deviation = StandardDeviation(); } +void HistogramImpl::Clear() { + std::lock_guard lock(mutex_); + stats_.Clear(); +} + +bool HistogramImpl::Empty() const { + return stats_.Empty(); +} + +void HistogramImpl::Add(uint64_t value) { + stats_.Add(value); +} + +void HistogramImpl::Merge(const Histogram& other) { + if (strcmp(Name(), other.Name()) == 0) { + Merge(dynamic_cast(other)); + } +} + +void HistogramImpl::Merge(const HistogramImpl& other) { + std::lock_guard lock(mutex_); + stats_.Merge(other.stats_); +} + +double HistogramImpl::Median() const { + return stats_.Median(); +} + +double HistogramImpl::Percentile(double p) const { + return stats_.Percentile(p); +} + +double HistogramImpl::Average() const { + return stats_.Average(); +} + +double HistogramImpl::StandardDeviation() const { + return stats_.StandardDeviation(); +} + +std::string HistogramImpl::ToString() const { + return stats_.ToString(); +} + +void HistogramImpl::Data(HistogramData * const data) const { + stats_.Data(data); +} + } // namespace levedb diff --git a/util/histogram.h b/util/histogram.h index 5f73bf3cd..84c3e94fe 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -14,8 +14,7 @@ #include #include #include - -#include +#include namespace rocksdb { @@ -25,7 +24,7 @@ class HistogramBucketMapper { HistogramBucketMapper(); // converts a value to the bucket index. - size_t IndexForValue(const uint64_t value) const; + size_t IndexForValue(uint64_t value) const; // number of buckets required. size_t BucketCount() const { @@ -52,33 +51,99 @@ class HistogramBucketMapper { std::map valueIndexMap_; }; -class HistogramImpl { +struct HistogramStat { + HistogramStat(); + ~HistogramStat() {} + + HistogramStat(const HistogramStat&) = delete; + HistogramStat& operator=(const HistogramStat&) = delete; + + void Clear(); + bool Empty() const; + void Add(uint64_t value); + void Merge(const HistogramStat& other); + + inline uint64_t min() const { return min_.load(std::memory_order_relaxed); } + inline uint64_t max() const { return max_.load(std::memory_order_relaxed); } + inline uint64_t num() const { return num_.load(std::memory_order_relaxed); } + inline uint64_t sum() const { return sum_.load(std::memory_order_relaxed); } + inline uint64_t sum_squares() const { + return sum_squares_.load(std::memory_order_relaxed); + } + inline uint64_t bucket_at(size_t b) const { + return buckets_[b].load(std::memory_order_relaxed); + } + + double Median() const; + double Percentile(double p) const; + double Average() const; + double StandardDeviation() const; + void Data(HistogramData* const data) const; + std::string ToString() const; + + // To be able to use HistogramStat as thread local variable, it + // cannot have dynamic allocated member. That's why we're + // using manually values from BucketMapper + std::atomic_uint_fast64_t min_; + std::atomic_uint_fast64_t max_; + std::atomic_uint_fast64_t num_; + std::atomic_uint_fast64_t sum_; + std::atomic_uint_fast64_t sum_squares_; + std::atomic_uint_fast64_t buckets_[138]; // 138==BucketMapper::BucketCount() + const uint64_t num_buckets_; +}; + +class Histogram { +public: + Histogram() {} + virtual ~Histogram() {}; + + virtual void Clear() = 0; + virtual bool Empty() const = 0; + virtual void Add(uint64_t value) = 0; + virtual void Merge(const Histogram&) = 0; + + virtual std::string ToString() const = 0; + virtual const char* Name() const = 0; + virtual uint64_t min() const = 0; + virtual uint64_t max() const = 0; + virtual uint64_t num() const = 0; + virtual double Median() const = 0; + virtual double Percentile(double p) const = 0; + virtual double Average() const = 0; + virtual double StandardDeviation() const = 0; + virtual void Data(HistogramData* const data) const = 0; +}; + +class HistogramImpl : public Histogram { public: - HistogramImpl() { memset(buckets_, 0, sizeof(buckets_)); } - virtual void Clear(); - virtual bool Empty(); - virtual void Add(uint64_t value); - void Merge(const HistogramImpl& other); + HistogramImpl() { Clear(); } + + HistogramImpl(const HistogramImpl&) = delete; + HistogramImpl& operator=(const HistogramImpl&) = delete; - virtual std::string ToString() const; + virtual void Clear() override; + virtual bool Empty() const override; + virtual void Add(uint64_t value) override; + virtual void Merge(const Histogram& other) override; + void Merge(const HistogramImpl& other); - virtual double Median() const; - virtual double Percentile(double p) const; - virtual double Average() const; - virtual double StandardDeviation() const; - virtual void Data(HistogramData * const data) const; + virtual std::string ToString() const override; + virtual const char* Name() const override { return "HistogramImpl"; } + virtual uint64_t min() const override { return stats_.min(); } + virtual uint64_t max() const override { return stats_.max(); } + virtual uint64_t num() const override { return stats_.num(); } + virtual double Median() const override; + virtual double Percentile(double p) const override; + virtual double Average() const override; + virtual double StandardDeviation() const override; + virtual void Data(HistogramData* const data) const override; virtual ~HistogramImpl() {} private: - // To be able to use HistogramImpl as thread local variable, its constructor - // has to be static. That's why we're using manually values from BucketMapper - double min_ = 1000000000; // this is BucketMapper:LastValue() - double max_ = 0; - double num_ = 0; - double sum_ = 0; - double sum_squares_ = 0; - uint64_t buckets_[138]; // this is BucketMapper::BucketCount() + HistogramStat stats_; + std::mutex mutex_; }; -} // namespace rocksdb +} // namespace rocksdb \ No newline at end of file diff --git a/util/histogram_test.cc b/util/histogram_test.cc index 22ddb4b42..47d9c65ba 100644 --- a/util/histogram_test.cc +++ b/util/histogram_test.cc @@ -1,59 +1,207 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // -#include "util/histogram.h" +#include +#include "util/histogram.h" +#include "util/histogram_windowing.h" #include "util/testharness.h" namespace rocksdb { class HistogramTest : public testing::Test {}; -TEST_F(HistogramTest, BasicOperation) { - HistogramImpl histogram; - for (uint64_t i = 1; i <= 100; i++) { - histogram.Add(i); - } +namespace { + const double kIota = 0.1; + const HistogramBucketMapper bucketMapper; + Env* env = Env::Default(); +} - { - double median = histogram.Median(); - // ASSERT_LE(median, 50); - ASSERT_GT(median, 0); +void PopulateHistogram(Histogram& histogram, + uint64_t low, uint64_t high, uint64_t loop = 1) { + for (; loop > 0; loop--) { + for (uint64_t i = low; i <= high; i++) { + histogram.Add(i); + } } +} - { - double percentile100 = histogram.Percentile(100.0); - ASSERT_LE(percentile100, 100.0); - ASSERT_GT(percentile100, 0.0); - double percentile99 = histogram.Percentile(99.0); - double percentile85 = histogram.Percentile(85.0); - ASSERT_LE(percentile99, 99.0); - ASSERT_TRUE(percentile99 >= percentile85); - } +void BasicOperation(Histogram& histogram) { + PopulateHistogram(histogram, 1, 100, 10); + + HistogramData data; + histogram.Data(&data); - ASSERT_EQ(histogram.Average(), 50.5); // avg is acurately calculated. + ASSERT_LE(fabs(histogram.Percentile(100.0) - 100.0), kIota); + ASSERT_LE(fabs(data.percentile99 - 99.0), kIota); + ASSERT_LE(fabs(data.percentile95 - 95.0), kIota); + ASSERT_LE(fabs(data.median - 50.0), kIota); + ASSERT_EQ(data.average, 50.5); // avg is acurately calculated. + ASSERT_LT(fabs(data.standard_deviation- 28.86), kIota); //sd is ~= 28.86 } -TEST_F(HistogramTest, EmptyHistogram) { - HistogramImpl histogram; +void MergeHistogram(Histogram& histogram, Histogram& other) { + PopulateHistogram(histogram, 1, 100); + PopulateHistogram(other, 101, 200); + histogram.Merge(other); + + HistogramData data; + histogram.Data(&data); + + ASSERT_LE(fabs(histogram.Percentile(100.0) - 200.0), kIota); + ASSERT_LE(fabs(data.percentile99 - 198.0), kIota); + ASSERT_LE(fabs(data.percentile95 - 190.0), kIota); + ASSERT_LE(fabs(data.median - 100.0), kIota); + ASSERT_EQ(data.average, 100.5); // avg is acurately calculated. + ASSERT_LT(fabs(data.standard_deviation - 57.73), kIota); //sd is ~= 57.73 +} + +void EmptyHistogram(Histogram& histogram) { + ASSERT_EQ(histogram.min(), bucketMapper.LastValue()); + ASSERT_EQ(histogram.max(), 0); + ASSERT_EQ(histogram.num(), 0); ASSERT_EQ(histogram.Median(), 0.0); ASSERT_EQ(histogram.Percentile(85.0), 0.0); ASSERT_EQ(histogram.Average(), 0.0); + ASSERT_EQ(histogram.StandardDeviation(), 0.0); } -TEST_F(HistogramTest, ClearHistogram) { - HistogramImpl histogram; +void ClearHistogram(Histogram& histogram) { for (uint64_t i = 1; i <= 100; i++) { histogram.Add(i); } histogram.Clear(); + ASSERT_TRUE(histogram.Empty()); ASSERT_EQ(histogram.Median(), 0); ASSERT_EQ(histogram.Percentile(85.0), 0); ASSERT_EQ(histogram.Average(), 0); } +TEST_F(HistogramTest, BasicOperation) { + HistogramImpl histogram; + BasicOperation(histogram); + + HistogramWindowingImpl histogramWindowing; + BasicOperation(histogramWindowing); +} + +TEST_F(HistogramTest, MergeHistogram) { + HistogramImpl histogram; + HistogramImpl other; + MergeHistogram(histogram, other); + + HistogramWindowingImpl histogramWindowing; + HistogramWindowingImpl otherWindowing; + MergeHistogram(histogramWindowing, otherWindowing); +} + +TEST_F(HistogramTest, EmptyHistogram) { + HistogramImpl histogram; + EmptyHistogram(histogram); + + HistogramWindowingImpl histogramWindowing; + EmptyHistogram(histogramWindowing); +} + +TEST_F(HistogramTest, ClearHistogram) { + HistogramImpl histogram; + ClearHistogram(histogram); + + HistogramWindowingImpl histogramWindowing; + ClearHistogram(histogramWindowing); +} + +TEST_F(HistogramTest, HistogramWindowingExpire) { + uint64_t num_windows = 3; + int micros_per_window = 1000000; + uint64_t min_num_per_window = 0; + + HistogramWindowingImpl + histogramWindowing(num_windows, micros_per_window, min_num_per_window); + + PopulateHistogram(histogramWindowing, 1, 1, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 100); + ASSERT_EQ(histogramWindowing.min(), 1); + ASSERT_EQ(histogramWindowing.max(), 1); + ASSERT_EQ(histogramWindowing.Average(), 1); + + PopulateHistogram(histogramWindowing, 2, 2, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 200); + ASSERT_EQ(histogramWindowing.min(), 1); + ASSERT_EQ(histogramWindowing.max(), 2); + ASSERT_EQ(histogramWindowing.Average(), 1.5); + + PopulateHistogram(histogramWindowing, 3, 3, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 300); + ASSERT_EQ(histogramWindowing.min(), 1); + ASSERT_EQ(histogramWindowing.max(), 3); + ASSERT_EQ(histogramWindowing.Average(), 2.0); + + // dropping oldest window with value 1, remaining 2 ~ 4 + PopulateHistogram(histogramWindowing, 4, 4, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 300); + ASSERT_EQ(histogramWindowing.min(), 2); + ASSERT_EQ(histogramWindowing.max(), 4); + ASSERT_EQ(histogramWindowing.Average(), 3.0); + + // dropping oldest window with value 2, remaining 3 ~ 5 + PopulateHistogram(histogramWindowing, 5, 5, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 300); + ASSERT_EQ(histogramWindowing.min(), 3); + ASSERT_EQ(histogramWindowing.max(), 5); + ASSERT_EQ(histogramWindowing.Average(), 4.0); +} + +TEST_F(HistogramTest, HistogramWindowingMerge) { + uint64_t num_windows = 3; + int micros_per_window = 1000000; + uint64_t min_num_per_window = 0; + + HistogramWindowingImpl + histogramWindowing(num_windows, micros_per_window, min_num_per_window); + HistogramWindowingImpl + otherWindowing(num_windows, micros_per_window, min_num_per_window); + + PopulateHistogram(histogramWindowing, 1, 1, 100); + PopulateHistogram(otherWindowing, 1, 1, 100); + env->SleepForMicroseconds(micros_per_window); + + PopulateHistogram(histogramWindowing, 2, 2, 100); + PopulateHistogram(otherWindowing, 2, 2, 100); + env->SleepForMicroseconds(micros_per_window); + + PopulateHistogram(histogramWindowing, 3, 3, 100); + PopulateHistogram(otherWindowing, 3, 3, 100); + env->SleepForMicroseconds(micros_per_window); + + histogramWindowing.Merge(otherWindowing); + ASSERT_EQ(histogramWindowing.num(), 600); + ASSERT_EQ(histogramWindowing.min(), 1); + ASSERT_EQ(histogramWindowing.max(), 3); + ASSERT_EQ(histogramWindowing.Average(), 2.0); + + // dropping oldest window with value 1, remaining 2 ~ 4 + PopulateHistogram(histogramWindowing, 4, 4, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 500); + ASSERT_EQ(histogramWindowing.min(), 2); + ASSERT_EQ(histogramWindowing.max(), 4); + + // dropping oldest window with value 2, remaining 3 ~ 5 + PopulateHistogram(histogramWindowing, 5, 5, 100); + env->SleepForMicroseconds(micros_per_window); + ASSERT_EQ(histogramWindowing.num(), 400); + ASSERT_EQ(histogramWindowing.min(), 3); + ASSERT_EQ(histogramWindowing.max(), 5); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/histogram_windowing.cc b/util/histogram_windowing.cc new file mode 100644 index 000000000..9d8f5429a --- /dev/null +++ b/util/histogram_windowing.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/histogram.h" +#include "util/histogram_windowing.h" + +#include + +namespace rocksdb { + +namespace { + const HistogramBucketMapper bucketMapper; +} + +HistogramWindowingImpl::HistogramWindowingImpl() { + env_ = Env::Default(); + window_stats_.reset(new HistogramStat[num_windows_]); + Clear(); +} + +HistogramWindowingImpl::HistogramWindowingImpl( + uint64_t num_windows, + uint64_t micros_per_window, + uint64_t min_num_per_window) : + num_windows_(num_windows), + micros_per_window_(micros_per_window), + min_num_per_window_(min_num_per_window) { + env_ = Env::Default(); + window_stats_.reset(new HistogramStat[num_windows_]); + Clear(); +} + +HistogramWindowingImpl::~HistogramWindowingImpl(){ + window_stats_.release(); +} + +void HistogramWindowingImpl::Clear() { + std::lock_guard lock(mutex_); + + stats_.Clear(); + for (size_t i = 0; i < num_windows_; i++) { + window_stats_[i].Clear(); + } + current_window_.store(0, std::memory_order_relaxed); + last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); +} + +bool HistogramWindowingImpl::Empty() const { return stats_.Empty(); } + +// This function is designed to be lock free, as it's in the critical path +// of any operation. +// Each individual value is atomic, it is just that some samples can go +// in the older bucket which is tolerable. +void HistogramWindowingImpl::Add(uint64_t value){ + TimerTick(); + + // Parent (global) member update + stats_.Add(value); + + // Current window update + window_stats_[current_window()].Add(value); +} + +void HistogramWindowingImpl::Merge(const Histogram& other) { + if (strcmp(Name(), other.Name()) == 0) { + Merge(dynamic_cast(other)); + } +} + +void HistogramWindowingImpl::Merge(const HistogramWindowingImpl& other) { + std::lock_guard lock(mutex_); + stats_.Merge(other.stats_); + + if (stats_.num_buckets_ != other.stats_.num_buckets_ || + micros_per_window_ != other.micros_per_window_) { + return; + } + + uint64_t cur_window = current_window(); + uint64_t other_cur_window = other.current_window(); + // going backwards for alignment + for (unsigned int i = 0; + i < std::min(num_windows_, other.num_windows_); i++) { + uint64_t window_index = + (cur_window + num_windows_ - i) % num_windows_; + uint64_t other_window_index = + (other_cur_window + other.num_windows_ - i) % other.num_windows_; + + window_stats_[window_index].Merge(other.window_stats_[other_window_index]); + } +} + +std::string HistogramWindowingImpl::ToString() const { + return stats_.ToString(); +} + +double HistogramWindowingImpl::Median() const { + return Percentile(50.0); +} + +double HistogramWindowingImpl::Percentile(double p) const { + // Retry 3 times in total + for (int retry = 0; retry < 3; retry++) { + uint64_t start_num = stats_.num(); + double result = stats_.Percentile(p); + // Detect if swap buckets or Clear() was called during calculation + if (stats_.num() >= start_num) { + return result; + } + } + return 0.0; +} + +double HistogramWindowingImpl::Average() const { + return stats_.Average(); +} + +double HistogramWindowingImpl::StandardDeviation() const { + return stats_.StandardDeviation(); +} + +void HistogramWindowingImpl::Data(HistogramData * const data) const { + stats_.Data(data); +} + +void HistogramWindowingImpl::TimerTick() { + uint64_t curr_time = env_->NowMicros(); + if (curr_time - last_swap_time() > micros_per_window_ && + window_stats_[current_window()].num() >= min_num_per_window_) { + SwapHistoryBucket(); + } +} + +void HistogramWindowingImpl::SwapHistoryBucket() { + // Threads executing Add() would be competing for this mutex, the first one + // who got the metex would take care of the bucket swap, other threads + // can skip this. + // If mutex is held by Merge() or Clear(), next Add() will take care of the + // swap, if needed. + if (mutex_.try_lock()) { + last_swap_time_.store(env_->NowMicros(), std::memory_order_relaxed); + + uint64_t curr_window = current_window(); + uint64_t next_window = (curr_window == num_windows_ - 1) ? + 0 : curr_window + 1; + + // subtract next buckets from totals and swap to next buckets + HistogramStat& stats_to_drop = window_stats_[next_window]; + + if (!stats_to_drop.Empty()) { + for (size_t b = 0; b < stats_.num_buckets_; b++){ + stats_.buckets_[b].fetch_sub( + stats_to_drop.bucket_at(b), std::memory_order_relaxed); + } + + if (stats_.min() == stats_to_drop.min()) { + uint64_t new_min = bucketMapper.LastValue(); + for (unsigned int i = 0; i < num_windows_; i++) { + if (i != next_window) { + uint64_t m = window_stats_[i].min(); + if (m < new_min) new_min = m; + } + } + stats_.min_.store(new_min, std::memory_order_relaxed); + } + + if (stats_.max() == stats_to_drop.max()) { + uint64_t new_max = 0; + for (unsigned int i = 0; i < num_windows_; i++) { + if (i != next_window) { + uint64_t m = window_stats_[i].max(); + if (m > new_max) new_max = m; + } + } + stats_.max_.store(new_max, std::memory_order_relaxed); + } + + stats_.num_.fetch_sub(stats_to_drop.num(), std::memory_order_relaxed); + stats_.sum_.fetch_sub(stats_to_drop.sum(), std::memory_order_relaxed); + stats_.sum_squares_.fetch_sub( + stats_to_drop.sum_squares(), std::memory_order_relaxed); + + stats_to_drop.Clear(); + } + + // advance to next window bucket + current_window_.store(next_window, std::memory_order_relaxed); + + mutex_.unlock(); + } +} + +} // namespace rocksdb diff --git a/util/histogram_windowing.h b/util/histogram_windowing.h new file mode 100644 index 000000000..cdcf1ba8a --- /dev/null +++ b/util/histogram_windowing.h @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include "util/histogram.h" +#include "rocksdb/env.h" + +namespace rocksdb { + +class HistogramWindowingImpl : public Histogram +{ +public: + HistogramWindowingImpl(); + HistogramWindowingImpl(uint64_t num_windows, + uint64_t micros_per_window, + uint64_t min_num_per_window); + + HistogramWindowingImpl(const HistogramImpl&) = delete; + HistogramWindowingImpl& operator=(const HistogramImpl&) = delete; + + ~HistogramWindowingImpl(); + + virtual void Clear() override; + virtual bool Empty() const override; + virtual void Add(uint64_t value) override; + virtual void Merge(const Histogram& other) override; + void Merge(const HistogramWindowingImpl& other); + + virtual std::string ToString() const override; + virtual const char* Name() const override { return "HistogramWindowingImpl"; } + virtual uint64_t min() const override { return stats_.min(); } + virtual uint64_t max() const override { return stats_.max(); } + virtual uint64_t num() const override { return stats_.num(); } + virtual double Median() const override; + virtual double Percentile(double p) const override; + virtual double Average() const override; + virtual double StandardDeviation() const override; + virtual void Data(HistogramData* const data) const override; + +private: + void TimerTick(); + void SwapHistoryBucket(); + inline uint64_t current_window() const { + return current_window_.load(std::memory_order_relaxed); + } + inline uint64_t last_swap_time() const{ + return last_swap_time_.load(std::memory_order_relaxed); + } + + Env* env_; + std::mutex mutex_; + + // Aggregated stats over windows_stats_, all the computation is done + // upon aggregated values + HistogramStat stats_; + + // This is a circular array representing the latest N time-windows. + // Each entry stores a time-window of data. Expiration is done + // on window-based. + std::unique_ptr window_stats_; + + std::atomic_uint_fast64_t current_window_; + std::atomic_uint_fast64_t last_swap_time_; + + // Following parameters are configuable + uint64_t num_windows_ = 5; + uint64_t micros_per_window_ = 60000000; + // By default, don't care about the number of values in current window + // when decide whether to swap windows or not. + uint64_t min_num_per_window_ = 0; +}; + +} // namespace rocksdb \ No newline at end of file diff --git a/util/instrumented_mutex.cc b/util/instrumented_mutex.cc index bfb989a1d..4eba27720 100644 --- a/util/instrumented_mutex.cc +++ b/util/instrumented_mutex.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -8,11 +8,18 @@ #include "util/thread_status_util.h" namespace rocksdb { +namespace { +bool ShouldReportToStats(Env* env, Statistics* stats) { + return env != nullptr && stats != nullptr && + stats->stats_level_ != kExceptTimeForMutex; +} +} // namespace + void InstrumentedMutex::Lock() { - PERF_CONDITIONAL_TIMER_GUARD(db_mutex_lock_nanos, - stats_code_ == DB_MUTEX_WAIT_MICROS); + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_mutex_lock_nanos, + stats_code_ == DB_MUTEX_WAIT_MICROS); uint64_t wait_time_micros = 0; - if (env_ != nullptr && stats_ != nullptr) { + if (ShouldReportToStats(env_, stats_)) { { StopWatch sw(env_, nullptr, 0, &wait_time_micros); LockInternal(); @@ -31,10 +38,10 @@ void InstrumentedMutex::LockInternal() { } void InstrumentedCondVar::Wait() { - PERF_CONDITIONAL_TIMER_GUARD(db_condition_wait_nanos, - stats_code_ == DB_MUTEX_WAIT_MICROS); + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_condition_wait_nanos, + stats_code_ == DB_MUTEX_WAIT_MICROS); uint64_t wait_time_micros = 0; - if (env_ != nullptr && stats_ != nullptr) { + if (ShouldReportToStats(env_, stats_)) { { StopWatch sw(env_, nullptr, 0, &wait_time_micros); WaitInternal(); @@ -53,11 +60,11 @@ void InstrumentedCondVar::WaitInternal() { } bool InstrumentedCondVar::TimedWait(uint64_t abs_time_us) { - PERF_CONDITIONAL_TIMER_GUARD(db_condition_wait_nanos, - stats_code_ == DB_MUTEX_WAIT_MICROS); + PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(db_condition_wait_nanos, + stats_code_ == DB_MUTEX_WAIT_MICROS); uint64_t wait_time_micros = 0; bool result = false; - if (env_ != nullptr && stats_ != nullptr) { + if (ShouldReportToStats(env_, stats_)) { { StopWatch sw(env_, nullptr, 0, &wait_time_micros); result = TimedWaitInternal(abs_time_us); diff --git a/util/instrumented_mutex.h b/util/instrumented_mutex.h index 3f233494a..45d553ae8 100644 --- a/util/instrumented_mutex.h +++ b/util/instrumented_mutex.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/io_posix.cc b/util/io_posix.cc index dd41e2a03..05a7f2788 100644 --- a/util/io_posix.cc +++ b/util/io_posix.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/io_posix.h b/util/io_posix.h index 2a45d10ff..39b7b8fdf 100644 --- a/util/io_posix.h +++ b/util/io_posix.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/iostats_context.cc b/util/iostats_context.cc index 50a6e8ab1..a3c72db96 100644 --- a/util/iostats_context.cc +++ b/util/iostats_context.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -31,14 +31,24 @@ void IOStatsContext::Reset() { logger_nanos = 0; } -#define OUTPUT(counter) #counter << " = " << counter << ", " +#define IOSTATS_CONTEXT_OUTPUT(counter) \ + if (!exclude_zero_counters || counter > 0) { \ + ss << #counter << " = " << counter << ", "; \ + } -std::string IOStatsContext::ToString() const { +std::string IOStatsContext::ToString(bool exclude_zero_counters) const { std::ostringstream ss; - ss << OUTPUT(thread_pool_id) << OUTPUT(bytes_read) << OUTPUT(bytes_written) - << OUTPUT(open_nanos) << OUTPUT(allocate_nanos) << OUTPUT(write_nanos) - << OUTPUT(read_nanos) << OUTPUT(range_sync_nanos) << OUTPUT(fsync_nanos) - << OUTPUT(prepare_write_nanos) << OUTPUT(logger_nanos); + IOSTATS_CONTEXT_OUTPUT(thread_pool_id); + IOSTATS_CONTEXT_OUTPUT(bytes_read); + IOSTATS_CONTEXT_OUTPUT(bytes_written); + IOSTATS_CONTEXT_OUTPUT(open_nanos); + IOSTATS_CONTEXT_OUTPUT(allocate_nanos); + IOSTATS_CONTEXT_OUTPUT(write_nanos); + IOSTATS_CONTEXT_OUTPUT(read_nanos); + IOSTATS_CONTEXT_OUTPUT(range_sync_nanos); + IOSTATS_CONTEXT_OUTPUT(fsync_nanos); + IOSTATS_CONTEXT_OUTPUT(prepare_write_nanos); + IOSTATS_CONTEXT_OUTPUT(logger_nanos); return ss.str(); } diff --git a/util/iostats_context_imp.h b/util/iostats_context_imp.h index 4617b4120..d0464ce0c 100644 --- a/util/iostats_context_imp.h +++ b/util/iostats_context_imp.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/iostats_context_test.cc b/util/iostats_context_test.cc new file mode 100644 index 000000000..a2884f8a6 --- /dev/null +++ b/util/iostats_context_test.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "rocksdb/iostats_context.h" +#include "util/testharness.h" + +namespace rocksdb { + +TEST(IOStatsContextTest, ToString) { + iostats_context.Reset(); + iostats_context.bytes_read = 12345; + + std::string zero_included = iostats_context.ToString(); + ASSERT_NE(std::string::npos, zero_included.find("= 0")); + ASSERT_NE(std::string::npos, zero_included.find("= 12345")); + + std::string zero_excluded = iostats_context.ToString(true); + ASSERT_EQ(std::string::npos, zero_excluded.find("= 0")); + ASSERT_NE(std::string::npos, zero_excluded.find("= 12345")); +} + +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/util/kv_map.h b/util/kv_map.h index 486db1918..ac3e96020 100644 --- a/util/kv_map.h +++ b/util/kv_map.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/log_buffer.cc b/util/log_buffer.cc index 7d15cf22e..e04f9f2ae 100644 --- a/util/log_buffer.cc +++ b/util/log_buffer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/log_buffer.h b/util/log_buffer.h index bd842b731..daf8ba6f5 100644 --- a/util/log_buffer.h +++ b/util/log_buffer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/log_write_bench.cc b/util/log_write_bench.cc index 16e7af7e2..d9b08762a 100644 --- a/util/log_write_bench.cc +++ b/util/log_write_bench.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/logging.cc b/util/logging.cc index 8917d099a..2e0881bb5 100644 --- a/util/logging.cc +++ b/util/logging.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/logging.h b/util/logging.h index 10801bb88..13aebb46d 100644 --- a/util/logging.h +++ b/util/logging.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/memenv_test.cc b/util/memenv_test.cc index 24190daba..2b872d266 100644 --- a/util/memenv_test.cc +++ b/util/memenv_test.cc @@ -4,7 +4,6 @@ #ifndef ROCKSDB_LITE -#include "db/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/testharness.h" @@ -183,58 +182,6 @@ TEST_F(MemEnvTest, LargeWrite) { delete [] scratch; } -TEST_F(MemEnvTest, DBTest) { - Options options; - options.create_if_missing = true; - options.env = env_; - DB* db; - - const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; - const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; - - ASSERT_OK(DB::Open(options, "/dir/db", &db)); - for (size_t i = 0; i < 3; ++i) { - ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); - } - - for (size_t i = 0; i < 3; ++i) { - std::string res; - ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); - ASSERT_TRUE(res == vals[i]); - } - - Iterator* iterator = db->NewIterator(ReadOptions()); - iterator->SeekToFirst(); - for (size_t i = 0; i < 3; ++i) { - ASSERT_TRUE(iterator->Valid()); - ASSERT_TRUE(keys[i] == iterator->key()); - ASSERT_TRUE(vals[i] == iterator->value()); - iterator->Next(); - } - ASSERT_TRUE(!iterator->Valid()); - delete iterator; - - DBImpl* dbi = reinterpret_cast(db); - ASSERT_OK(dbi->TEST_FlushMemTable()); - - for (size_t i = 0; i < 3; ++i) { - std::string res; - ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); - ASSERT_TRUE(res == vals[i]); - } - - delete db; - - options.create_if_missing = false; - ASSERT_OK(DB::Open(options, "/dir/db", &db)); - for (size_t i = 0; i < 3; ++i) { - std::string res; - ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); - ASSERT_TRUE(res == vals[i]); - } - delete db; -} - } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/mock_env.cc b/util/mock_env.cc index 409e16e3a..5001dfc39 100644 --- a/util/mock_env.cc +++ b/util/mock_env.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/mock_env.h b/util/mock_env.h index bcc74a731..d4bbdc8b2 100644 --- a/util/mock_env.h +++ b/util/mock_env.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/mock_env_test.cc b/util/mock_env_test.cc index 2f50c2a82..710881b55 100644 --- a/util/mock_env_test.cc +++ b/util/mock_env_test.cc @@ -6,7 +6,6 @@ #include #include #include "util/mock_env.h" -#include "db/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "util/testharness.h" @@ -221,52 +220,6 @@ TEST_F(MockEnvTest, Corrupt) { ASSERT_NE(result.compare(kCorrupted), 0); } -TEST_F(MockEnvTest, DBTest) { - Options options; - options.create_if_missing = true; - options.env = env_; - DB* db; - - const Slice keys[] = {Slice("aaa"), Slice("bbb"), Slice("ccc")}; - const Slice vals[] = {Slice("foo"), Slice("bar"), Slice("baz")}; - - ASSERT_OK(DB::Open(options, "/dir/db", &db)); - for (size_t i = 0; i < 3; ++i) { - ASSERT_OK(db->Put(WriteOptions(), keys[i], vals[i])); - } - - for (size_t i = 0; i < 3; ++i) { - std::string res; - ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); - ASSERT_TRUE(res == vals[i]); - } - - Iterator* iterator = db->NewIterator(ReadOptions()); - iterator->SeekToFirst(); - for (size_t i = 0; i < 3; ++i) { - ASSERT_TRUE(iterator->Valid()); - ASSERT_TRUE(keys[i] == iterator->key()); - ASSERT_TRUE(vals[i] == iterator->value()); - iterator->Next(); - } - ASSERT_TRUE(!iterator->Valid()); - delete iterator; - - // TEST_FlushMemTable() is not supported in ROCKSDB_LITE - #ifndef ROCKSDB_LITE - DBImpl* dbi = reinterpret_cast(db); - ASSERT_OK(dbi->TEST_FlushMemTable()); - - for (size_t i = 0; i < 3; ++i) { - std::string res; - ASSERT_OK(db->Get(ReadOptions(), keys[i], &res)); - ASSERT_TRUE(res == vals[i]); - } - #endif // ROCKSDB_LITE - - delete db; -} - TEST_F(MockEnvTest, FakeSleeping) { int64_t now = 0; auto s = env_->GetCurrentTime(&now); diff --git a/util/murmurhash.cc b/util/murmurhash.cc index d9d8b7061..9a01bf11b 100644 --- a/util/murmurhash.cc +++ b/util/murmurhash.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/murmurhash.h b/util/murmurhash.h index 40ee357a7..856fece9c 100644 --- a/util/murmurhash.h +++ b/util/murmurhash.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/mutable_cf_options.cc b/util/mutable_cf_options.cc index 582c0eadf..83eb2fafb 100644 --- a/util/mutable_cf_options.cc +++ b/util/mutable_cf_options.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/mutable_cf_options.h b/util/mutable_cf_options.h index 209aa3d51..dbae48e33 100644 --- a/util/mutable_cf_options.h +++ b/util/mutable_cf_options.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/mutexlock.h b/util/mutexlock.h index 63a0f5ce1..a2d14aedf 100644 --- a/util/mutexlock.h +++ b/util/mutexlock.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/options.cc b/util/options.cc index c925153fd..274fda8c5 100644 --- a/util/options.cc +++ b/util/options.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,12 +17,11 @@ #include #include -#include "db/writebuffer.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/comparator.h" -#include "rocksdb/delete_scheduler.h" #include "rocksdb/env.h" +#include "rocksdb/sst_file_manager.h" #include "rocksdb/memtablerep.h" #include "rocksdb/merge_operator.h" #include "rocksdb/slice.h" @@ -214,7 +213,7 @@ DBOptions::DBOptions() paranoid_checks(true), env(Env::Default()), rate_limiter(nullptr), - delete_scheduler(nullptr), + sst_file_manager(nullptr), info_log(nullptr), #ifdef NDEBUG info_log_level(INFO_LEVEL), @@ -230,6 +229,7 @@ DBOptions::DBOptions() db_log_dir(""), wal_dir(""), delete_obsolete_files_period_micros(6ULL * 60 * 60 * 1000000), + base_background_compactions(-1), max_background_compactions(1), max_subcompactions(1), max_background_flushes(1), @@ -282,7 +282,7 @@ DBOptions::DBOptions(const Options& options) paranoid_checks(options.paranoid_checks), env(options.env), rate_limiter(options.rate_limiter), - delete_scheduler(options.delete_scheduler), + sst_file_manager(options.sst_file_manager), info_log(options.info_log), info_log_level(options.info_log_level), max_open_files(options.max_open_files), @@ -296,6 +296,7 @@ DBOptions::DBOptions(const Options& options) wal_dir(options.wal_dir), delete_obsolete_files_period_micros( options.delete_obsolete_files_period_micros), + base_background_compactions(options.base_background_compactions), max_background_compactions(options.max_background_compactions), max_subcompactions(options.max_subcompactions), max_background_flushes(options.max_background_flushes), @@ -384,6 +385,8 @@ void DBOptions::Dump(Logger* log) const { table_cache_numshardbits); Header(log, " Options.delete_obsolete_files_period_micros: %" PRIu64, delete_obsolete_files_period_micros); + Header(log, " Options.base_background_compactions: %d", + base_background_compactions); Header(log, " Options.max_background_compactions: %d", max_background_compactions); Header(log, " Options.max_subcompactions: %" PRIu32, @@ -434,8 +437,9 @@ void DBOptions::Dump(Logger* log) const { use_adaptive_mutex); Header(log, " Options.rate_limiter: %p", rate_limiter.get()); - Header(log, " Options.delete_scheduler.rate_bytes_per_sec: %" PRIi64, - delete_scheduler ? delete_scheduler->GetRateBytesPerSecond() : 0); + Header( + log, " Options.sst_file_manager.rate_bytes_per_sec: %" PRIi64, + sst_file_manager ? sst_file_manager->GetDeleteRateBytesPerSecond() : 0); Header(log, " Options.bytes_per_sync: %" PRIu64, bytes_per_sync); Header(log, " Options.wal_bytes_per_sync: %" PRIu64, @@ -596,7 +600,7 @@ void ColumnFamilyOptions::Dump(Logger* log) const { Header(log, " Options.max_successive_merges: %" ROCKSDB_PRIszt, max_successive_merges); - Header(log, " Options.optimize_fllters_for_hits: %d", + Header(log, " Options.optimize_filters_for_hits: %d", optimize_filters_for_hits); Header(log, " Options.paranoid_file_checks: %d", paranoid_file_checks); @@ -652,6 +656,7 @@ Options::PrepareForBulkLoad() // to L1. This is helpful so that all files that are // input to the manual compaction are all at L0. max_background_compactions = 2; + base_background_compactions = 2; // The compaction would create large files in L1. target_file_size_base = 256 * 1024 * 1024; diff --git a/util/options_builder.cc b/util/options_builder.cc index 67fd268a9..89aeda5f5 100644 --- a/util/options_builder.cc +++ b/util/options_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -26,7 +26,7 @@ CompactionStyle PickCompactionStyle(size_t write_buffer_size, // Otherwise, calculate a score based on threshold and expected value of // two styles, weighing reads 4X important than writes. int expected_levels = static_cast(ceil( - ::log(target_db_size / write_buffer_size) / ::log(kBytesForLevelMultiplier))); + std::log(target_db_size / write_buffer_size) / std::log(kBytesForLevelMultiplier))); int expected_max_files_universal = static_cast(ceil(log2(target_db_size / write_buffer_size))); @@ -117,8 +117,8 @@ void OptimizeForLevel(int read_amplification_threshold, int write_amplification_threshold, uint64_t target_db_size, Options* options) { int expected_levels_one_level0_file = - static_cast(ceil(::log(target_db_size / options->write_buffer_size) / - ::log(kBytesForLevelMultiplier))); + static_cast(ceil(std::log(target_db_size / options->write_buffer_size) / + std::log(kBytesForLevelMultiplier))); int level0_stop_writes_trigger = read_amplification_threshold - expected_levels_one_level0_file; diff --git a/util/options_helper.cc b/util/options_helper.cc index c397c2f56..679d0a1b4 100644 --- a/util/options_helper.cc +++ b/util/options_helper.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -361,6 +361,18 @@ bool ParseOptionHelper(char* opt_address, const OptionType& opt_type, return ParseEnum( encoding_type_string_map, value, reinterpret_cast(opt_address)); + case OptionType::kWALRecoveryMode: + return ParseEnum( + wal_recovery_mode_string_map, value, + reinterpret_cast(opt_address)); + case OptionType::kAccessHint: + return ParseEnum( + access_hint_string_map, value, + reinterpret_cast(opt_address)); + case OptionType::kInfoLogLevel: + return ParseEnum( + info_log_level_string_map, value, + reinterpret_cast(opt_address)); default: return false; } @@ -498,6 +510,18 @@ bool SerializeSingleOptionHelper(const char* opt_address, return SerializeEnum( encoding_type_string_map, *reinterpret_cast(opt_address), value); + case OptionType::kWALRecoveryMode: + return SerializeEnum( + wal_recovery_mode_string_map, + *reinterpret_cast(opt_address), value); + case OptionType::kAccessHint: + return SerializeEnum( + access_hint_string_map, + *reinterpret_cast(opt_address), value); + case OptionType::kInfoLogLevel: + return SerializeEnum( + info_log_level_string_map, + *reinterpret_cast(opt_address), value); default: return false; } @@ -805,6 +829,7 @@ Status ParseColumnFamilyOption(const std::string& name, } switch (opt_info.verification) { case OptionVerificationType::kByName: + case OptionVerificationType::kByNameAllowNull: return Status::NotSupported( "Deserializing the specified CF option " + name + " is not supported"); @@ -985,6 +1010,7 @@ Status ParseDBOption(const std::string& name, } switch (opt_info.verification) { case OptionVerificationType::kByName: + case OptionVerificationType::kByNameAllowNull: return Status::NotSupported( "Deserializing the specified DB option " + name + " is not supported"); @@ -1082,6 +1108,8 @@ Status GetBlockBasedTableOptionsFromMap( // the old API, where everything is // parsable. (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && iter->second.verification != OptionVerificationType::kDeprecated)) { return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", o.first + " " + error_message); @@ -1116,10 +1144,12 @@ Status GetPlainTableOptionsFromMap( if (error_message != "") { const auto iter = plain_table_type_info.find(o.first); if (iter == plain_table_type_info.end() || - !input_strings_escaped ||// !input_strings_escaped indicates - // the old API, where everything is - // parsable. + !input_strings_escaped || // !input_strings_escaped indicates + // the old API, where everything is + // parsable. (iter->second.verification != OptionVerificationType::kByName && + iter->second.verification != + OptionVerificationType::kByNameAllowNull && iter->second.verification != OptionVerificationType::kDeprecated)) { return Status::InvalidArgument("Can't parse PlainTableOptions:", o.first + " " + error_message); diff --git a/util/options_helper.h b/util/options_helper.h index 84d547cfc..5c33e36ff 100644 --- a/util/options_helper.h +++ b/util/options_helper.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -93,18 +93,23 @@ enum class OptionType { kFlushBlockPolicyFactory, kChecksumType, kEncodingType, + kWALRecoveryMode, + kAccessHint, + kInfoLogLevel, kUnknown }; enum class OptionVerificationType { kNormal, - kByName, // The option is pointer typed so we can only verify - // based on it's name. - kDeprecated // The option is no longer used in rocksdb. The RocksDB - // OptionsParser will still accept this option if it - // happen to exists in some Options file. However, the - // parser will not include it in serialization and - // verification processes. + kByName, // The option is pointer typed so we can only verify + // based on it's name. + kByNameAllowNull, // Same as kByName, but it also allows the case + // where one of them is a nullptr. + kDeprecated // The option is no longer used in rocksdb. The RocksDB + // OptionsParser will still accept this option if it + // happen to exists in some Options file. However, the + // parser will not include it in serialization and + // verification processes. }; // A struct for storing constant option information such as option name, @@ -141,10 +146,7 @@ Status GetColumnFamilyOptionsFromMapInternal( static std::unordered_map db_options_type_info = { /* // not yet supported - AccessHint access_hint_on_compaction_start; Env* env; - InfoLogLevel info_log_level; - WALRecoveryMode wal_recovery_mode; std::shared_ptr row_cache; std::shared_ptr delete_scheduler; std::shared_ptr info_log; @@ -208,7 +210,7 @@ static std::unordered_map db_options_type_info = { {offsetof(struct DBOptions, random_access_max_buffer_size), OptionType::kSizeT, OptionVerificationType::kNormal}}, {"writable_file_max_buffer_size", - {offsetof(struct DBOptions, writable_file_max_buffer_size), + {offsetof(struct DBOptions, writable_file_max_buffer_size), OptionType::kSizeT, OptionVerificationType::kNormal}}, {"use_adaptive_mutex", {offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean, @@ -219,6 +221,9 @@ static std::unordered_map db_options_type_info = { {"max_background_compactions", {offsetof(struct DBOptions, max_background_compactions), OptionType::kInt, OptionVerificationType::kNormal}}, + {"base_background_compactions", + {offsetof(struct DBOptions, base_background_compactions), OptionType::kInt, + OptionVerificationType::kNormal}}, {"max_background_flushes", {offsetof(struct DBOptions, max_background_flushes), OptionType::kInt, OptionVerificationType::kNormal}}, @@ -284,6 +289,30 @@ static std::unordered_map db_options_type_info = { OptionVerificationType::kNormal}}, {"stats_dump_period_sec", {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt, + OptionVerificationType::kNormal}}, + {"fail_if_options_file_error", + {offsetof(struct DBOptions, fail_if_options_file_error), + OptionType::kBoolean, OptionVerificationType::kNormal}}, + {"allow_concurrent_memtable_write", + {offsetof(struct DBOptions, allow_concurrent_memtable_write), + OptionType::kBoolean, OptionVerificationType::kNormal}}, + {"wal_recovery_mode", + {offsetof(struct DBOptions, wal_recovery_mode), + OptionType::kWALRecoveryMode, OptionVerificationType::kNormal}}, + {"enable_write_thread_adaptive_yield", + {offsetof(struct DBOptions, enable_write_thread_adaptive_yield), + OptionType::kBoolean, OptionVerificationType::kNormal}}, + {"write_thread_slow_yield_usec", + {offsetof(struct DBOptions, write_thread_slow_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal}}, + {"write_thread_max_yield_usec", + {offsetof(struct DBOptions, write_thread_max_yield_usec), + OptionType::kUInt64T, OptionVerificationType::kNormal}}, + {"access_hint_on_compaction_start", + {offsetof(struct DBOptions, access_hint_on_compaction_start), + OptionType::kAccessHint, OptionVerificationType::kNormal}}, + {"info_log_level", + {offsetof(struct DBOptions, info_log_level), OptionType::kInfoLogLevel, OptionVerificationType::kNormal}}}; static std::unordered_map cf_options_type_info = { @@ -430,7 +459,7 @@ static std::unordered_map cf_options_type_info = { OptionVerificationType::kByName}}, {"prefix_extractor", {offsetof(struct ColumnFamilyOptions, prefix_extractor), - OptionType::kSliceTransform, OptionVerificationType::kByName}}, + OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull}}, {"memtable_factory", {offsetof(struct ColumnFamilyOptions, memtable_factory), OptionType::kMemTableRepFactory, OptionVerificationType::kByName}}, @@ -462,6 +491,10 @@ static std::unordered_map {"kCompactionStyleFIFO", kCompactionStyleFIFO}, {"kCompactionStyleNone", kCompactionStyleNone}}; +static std::unordered_map wal_recovery_mode_string_map = { + {"kTolerateCorruptedTailRecords", + WALRecoveryMode::kTolerateCorruptedTailRecords}, + {"kAbsoluteConsistency", WALRecoveryMode::kAbsoluteConsistency}, + {"kPointInTimeRecovery", WALRecoveryMode::kPointInTimeRecovery}, + {"kSkipAnyCorruptedRecords", WALRecoveryMode::kSkipAnyCorruptedRecords}}; + +static std::unordered_map + access_hint_string_map = {{"NONE", DBOptions::AccessHint::NONE}, + {"NORMAL", DBOptions::AccessHint::NORMAL}, + {"SEQUENTIAL", DBOptions::AccessHint::SEQUENTIAL}, + {"WILLNEED", DBOptions::AccessHint::WILLNEED}}; + +static std::unordered_map info_log_level_string_map = + {{"DEBUG_LEVEL", InfoLogLevel::DEBUG_LEVEL}, + {"INFO_LEVEL", InfoLogLevel::INFO_LEVEL}, + {"WARN_LEVEL", InfoLogLevel::WARN_LEVEL}, + {"ERROR_LEVEL", InfoLogLevel::ERROR_LEVEL}, + {"FATAL_LEVEL", InfoLogLevel::FATAL_LEVEL}, + {"HEADER_LEVEL", InfoLogLevel::HEADER_LEVEL}}; + } // namespace rocksdb #endif // !ROCKSDB_LITE diff --git a/util/options_parser.cc b/util/options_parser.cc index 20ae51e8b..0c368c646 100644 --- a/util/options_parser.cc +++ b/util/options_parser.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -510,6 +510,7 @@ bool AreEqualOptions( const std::unordered_map* opt_map) { const char* offset1 = opt1 + type_info.offset; const char* offset2 = opt2 + type_info.offset; + static const std::string kNullptrString = "nullptr"; switch (type_info.type) { case OptionType::kBoolean: return (*reinterpret_cast(offset1) == @@ -556,8 +557,18 @@ bool AreEqualOptions( *reinterpret_cast( offset1) == *reinterpret_cast(offset2)); + case OptionType::kWALRecoveryMode: + return (*reinterpret_cast(offset1) == + *reinterpret_cast(offset2)); + case OptionType::kAccessHint: + return (*reinterpret_cast(offset1) == + *reinterpret_cast(offset2)); + case OptionType::kInfoLogLevel: + return (*reinterpret_cast(offset1) == + *reinterpret_cast(offset2)); default: - if (type_info.verification == OptionVerificationType::kByName) { + if (type_info.verification == OptionVerificationType::kByName || + type_info.verification == OptionVerificationType::kByNameAllowNull) { std::string value1; bool result = SerializeSingleOptionHelper(offset1, type_info.type, &value1); @@ -571,6 +582,12 @@ bool AreEqualOptions( if (iter == opt_map->end()) { return true; } else { + if (type_info.verification == + OptionVerificationType::kByNameAllowNull) { + if (iter->second == kNullptrString || value1 == kNullptrString) { + return true; + } + } return (value1 == iter->second); } } diff --git a/util/options_parser.h b/util/options_parser.h index 94e69cc2a..0c96df83e 100644 --- a/util/options_parser.h +++ b/util/options_parser.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/options_sanity_check.cc b/util/options_sanity_check.cc index a84031bf9..1294a6ecc 100644 --- a/util/options_sanity_check.cc +++ b/util/options_sanity_check.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/options_sanity_check.h b/util/options_sanity_check.h index 6f18a58c8..bfadbdaf2 100644 --- a/util/options_sanity_check.h +++ b/util/options_sanity_check.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/options_test.cc b/util/options_test.cc index e4b572533..8940f8c6c 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,6 +12,7 @@ #endif #include +#include #include #include @@ -1307,10 +1308,10 @@ TEST_F(OptionsSanityCheckTest, SanityCheck) { // prefix_extractor { - // change the prefix extractor and expect only pass when - // sanity-level == kSanityLevelNone + // Okay to change prefix_extractor form nullptr to non-nullptr + ASSERT_EQ(opts.prefix_extractor.get(), nullptr); opts.prefix_extractor.reset(NewCappedPrefixTransform(10)); - ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); // persist the change @@ -1337,11 +1338,21 @@ TEST_F(OptionsSanityCheckTest, SanityCheck) { // expect pass only in kSanityLevelNone ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); + + // Change prefix extractor from non-nullptr to nullptr + opts.prefix_extractor.reset(); + // expect pass as it's safe to change prefix_extractor + // from non-null to null + ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); + ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelNone)); } + // persist the change + ASSERT_OK(PersistCFOptions(opts)); + ASSERT_OK(SanityCheckCFOptions(opts, kSanityLevelExactMatch)); // table_factory { - for (int tb = 2; tb >= 0; --tb) { + for (int tb = 0; tb <= 2; ++tb) { // change the table factory opts.table_factory.reset(test::RandomTableFactory(&rnd, tb)); ASSERT_NOK(SanityCheckCFOptions(opts, kSanityLevelLooselyCompatible)); @@ -1474,6 +1485,392 @@ TEST_F(OptionsParserTest, EscapeOptionString) { "Escape \\# and"); } +// Only run the tests to verify new fields in options are settable through +// string on limited platforms as it depends on behavior of compilers. +#ifdef OS_LINUX +#ifndef __clang__ +const char kSpecialChar = 'R'; +typedef std::vector> OffsetGap; + +void FillWithSpecialChar(char* start_ptr, size_t total_size, + const OffsetGap& blacklist) { + size_t offset = 0; + for (auto& pair : blacklist) { + std::memset(start_ptr + offset, kSpecialChar, pair.first - offset); + offset = pair.first + pair.second; + } + std::memset(start_ptr + offset, kSpecialChar, total_size - offset); +} + +int NumUnsetBytes(char* start_ptr, size_t total_size, + const OffsetGap& blacklist) { + int total_unset_bytes_base = 0; + size_t offset = 0; + for (auto& pair : blacklist) { + for (char* ptr = start_ptr + offset; ptr < start_ptr + pair.first; ptr++) { + if (*ptr == kSpecialChar) { + total_unset_bytes_base++; + } + offset = pair.first + pair.second; + } + } + for (char* ptr = start_ptr + offset; ptr < start_ptr + total_size; ptr++) { + if (*ptr == kSpecialChar) { + total_unset_bytes_base++; + } + } + return total_unset_bytes_base; +} + +// If the test fails, likely a new option is added to BlockBasedTableOptions +// but it cannot be set through GetBlockBasedTableOptionsFromString(), or the +// test is not updated accordingly. +// After adding an option, we need to make sure it is settable by +// GetBlockBasedTableOptionsFromString() and add the option to the input string +// passed to the GetBlockBasedTableOptionsFromString() in this test. +// If it is a complicated type, you also need to add the field to +// kBbtoBlacklist, and maybe add customized verification for it. +TEST_F(OptionsParserTest, BlockBasedTableOptionsAllFieldsSettable) { + // Items in the form of . Need to be in ascending order + // and not overlapping. Need to updated if new pointer-option is added. + const OffsetGap kBbtoBlacklist = { + {offsetof(struct BlockBasedTableOptions, flush_block_policy_factory), + sizeof(std::shared_ptr)}, + {offsetof(struct BlockBasedTableOptions, block_cache), + sizeof(std::shared_ptr)}, + {offsetof(struct BlockBasedTableOptions, block_cache_compressed), + sizeof(std::shared_ptr)}, + {offsetof(struct BlockBasedTableOptions, filter_policy), + sizeof(std::shared_ptr)}, + }; + + // In this test, we catch a new option of BlockBasedTableOptions that is not + // settable through GetBlockBasedTableOptionsFromString(). + // We count padding bytes of the option struct, and assert it to be the same + // as unset bytes of an option struct initialized by + // GetBlockBasedTableOptionsFromString(). + + char* bbto_ptr = new char[sizeof(BlockBasedTableOptions)]; + + // Count padding bytes by setting all bytes in the memory to a special char, + // copy a well constructed struct to this memory and see how many special + // bytes left. + BlockBasedTableOptions* bbto = new (bbto_ptr) BlockBasedTableOptions(); + FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + // It based on the behavior of compiler that padding bytes are not changed + // when copying the struct. It's prone to failure when compiler behavior + // changes. We verify there is unset bytes to detect the case. + *bbto = BlockBasedTableOptions(); + int unset_bytes_base = + NumUnsetBytes(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + ASSERT_GT(unset_bytes_base, 0); + bbto->~BlockBasedTableOptions(); + + // Construct the base option passed into + // GetBlockBasedTableOptionsFromString(). + bbto = new (bbto_ptr) BlockBasedTableOptions(); + FillWithSpecialChar(bbto_ptr, sizeof(BlockBasedTableOptions), kBbtoBlacklist); + // This option is not setable: + bbto->use_delta_encoding = true; + + char* new_bbto_ptr = new char[sizeof(BlockBasedTableOptions)]; + BlockBasedTableOptions* new_bbto = + new (new_bbto_ptr) BlockBasedTableOptions(); + FillWithSpecialChar(new_bbto_ptr, sizeof(BlockBasedTableOptions), + kBbtoBlacklist); + + // Need to update the option string if a new option is added. + ASSERT_OK(GetBlockBasedTableOptionsFromString( + *bbto, + "cache_index_and_filter_blocks=1;" + "pin_l0_filter_and_index_blocks_in_cache=1;" + "index_type=kHashSearch;" + "checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;" + "block_cache=1M;block_cache_compressed=1k;block_size=1024;" + "block_size_deviation=8;block_restart_interval=4; " + "index_block_restart_interval=4;" + "filter_policy=bloomfilter:4:true;whole_key_filtering=1;" + "skip_table_builder_flush=1;format_version=1;" + "hash_index_allow_collision=false;", + new_bbto)); + + ASSERT_EQ(unset_bytes_base, + NumUnsetBytes(new_bbto_ptr, sizeof(BlockBasedTableOptions), + kBbtoBlacklist)); + + ASSERT_TRUE(new_bbto->block_cache.get() != nullptr); + ASSERT_TRUE(new_bbto->block_cache_compressed.get() != nullptr); + ASSERT_TRUE(new_bbto->filter_policy.get() != nullptr); + + bbto->~BlockBasedTableOptions(); + new_bbto->~BlockBasedTableOptions(); + + delete[] bbto_ptr; + delete[] new_bbto_ptr; +} + +// If the test fails, likely a new option is added to DBOptions +// but it cannot be set through GetDBOptionsFromString(), or the test is not +// updated accordingly. +// After adding an option, we need to make sure it is settable by +// GetDBOptionsFromString() and add the option to the input string passed to +// DBOptionsFromString()in this test. +// If it is a complicated type, you also need to add the field to +// kDBOptionsBlacklist, and maybe add customized verification for it. +TEST_F(OptionsParserTest, DBOptionsAllFieldsSettable) { + const OffsetGap kDBOptionsBlacklist = { + {offsetof(struct DBOptions, env), sizeof(Env*)}, + {offsetof(struct DBOptions, rate_limiter), + sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, sst_file_manager), + sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, info_log), sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, statistics), + sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, db_paths), sizeof(std::vector)}, + {offsetof(struct DBOptions, db_log_dir), sizeof(std::string)}, + {offsetof(struct DBOptions, wal_dir), sizeof(std::string)}, + {offsetof(struct DBOptions, listeners), + sizeof(std::vector>)}, + {offsetof(struct DBOptions, row_cache), sizeof(std::shared_ptr)}, + {offsetof(struct DBOptions, wal_filter), sizeof(const WalFilter*)}, + }; + + char* options_ptr = new char[sizeof(DBOptions)]; + + // Count padding bytes by setting all bytes in the memory to a special char, + // copy a well constructed struct to this memory and see how many special + // bytes left. + DBOptions* options = new (options_ptr) DBOptions(); + FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + // It based on the behavior of compiler that padding bytes are not changed + // when copying the struct. It's prone to failure when compiler behavior + // changes. We verify there is unset bytes to detect the case. + *options = DBOptions(); + int unset_bytes_base = + NumUnsetBytes(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + ASSERT_GT(unset_bytes_base, 0); + options->~DBOptions(); + + options = new (options_ptr) DBOptions(); + FillWithSpecialChar(options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + + char* new_options_ptr = new char[sizeof(DBOptions)]; + DBOptions* new_options = new (new_options_ptr) DBOptions(); + FillWithSpecialChar(new_options_ptr, sizeof(DBOptions), kDBOptionsBlacklist); + + // Need to update the option string if a new option is added. + ASSERT_OK( + GetDBOptionsFromString(*options, + "wal_bytes_per_sync=4295048118;" + "delete_obsolete_files_period_micros=4294967758;" + "WAL_ttl_seconds=4295008036;" + "WAL_size_limit_MB=4295036161;" + "wal_dir=path/to/wal_dir;" + "db_write_buffer_size=2587;" + "max_subcompactions=64330;" + "table_cache_numshardbits=28;" + "max_open_files=72;" + "max_file_opening_threads=35;" + "base_background_compactions=3;" + "max_background_compactions=33;" + "use_fsync=true;" + "use_adaptive_mutex=false;" + "max_total_wal_size=4295005604;" + "compaction_readahead_size=0;" + "new_table_reader_for_compaction_inputs=false;" + "keep_log_file_num=4890;" + "skip_stats_update_on_db_open=false;" + "max_manifest_file_size=4295009941;" + "db_log_dir=path/to/db_log_dir;" + "skip_log_error_on_recovery=true;" + "writable_file_max_buffer_size=1048576;" + "paranoid_checks=true;" + "is_fd_close_on_exec=false;" + "bytes_per_sync=4295013613;" + "enable_thread_tracking=false;" + "disable_data_sync=false;" + "recycle_log_file_num=0;" + "disableDataSync=false;" + "create_missing_column_families=true;" + "log_file_time_to_roll=3097;" + "max_background_flushes=35;" + "create_if_missing=false;" + "error_if_exists=true;" + "allow_os_buffer=false;" + "delayed_write_rate=4294976214;" + "manifest_preallocation_size=1222;" + "allow_mmap_writes=false;" + "stats_dump_period_sec=70127;" + "allow_fallocate=true;" + "allow_mmap_reads=false;" + "max_log_file_size=4607;" + "random_access_max_buffer_size=1048576;" + "advise_random_on_open=true;" + "fail_if_options_file_error=false;" + "allow_concurrent_memtable_write=true;" + "wal_recovery_mode=kPointInTimeRecovery;" + "enable_write_thread_adaptive_yield=true;" + "write_thread_slow_yield_usec=5;" + "write_thread_max_yield_usec=1000;" + "access_hint_on_compaction_start=NONE;" + "info_log_level=DEBUG_LEVEL;", + new_options)); + + ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions), + kDBOptionsBlacklist)); + + options->~DBOptions(); + new_options->~DBOptions(); + + delete[] options_ptr; + delete[] new_options_ptr; +} + +// If the test fails, likely a new option is added to ColumnFamilyOptions +// but it cannot be set through GetColumnFamilyOptionsFromString(), or the +// test is not updated accordingly. +// After adding an option, we need to make sure it is settable by +// GetColumnFamilyOptionsFromString() and add the option to the input +// string passed to GetColumnFamilyOptionsFromString()in this test. +// If it is a complicated type, you also need to add the field to +// kColumnFamilyOptionsBlacklist, and maybe add customized verification +// for it. +TEST_F(OptionsParserTest, ColumnFamilyOptionsAllFieldsSettable) { + const OffsetGap kColumnFamilyOptionsBlacklist = { + {offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)}, + {offsetof(struct ColumnFamilyOptions, merge_operator), + sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, compaction_filter), + sizeof(const CompactionFilter*)}, + {offsetof(struct ColumnFamilyOptions, compaction_filter_factory), + sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, compression_per_level), + sizeof(std::vector)}, + {offsetof(struct ColumnFamilyOptions, prefix_extractor), + sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, + max_bytes_for_level_multiplier_additional), + sizeof(std::vector)}, + {offsetof(struct ColumnFamilyOptions, memtable_factory), + sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, table_factory), + sizeof(std::shared_ptr)}, + {offsetof(struct ColumnFamilyOptions, + table_properties_collector_factories), + sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)}, + {offsetof(struct ColumnFamilyOptions, inplace_callback), + sizeof(UpdateStatus (*)(char*, uint32_t*, Slice, std::string*))}, + }; + + char* options_ptr = new char[sizeof(ColumnFamilyOptions)]; + + // Count padding bytes by setting all bytes in the memory to a special char, + // copy a well constructed struct to this memory and see how many special + // bytes left. + ColumnFamilyOptions* options = new (options_ptr) ColumnFamilyOptions(); + FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsBlacklist); + // It based on the behavior of compiler that padding bytes are not changed + // when copying the struct. It's prone to failure when compiler behavior + // changes. We verify there is unset bytes to detect the case. + *options = ColumnFamilyOptions(); + + // Deprecatd option which is not initialized. Need to set it to avoid + // Valgrind error + options->max_mem_compaction_level = 0; + + int unset_bytes_base = NumUnsetBytes(options_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsBlacklist); + ASSERT_GT(unset_bytes_base, 0); + options->~ColumnFamilyOptions(); + + options = new (options_ptr) ColumnFamilyOptions(); + FillWithSpecialChar(options_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsBlacklist); + + // Following options are not settable through + // GetColumnFamilyOptionsFromString(): + options->rate_limit_delay_max_milliseconds = 33; + options->compaction_pri = CompactionPri::kOldestSmallestSeqFirst; + options->compaction_options_universal = CompactionOptionsUniversal(); + options->compression_opts = CompressionOptions(); + options->hard_rate_limit = 0; + options->soft_rate_limit = 0; + options->compaction_options_fifo = CompactionOptionsFIFO(); + options->max_mem_compaction_level = 0; + + char* new_options_ptr = new char[sizeof(ColumnFamilyOptions)]; + ColumnFamilyOptions* new_options = + new (new_options_ptr) ColumnFamilyOptions(); + FillWithSpecialChar(new_options_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsBlacklist); + + // Need to update the option string if a new option is added. + ASSERT_OK(GetColumnFamilyOptionsFromString( + *options, + "compaction_filter_factory=mpudlojcujCompactionFilterFactory;" + "table_factory=PlainTable;" + "prefix_extractor=rocksdb.CappedPrefix.13;" + "comparator=leveldb.BytewiseComparator;" + "compression_per_level=kBZip2Compression:kBZip2Compression:" + "kBZip2Compression:kNoCompression:kZlibCompression:kBZip2Compression:" + "kSnappyCompression;" + "max_bytes_for_level_base=986;" + "bloom_locality=8016;" + "target_file_size_base=4294976376;" + "memtable_prefix_bloom_huge_page_tlb_size=2557;" + "max_successive_merges=5497;" + "max_sequential_skip_in_iterations=4294971408;" + "arena_block_size=1893;" + "target_file_size_multiplier=35;" + "source_compaction_factor=54;" + "min_write_buffer_number_to_merge=9;" + "max_write_buffer_number=84;" + "write_buffer_size=1653;" + "max_grandparent_overlap_factor=64;" + "max_bytes_for_level_multiplier=60;" + "memtable_factory=SkipListFactory;" + "compression=kNoCompression;" + "min_partial_merge_operands=7576;" + "level0_stop_writes_trigger=33;" + "num_levels=99;" + "level0_slowdown_writes_trigger=22;" + "level0_file_num_compaction_trigger=14;" + "expanded_compaction_factor=34;" + "compaction_filter=urxcqstuwnCompactionFilter;" + "soft_rate_limit=530.615385;" + "soft_pending_compaction_bytes_limit=0;" + "max_write_buffer_number_to_maintain=84;" + "verify_checksums_in_compaction=false;" + "merge_operator=aabcxehazrMergeOperator;" + "memtable_prefix_bloom_bits=4642;" + "paranoid_file_checks=true;" + "inplace_update_num_locks=7429;" + "optimize_filters_for_hits=false;" + "level_compaction_dynamic_level_bytes=false;" + "inplace_update_support=false;" + "compaction_style=kCompactionStyleFIFO;" + "memtable_prefix_bloom_probes=2511;" + "purge_redundant_kvs_while_flush=true;" + "filter_deletes=false;" + "hard_pending_compaction_bytes_limit=0;" + "disable_auto_compactions=false;" + "compaction_measure_io_stats=true;", + new_options)); + + ASSERT_EQ(unset_bytes_base, + NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions), + kColumnFamilyOptionsBlacklist)); + + options->~ColumnFamilyOptions(); + new_options->~ColumnFamilyOptions(); + + delete[] options_ptr; + delete[] new_options_ptr; +} +#endif // !__clang__ +#endif // OS_LINUX #endif // !ROCKSDB_LITE } // namespace rocksdb diff --git a/util/perf_context.cc b/util/perf_context.cc index 282516590..5fdda1081 100644 --- a/util/perf_context.cc +++ b/util/perf_context.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -61,32 +61,54 @@ void PerfContext::Reset() { #endif } -#define OUTPUT(counter) #counter << " = " << counter << ", " +#define PERF_CONTEXT_OUTPUT(counter) \ + if (!exclude_zero_counters || (counter > 0)) { \ + ss << #counter << " = " << counter << ", "; \ + } -std::string PerfContext::ToString() const { +std::string PerfContext::ToString(bool exclude_zero_counters) const { #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) return ""; #else std::ostringstream ss; - ss << OUTPUT(user_key_comparison_count) << OUTPUT(block_cache_hit_count) - << OUTPUT(block_read_count) << OUTPUT(block_read_byte) - << OUTPUT(block_read_time) << OUTPUT(block_checksum_time) - << OUTPUT(block_decompress_time) << OUTPUT(internal_key_skipped_count) - << OUTPUT(internal_delete_skipped_count) << OUTPUT(write_wal_time) - << OUTPUT(get_snapshot_time) << OUTPUT(get_from_memtable_time) - << OUTPUT(get_from_memtable_count) << OUTPUT(get_post_process_time) - << OUTPUT(get_from_output_files_time) << OUTPUT(seek_on_memtable_time) - << OUTPUT(seek_on_memtable_count) << OUTPUT(seek_child_seek_time) - << OUTPUT(seek_child_seek_count) << OUTPUT(seek_min_heap_time) - << OUTPUT(seek_internal_seek_time) << OUTPUT(find_next_user_entry_time) - << OUTPUT(write_pre_and_post_process_time) << OUTPUT(write_memtable_time) - << OUTPUT(db_mutex_lock_nanos) << OUTPUT(db_condition_wait_nanos) - << OUTPUT(merge_operator_time_nanos) << OUTPUT(write_delay_time) - << OUTPUT(read_index_block_nanos) << OUTPUT(read_filter_block_nanos) - << OUTPUT(new_table_block_iter_nanos) << OUTPUT(new_table_iterator_nanos) - << OUTPUT(block_seek_nanos) << OUTPUT(find_table_nanos) - << OUTPUT(bloom_memtable_hit_count) << OUTPUT(bloom_memtable_miss_count) - << OUTPUT(bloom_sst_hit_count) << OUTPUT(bloom_sst_miss_count); + PERF_CONTEXT_OUTPUT(user_key_comparison_count); + PERF_CONTEXT_OUTPUT(block_cache_hit_count); + PERF_CONTEXT_OUTPUT(block_read_count); + PERF_CONTEXT_OUTPUT(block_read_byte); + PERF_CONTEXT_OUTPUT(block_read_time); + PERF_CONTEXT_OUTPUT(block_checksum_time); + PERF_CONTEXT_OUTPUT(block_decompress_time); + PERF_CONTEXT_OUTPUT(internal_key_skipped_count); + PERF_CONTEXT_OUTPUT(internal_delete_skipped_count); + PERF_CONTEXT_OUTPUT(write_wal_time); + PERF_CONTEXT_OUTPUT(get_snapshot_time); + PERF_CONTEXT_OUTPUT(get_from_memtable_time); + PERF_CONTEXT_OUTPUT(get_from_memtable_count); + PERF_CONTEXT_OUTPUT(get_post_process_time); + PERF_CONTEXT_OUTPUT(get_from_output_files_time); + PERF_CONTEXT_OUTPUT(seek_on_memtable_time); + PERF_CONTEXT_OUTPUT(seek_on_memtable_count); + PERF_CONTEXT_OUTPUT(seek_child_seek_time); + PERF_CONTEXT_OUTPUT(seek_child_seek_count); + PERF_CONTEXT_OUTPUT(seek_min_heap_time); + PERF_CONTEXT_OUTPUT(seek_internal_seek_time); + PERF_CONTEXT_OUTPUT(find_next_user_entry_time); + PERF_CONTEXT_OUTPUT(write_pre_and_post_process_time); + PERF_CONTEXT_OUTPUT(write_memtable_time); + PERF_CONTEXT_OUTPUT(db_mutex_lock_nanos); + PERF_CONTEXT_OUTPUT(db_condition_wait_nanos); + PERF_CONTEXT_OUTPUT(merge_operator_time_nanos); + PERF_CONTEXT_OUTPUT(write_delay_time); + PERF_CONTEXT_OUTPUT(read_index_block_nanos); + PERF_CONTEXT_OUTPUT(read_filter_block_nanos); + PERF_CONTEXT_OUTPUT(new_table_block_iter_nanos); + PERF_CONTEXT_OUTPUT(new_table_iterator_nanos); + PERF_CONTEXT_OUTPUT(block_seek_nanos); + PERF_CONTEXT_OUTPUT(find_table_nanos); + PERF_CONTEXT_OUTPUT(bloom_memtable_hit_count); + PERF_CONTEXT_OUTPUT(bloom_memtable_miss_count); + PERF_CONTEXT_OUTPUT(bloom_sst_hit_count); + PERF_CONTEXT_OUTPUT(bloom_sst_miss_count); return ss.str(); #endif } diff --git a/util/perf_context_imp.h b/util/perf_context_imp.h index a5c4c39d9..ee1a7c318 100644 --- a/util/perf_context_imp.h +++ b/util/perf_context_imp.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,7 +13,7 @@ namespace rocksdb { #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) #define PERF_TIMER_GUARD(metric) -#define PERF_CONDITIONAL_TIMER_GUARD(metric, condition) +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition) #define PERF_TIMER_MEASURE(metric) #define PERF_TIMER_STOP(metric) #define PERF_TIMER_START(metric) @@ -33,10 +33,10 @@ namespace rocksdb { PerfStepTimer perf_step_timer_ ## metric(&(perf_context.metric)); \ perf_step_timer_ ## metric.Start(); -#define PERF_CONDITIONAL_TIMER_GUARD(metric, condition) \ - PerfStepTimer perf_step_timer_##metric(&(perf_context.metric)); \ - if ((condition)) { \ - perf_step_timer_##metric.Start(); \ +#define PERF_CONDITIONAL_TIMER_FOR_MUTEX_GUARD(metric, condition) \ + PerfStepTimer perf_step_timer_##metric(&(perf_context.metric), true); \ + if ((condition)) { \ + perf_step_timer_##metric.Start(); \ } // Update metric with time elapsed since last START. start time is reset diff --git a/util/perf_level.cc b/util/perf_level.cc index 387ff5f1d..746510f21 100644 --- a/util/perf_level.cc +++ b/util/perf_level.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/perf_level_imp.h b/util/perf_level_imp.h index 7a8341062..ced05e924 100644 --- a/util/perf_level_imp.h +++ b/util/perf_level_imp.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/perf_step_timer.h b/util/perf_step_timer.h index 950258345..33a709f93 100644 --- a/util/perf_step_timer.h +++ b/util/perf_step_timer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -12,12 +12,12 @@ namespace rocksdb { class PerfStepTimer { public: - PerfStepTimer(uint64_t* metric) - : enabled_(perf_level >= PerfLevel::kEnableTime), - env_(enabled_ ? Env::Default() : nullptr), - start_(0), - metric_(metric) { - } + explicit PerfStepTimer(uint64_t* metric, bool for_mutex = false) + : enabled_(perf_level >= PerfLevel::kEnableTime || + (!for_mutex && perf_level >= kEnableTimeExceptForMutex)), + env_(enabled_ ? Env::Default() : nullptr), + start_(0), + metric_(metric) {} ~PerfStepTimer() { Stop(); diff --git a/util/posix_logger.h b/util/posix_logger.h index 55cb34a86..61bb9e38a 100644 --- a/util/posix_logger.h +++ b/util/posix_logger.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -16,11 +16,16 @@ #include "port/sys_time.h" #include #include + #ifdef OS_LINUX +#ifndef FALLOC_FL_KEEP_SIZE #include #endif +#endif + #include "rocksdb/env.h" #include "util/iostats_context_imp.h" +#include "util/sync_point.h" #include namespace rocksdb { @@ -52,6 +57,7 @@ class PosixLogger : public Logger { fclose(file_); } virtual void Flush() override { + TEST_SYNC_POINT_CALLBACK("PosixLogger::Flush:BeginCallback", nullptr); if (flush_pending_) { flush_pending_ = false; fflush(file_); diff --git a/util/random.cc b/util/random.cc index 56944773f..9f0f9bb4c 100644 --- a/util/random.cc +++ b/util/random.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/random.h b/util/random.h index 8f90c7675..7428454d8 100644 --- a/util/random.h +++ b/util/random.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -102,7 +102,7 @@ class Random64 { // return "base" random bits. The effect is to pick a number in the // range [0,2^max_log-1] with exponential bias towards smaller numbers. uint64_t Skewed(int max_log) { - return Uniform(1 << Uniform(max_log + 1)); + return Uniform(uint64_t(1) << Uniform(max_log + 1)); } }; diff --git a/util/rate_limiter.cc b/util/rate_limiter.cc index 188d5f0c7..352925bec 100644 --- a/util/rate_limiter.cc +++ b/util/rate_limiter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/rate_limiter.h b/util/rate_limiter.h index 62ae6b5ad..d413d9c6e 100644 --- a/util/rate_limiter.h +++ b/util/rate_limiter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/rate_limiter_test.cc b/util/rate_limiter_test.cc index d635010a4..9085835de 100644 --- a/util/rate_limiter_test.cc +++ b/util/rate_limiter_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/slice.cc b/util/slice.cc index 4c50ff9a6..d1ddb7cd7 100644 --- a/util/slice.cc +++ b/util/slice.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/slice_transform_test.cc b/util/slice_transform_test.cc index 5b7c1b402..624b11f11 100644 --- a/util/slice_transform_test.cc +++ b/util/slice_transform_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/sst_file_manager_impl.cc b/util/sst_file_manager_impl.cc new file mode 100644 index 000000000..bbf240cad --- /dev/null +++ b/util/sst_file_manager_impl.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include "util/sst_file_manager_impl.h" + +#include + +#include "port/port.h" +#include "rocksdb/env.h" +#include "util/mutexlock.h" +#include "util/sync_point.h" + +namespace rocksdb { + +SstFileManagerImpl::SstFileManagerImpl(Env* env, std::shared_ptr logger, + const std::string& trash_dir, + int64_t rate_bytes_per_sec) + : env_(env), + logger_(logger), + total_files_size_(0), + max_allowed_space_(0), + delete_scheduler_(env, trash_dir, rate_bytes_per_sec, logger.get(), + this) {} + +SstFileManagerImpl::~SstFileManagerImpl() {} + +Status SstFileManagerImpl::OnAddFile(const std::string& file_path) { + uint64_t file_size; + Status s = env_->GetFileSize(file_path, &file_size); + if (s.ok()) { + MutexLock l(&mu_); + OnAddFileImpl(file_path, file_size); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnAddFile"); + return s; +} + +Status SstFileManagerImpl::OnDeleteFile(const std::string& file_path) { + { + MutexLock l(&mu_); + OnDeleteFileImpl(file_path); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnDeleteFile"); + return Status::OK(); +} + +Status SstFileManagerImpl::OnMoveFile(const std::string& old_path, + const std::string& new_path) { + { + MutexLock l(&mu_); + OnAddFileImpl(new_path, tracked_files_[old_path]); + OnDeleteFileImpl(old_path); + } + TEST_SYNC_POINT("SstFileManagerImpl::OnMoveFile"); + return Status::OK(); +} + +void SstFileManagerImpl::SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) { + MutexLock l(&mu_); + max_allowed_space_ = max_allowed_space; +} + +bool SstFileManagerImpl::IsMaxAllowedSpaceReached() { + MutexLock l(&mu_); + if (max_allowed_space_ <= 0) { + return false; + } + return total_files_size_ >= max_allowed_space_; +} + +uint64_t SstFileManagerImpl::GetTotalSize() { + MutexLock l(&mu_); + return total_files_size_; +} + +std::unordered_map +SstFileManagerImpl::GetTrackedFiles() { + MutexLock l(&mu_); + return tracked_files_; +} + +int64_t SstFileManagerImpl::GetDeleteRateBytesPerSecond() { + return delete_scheduler_.GetRateBytesPerSecond(); +} + +Status SstFileManagerImpl::ScheduleFileDeletion(const std::string& file_path) { + return delete_scheduler_.DeleteFile(file_path); +} + +void SstFileManagerImpl::WaitForEmptyTrash() { + delete_scheduler_.WaitForEmptyTrash(); +} + +void SstFileManagerImpl::OnAddFileImpl(const std::string& file_path, + uint64_t file_size) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file != tracked_files_.end()) { + // File was added before, we will just update the size + total_files_size_ -= tracked_file->second; + total_files_size_ += file_size; + } else { + total_files_size_ += file_size; + } + tracked_files_[file_path] = file_size; +} + +void SstFileManagerImpl::OnDeleteFileImpl(const std::string& file_path) { + auto tracked_file = tracked_files_.find(file_path); + if (tracked_file == tracked_files_.end()) { + // File is not tracked + return; + } + + total_files_size_ -= tracked_file->second; + tracked_files_.erase(tracked_file); +} + +SstFileManager* NewSstFileManager(Env* env, std::shared_ptr info_log, + std::string trash_dir, + int64_t rate_bytes_per_sec, + bool delete_exisitng_trash, Status* status) { + SstFileManagerImpl* res = + new SstFileManagerImpl(env, info_log, trash_dir, rate_bytes_per_sec); + + Status s; + if (trash_dir != "" && rate_bytes_per_sec > 0) { + s = env->CreateDirIfMissing(trash_dir); + if (s.ok() && delete_exisitng_trash) { + std::vector files_in_trash; + s = env->GetChildren(trash_dir, &files_in_trash); + if (s.ok()) { + for (const std::string& trash_file : files_in_trash) { + if (trash_file == "." || trash_file == "..") { + continue; + } + + std::string path_in_trash = trash_dir + "/" + trash_file; + res->OnAddFile(path_in_trash); + Status file_delete = res->ScheduleFileDeletion(path_in_trash); + if (s.ok() && !file_delete.ok()) { + s = file_delete; + } + } + } + } + } + + if (status) { + *status = s; + } + + return res; +} + +} // namespace rocksdb diff --git a/util/sst_file_manager_impl.h b/util/sst_file_manager_impl.h new file mode 100644 index 000000000..ca9ddedba --- /dev/null +++ b/util/sst_file_manager_impl.h @@ -0,0 +1,95 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#include + +#include "port/port.h" + +#include "rocksdb/sst_file_manager.h" +#include "util/delete_scheduler.h" + +namespace rocksdb { + +class Env; +class Logger; + +// SstFileManager is used to track SST files in the DB and control there +// deletion rate. +// All SstFileManager public functions are thread-safe. +class SstFileManagerImpl : public SstFileManager { + public: + explicit SstFileManagerImpl(Env* env, std::shared_ptr logger, + const std::string& trash_dir, + int64_t rate_bytes_per_sec); + + ~SstFileManagerImpl(); + + // DB will call OnAddFile whenever a new sst file is added. + Status OnAddFile(const std::string& file_path); + + // DB will call OnDeleteFile whenever an sst file is deleted. + Status OnDeleteFile(const std::string& file_path); + + // DB will call OnMoveFile whenever an sst file is move to a new path. + Status OnMoveFile(const std::string& old_path, const std::string& new_path); + + // Update the maximum allowed space that should be used by RocksDB, if + // the total size of the SST files exceeds max_allowed_space, writes to + // RocksDB will fail. + // + // Setting max_allowed_space to 0 will disable this feature, maximum allowed + // space will be infinite (Default value). + // + // thread-safe. + void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) override; + + // Return true if the total size of SST files exceeded the maximum allowed + // space usage. + // + // thread-safe. + bool IsMaxAllowedSpaceReached() override; + + // Return the total size of all tracked files. + uint64_t GetTotalSize() override; + + // Return a map containing all tracked files and there corresponding sizes. + std::unordered_map GetTrackedFiles() override; + + // Return delete rate limit in bytes per second. + virtual int64_t GetDeleteRateBytesPerSecond() override; + + // Move file to trash directory and schedule it's deletion. + virtual Status ScheduleFileDeletion(const std::string& file_path); + + // Wait for all files being deleteing in the background to finish or for + // destructor to be called. + virtual void WaitForEmptyTrash(); + + private: + // REQUIRES: mutex locked + void OnAddFileImpl(const std::string& file_path, uint64_t file_size); + // REQUIRES: mutex locked + void OnDeleteFileImpl(const std::string& file_path); + + Env* env_; + std::shared_ptr logger_; + // Mutex to protect tracked_files_, total_files_size_ + port::Mutex mu_; + // The summation of the sizes of all files in tracked_files_ map + uint64_t total_files_size_; + // A map containing all tracked files and there sizes + // file_path => file_size + std::unordered_map tracked_files_; + // The maximum allowed space (in bytes) for sst files. + uint64_t max_allowed_space_; + // DeleteScheduler used to throttle file deletition, if SstFileManagerImpl was + // created with rate_bytes_per_sec == 0 or trash_dir == "", delete_scheduler_ + // rate limiting will be disabled and will simply delete the files. + DeleteScheduler delete_scheduler_; +}; + +} // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc index 8a7525c81..62993b029 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/statistics.h b/util/statistics.h index 55914f59e..001c6715f 100644 --- a/util/statistics.h +++ b/util/statistics.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -61,7 +61,13 @@ class StatisticsImpl : public Statistics { char padding[64 - sizeof(std::atomic_uint_fast64_t)]; }; - Ticker tickers_[INTERNAL_TICKER_ENUM_MAX] __attribute__((aligned(64))); + static_assert(sizeof(Ticker) == 64, "Expecting to fit into 64 bytes"); + + // Attributes expand to nothing depending on the platform + __declspec(align(64)) + Ticker tickers_[INTERNAL_TICKER_ENUM_MAX] + __attribute__((aligned(64))); + __declspec(align(64)) HistogramImpl histograms_[INTERNAL_HISTOGRAM_ENUM_MAX] __attribute__((aligned(64))); }; diff --git a/util/status.cc b/util/status.cc index 6ff5005f9..93590d2d7 100644 --- a/util/status.cc +++ b/util/status.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/status_message.cc b/util/status_message.cc index 26ab06ddd..fc251a9b4 100644 --- a/util/status_message.cc +++ b/util/status_message.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/stop_watch.h b/util/stop_watch.h index 86cb2653c..663661aba 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/string_util.cc b/util/string_util.cc index 4e0bc4668..d1830d27e 100644 --- a/util/string_util.cc +++ b/util/string_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/string_util.h b/util/string_util.h index c7cc57dab..7afbc402e 100644 --- a/util/string_util.h +++ b/util/string_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/sync_point.cc b/util/sync_point.cc index 11c42f100..88d36bd3d 100644 --- a/util/sync_point.cc +++ b/util/sync_point.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,16 +21,14 @@ void TestKillRandom(std::string kill_point, int odds, } } - time_t curtime = time(nullptr); - Random r((uint32_t)curtime); - assert(odds > 0); if (odds % 7 == 0) { - // class Rarndom uses multiplier 16807, which is 7^5. If odds are - // multiplier of 7, the first random value might have limited values. + // class Random uses multiplier 16807, which is 7^5. If odds are + // multiplier of 7, there might be limited values generated. odds++; } - bool crash = r.OneIn(odds); + auto* r = Random::GetTLSInstance(); + bool crash = r->OneIn(odds); if (crash) { port::Crash(srcfile, srcline); } diff --git a/util/sync_point.h b/util/sync_point.h index f169636a1..a9aac755e 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/testharness.cc b/util/testharness.cc index 603f6f6e1..4c4455bfe 100644 --- a/util/testharness.cc +++ b/util/testharness.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/testharness.h b/util/testharness.h index b212b1e3a..298b16632 100644 --- a/util/testharness.h +++ b/util/testharness.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/testutil.cc b/util/testutil.cc index 8db8dac88..8c587511f 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -113,7 +113,7 @@ class Uint64ComparatorImpl : public Comparator { }; } // namespace -static port::OnceType once = LEVELDB_ONCE_INIT; +static port::OnceType once; static const Comparator* uint64comp; static void InitModule() { @@ -193,6 +193,7 @@ const SliceTransform* RandomSliceTransform(Random* rnd, int pre_defined) { BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { BlockBasedTableOptions opt; opt.cache_index_and_filter_blocks = rnd->Uniform(2); + opt.pin_l0_filter_and_index_blocks_in_cache = rnd->Uniform(2); opt.index_type = rnd->Uniform(2) ? BlockBasedTableOptions::kBinarySearch : BlockBasedTableOptions::kHashSearch; opt.hash_index_allow_collision = rnd->Uniform(2); @@ -200,6 +201,7 @@ BlockBasedTableOptions RandomBlockBasedTableOptions(Random* rnd) { opt.block_size = rnd->Uniform(10000000); opt.block_size_deviation = rnd->Uniform(100); opt.block_restart_interval = rnd->Uniform(100); + opt.index_block_restart_interval = rnd->Uniform(100); opt.whole_key_filtering = rnd->Uniform(2); return opt; diff --git a/util/testutil.h b/util/testutil.h index d8b4f0ca2..80f6c55c1 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,7 +13,6 @@ #include #include -#include "db/dbformat.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index eeb2b1688..c706a2e3e 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_local.cc b/util/thread_local.cc index 7fb7a27dc..5f3fddae5 100644 --- a/util/thread_local.cc +++ b/util/thread_local.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -104,7 +104,6 @@ PIMAGE_TLS_CALLBACK p_thread_callback_on_exit = wintlscleanup::WinOnThreadExit; void ThreadLocalPtr::InitSingletons() { ThreadLocalPtr::StaticMeta::InitSingletons(); - ThreadLocalPtr::Instance(); } ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { @@ -113,30 +112,46 @@ ThreadLocalPtr::StaticMeta* ThreadLocalPtr::Instance() { // when the function is first call. As a result, we can properly // control their construction order by properly preparing their // first function call. - static ThreadLocalPtr::StaticMeta inst; - return &inst; + // + // Note that here we decide to make "inst" a static pointer w/o deleting + // it at the end instead of a static variable. This is to avoid the following + // destruction order desester happens when a child thread using ThreadLocalPtr + // dies AFTER the main thread dies: When a child thread happens to use + // ThreadLocalPtr, it will try to delete its thread-local data on its + // OnThreadExit when the child thread dies. However, OnThreadExit depends + // on the following variable. As a result, if the main thread dies before any + // child thread happen to use ThreadLocalPtr dies, then the destruction of + // the following variable will go first, then OnThreadExit, therefore causing + // invalid access. + // + // The above problem can be solved by using thread_local to store tls_ instead + // of using __thread. The major difference between thread_local and __thread + // is that thread_local supports dynamic construction and destruction of + // non-primitive typed variables. As a result, we can guarantee the + // desturction order even when the main thread dies before any child threads. + // However, thread_local requires gcc 4.8 and is not supported in all the + // compilers that accepts -std=c++11 (e.g., the default clang on Mac), while + // the current RocksDB still accept gcc 4.7. + static ThreadLocalPtr::StaticMeta* inst = new ThreadLocalPtr::StaticMeta(); + return inst; } void ThreadLocalPtr::StaticMeta::InitSingletons() { Mutex(); } -port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { - // Here we prefer function static variable instead of global - // static variable as function static variable is initialized - // when the function is first call. As a result, we can properly - // control their construction order by properly preparing their - // first function call. - static port::Mutex mutex; - return &mutex; -} +port::Mutex* ThreadLocalPtr::StaticMeta::Mutex() { return &Instance()->mutex_; } void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { auto* tls = static_cast(ptr); assert(tls != nullptr); - auto* inst = Instance(); + // Use the cached StaticMeta::Instance() instead of directly calling + // the variable inside StaticMeta::Instance() might already go out of + // scope here in case this OnThreadExit is called after the main thread + // dies. + auto* inst = tls->inst; pthread_setspecific(inst->pthread_key_, nullptr); - MutexLock l(Mutex()); + MutexLock l(inst->MemberMutex()); inst->RemoveThreadData(tls); // Unref stored pointers of current thread from all instances uint32_t id = 0; @@ -154,7 +169,7 @@ void ThreadLocalPtr::StaticMeta::OnThreadExit(void* ptr) { delete tls; } -ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0) { +ThreadLocalPtr::StaticMeta::StaticMeta() : next_instance_id_(0), head_(this) { if (pthread_key_create(&pthread_key_, &OnThreadExit) != 0) { abort(); } @@ -221,7 +236,7 @@ ThreadLocalPtr::ThreadData* ThreadLocalPtr::StaticMeta::GetThreadLocal() { if (UNLIKELY(tls_ == nullptr)) { auto* inst = Instance(); - tls_ = new ThreadData(); + tls_ = new ThreadData(inst); { // Register it in the global chain, needs to be done before thread exit // handler registration diff --git a/util/thread_local.h b/util/thread_local.h index 72991724e..3adf8ba85 100644 --- a/util/thread_local.h +++ b/util/thread_local.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -79,6 +79,8 @@ class ThreadLocalPtr { std::atomic ptr; }; + class StaticMeta; + // This is the structure that is declared as "thread_local" storage. // The vector keep list of atomic pointer for all instances for "current" // thread. The vector is indexed by an Id that is unique in process and @@ -95,10 +97,11 @@ class ThreadLocalPtr { // | thread 3 | void* | void* | void* | <- ThreadData // --------------------------------------------------- struct ThreadData { - ThreadData() : entries() {} + explicit ThreadData(StaticMeta* _inst) : entries(), inst(_inst) {} std::vector entries; ThreadData* next; ThreadData* prev; + StaticMeta* inst; }; class StaticMeta { @@ -139,6 +142,31 @@ class ThreadLocalPtr { // initialized will be no-op. static void InitSingletons(); + // protect inst, next_instance_id_, free_instance_ids_, head_, + // ThreadData.entries + // + // Note that here we prefer function static variable instead of the usual + // global static variable. The reason is that c++ destruction order of + // static variables in the reverse order of their construction order. + // However, C++ does not guarantee any construction order when global + // static variables are defined in different files, while the function + // static variables are initialized when their function are first called. + // As a result, the construction order of the function static variables + // can be controlled by properly invoke their first function calls in + // the right order. + // + // For instance, the following function contains a function static + // variable. We place a dummy function call of this inside + // Env::Default() to ensure the construction order of the construction + // order. + static port::Mutex* Mutex(); + + // Returns the member mutex of the current StaticMeta. In general, + // Mutex() should be used instead of this one. However, in case where + // the static variable inside Instance() goes out of scope, MemberMutex() + // should be used. One example is OnThreadExit() function. + port::Mutex* MemberMutex() { return &mutex_; } + private: // Get UnrefHandler for id with acquiring mutex // REQUIRES: mutex locked @@ -169,24 +197,9 @@ class ThreadLocalPtr { std::unordered_map handler_map_; - // protect inst, next_instance_id_, free_instance_ids_, head_, - // ThreadData.entries - // - // Note that here we prefer function static variable instead of the usual - // global static variable. The reason is that c++ destruction order of - // static variables in the reverse order of their construction order. - // However, C++ does not guarantee any construction order when global - // static variables are defined in different files, while the function - // static variables are initialized when their function are first called. - // As a result, the construction order of the function static variables - // can be controlled by properly invoke their first function calls in - // the right order. - // - // For instance, the following function contains a function static - // variable. We place a dummy function call of this inside - // Env::Default() to ensure the construction order of the construction - // order. - static port::Mutex* Mutex(); + // The private mutex. Developers should always use Mutex() instead of + // using this variable directly. + port::Mutex mutex_; #if ROCKSDB_SUPPORT_THREAD_LOCAL // Thread local storage static __thread ThreadData* tls_; diff --git a/util/thread_local_test.cc b/util/thread_local_test.cc index 368818669..737a2654f 100644 --- a/util/thread_local_test.cc +++ b/util/thread_local_test.cc @@ -1,16 +1,19 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include #include +#include #include "rocksdb/env.h" #include "port/port.h" #include "util/autovector.h" -#include "util/thread_local.h" +#include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/thread_local.h" namespace rocksdb { @@ -467,6 +470,47 @@ TEST_F(ThreadLocalTest, CompareAndSwap) { ASSERT_EQ(tls.Get(), reinterpret_cast(3)); } +namespace { + +void* AccessThreadLocal(void* arg) { + TEST_SYNC_POINT("AccessThreadLocal:Start"); + ThreadLocalPtr tlp; + tlp.Reset(new std::string("hello RocksDB")); + TEST_SYNC_POINT("AccessThreadLocal:End"); + return nullptr; +} + +} // namespace + +// The following test is disabled as it requires manual steps to run it +// correctly. +// +// Currently we have no way to acess SyncPoint w/o ASAN error when the +// child thread dies after the main thread dies. So if you manually enable +// this test and only see an ASAN error on SyncPoint, it means you pass the +// test. +TEST_F(ThreadLocalTest, DISABLED_MainThreadDiesFirst) { + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"AccessThreadLocal:Start", "MainThreadDiesFirst:End"}, + {"PosixEnv::~PosixEnv():End", "AccessThreadLocal:End"}}); + + // Triggers the initialization of singletons. + Env::Default(); + +#ifndef ROCKSDB_LITE + try { +#endif // ROCKSDB_LITE + std::thread th(&AccessThreadLocal, nullptr); + th.detach(); + TEST_SYNC_POINT("MainThreadDiesFirst:End"); +#ifndef ROCKSDB_LITE + } catch (const std::system_error& ex) { + std::cerr << "Start thread: " << ex.code() << std::endl; + ASSERT_TRUE(false); + } +#endif // ROCKSDB_LITE +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/util/thread_operation.h b/util/thread_operation.h index e55596c1b..ace619817 100644 --- a/util/thread_operation.h +++ b/util/thread_operation.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_posix.cc b/util/thread_posix.cc index 88e67ed76..f09abd54c 100644 --- a/util/thread_posix.cc +++ b/util/thread_posix.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_posix.h b/util/thread_posix.h index c5d643878..96dfe1e1e 100644 --- a/util/thread_posix.h +++ b/util/thread_posix.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_status_impl.cc b/util/thread_status_impl.cc index 50cb355bb..e9a702bba 100644 --- a/util/thread_status_impl.cc +++ b/util/thread_status_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_status_updater.cc b/util/thread_status_updater.cc index 3b93f2087..a3f9a9afc 100644 --- a/util/thread_status_updater.cc +++ b/util/thread_status_updater.cc @@ -1,13 +1,13 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include "util/thread_status_updater.h" #include #include "rocksdb/env.h" #include "port/likely.h" #include "util/mutexlock.h" -#include "util/thread_status_updater.h" namespace rocksdb { @@ -246,7 +246,9 @@ void ThreadStatusUpdater::EraseColumnFamilyInfo(const void* cf_key) { // a consistent view of global column family table (cf_info_map). std::lock_guard lck(thread_list_mutex_); auto cf_pair = cf_info_map_.find(cf_key); - assert(cf_pair != cf_info_map_.end()); + if (cf_pair == cf_info_map_.end()) { + return; + } auto* cf_info = cf_pair->second.get(); assert(cf_info); @@ -278,7 +280,9 @@ void ThreadStatusUpdater::EraseDatabaseInfo(const void* db_key) { size_t result __attribute__((unused)) = 0; for (auto cf_key : db_pair->second) { auto cf_pair = cf_info_map_.find(cf_key); - assert(cf_pair != cf_info_map_.end()); + if (cf_pair == cf_info_map_.end()) { + continue; + } cf_pair->second.reset(); result = cf_info_map_.erase(cf_key); assert(result); diff --git a/util/thread_status_updater.h b/util/thread_status_updater.h index e7c7007d4..23d6d6f58 100644 --- a/util/thread_status_updater.h +++ b/util/thread_status_updater.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_status_updater_debug.cc b/util/thread_status_updater_debug.cc index 274f427d3..501181faf 100644 --- a/util/thread_status_updater_debug.cc +++ b/util/thread_status_updater_debug.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/thread_status_util.cc b/util/thread_status_util.cc index e67a8e4ef..d573e0566 100644 --- a/util/thread_status_util.cc +++ b/util/thread_status_util.cc @@ -1,11 +1,12 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. +#include "util/thread_status_util.h" + #include "rocksdb/env.h" #include "util/thread_status_updater.h" -#include "util/thread_status_util.h" namespace rocksdb { @@ -33,12 +34,14 @@ void ThreadStatusUtil::UnregisterThread() { } } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { - if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) { +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd, + const Env* env, + bool enable_thread_tracking) { + if (!MaybeInitThreadLocalUpdater(env)) { return; } assert(thread_updater_local_cache_); - if (cfd != nullptr && cfd->options()->enable_thread_tracking) { + if (cfd != nullptr && enable_thread_tracking) { thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); } else { // When cfd == nullptr or enable_thread_tracking == false, we set @@ -118,15 +121,17 @@ void ThreadStatusUtil::ResetThreadStatus() { thread_updater_local_cache_->ResetThreadStatus(); } -void ThreadStatusUtil::NewColumnFamilyInfo( - const DB* db, const ColumnFamilyData* cfd) { - if (!MaybeInitThreadLocalUpdater(cfd->ioptions()->env)) { +void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db, + const ColumnFamilyData* cfd, + const std::string& cf_name, + const Env* env) { + if (!MaybeInitThreadLocalUpdater(env)) { return; } assert(thread_updater_local_cache_); if (thread_updater_local_cache_) { - thread_updater_local_cache_->NewColumnFamilyInfo( - db, db->GetName(), cfd, cfd->GetName()); + thread_updater_local_cache_->NewColumnFamilyInfo(db, db->GetName(), cfd, + cf_name); } } @@ -171,8 +176,9 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* env) { return false; } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { -} +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd, + const Env* env, + bool enable_thread_tracking) {} void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { } @@ -188,9 +194,10 @@ void ThreadStatusUtil::IncreaseThreadOperationProperty( void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType state) { } -void ThreadStatusUtil::NewColumnFamilyInfo( - const DB* db, const ColumnFamilyData* cfd) { -} +void ThreadStatusUtil::NewColumnFamilyInfo(const DB* db, + const ColumnFamilyData* cfd, + const std::string& cf_name, + const Env* env) {} void ThreadStatusUtil::EraseColumnFamilyInfo( const ColumnFamilyData* cfd) { diff --git a/util/thread_status_util.h b/util/thread_status_util.h index aa13a6c40..3445182ec 100644 --- a/util/thread_status_util.h +++ b/util/thread_status_util.h @@ -1,18 +1,20 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #pragma once -#include "db/column_family.h" +#include + +#include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/thread_status.h" #include "util/thread_status_updater.h" namespace rocksdb { -class ColumnFamilyData; +class ColumnFamilyData; // The static utility class for updating thread-local status. // @@ -37,8 +39,8 @@ class ThreadStatusUtil { // Create an entry in the global ColumnFamilyInfo table for the // specified column family. This function should be called only // when the current thread does not hold db_mutex. - static void NewColumnFamilyInfo( - const DB* db, const ColumnFamilyData* cfd); + static void NewColumnFamilyInfo(const DB* db, const ColumnFamilyData* cfd, + const std::string& cf_name, const Env* env); // Erase the ConstantColumnFamilyInfo that is associated with the // specified ColumnFamilyData. This function should be called only @@ -52,7 +54,8 @@ class ThreadStatusUtil { // Update the thread status to indicate the current thread is doing // something related to the specified column family. - static void SetColumnFamily(const ColumnFamilyData* cfd); + static void SetColumnFamily(const ColumnFamilyData* cfd, const Env* env, + bool enable_thread_tracking); static void SetThreadOperation(ThreadStatus::OperationType type); diff --git a/util/thread_status_util_debug.cc b/util/thread_status_util_debug.cc index 94b19f3d2..355bd9784 100644 --- a/util/thread_status_util_debug.cc +++ b/util/thread_status_util_debug.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/util/transaction_test_util.cc b/util/transaction_test_util.cc new file mode 100644 index 000000000..7ec990374 --- /dev/null +++ b/util/transaction_test_util.cc @@ -0,0 +1,237 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include "util/transaction_test_util.h" + +#include +#include + +#include "rocksdb/db.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "rocksdb/utilities/transaction_db.h" +#include "util/random.h" +#include "util/string_util.h" + +namespace rocksdb { + +RandomTransactionInserter::RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options, + const ReadOptions& read_options, uint64_t num_keys, uint16_t num_sets) + : rand_(rand), + write_options_(write_options), + read_options_(read_options), + num_keys_(num_keys), + num_sets_(num_sets) {} + +RandomTransactionInserter::~RandomTransactionInserter() { + if (txn_ != nullptr) { + delete txn_; + } + if (optimistic_txn_ != nullptr) { + delete optimistic_txn_; + } +} + +bool RandomTransactionInserter::TransactionDBInsert( + TransactionDB* db, const TransactionOptions& txn_options) { + txn_ = db->BeginTransaction(write_options_, txn_options, txn_); + + return DoInsert(nullptr, txn_, false); +} + +bool RandomTransactionInserter::OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options) { + optimistic_txn_ = + db->BeginTransaction(write_options_, txn_options, optimistic_txn_); + + return DoInsert(nullptr, optimistic_txn_, true); +} + +bool RandomTransactionInserter::DBInsert(DB* db) { + return DoInsert(db, nullptr, false); +} + +bool RandomTransactionInserter::DoInsert(DB* db, Transaction* txn, + bool is_optimistic) { + Status s; + WriteBatch batch; + std::string value; + + // pick a random number to use to increment a key in each set + uint64_t incr = (rand_->Next() % 100) + 1; + + bool unexpected_error = false; + + // For each set, pick a key at random and increment it + for (uint8_t i = 0; i < num_sets_; i++) { + uint64_t int_value = 0; + char prefix_buf[5]; + // prefix_buf needs to be large enough to hold a uint16 in string form + + // key format: [SET#][random#] + std::string rand_key = ToString(rand_->Next() % num_keys_); + Slice base_key(rand_key); + + // Pad prefix appropriately so we can iterate over each set + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", i + 1); + std::string full_key = std::string(prefix_buf) + base_key.ToString(); + Slice key(full_key); + + if (txn != nullptr) { + s = txn->GetForUpdate(read_options_, key, &value); + } else { + s = db->Get(read_options_, key, &value); + } + + if (s.ok()) { + // Found key, parse its value + int_value = std::stoull(value); + + if (int_value == 0 || int_value == ULONG_MAX) { + unexpected_error = true; + fprintf(stderr, "Get returned unexpected value: %s\n", value.c_str()); + s = Status::Corruption(); + } + } else if (s.IsNotFound()) { + // Have not yet written to this key, so assume its value is 0 + int_value = 0; + s = Status::OK(); + } else { + // Optimistic transactions should never return non-ok status here. + // Non-optimistic transactions may return write-coflict/timeout errors. + if (is_optimistic || !(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + fprintf(stderr, "Get returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + break; + } + + if (s.ok()) { + // Increment key + std::string sum = ToString(int_value + incr); + if (txn != nullptr) { + s = txn->Put(key, sum); + if (!s.ok()) { + // Since we did a GetForUpdate, Put should not fail. + fprintf(stderr, "Put returned an unexpected error: %s\n", + s.ToString().c_str()); + unexpected_error = true; + } + } else { + batch.Put(key, sum); + } + } + } + + if (s.ok()) { + if (txn != nullptr) { + s = txn->Commit(); + + if (!s.ok()) { + if (is_optimistic) { + // Optimistic transactions can have write-conflict errors on commit. + // Any other error is unexpected. + if (!(s.IsBusy() || s.IsTimedOut() || s.IsTryAgain())) { + unexpected_error = true; + } + } else { + // Non-optimistic transactions should only fail due to expiration + // or write failures. For testing purproses, we do not expect any + // write failures. + if (!s.IsExpired()) { + unexpected_error = true; + } + } + + if (unexpected_error) { + fprintf(stderr, "Commit returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + + } else { + s = db->Write(write_options_, &batch); + if (!s.ok()) { + unexpected_error = true; + fprintf(stderr, "Write returned an unexpected error: %s\n", + s.ToString().c_str()); + } + } + } else { + if (txn != nullptr) { + txn->Rollback(); + } + } + + if (s.ok()) { + success_count_++; + } else { + failure_count_++; + } + + last_status_ = s; + + // return success if we didn't get any unexpected errors + return !unexpected_error; +} + +Status RandomTransactionInserter::Verify(DB* db, uint16_t num_sets) { + uint64_t prev_total = 0; + + // For each set of keys with the same prefix, sum all the values + for (uint32_t i = 0; i < num_sets; i++) { + char prefix_buf[5]; + snprintf(prefix_buf, sizeof(prefix_buf), "%.4u", i + 1); + uint64_t total = 0; + + Iterator* iter = db->NewIterator(ReadOptions()); + + for (iter->Seek(Slice(prefix_buf, 4)); iter->Valid(); iter->Next()) { + Slice key = iter->key(); + + // stop when we reach a different prefix + if (key.ToString().compare(0, 4, prefix_buf) != 0) { + break; + } + + Slice value = iter->value(); + uint64_t int_value = std::stoull(value.ToString()); + if (int_value == 0 || int_value == ULONG_MAX) { + fprintf(stderr, "Iter returned unexpected value: %s\n", + value.ToString().c_str()); + return Status::Corruption(); + } + + total += int_value; + } + delete iter; + + if (i > 0) { + if (total != prev_total) { + fprintf(stderr, + "RandomTransactionVerify found inconsistent totals. " + "Set[%" PRIu32 "]: %" PRIu64 ", Set[%" PRIu32 "]: %" PRIu64 + " \n", + i - 1, prev_total, i, total); + return Status::Corruption(); + } + } + prev_total = total; + } + + return Status::OK(); +} + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/transaction_test_util.h b/util/transaction_test_util.h new file mode 100644 index 000000000..97c62841f --- /dev/null +++ b/util/transaction_test_util.h @@ -0,0 +1,112 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "rocksdb/options.h" +#include "port/port.h" +#include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction_db.h" + +namespace rocksdb { + +class DB; +class Random64; + +// Utility class for stress testing transactions. Can be used to write many +// transactions in parallel and then validate that the data written is logically +// consistent. This class assumes the input DB is initially empty. +// +// Each call to TransactionDBInsert()/OptimisticTransactionDBInsert() will +// increment the value of a key in #num_sets sets of keys. Regardless of +// whether the transaction succeeds, the total sum of values of keys in each +// set is an invariant that should remain equal. +// +// After calling TransactionDBInsert()/OptimisticTransactionDBInsert() many +// times, Verify() can be called to validate that the invariant holds. +// +// To test writing Transaction in parallel, multiple threads can create a +// RandomTransactionInserter with similar arguments using the same DB. +class RandomTransactionInserter { + public: + // num_keys is the number of keys in each set. + // num_sets is the number of sets of keys. + explicit RandomTransactionInserter( + Random64* rand, const WriteOptions& write_options = WriteOptions(), + const ReadOptions& read_options = ReadOptions(), uint64_t num_keys = 1000, + uint16_t num_sets = 3); + + ~RandomTransactionInserter(); + + // Increment a key in each set using a Transaction on a TransactionDB. + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool TransactionDBInsert( + TransactionDB* db, + const TransactionOptions& txn_options = TransactionOptions()); + + // Increment a key in each set using a Transaction on an + // OptimisticTransactionDB + // + // Returns true if the transaction succeeded OR if any error encountered was + // expected (eg a write-conflict). Error status may be obtained by calling + // GetLastStatus(); + bool OptimisticTransactionDBInsert( + OptimisticTransactionDB* db, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); + // Increment a key in each set without using a transaction. If this function + // is called in parallel, then Verify() may fail. + // + // Returns true if the write succeeds. + // Error status may be obtained by calling GetLastStatus(). + bool DBInsert(DB* db); + + // Returns OK if Invariant is true. + static Status Verify(DB* db, uint16_t num_sets); + + // Returns the status of the previous Insert operation + Status GetLastStatus() { return last_status_; } + + // Returns the number of successfully written calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert + uint64_t GetSuccessCount() { return success_count_; } + + // Returns the number of calls to + // TransactionDBInsert/OptimisticTransactionDBInsert/DBInsert that did not + // write any data. + uint64_t GetFailureCount() { return failure_count_; } + + private: + // Input options + Random64* rand_; + const WriteOptions write_options_; + const ReadOptions read_options_; + const uint64_t num_keys_; + const uint16_t num_sets_; + + // Number of successful insert batches performed + uint64_t success_count_ = 0; + + // Number of failed insert batches attempted + uint64_t failure_count_ = 0; + + // Status returned by most recent insert operation + Status last_status_; + + // optimization: re-use allocated transaction objects. + Transaction* txn_ = nullptr; + Transaction* optimistic_txn_ = nullptr; + + bool DoInsert(DB* db, Transaction* txn, bool is_optimistic); +}; + +} // namespace rocksdb + +#endif // ROCKSDB_LITE diff --git a/util/xfunc.cc b/util/xfunc.cc index 98de1c594..fc812279f 100644 --- a/util/xfunc.cc +++ b/util/xfunc.cc @@ -1,19 +1,17 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifdef XFUNC +#include "util/xfunc.h" + #include -#include "db/db_impl.h" -#include "db/managed_iterator.h" -#include "db/write_callback.h" + #include "rocksdb/db.h" #include "rocksdb/options.h" #include "rocksdb/utilities/optimistic_transaction_db.h" #include "rocksdb/write_batch.h" -#include "util/xfunc.h" - namespace rocksdb { @@ -29,12 +27,6 @@ void GetXFTestOptions(Options* options, int skip_policy) { } } -void xf_manage_release(ManagedIterator* iter) { - if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { - iter->ReleaseIter(false); - } -} - void xf_manage_options(ReadOptions* read_options) { if (!XFuncPoint::Check("managed_xftest_dropold") && (!XFuncPoint::Check("managed_xftest_release"))) { @@ -43,31 +35,6 @@ void xf_manage_options(ReadOptions* read_options) { read_options->managed = true; } -void xf_manage_new(DBImpl* db, ReadOptions* read_options, - bool is_snapshot_supported) { - if ((!XFuncPoint::Check("managed_xftest_dropold") && - (!XFuncPoint::Check("managed_xftest_release"))) || - (!read_options->managed)) { - return; - } - if ((!read_options->tailing) && (read_options->snapshot == nullptr) && - (!is_snapshot_supported)) { - read_options->managed = false; - return; - } - if (db->GetOptions().prefix_extractor != nullptr) { - if (strcmp(db->GetOptions().table_factory.get()->Name(), "PlainTable")) { - if (!(XFuncPoint::GetSkip() & kSkipNoPrefix)) { - read_options->total_order_seek = true; - } - } else { - read_options->managed = false; - } - } -} - -void xf_manage_create(ManagedIterator* iter) { iter->SetDropOld(false); } - void xf_transaction_set_memtable_history( int32_t* max_write_buffer_number_to_maintain) { *max_write_buffer_number_to_maintain = 10; @@ -78,106 +45,6 @@ void xf_transaction_clear_memtable_history( *max_write_buffer_number_to_maintain = 0; } -class XFTransactionWriteHandler : public WriteBatch::Handler { - public: - OptimisticTransaction* txn_; - DBImpl* db_impl_; - - XFTransactionWriteHandler(OptimisticTransaction* txn, DBImpl* db_impl) - : txn_(txn), db_impl_(db_impl) {} - - virtual Status PutCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { - InstrumentedMutexLock l(&db_impl_->mutex_); - - ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); - if (cfh == nullptr) { - return Status::InvalidArgument( - "XFUNC test could not find column family " - "handle for id ", - ToString(column_family_id)); - } - - txn_->Put(cfh, key, value); - - return Status::OK(); - } - - virtual Status MergeCF(uint32_t column_family_id, const Slice& key, - const Slice& value) override { - InstrumentedMutexLock l(&db_impl_->mutex_); - - ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); - if (cfh == nullptr) { - return Status::InvalidArgument( - "XFUNC test could not find column family " - "handle for id ", - ToString(column_family_id)); - } - - txn_->Merge(cfh, key, value); - - return Status::OK(); - } - - virtual Status DeleteCF(uint32_t column_family_id, - const Slice& key) override { - InstrumentedMutexLock l(&db_impl_->mutex_); - - ColumnFamilyHandle* cfh = db_impl_->GetColumnFamilyHandle(column_family_id); - if (cfh == nullptr) { - return Status::InvalidArgument( - "XFUNC test could not find column family " - "handle for id ", - ToString(column_family_id)); - } - - txn_->Delete(cfh, key); - - return Status::OK(); - } - - virtual void LogData(const Slice& blob) override { txn_->PutLogData(blob); } -}; - -// Whenever DBImpl::Write is called, create a transaction and do the write via -// the transaction. -void xf_transaction_write(const WriteOptions& write_options, - const DBOptions& db_options, WriteBatch* my_batch, - WriteCallback* callback, DBImpl* db_impl, Status* s, - bool* write_attempted) { - if (callback != nullptr) { - // We may already be in a transaction, don't force a transaction - *write_attempted = false; - return; - } - - OptimisticTransactionDB* txn_db = new OptimisticTransactionDB(db_impl); - OptimisticTransaction* txn = - OptimisticTransaction::BeginTransaction(txn_db, write_options); - - XFTransactionWriteHandler handler(txn, db_impl); - *s = my_batch->Iterate(&handler); - - if (!s->ok()) { - Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, - "XFUNC test could not iterate batch. status: $s\n", - s->ToString().c_str()); - } - - *s = txn->Commit(); - - if (!s->ok()) { - Log(InfoLogLevel::ERROR_LEVEL, db_options.info_log, - "XFUNC test could not commit transaction. status: $s\n", - s->ToString().c_str()); - } - - *write_attempted = true; - delete txn; - delete txn_db; -} - } // namespace rocksdb #endif // XFUNC diff --git a/util/xfunc.h b/util/xfunc.h index 2b3b0e3ee..e19a03f1c 100644 --- a/util/xfunc.h +++ b/util/xfunc.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -7,6 +7,8 @@ #include #include +#include "rocksdb/options.h" + namespace rocksdb { /* @@ -31,25 +33,12 @@ namespace rocksdb { #define XFUNC_TEST(condition, location, lfname, fname, ...) #else -struct Options; -struct WriteOptions; -class ManagedIterator; -class DBImpl; void GetXFTestOptions(Options* options, int skip_policy); -void xf_manage_release(ManagedIterator* iter); -void xf_manage_new(DBImpl* db, ReadOptions* readoptions, - bool is_snapshot_supported); -void xf_manage_create(ManagedIterator* iter); void xf_manage_options(ReadOptions* read_options); void xf_transaction_set_memtable_history( int32_t* max_write_buffer_number_to_maintain); void xf_transaction_clear_memtable_history( int32_t* max_write_buffer_number_to_maintain); -void xf_transaction_write(const WriteOptions& write_options, - const DBOptions& db_options, - class WriteBatch* my_batch, - class WriteCallback* callback, DBImpl* db_impl, - Status* success, bool* write_attempted); // This class provides the facility to run custom code to test a specific // feature typically with all existing unit tests. diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index bbaf75b98..dd544871d 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,10 +20,11 @@ #include "rocksdb/rate_limiter.h" #include "rocksdb/transaction_log.h" #include "port/port.h" +#include "util/sync_point.h" #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS -#endif +#endif // __STDC_FORMAT_MACROS #include #include @@ -39,8 +40,6 @@ #include #include #include -#include "port/port.h" - namespace rocksdb { @@ -118,6 +117,12 @@ class BackupEngineImpl : public BackupEngine { private: void DeleteChildren(const std::string& dir, uint32_t file_type_filter = 0); + // Extends the "result" map with pathname->size mappings for the contents of + // "dir". Pathnames are prefixed with "dir". + Status InsertPathnameToSizeBytes( + const std::string& dir, + std::unordered_map* result); + struct FileInfo { FileInfo(const std::string& fname, uint64_t sz, uint32_t checksum) : refs(0), filename(fname), size(sz), checksum_value(checksum) {} @@ -180,8 +185,10 @@ class BackupEngineImpl : public BackupEngine { return files_; } - Status LoadFromFile(const std::string& backup_dir, - bool use_size_in_file_name); + // @param abs_path_to_size Pre-fetched file sizes (bytes). + Status LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size); Status StoreToFile(bool sync); std::string GetInfoString() { @@ -269,43 +276,59 @@ class BackupEngineImpl : public BackupEngine { } Status PutLatestBackupFileContents(uint32_t latest_backup); - // if size_limit == 0, there is no size limit, copy everything - Status CopyFile(const std::string& src, const std::string& dst, Env* src_env, - Env* dst_env, bool sync, RateLimiter* rate_limiter, - uint64_t* size = nullptr, uint32_t* checksum_value = nullptr, - uint64_t size_limit = 0, - std::function progress_callback = []() {}); + + // If size_limit == 0, there is no size limit, copy everything. + // + // Exactly one of src and contents must be non-empty. + // + // @param src If non-empty, the file is copied from this pathname. + // @param contents If non-empty, the file will be created with these contents. + Status CopyOrCreateFile(const std::string& src, const std::string& dst, + const std::string& contents, Env* src_env, + Env* dst_env, bool sync, RateLimiter* rate_limiter, + uint64_t* size = nullptr, + uint32_t* checksum_value = nullptr, + uint64_t size_limit = 0, + std::function progress_callback = []() {}); Status CalculateChecksum(const std::string& src, Env* src_env, uint64_t size_limit, uint32_t* checksum_value); - struct CopyResult { + struct CopyOrCreateResult { uint64_t size; uint32_t checksum_value; Status status; }; - struct CopyWorkItem { + + // Exactly one of src_path and contents must be non-empty. If src_path is + // non-empty, the file is copied from this pathname. Otherwise, if contents is + // non-empty, the file will be created at dst_path with these contents. + struct CopyOrCreateWorkItem { std::string src_path; std::string dst_path; + std::string contents; Env* src_env; Env* dst_env; bool sync; RateLimiter* rate_limiter; uint64_t size_limit; - std::promise result; + std::promise result; std::function progress_callback; - CopyWorkItem() {} - CopyWorkItem(const CopyWorkItem&) = delete; - CopyWorkItem& operator=(const CopyWorkItem&) = delete; + CopyOrCreateWorkItem() {} + CopyOrCreateWorkItem(const CopyOrCreateWorkItem&) = delete; + CopyOrCreateWorkItem& operator=(const CopyOrCreateWorkItem&) = delete; - CopyWorkItem(CopyWorkItem&& o) ROCKSDB_NOEXCEPT { *this = std::move(o); } + CopyOrCreateWorkItem(CopyOrCreateWorkItem&& o) ROCKSDB_NOEXCEPT { + *this = std::move(o); + } - CopyWorkItem& operator=(CopyWorkItem&& o) ROCKSDB_NOEXCEPT { + CopyOrCreateWorkItem& operator=(CopyOrCreateWorkItem&& o) ROCKSDB_NOEXCEPT { src_path = std::move(o.src_path); dst_path = std::move(o.dst_path); + contents = std::move(o.contents); src_env = o.src_env; dst_env = o.dst_env; sync = o.sync; @@ -316,12 +339,14 @@ class BackupEngineImpl : public BackupEngine { return *this; } - CopyWorkItem(std::string _src_path, std::string _dst_path, Env* _src_env, - Env* _dst_env, bool _sync, RateLimiter* _rate_limiter, - uint64_t _size_limit, - std::function _progress_callback = []() {}) + CopyOrCreateWorkItem(std::string _src_path, std::string _dst_path, + std::string _contents, Env* _src_env, Env* _dst_env, + bool _sync, RateLimiter* _rate_limiter, + uint64_t _size_limit, + std::function _progress_callback = []() {}) : src_path(std::move(_src_path)), dst_path(std::move(_dst_path)), + contents(std::move(_contents)), src_env(_src_env), dst_env(_dst_env), sync(_sync), @@ -330,21 +355,23 @@ class BackupEngineImpl : public BackupEngine { progress_callback(_progress_callback) {} }; - struct BackupAfterCopyWorkItem { - std::future result; + struct BackupAfterCopyOrCreateWorkItem { + std::future result; bool shared; bool needed_to_copy; Env* backup_env; std::string dst_path_tmp; std::string dst_path; std::string dst_relative; - BackupAfterCopyWorkItem() {} + BackupAfterCopyOrCreateWorkItem() {} - BackupAfterCopyWorkItem(BackupAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT { + BackupAfterCopyOrCreateWorkItem(BackupAfterCopyOrCreateWorkItem&& o) + ROCKSDB_NOEXCEPT { *this = std::move(o); } - BackupAfterCopyWorkItem& operator=(BackupAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT { + BackupAfterCopyOrCreateWorkItem& operator=( + BackupAfterCopyOrCreateWorkItem&& o) ROCKSDB_NOEXCEPT { result = std::move(o.result); shared = o.shared; needed_to_copy = o.needed_to_copy; @@ -355,10 +382,11 @@ class BackupEngineImpl : public BackupEngine { return *this; } - BackupAfterCopyWorkItem(std::future&& _result, bool _shared, - bool _needed_to_copy, Env* _backup_env, - std::string _dst_path_tmp, std::string _dst_path, - std::string _dst_relative) + BackupAfterCopyOrCreateWorkItem(std::future&& _result, + bool _shared, bool _needed_to_copy, + Env* _backup_env, std::string _dst_path_tmp, + std::string _dst_path, + std::string _dst_relative) : result(std::move(_result)), shared(_shared), needed_to_copy(_needed_to_copy), @@ -368,18 +396,20 @@ class BackupEngineImpl : public BackupEngine { dst_relative(std::move(_dst_relative)) {} }; - struct RestoreAfterCopyWorkItem { - std::future result; + struct RestoreAfterCopyOrCreateWorkItem { + std::future result; uint32_t checksum_value; - RestoreAfterCopyWorkItem() {} - RestoreAfterCopyWorkItem(std::future&& _result, - uint32_t _checksum_value) + RestoreAfterCopyOrCreateWorkItem() {} + RestoreAfterCopyOrCreateWorkItem(std::future&& _result, + uint32_t _checksum_value) : result(std::move(_result)), checksum_value(_checksum_value) {} - RestoreAfterCopyWorkItem(RestoreAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT { + RestoreAfterCopyOrCreateWorkItem(RestoreAfterCopyOrCreateWorkItem&& o) + ROCKSDB_NOEXCEPT { *this = std::move(o); } - RestoreAfterCopyWorkItem& operator=(RestoreAfterCopyWorkItem&& o) ROCKSDB_NOEXCEPT { + RestoreAfterCopyOrCreateWorkItem& operator=( + RestoreAfterCopyOrCreateWorkItem&& o) ROCKSDB_NOEXCEPT { result = std::move(o.result); checksum_value = o.checksum_value; return *this; @@ -388,17 +418,27 @@ class BackupEngineImpl : public BackupEngine { bool initialized_; std::mutex byte_report_mutex_; - channel files_to_copy_; + channel files_to_copy_or_create_; std::vector threads_; + // Adds a file to the backup work queue to be copied or created if it doesn't + // already exist. + // + // Exactly one of src_dir and contents must be non-empty. + // + // @param src_dir If non-empty, the file in this directory named fname will be + // copied. + // @param fname Name of destination file and, in case of copy, source file. + // @param contents If non-empty, the file will be created with these contents. Status AddBackupFileWorkItem( std::unordered_set& live_dst_paths, - std::vector& backup_items_to_finish, + std::vector& backup_items_to_finish, BackupID backup_id, bool shared, const std::string& src_dir, - const std::string& src_fname, // starts with "/" - RateLimiter* rate_limiter, uint64_t size_limit = 0, + const std::string& fname, // starts with "/" + RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit = 0, bool shared_checksum = false, - std::function progress_callback = []() {}); + std::function progress_callback = []() {}, + const std::string& contents = std::string()); // backup state data BackupID latest_backup_id_; @@ -451,7 +491,7 @@ BackupEngineImpl::BackupEngineImpl(Env* db_env, read_only_(read_only) {} BackupEngineImpl::~BackupEngineImpl() { - files_to_copy_.sendEof(); + files_to_copy_or_create_.sendEof(); for (auto& t : threads_) { t.join(); } @@ -541,10 +581,18 @@ Status BackupEngineImpl::Initialize() { return s; } } else { // Load data from storage + std::unordered_map abs_path_to_size; + for (const auto& rel_dir : + {GetSharedFileRel(), GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + InsertPathnameToSizeBytes(abs_dir, &abs_path_to_size); + } // load the backups if any for (auto& backup : backups_) { - Status s = backup.second->LoadFromFile( - options_.backup_dir, options_.use_file_size_in_file_name); + InsertPathnameToSizeBytes( + GetAbsolutePath(GetPrivateFileRel(backup.first)), &abs_path_to_size); + Status s = + backup.second->LoadFromFile(options_.backup_dir, abs_path_to_size); if (!s.ok()) { Log(options_.info_log, "Backup %u corrupted -- %s", backup.first, s.ToString().c_str()); @@ -571,17 +619,18 @@ Status BackupEngineImpl::Initialize() { } } - // set up threads perform copies from files_to_copy_ in the background + // set up threads perform copies from files_to_copy_or_create_ in the + // background for (int t = 0; t < options_.max_background_operations; t++) { threads_.emplace_back([&]() { - CopyWorkItem work_item; - while (files_to_copy_.read(work_item)) { - CopyResult result; - result.status = - CopyFile(work_item.src_path, work_item.dst_path, work_item.src_env, - work_item.dst_env, work_item.sync, work_item.rate_limiter, - &result.size, &result.checksum_value, work_item.size_limit, - work_item.progress_callback); + CopyOrCreateWorkItem work_item; + while (files_to_copy_or_create_.read(work_item)) { + CopyOrCreateResult result; + result.status = CopyOrCreateFile( + work_item.src_path, work_item.dst_path, work_item.contents, + work_item.src_env, work_item.dst_env, work_item.sync, + work_item.rate_limiter, &result.size, &result.checksum_value, + work_item.size_limit, work_item.progress_callback); work_item.result.set_value(std::move(result)); } }); @@ -616,6 +665,8 @@ Status BackupEngineImpl::CreateNewBackup( db->EnableFileDeletions(false); return s; } + TEST_SYNC_POINT("BackupEngineImpl::CreateNewBackup:SavedLiveFiles1"); + TEST_SYNC_POINT("BackupEngineImpl::CreateNewBackup:SavedLiveFiles2"); BackupID new_backup_id = latest_backup_id_ + 1; assert(backups_.find(new_backup_id) == backups_.end()); @@ -650,8 +701,15 @@ Status BackupEngineImpl::CreateNewBackup( std::unordered_set live_dst_paths; live_dst_paths.reserve(live_files.size() + live_wal_files.size()); - std::vector backup_items_to_finish; - // Add a CopyWorkItem to the channel for each live file + // Pre-fetch sizes for data files + std::unordered_map data_path_to_size; + if (s.ok()) { + s = InsertPathnameToSizeBytes(db->GetName(), &data_path_to_size); + } + + std::vector backup_items_to_finish; + // Add a CopyOrCreateWorkItem to the channel for each live file + std::string manifest_fname, current_fname; for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { uint64_t number; FileType type; @@ -663,6 +721,21 @@ Status BackupEngineImpl::CreateNewBackup( // we should only get sst, manifest and current files here assert(type == kTableFile || type == kDescriptorFile || type == kCurrentFile); + if (type == kCurrentFile) { + // We will craft the current file manually to ensure it's consistent with + // the manifest number. This is necessary because current's file contents + // can change during backup. + current_fname = live_files[i]; + continue; + } else if (type == kDescriptorFile) { + manifest_fname = live_files[i]; + } + + auto data_path_to_size_iter = + data_path_to_size.find(db->GetName() + live_files[i]); + uint64_t size_bytes = data_path_to_size_iter == data_path_to_size.end() + ? port::kMaxUint64 + : data_path_to_size_iter->second; // rules: // * if it's kTableFile, then it's shared @@ -670,23 +743,47 @@ Status BackupEngineImpl::CreateNewBackup( s = AddBackupFileWorkItem( live_dst_paths, backup_items_to_finish, new_backup_id, options_.share_table_files && type == kTableFile, db->GetName(), - live_files[i], rate_limiter.get(), + live_files[i], rate_limiter.get(), size_bytes, (type == kDescriptorFile) ? manifest_file_size : 0, options_.share_files_with_checksum && type == kTableFile, progress_callback); } - // Add a CopyWorkItem to the channel for each WAL file + if (s.ok() && !current_fname.empty() && !manifest_fname.empty()) { + // Write the current file with the manifest filename as its contents. + s = AddBackupFileWorkItem( + live_dst_paths, backup_items_to_finish, new_backup_id, + false /* shared */, "" /* src_dir */, CurrentFileName(""), + rate_limiter.get(), manifest_fname.size(), 0 /* size_limit */, + false /* shared_checksum */, progress_callback, + manifest_fname.substr(1) + "\n"); + } + + // Pre-fetch sizes for WAL files + std::unordered_map wal_path_to_size; + if (s.ok()) { + if (db->GetOptions().wal_dir != "") { + s = InsertPathnameToSizeBytes(db->GetOptions().wal_dir, + &wal_path_to_size); + } else { + wal_path_to_size = std::move(data_path_to_size); + } + } + + // Add a CopyOrCreateWorkItem to the channel for each WAL file for (size_t i = 0; s.ok() && i < live_wal_files.size(); ++i) { + auto wal_path_to_size_iter = + wal_path_to_size.find(live_wal_files[i]->PathName()); + uint64_t size_bytes = wal_path_to_size_iter == wal_path_to_size.end() + ? port::kMaxUint64 + : wal_path_to_size_iter->second; if (live_wal_files[i]->Type() == kAliveLogFile) { // we only care about live log files // copy the file into backup_dir/files// - s = AddBackupFileWorkItem(live_dst_paths, - backup_items_to_finish, - new_backup_id, - false, /* not shared */ + s = AddBackupFileWorkItem(live_dst_paths, backup_items_to_finish, + new_backup_id, false, /* not shared */ db->GetOptions().wal_dir, live_wal_files[i]->PathName(), - rate_limiter.get()); + rate_limiter.get(), size_bytes); } } @@ -938,7 +1035,7 @@ Status BackupEngineImpl::RestoreDBFromBackup( copy_file_buffer_size_ = rate_limiter->GetSingleBurstBytes(); } Status s; - std::vector restore_items_to_finish; + std::vector restore_items_to_finish; for (const auto& file_info : backup->GetFiles()) { const std::string &file = file_info->filename; std::string dst; @@ -968,18 +1065,15 @@ Status BackupEngineImpl::RestoreDBFromBackup( "/" + dst; Log(options_.info_log, "Restoring %s to %s\n", file.c_str(), dst.c_str()); - CopyWorkItem copy_work_item(GetAbsolutePath(file), - dst, - backup_env_, - db_env_, - false, - rate_limiter.get(), - 0 /* size_limit */); - RestoreAfterCopyWorkItem after_copy_work_item( - copy_work_item.result.get_future(), - file_info->checksum_value); - files_to_copy_.write(std::move(copy_work_item)); - restore_items_to_finish.push_back(std::move(after_copy_work_item)); + CopyOrCreateWorkItem copy_or_create_work_item( + GetAbsolutePath(file), dst, "" /* contents */, backup_env_, db_env_, + false, rate_limiter.get(), 0 /* size_limit */); + RestoreAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), + file_info->checksum_value); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + restore_items_to_finish.push_back( + std::move(after_copy_or_create_work_item)); } Status item_status; for (auto& item : restore_items_to_finish) { @@ -1020,21 +1114,20 @@ Status BackupEngineImpl::VerifyBackup(BackupID backup_id) { Log(options_.info_log, "Verifying backup id %u\n", backup_id); - uint64_t size; - Status result; - std::string file_path; + std::unordered_map curr_abs_path_to_size; + for (const auto& rel_dir : {GetPrivateFileRel(backup_id), GetSharedFileRel(), + GetSharedFileWithChecksumRel()}) { + const auto abs_dir = GetAbsolutePath(rel_dir); + InsertPathnameToSizeBytes(abs_dir, &curr_abs_path_to_size); + } + for (const auto& file_info : backup->GetFiles()) { - const std::string& file = file_info->filename; - file_path = GetAbsolutePath(file); - result = backup_env_->FileExists(file_path); - if (!result.ok()) { - return result; + const auto abs_path = GetAbsolutePath(file_info->filename); + if (curr_abs_path_to_size.find(abs_path) == curr_abs_path_to_size.end()) { + return Status::NotFound("File missing: " + abs_path); } - result = backup_env_->GetFileSize(file_path, &size); - if (!result.ok()) { - return result; - } else if (size != file_info->size) { - return Status::Corruption("File corrupted: " + file); + if (file_info->size != curr_abs_path_to_size[abs_path]) { + return Status::Corruption("File corrupted: " + abs_path); } } return Status::OK(); @@ -1078,12 +1171,12 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { return s; } -Status BackupEngineImpl::CopyFile(const std::string& src, - const std::string& dst, Env* src_env, - Env* dst_env, bool sync, - RateLimiter* rate_limiter, uint64_t* size, - uint32_t* checksum_value, uint64_t size_limit, - std::function progress_callback) { +Status BackupEngineImpl::CopyOrCreateFile( + const std::string& src, const std::string& dst, const std::string& contents, + Env* src_env, Env* dst_env, bool sync, RateLimiter* rate_limiter, + uint64_t* size, uint32_t* checksum_value, uint64_t size_limit, + std::function progress_callback) { + assert(src.empty() != contents.empty()); Status s; unique_ptr dst_file; unique_ptr src_file; @@ -1102,9 +1195,9 @@ Status BackupEngineImpl::CopyFile(const std::string& src, size_limit = std::numeric_limits::max(); } - s = src_env->NewSequentialFile(src, &src_file, env_options); - if (s.ok()) { - s = dst_env->NewWritableFile(dst, &dst_file, env_options); + s = dst_env->NewWritableFile(dst, &dst_file, env_options); + if (s.ok() && !src.empty()) { + s = src_env->NewSequentialFile(src, &src_file, env_options); } if (!s.ok()) { return s; @@ -1112,19 +1205,28 @@ Status BackupEngineImpl::CopyFile(const std::string& src, unique_ptr dest_writer( new WritableFileWriter(std::move(dst_file), env_options)); - unique_ptr src_reader( - new SequentialFileReader(std::move(src_file))); - unique_ptr buf(new char[copy_file_buffer_size_]); - Slice data; + unique_ptr src_reader; + unique_ptr buf; + if (!src.empty()) { + src_reader.reset(new SequentialFileReader(std::move(src_file))); + buf.reset(new char[copy_file_buffer_size_]); + } + Slice data; uint64_t processed_buffer_size = 0; do { if (stop_backup_.load(std::memory_order_acquire)) { return Status::Incomplete("Backup stopped"); } - size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? - copy_file_buffer_size_ : size_limit; - s = src_reader->Read(buffer_to_read, &data, buf.get()); + if (!src.empty()) { + size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) + ? copy_file_buffer_size_ + : size_limit; + s = src_reader->Read(buffer_to_read, &data, buf.get()); + processed_buffer_size += buffer_to_read; + } else { + data = contents; + } size_limit -= data.size(); if (!s.ok()) { @@ -1135,57 +1237,54 @@ Status BackupEngineImpl::CopyFile(const std::string& src, *size += data.size(); } if (checksum_value != nullptr) { - *checksum_value = crc32c::Extend(*checksum_value, data.data(), - data.size()); + *checksum_value = + crc32c::Extend(*checksum_value, data.data(), data.size()); } s = dest_writer->Append(data); if (rate_limiter != nullptr) { rate_limiter->Request(data.size(), Env::IO_LOW); } - processed_buffer_size += buffer_to_read; if (processed_buffer_size > options_.callback_trigger_interval_size) { processed_buffer_size -= options_.callback_trigger_interval_size; std::lock_guard lock(byte_report_mutex_); progress_callback(); } - } while (s.ok() && data.size() > 0 && size_limit > 0); + } while (s.ok() && contents.empty() && data.size() > 0 && size_limit > 0); if (s.ok() && sync) { s = dest_writer->Sync(false); } - return s; } -// src_fname will always start with "/" +// fname will always start with "/" Status BackupEngineImpl::AddBackupFileWorkItem( std::unordered_set& live_dst_paths, - std::vector& backup_items_to_finish, + std::vector& backup_items_to_finish, BackupID backup_id, bool shared, const std::string& src_dir, - const std::string& src_fname, RateLimiter* rate_limiter, + const std::string& fname, RateLimiter* rate_limiter, uint64_t size_bytes, uint64_t size_limit, bool shared_checksum, - std::function progress_callback) { - assert(src_fname.size() > 0 && src_fname[0] == '/'); - std::string dst_relative = src_fname.substr(1); + std::function progress_callback, const std::string& contents) { + assert(!fname.empty() && fname[0] == '/'); + assert(contents.empty() != src_dir.empty()); + + std::string dst_relative = fname.substr(1); std::string dst_relative_tmp; Status s; - uint64_t size; uint32_t checksum_value = 0; if (shared && shared_checksum) { // add checksum and file length to the file name - s = CalculateChecksum(src_dir + src_fname, - db_env_, - size_limit, + s = CalculateChecksum(src_dir + fname, db_env_, size_limit, &checksum_value); - if (s.ok()) { - s = db_env_->GetFileSize(src_dir + src_fname, &size); - } if (!s.ok()) { - return s; + return s; } - dst_relative = GetSharedFileWithChecksum(dst_relative, checksum_value, - size); + if (size_bytes == port::kMaxUint64) { + return Status::NotFound("File missing: " + src_dir + fname); + } + dst_relative = + GetSharedFileWithChecksum(dst_relative, checksum_value, size_bytes); dst_relative_tmp = GetSharedFileWithChecksumRel(dst_relative, true); dst_relative = GetSharedFileWithChecksumRel(dst_relative, false); } else if (shared) { @@ -1218,12 +1317,14 @@ Status BackupEngineImpl::AddBackupFileWorkItem( } } - if (shared && (same_path || file_exists)) { + if (!contents.empty()) { + need_to_copy = false; + } else if (shared && (same_path || file_exists)) { need_to_copy = false; if (shared_checksum) { Log(options_.info_log, "%s already present, with checksum %u and size %" PRIu64, - src_fname.c_str(), checksum_value, size); + fname.c_str(), checksum_value, size_bytes); } else if (backuped_file_infos_.find(dst_relative) == backuped_file_infos_.end() && !same_path) { // file already exists, but it's not referenced by any backup. overwrite @@ -1231,50 +1332,40 @@ Status BackupEngineImpl::AddBackupFileWorkItem( Log(options_.info_log, "%s already present, but not referenced by any backup. We will " "overwrite the file.", - src_fname.c_str()); + fname.c_str()); need_to_copy = true; backup_env_->DeleteFile(dst_path); } else { // the file is present and referenced by a backup - db_env_->GetFileSize(src_dir + src_fname, &size); // Ignore error Log(options_.info_log, "%s already present, calculate checksum", - src_fname.c_str()); - s = CalculateChecksum(src_dir + src_fname, db_env_, size_limit, + fname.c_str()); + s = CalculateChecksum(src_dir + fname, db_env_, size_limit, &checksum_value); } } live_dst_paths.insert(dst_path); - if (need_to_copy) { - Log(options_.info_log, "Copying %s to %s", src_fname.c_str(), - dst_path_tmp.c_str()); - CopyWorkItem copy_work_item(src_dir + src_fname, dst_path_tmp, db_env_, - backup_env_, options_.sync, rate_limiter, - size_limit, progress_callback); - BackupAfterCopyWorkItem after_copy_work_item( - copy_work_item.result.get_future(), - shared, - need_to_copy, - backup_env_, - dst_path_tmp, - dst_path, - dst_relative); - files_to_copy_.write(std::move(copy_work_item)); - backup_items_to_finish.push_back(std::move(after_copy_work_item)); + if (!contents.empty() || need_to_copy) { + Log(options_.info_log, "Copying %s to %s", fname.c_str(), + dst_path_tmp.c_str()); + CopyOrCreateWorkItem copy_or_create_work_item( + src_dir.empty() ? "" : src_dir + fname, dst_path_tmp, contents, db_env_, + backup_env_, options_.sync, rate_limiter, size_limit, + progress_callback); + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + copy_or_create_work_item.result.get_future(), shared, need_to_copy, + backup_env_, dst_path_tmp, dst_path, dst_relative); + files_to_copy_or_create_.write(std::move(copy_or_create_work_item)); + backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); } else { - std::promise promise_result; - BackupAfterCopyWorkItem after_copy_work_item( - promise_result.get_future(), - shared, - need_to_copy, - backup_env_, - dst_path_tmp, - dst_path, - dst_relative); - backup_items_to_finish.push_back(std::move(after_copy_work_item)); - CopyResult result; + std::promise promise_result; + BackupAfterCopyOrCreateWorkItem after_copy_or_create_work_item( + promise_result.get_future(), shared, need_to_copy, backup_env_, + dst_path_tmp, dst_path, dst_relative); + backup_items_to_finish.push_back(std::move(after_copy_or_create_work_item)); + CopyOrCreateResult result; result.status = s; - result.size = size; + result.size = size_bytes; result.checksum_value = checksum_value; promise_result.set_value(std::move(result)); } @@ -1340,6 +1431,22 @@ void BackupEngineImpl::DeleteChildren(const std::string& dir, } } +Status BackupEngineImpl::InsertPathnameToSizeBytes( + const std::string& dir, std::unordered_map* result) { + assert(result != nullptr); + std::vector files_attrs; + Status status = backup_env_->GetChildrenFileAttributes(dir, &files_attrs); + if (!status.ok()) { + return status; + } + const bool slash_needed = dir.empty() || dir.back() != '/'; + for (const auto& file_attrs : files_attrs) { + result->emplace(dir + (slash_needed ? "/" : "") + file_attrs.name, + file_attrs.size_bytes); + } + return Status::OK(); +} + Status BackupEngineImpl::GarbageCollect() { assert(!read_only_); Log(options_.info_log, "Starting garbage collection"); @@ -1455,55 +1562,6 @@ Status BackupEngineImpl::BackupMeta::Delete(bool delete_meta) { return s; } -namespace { -bool ParseStrToUint64(const std::string& str, uint64_t* out) { - try { - unsigned long ul = std::stoul(str); - *out = static_cast(ul); - return true; - } catch (const std::invalid_argument&) { - return false; - } catch (const std::out_of_range&) { - return false; - } -} - -// Parse file name in the format of -// "shared_checksum/__.sst, and fill `size` with -// the parsed part. -// Will also accept only name part, or a file path in URL format. -// if file name doesn't have the extension of "sst", or doesn't have '_' as a -// part of the file name, or we can't parse a number from the sub string -// between the last '_' and '.', return false. -bool GetFileSizeFromBackupFileName(const std::string full_name, - uint64_t* size) { - auto dot_pos = full_name.find_last_of('.'); - if (dot_pos == std::string::npos) { - return false; - } - if (full_name.substr(dot_pos + 1) != "sst") { - return false; - } - auto last_underscore_pos = full_name.find_last_of('_'); - if (last_underscore_pos == std::string::npos) { - return false; - } - if (dot_pos <= last_underscore_pos + 2) { - return false; - } - return ParseStrToUint64(full_name.substr(last_underscore_pos + 1, - dot_pos - last_underscore_pos - 1), - size); -} -} // namespace - -namespace test { -bool TEST_GetFileSizeFromBackupFileName(const std::string full_name, - uint64_t* size) { - return GetFileSizeFromBackupFileName(full_name, size); -} -} // namespace test - // each backup meta file is of the format: // // @@ -1511,8 +1569,9 @@ bool TEST_GetFileSizeFromBackupFileName(const std::string full_name, // // // ... -Status BackupEngineImpl::BackupMeta::LoadFromFile(const std::string& backup_dir, - bool use_size_in_file_name) { +Status BackupEngineImpl::BackupMeta::LoadFromFile( + const std::string& backup_dir, + const std::unordered_map& abs_path_to_size) { assert(Empty()); Status s; unique_ptr backup_meta_file; @@ -1554,12 +1613,11 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile(const std::string& backup_dir, if (file_info) { size = file_info->size; } else { - if (!use_size_in_file_name || - !GetFileSizeFromBackupFileName(filename, &size)) { - s = env_->GetFileSize(backup_dir + "/" + filename, &size); - if (!s.ok()) { - return s; - } + std::string abs_path = backup_dir + "/" + filename; + try { + size = abs_path_to_size.at(abs_path); + } catch (std::out_of_range&) { + return Status::NotFound("Size missing for pathname: " + abs_path); } } diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index a41b4094f..0a23f4df0 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -13,6 +13,8 @@ #include #include +#include "db/db_impl.h" +#include "db/filename.h" #include "port/port.h" #include "port/stack_trace.h" #include "rocksdb/types.h" @@ -23,10 +25,9 @@ #include "util/random.h" #include "util/mutexlock.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "util/testutil.h" -#include "util/auto_roll_logger.h" #include "util/mock_env.h" -#include "utilities/backupable/backupable_db_testutil.h" namespace rocksdb { @@ -231,6 +232,23 @@ class TestEnv : public EnvWrapper { return EnvWrapper::GetChildren(dir, r); } + // Some test cases do not actually create the test files (e.g., see + // DummyDB::live_files_) - for those cases, we mock those files' attributes + // so CreateNewBackup() can get their attributes. + void SetFilenamesForMockedAttrs(const std::vector& filenames) { + filenames_for_mocked_attrs_ = filenames; + } + Status GetChildrenFileAttributes( + const std::string& dir, std::vector* r) override { + if (filenames_for_mocked_attrs_.size() > 0) { + for (const auto& filename : filenames_for_mocked_attrs_) { + r->push_back({dir + filename, 10 /* size_bytes */}); + } + return Status::OK(); + } + return EnvWrapper::GetChildrenFileAttributes(dir, r); + } + void SetCreateDirIfMissingFailure(bool fail) { create_dir_if_missing_failure_ = fail; } @@ -254,6 +272,7 @@ class TestEnv : public EnvWrapper { port::Mutex mutex_; bool dummy_sequential_file_ = false; std::vector written_files_; + std::vector filenames_for_mocked_attrs_; uint64_t limit_written_files_ = 1000000; uint64_t limit_delete_files_ = 1000000; @@ -544,6 +563,10 @@ class BackupableDBTest : public testing::Test { std::string dbname_; std::string backupdir_; + // logger_ must be above backup_engine_ such that the engine's destructor, + // which uses a raw pointer to the logger, executes first. + std::shared_ptr logger_; + // envs Env* env_; unique_ptr mock_env_; @@ -558,7 +581,6 @@ class BackupableDBTest : public testing::Test { // options Options options_; - std::shared_ptr logger_; protected: unique_ptr backupable_options_; @@ -574,8 +596,7 @@ class BackupableDBTestWithParam : public BackupableDBTest, public testing::WithParamInterface { public: BackupableDBTestWithParam() { - backupable_options_->share_files_with_checksum = - backupable_options_->use_file_size_in_file_name = GetParam(); + backupable_options_->share_files_with_checksum = GetParam(); } }; @@ -724,47 +745,6 @@ TEST_P(BackupableDBTestWithParam, OnlineIntegrationTest) { INSTANTIATE_TEST_CASE_P(BackupableDBTestWithParam, BackupableDBTestWithParam, ::testing::Bool()); -TEST_F(BackupableDBTest, GetFileSizeFromBackupFileName) { - uint64_t size = 0; - - ASSERT_TRUE(test::TEST_GetFileSizeFromBackupFileName( - "shared_checksum/6580354_1874793674_65806675.sst", &size)); - ASSERT_EQ(65806675u, size); - - ASSERT_TRUE(test::TEST_GetFileSizeFromBackupFileName( - "hdfs://a.b:80/a/b/shared_checksum/6580354_1874793674_85806675.sst", - &size)); - ASSERT_EQ(85806675u, size); - - ASSERT_TRUE(test::TEST_GetFileSizeFromBackupFileName( - "6580354_1874793674_65806665.sst", &size)); - ASSERT_EQ(65806665u, size); - - ASSERT_TRUE(test::TEST_GetFileSizeFromBackupFileName( - "private/66/6580354_1874793674_65806666.sst", &size)); - ASSERT_EQ(65806666u, size); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName( - "shared_checksum/6580354.sst", &size)); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName( - "private/368/6592388.log", &size)); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName( - "private/68/MANIFEST-6586581", &size)); - - ASSERT_TRUE( - !test::TEST_GetFileSizeFromBackupFileName("private/68/CURRENT", &size)); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName( - "shared_checksum/6580354_1874793674_65806675.log", &size)); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName( - "shared_checksum/6580354_1874793674_65806675", &size)); - - ASSERT_TRUE(!test::TEST_GetFileSizeFromBackupFileName("meta/368", &size)); -} - // this will make sure that backup does not copy the same file twice TEST_F(BackupableDBTest, NoDoubleCopy) { OpenDBAndBackupEngine(true, true); @@ -776,6 +756,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy) { dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", "/CURRENT", "/MANIFEST-01" }; dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + test_backup_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); std::vector should_have_written = { "/shared/00010.sst.tmp", "/shared/00011.sst.tmp", @@ -792,6 +773,7 @@ TEST_F(BackupableDBTest, NoDoubleCopy) { dummy_db_->live_files_ = { "/00010.sst", "/00015.sst", "/CURRENT", "/MANIFEST-01" }; dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + test_backup_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); // should not open 00010.sst - it's already there should_have_written = { @@ -842,6 +824,7 @@ TEST_F(BackupableDBTest, DifferentEnvs) { dummy_db_->live_files_ = { "/00010.sst", "/00011.sst", "/CURRENT", "/MANIFEST-01" }; dummy_db_->wal_files_ = {{"/00011.log", true}, {"/00012.log", false}}; + test_backup_env_->SetFilenamesForMockedAttrs(dummy_db_->live_files_); ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); CloseDBAndBackupEngine(); @@ -853,6 +836,7 @@ TEST_F(BackupableDBTest, DifferentEnvs) { CloseDBAndBackupEngine(); DestroyDB(dbname_, Options()); + test_backup_env_->SetFilenamesForMockedAttrs({}); AssertBackupConsistency(0, 0, 100, 500); } @@ -1314,6 +1298,45 @@ TEST_F(BackupableDBTest, EnvFailures) { } } +// Verify manifest can roll while a backup is being created with the old +// manifest. +TEST_F(BackupableDBTest, ChangeManifestDuringBackupCreation) { + DestroyDB(dbname_, Options()); + options_.max_manifest_file_size = 0; // always rollover manifest for file add + OpenDBAndBackupEngine(true); + FillDB(db_.get(), 0, 100); + + rocksdb::SyncPoint::GetInstance()->LoadDependency({ + {"BackupEngineImpl::CreateNewBackup:SavedLiveFiles1", + "VersionSet::LogAndApply:WriteManifest"}, + {"VersionSet::LogAndApply:WriteManifestDone", + "BackupEngineImpl::CreateNewBackup:SavedLiveFiles2"}, + }); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::thread flush_thread{[this]() { ASSERT_OK(db_->Flush(FlushOptions())); }}; + + ASSERT_OK(backup_engine_->CreateNewBackup(db_.get(), false)); + + flush_thread.join(); + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + // The last manifest roll would've already been cleaned up by the full scan + // that happens when CreateNewBackup invokes EnableFileDeletions. We need to + // trigger another roll to verify non-full scan purges stale manifests. + DBImpl* db_impl = reinterpret_cast(db_.get()); + std::string prev_manifest_path = + DescriptorFileName(dbname_, db_impl->TEST_Current_Manifest_FileNo()); + FillDB(db_.get(), 0, 100); + ASSERT_OK(env_->FileExists(prev_manifest_path)); + ASSERT_OK(db_->Flush(FlushOptions())); + ASSERT_TRUE(env_->FileExists(prev_manifest_path).IsNotFound()); + + CloseDBAndBackupEngine(); + DestroyDB(dbname_, Options()); + AssertBackupConsistency(0, 0, 100); +} + // see https://github.com/facebook/rocksdb/issues/921 TEST_F(BackupableDBTest, Issue921Test) { BackupEngine* backup_engine; diff --git a/utilities/backupable/backupable_db_testutil.h b/utilities/backupable/backupable_db_testutil.h deleted file mode 100644 index 6c45f33ed..000000000 --- a/utilities/backupable/backupable_db_testutil.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -#pragma once -#ifndef ROCKSDB_LITE -#include - -namespace rocksdb { -namespace test { -extern bool TEST_GetFileSizeFromBackupFileName(const std::string full_name, - uint64_t* size); -} // namespace test -} // namespace rocksdb -#endif // ROCKSDB_LITE diff --git a/utilities/checkpoint/checkpoint.cc b/utilities/checkpoint/checkpoint.cc index 6e6fac004..b8543bb5b 100644 --- a/utilities/checkpoint/checkpoint.cc +++ b/utilities/checkpoint/checkpoint.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,6 +24,7 @@ #include "rocksdb/env.h" #include "rocksdb/transaction_log.h" #include "util/file_util.h" +#include "util/sync_point.h" #include "port/port.h" namespace rocksdb { @@ -76,7 +77,9 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) { s = db_->DisableFileDeletions(); if (s.ok()) { // this will return live_files prefixed with "/" - s = db_->GetLiveFiles(live_files, &manifest_file_size, true); + s = db_->GetLiveFiles(live_files, &manifest_file_size); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles1"); + TEST_SYNC_POINT("CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"); } // if we have more than one column family, we need to also get WAL files if (s.ok()) { @@ -98,6 +101,7 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) { s = db_->GetEnv()->CreateDir(full_private_path); // copy/hard link live_files + std::string manifest_fname, current_fname; for (size_t i = 0; s.ok() && i < live_files.size(); ++i) { uint64_t number; FileType type; @@ -110,6 +114,15 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) { assert(type == kTableFile || type == kDescriptorFile || type == kCurrentFile); assert(live_files[i].size() > 0 && live_files[i][0] == '/'); + if (type == kCurrentFile) { + // We will craft the current file manually to ensure it's consistent with + // the manifest number. This is necessary because current's file contents + // can change during checkpoint creation. + current_fname = live_files[i]; + continue; + } else if (type == kDescriptorFile) { + manifest_fname = live_files[i]; + } std::string src_fname = live_files[i]; // rules: @@ -132,6 +145,10 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir) { (type == kDescriptorFile) ? manifest_file_size : 0); } } + if (s.ok() && !current_fname.empty() && !manifest_fname.empty()) { + s = CreateFile(db_->GetEnv(), full_private_path + current_fname, + manifest_fname.substr(1) + "\n"); + } Log(db_->GetOptions().info_log, "Number of log files %" ROCKSDB_PRIszt, live_wal_files.size()); diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 27c1beb5f..3336e5af5 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -346,6 +346,50 @@ TEST_F(DBTest, CheckpointCF) { ASSERT_OK(DestroyDB(snapshot_name, options)); } +TEST_F(DBTest, CurrentFileModifiedWhileCheckpointing) { + const std::string kSnapshotName = test::TmpDir(env_) + "/snapshot"; + ASSERT_OK(DestroyDB(kSnapshotName, CurrentOptions())); + env_->DeleteDir(kSnapshotName); + + Options options = CurrentOptions(); + options.max_manifest_file_size = 0; // always rollover manifest for file add + Reopen(options); + + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {// Get past the flush in the checkpoint thread before adding any keys to + // the db so the checkpoint thread won't hit the WriteManifest + // syncpoints. + {"DBImpl::GetLiveFiles:1", + "DBTest::CurrentFileModifiedWhileCheckpointing:PrePut"}, + // Roll the manifest during checkpointing right after live files are + // snapshotted. + {"CheckpointImpl::CreateCheckpoint:SavedLiveFiles1", + "VersionSet::LogAndApply:WriteManifest"}, + {"VersionSet::LogAndApply:WriteManifestDone", + "CheckpointImpl::CreateCheckpoint:SavedLiveFiles2"}}); + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + std::thread t([&]() { + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(kSnapshotName)); + delete checkpoint; + }); + TEST_SYNC_POINT("DBTest::CurrentFileModifiedWhileCheckpointing:PrePut"); + ASSERT_OK(Put("Default", "Default1")); + ASSERT_OK(Flush()); + t.join(); + + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); + + DB* snapshotDB; + // Successful Open() implies that CURRENT pointed to the manifest in the + // checkpoint. + ASSERT_OK(DB::Open(options, kSnapshotName, &snapshotDB)); + delete snapshotDB; + snapshotDB = nullptr; +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc index 4ef4edf92..ad9043755 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h index ec9342d38..df303e8cd 100644 --- a/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h +++ b/utilities/compaction_filters/remove_emptyvalue_compactionfilter.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/convenience/info_log_finder.cc b/utilities/convenience/info_log_finder.cc index acdec5119..ecbdd7714 100644 --- a/utilities/convenience/info_log_finder.cc +++ b/utilities/convenience/info_log_finder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/document/document_db.cc b/utilities/document/document_db.cc index 7f7bc781a..85330b123 100644 --- a/utilities/document/document_db.cc +++ b/utilities/document/document_db.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/document/document_db_test.cc b/utilities/document/document_db_test.cc index 03bebf48e..9c9da552b 100644 --- a/utilities/document/document_db_test.cc +++ b/utilities/document/document_db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index f26787b97..9cf110830 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/document/json_document_builder.cc b/utilities/document/json_document_builder.cc index 8cbccc832..812239588 100644 --- a/utilities/document/json_document_builder.cc +++ b/utilities/document/json_document_builder.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/document/json_document_test.cc b/utilities/document/json_document_test.cc index b9d6dcf0f..f8c11d9cf 100644 --- a/utilities/document/json_document_test.cc +++ b/utilities/document/json_document_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/flashcache/flashcache.cc b/utilities/flashcache/flashcache.cc index d50232440..3765300d3 100644 --- a/utilities/flashcache/flashcache.cc +++ b/utilities/flashcache/flashcache.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/flashcache/flashcache.h b/utilities/flashcache/flashcache.h index a8a3d7d13..94a3ed41f 100644 --- a/utilities/flashcache/flashcache.h +++ b/utilities/flashcache/flashcache.h @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/geodb/geodb_impl.cc b/utilities/geodb/geodb_impl.cc index b30dd6333..bd57ca1f9 100644 --- a/utilities/geodb/geodb_impl.cc +++ b/utilities/geodb/geodb_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,6 +19,7 @@ #include "util/coding.h" #include "util/string_util.h" + // // There are two types of keys. The first type of key-values // maps a geo location to the set of object ids and their values. @@ -352,8 +353,8 @@ Status GeoDBImpl::searchQuadIds(const GeoPosition& position, Pixel bottomRight = PositionToPixel(bottomRightPos, Detail); // how many level of details to look for - int numberOfTilesAtMaxDepth = static_cast(floor((bottomRight.x - topLeft.x) / 256)); - int zoomLevelsToRise = static_cast(floor(::log(numberOfTilesAtMaxDepth) / ::log(2))); + int numberOfTilesAtMaxDepth = static_cast(std::floor((bottomRight.x - topLeft.x) / 256)); + int zoomLevelsToRise = static_cast(std::floor(std::log(numberOfTilesAtMaxDepth) / std::log(2))); zoomLevelsToRise++; int levels = std::max(0, Detail - zoomLevelsToRise); @@ -390,10 +391,10 @@ GeoDBImpl::Pixel GeoDBImpl::PositionToPixel(const GeoPosition& pos, double latitude = clip(pos.latitude, MinLatitude, MaxLatitude); double x = (pos.longitude + 180) / 360; double sinLatitude = sin(latitude * PI / 180); - double y = 0.5 - ::log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI); + double y = 0.5 - std::log((1 + sinLatitude) / (1 - sinLatitude)) / (4 * PI); double mapSize = MapSize(levelOfDetail); - double X = floor(clip(x * mapSize + 0.5, 0, mapSize - 1)); - double Y = floor(clip(y * mapSize + 0.5, 0, mapSize - 1)); + double X = std::floor(clip(x * mapSize + 0.5, 0, mapSize - 1)); + double Y = std::floor(clip(y * mapSize + 0.5, 0, mapSize - 1)); return Pixel((unsigned int)X, (unsigned int)Y); } @@ -408,8 +409,8 @@ GeoPosition GeoDBImpl::PixelToPosition(const Pixel& pixel, int levelOfDetail) { // Converts a Pixel to a Tile GeoDBImpl::Tile GeoDBImpl::PixelToTile(const Pixel& pixel) { - unsigned int tileX = static_cast(floor(pixel.x / 256)); - unsigned int tileY = static_cast(floor(pixel.y / 256)); + unsigned int tileX = static_cast(std::floor(pixel.x / 256)); + unsigned int tileY = static_cast(std::floor(pixel.y / 256)); return Tile(tileX, tileY); } diff --git a/utilities/geodb/geodb_impl.h b/utilities/geodb/geodb_impl.h index d63102856..a61f1674a 100644 --- a/utilities/geodb/geodb_impl.h +++ b/utilities/geodb/geodb_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/geodb/geodb_test.cc b/utilities/geodb/geodb_test.cc index 503332e44..91b3621a2 100644 --- a/utilities/geodb/geodb_test.cc +++ b/utilities/geodb/geodb_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/leveldb_options/leveldb_options.cc b/utilities/leveldb_options/leveldb_options.cc index cb7dfb8ea..cd12f3b50 100644 --- a/utilities/leveldb_options/leveldb_options.cc +++ b/utilities/leveldb_options/leveldb_options.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/memory/memory_test.cc b/utilities/memory/memory_test.cc index 079514736..5c97f78bd 100644 --- a/utilities/memory/memory_test.cc +++ b/utilities/memory/memory_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -93,7 +93,7 @@ class MemoryTest : public testing::Test { std::vector usage_history_[MemoryUtil::kNumUsageTypes]; }; -TEST_F(MemoryTest, SharedBlockCacheTotal) { +TEST_F(MemoryTest, DISABLED_SharedBlockCacheTotal) { std::vector dbs; std::vector usage_by_type; const int kNumDBs = 10; @@ -144,7 +144,7 @@ TEST_F(MemoryTest, SharedBlockCacheTotal) { } } -TEST_F(MemoryTest, MemTableAndTableReadersTotal) { +TEST_F(MemoryTest, DISABLED_MemTableAndTableReadersTotal) { std::vector dbs; std::vector usage_by_type; std::vector> vec_handles; diff --git a/utilities/memory/memory_util.cc b/utilities/memory/memory_util.cc index f5580174a..403f2f5ad 100644 --- a/utilities/memory/memory_util.cc +++ b/utilities/memory/memory_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/merge_operators.h b/utilities/merge_operators.h index fdf06645f..eb60ed5cc 100644 --- a/utilities/merge_operators.h +++ b/utilities/merge_operators.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/merge_operators/put.cc b/utilities/merge_operators/put.cc index 333084313..04c1270b2 100644 --- a/utilities/merge_operators/put.cc +++ b/utilities/merge_operators/put.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/merge_operators/uint64add.cc b/utilities/merge_operators/uint64add.cc index 6024beb95..90eb3d4c6 100644 --- a/utilities/merge_operators/uint64add.cc +++ b/utilities/merge_operators/uint64add.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/options/options_util.cc b/utilities/options/options_util.cc index 1c6a068ac..2526c5690 100644 --- a/utilities/options/options_util.cc +++ b/utilities/options/options_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2014, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/options/options_util_test.cc b/utilities/options/options_util_test.cc index c6d8cdb5f..94ddbc408 100644 --- a/utilities/options/options_util_test.cc +++ b/utilities/options/options_util_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -173,8 +173,9 @@ TEST_F(OptionsUtilTest, SanityCheck) { (i == 0) ? kDefaultColumnFamilyName : test::RandomName(&rnd_, 10); cf_descs.back().options.table_factory.reset(NewBlockBasedTableFactory()); + // Assign non-null values to prefix_extractors except the first cf. cf_descs.back().options.prefix_extractor.reset( - test::RandomSliceTransform(&rnd_)); + i != 0 ? test::RandomSliceTransform(&rnd_) : nullptr); cf_descs.back().options.merge_operator.reset( test::RandomMergeOperator(&rnd_)); } @@ -223,9 +224,10 @@ TEST_F(OptionsUtilTest, SanityCheck) { std::shared_ptr prefix_extractor = cf_descs[1].options.prefix_extractor; + // It's okay to set prefix_extractor to nullptr. ASSERT_NE(prefix_extractor, nullptr); cf_descs[1].options.prefix_extractor.reset(); - ASSERT_NOK( + ASSERT_OK( CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs)); cf_descs[1].options.prefix_extractor.reset(new DummySliceTransform()); @@ -237,6 +239,27 @@ TEST_F(OptionsUtilTest, SanityCheck) { CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs)); } + // prefix extractor nullptr case + { + std::shared_ptr prefix_extractor = + cf_descs[0].options.prefix_extractor; + + // It's okay to set prefix_extractor to nullptr. + ASSERT_EQ(prefix_extractor, nullptr); + cf_descs[0].options.prefix_extractor.reset(); + ASSERT_OK( + CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs)); + + // It's okay to change prefix_extractor from nullptr to non-nullptr + cf_descs[0].options.prefix_extractor.reset(new DummySliceTransform()); + ASSERT_OK( + CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs)); + + cf_descs[0].options.prefix_extractor = prefix_extractor; + ASSERT_OK( + CheckOptionsCompatibility(dbname_, Env::Default(), db_opt, cf_descs)); + } + // comparator { test::SimpleSuffixReverseComparator comparator; diff --git a/utilities/redis/redis_lists_test.cc b/utilities/redis/redis_lists_test.cc index 3ef35f75e..3c97be271 100644 --- a/utilities/redis/redis_lists_test.cc +++ b/utilities/redis/redis_lists_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/spatialdb/spatial_db.cc b/utilities/spatialdb/spatial_db.cc index 36c9ed188..e7943120c 100644 --- a/utilities/spatialdb/spatial_db.cc +++ b/utilities/spatialdb/spatial_db.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/spatialdb/spatial_db_test.cc b/utilities/spatialdb/spatial_db_test.cc index 41f3cd620..92c34c87d 100644 --- a/utilities/spatialdb/spatial_db_test.cc +++ b/utilities/spatialdb/spatial_db_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/spatialdb/utils.h b/utilities/spatialdb/utils.h index b8c664a92..d4dae0200 100644 --- a/utilities/spatialdb/utils.h +++ b/utilities/spatialdb/utils.h @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.cc b/utilities/table_properties_collectors/compact_on_deletion_collector.cc index 2079ccb86..59eaf81c3 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector.h b/utilities/table_properties_collectors/compact_on_deletion_collector.h index 3001ce913..50d363cd3 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector.h +++ b/utilities/table_properties_collectors/compact_on_deletion_collector.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc index 89e6bbcb8..ab16b37bb 100644 --- a/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc +++ b/utilities/table_properties_collectors/compact_on_deletion_collector_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/transactions/optimistic_transaction_db_impl.cc b/utilities/transactions/optimistic_transaction_db_impl.cc index ca9897211..190440242 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.cc +++ b/utilities/transactions/optimistic_transaction_db_impl.cc @@ -1,15 +1,15 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. #ifndef ROCKSDB_LITE +#include "utilities/transactions/optimistic_transaction_db_impl.h" + #include #include -#include "utilities/transactions/optimistic_transaction_db_impl.h" - #include "db/db_impl.h" #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -20,11 +20,13 @@ namespace rocksdb { Transaction* OptimisticTransactionDBImpl::BeginTransaction( const WriteOptions& write_options, - const OptimisticTransactionOptions& txn_options) { - Transaction* txn = - new OptimisticTransactionImpl(this, write_options, txn_options); - - return txn; + const OptimisticTransactionOptions& txn_options, Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new OptimisticTransactionImpl(this, write_options, txn_options); + } } Status OptimisticTransactionDB::Open(const Options& options, @@ -76,5 +78,14 @@ Status OptimisticTransactionDB::Open( return s; } +void OptimisticTransactionDBImpl::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index ec5b42823..e426a21be 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -19,14 +19,19 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { ~OptimisticTransactionDBImpl() {} - Transaction* BeginTransaction( - const WriteOptions& write_options, - const OptimisticTransactionOptions& txn_options) override; + Transaction* BeginTransaction(const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options, + Transaction* old_txn) override; DB* GetBaseDB() override { return db_.get(); } private: std::unique_ptr db_; + + void ReinitializeTransaction(Transaction* txn, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options = + OptimisticTransactionOptions()); }; } // namespace rocksdb diff --git a/utilities/transactions/optimistic_transaction_impl.cc b/utilities/transactions/optimistic_transaction_impl.cc index 120f18ed8..2647b3dd7 100644 --- a/utilities/transactions/optimistic_transaction_impl.cc +++ b/utilities/transactions/optimistic_transaction_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -28,11 +28,23 @@ OptimisticTransactionImpl::OptimisticTransactionImpl( OptimisticTransactionDB* txn_db, const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options) : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_(txn_db) { + Initialize(txn_options); +} + +void OptimisticTransactionImpl::Initialize( + const OptimisticTransactionOptions& txn_options) { if (txn_options.set_snapshot) { SetSnapshot(); } } +void OptimisticTransactionImpl::Reinitialize( + OptimisticTransactionDB* txn_db, const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options) { + TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options); + Initialize(txn_options); +} + OptimisticTransactionImpl::~OptimisticTransactionImpl() { } @@ -54,7 +66,7 @@ Status OptimisticTransactionImpl::Commit() { } Status s = db_impl->WriteWithCallback( - write_options_, write_batch_->GetWriteBatch(), &callback); + write_options_, GetWriteBatch()->GetWriteBatch(), &callback); if (s.ok()) { Clear(); @@ -67,7 +79,8 @@ void OptimisticTransactionImpl::Rollback() { Clear(); } // Record this key so that we can check it for conflicts at commit time. Status OptimisticTransactionImpl::TryLock(ColumnFamilyHandle* column_family, - const Slice& key, bool untracked) { + const Slice& key, bool read_only, + bool untracked) { if (untracked) { return Status::OK(); } @@ -77,14 +90,14 @@ Status OptimisticTransactionImpl::TryLock(ColumnFamilyHandle* column_family, SequenceNumber seq; if (snapshot_) { - seq = snapshot_->snapshot()->GetSequenceNumber(); + seq = snapshot_->GetSequenceNumber(); } else { seq = db_->GetLatestSequenceNumber(); } std::string key_str = key.ToString(); - TrackKey(cfh_id, key_str, seq); + TrackKey(cfh_id, key_str, seq, read_only); // Always return OK. Confilct checking will happen at commit time. return Status::OK(); diff --git a/utilities/transactions/optimistic_transaction_impl.h b/utilities/transactions/optimistic_transaction_impl.h index a18561efd..4876a100d 100644 --- a/utilities/transactions/optimistic_transaction_impl.h +++ b/utilities/transactions/optimistic_transaction_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -34,19 +34,25 @@ class OptimisticTransactionImpl : public TransactionBaseImpl { virtual ~OptimisticTransactionImpl(); + void Reinitialize(OptimisticTransactionDB* txn_db, + const WriteOptions& write_options, + const OptimisticTransactionOptions& txn_options); + Status Commit() override; void Rollback() override; protected: Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, - bool untracked = false) override; + bool read_only, bool untracked = false) override; private: OptimisticTransactionDB* const txn_db_; friend class OptimisticTransactionCallback; + void Initialize(const OptimisticTransactionOptions& txn_options); + // Returns OK if it is safe to commit this transaction. Returns Status::Busy // if there are read or write conflicts that would prevent us from committing // OR if we can not determine whether there would be any such conflicts. @@ -56,6 +62,11 @@ class OptimisticTransactionImpl : public TransactionBaseImpl { void Clear() override; + void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override { + // Nothing to unlock. + } + // No copying allowed OptimisticTransactionImpl(const OptimisticTransactionImpl&); void operator=(const OptimisticTransactionImpl&); @@ -71,6 +82,8 @@ class OptimisticTransactionCallback : public WriteCallback { return txn_->CheckTransactionForConflicts(db); } + bool AllowWriteBatching() override { return false; } + private: OptimisticTransactionImpl* txn_; }; diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 6fe7e95f9..fd90f2423 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,12 +6,16 @@ #ifndef ROCKSDB_LITE #include +#include #include "rocksdb/db.h" -#include "rocksdb/utilities/transaction.h" #include "rocksdb/utilities/optimistic_transaction_db.h" +#include "rocksdb/utilities/transaction.h" +#include "util/crc32c.h" #include "util/logging.h" +#include "util/random.h" #include "util/testharness.h" +#include "util/transaction_test_util.h" using std::string; @@ -1114,6 +1118,226 @@ TEST_F(OptimisticTransactionTest, SavepointTest) { delete txn; } +TEST_F(OptimisticTransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options, snapshot_read_options; + OptimisticTransactionOptions txn_options; + string value; + Status s; + + db->Put(write_options, "A", ""); + + Transaction* txn1 = txn_db->BeginTransaction(write_options); + ASSERT_TRUE(txn1); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + + Transaction* txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + txn1->Put("A", "a"); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->SetSavePoint(); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->SetSavePoint(); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 cannot commit since A will still be conflict checked + s = txn1->Commit(); + ASSERT_TRUE(s.IsBusy()); + delete txn1; + + txn1 = txn_db->BeginTransaction(write_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + + txn1->SetSavePoint(); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + txn1->UndoGetForUpdate("A"); + + txn1->RollbackToSavePoint(); + txn1->UndoGetForUpdate("A"); + + txn2 = txn_db->BeginTransaction(write_options); + txn2->Put("A", "x"); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + // Verify that txn1 can commit since A isn't conflict checked + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +namespace { +Status OptimisticTransactionStressTestInserter(OptimisticTransactionDB* db, + const size_t num_transactions, + const size_t num_sets, + const size_t num_keys_per_set) { + size_t seed = std::hash()(std::this_thread::get_id()); + Random64 _rand(seed); + WriteOptions write_options; + ReadOptions read_options; + OptimisticTransactionOptions txn_options; + txn_options.set_snapshot = true; + + RandomTransactionInserter inserter(&_rand, write_options, read_options, + num_keys_per_set, + static_cast(num_sets)); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.OptimisticTransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +TEST_F(OptimisticTransactionTest, OptimisticTransactionStressTest) { + const size_t num_threads = 4; + const size_t num_transactions_per_thread = 10000; + const size_t num_sets = 3; + const size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector threads; + + std::function call_inserter = [&] { + ASSERT_OK(OptimisticTransactionStressTestInserter( + txn_db, num_transactions_per_thread, num_sets, num_keys_per_set)); + }; + + // Create N threads that use RandomTransactionInserter to write + // many transactions. + for (uint32_t i = 0; i < num_threads; i++) { + threads.emplace_back(call_inserter); + } + + // Wait for all threads to run + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(db, num_sets); + ASSERT_OK(s); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 5f3e97e9b..01bab827a 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -21,26 +21,49 @@ TransactionBaseImpl::TransactionBaseImpl(DB* db, : db_(db), write_options_(write_options), cmp_(GetColumnFamilyUserComparator(db->DefaultColumnFamily())), - write_batch_(new WriteBatchWithIndex(cmp_, 0, true)), - start_time_(db_->GetEnv()->NowMicros()) {} + start_time_(db_->GetEnv()->NowMicros()), + write_batch_(cmp_, 0, true), + indexing_enabled_(true) {} -TransactionBaseImpl::~TransactionBaseImpl() {} +TransactionBaseImpl::~TransactionBaseImpl() { + // Release snapshot if snapshot is set + SetSnapshotInternal(nullptr); +} void TransactionBaseImpl::Clear() { save_points_.reset(nullptr); - write_batch_->Clear(); + write_batch_.Clear(); tracked_keys_.clear(); num_puts_ = 0; num_deletes_ = 0; num_merges_ = 0; } +void TransactionBaseImpl::Reinitialize(DB* db, + const WriteOptions& write_options) { + Clear(); + ClearSnapshot(); + db_ = db; + write_options_ = write_options; + start_time_ = db_->GetEnv()->NowMicros(); + indexing_enabled_ = true; + cmp_ = GetColumnFamilyUserComparator(db_->DefaultColumnFamily()); +} + void TransactionBaseImpl::SetSnapshot() { assert(dynamic_cast(db_) != nullptr); auto db_impl = reinterpret_cast(db_); const Snapshot* snapshot = db_impl->GetSnapshotForWriteConflictBoundary(); - snapshot_.reset(new ManagedSnapshot(db_, snapshot)); + + SetSnapshotInternal(snapshot); +} + +void TransactionBaseImpl::SetSnapshotInternal(const Snapshot* snapshot) { + // Set a custom deleter for the snapshot_ SharedPtr as the snapshot needs to + // be released, not deleted when it is no longer referenced. + snapshot_.reset(snapshot, std::bind(&TransactionBaseImpl::ReleaseSnapshot, + this, std::placeholders::_1, db_)); snapshot_needed_ = false; snapshot_notifier_ = nullptr; } @@ -62,7 +85,8 @@ void TransactionBaseImpl::SetSnapshotIfNeeded() { } Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family, - const SliceParts& key, bool untracked) { + const SliceParts& key, bool read_only, + bool untracked) { size_t key_size = 0; for (int i = 0; i < key.num_parts; ++i) { key_size += key.parts[i].size(); @@ -75,7 +99,7 @@ Status TransactionBaseImpl::TryLock(ColumnFamilyHandle* column_family, str.append(key.parts[i].data(), key.parts[i].size()); } - return TryLock(column_family, str, untracked); + return TryLock(column_family, str, read_only, untracked); } void TransactionBaseImpl::SetSavePoint() { @@ -84,7 +108,7 @@ void TransactionBaseImpl::SetSavePoint() { } save_points_->emplace(snapshot_, snapshot_needed_, snapshot_notifier_, num_puts_, num_deletes_, num_merges_); - write_batch_->SetSavePoint(); + write_batch_.SetSavePoint(); } Status TransactionBaseImpl::RollbackToSavePoint() { @@ -99,19 +123,39 @@ Status TransactionBaseImpl::RollbackToSavePoint() { num_merges_ = save_point.num_merges_; // Rollback batch - Status s = write_batch_->RollbackToSavePoint(); + Status s = write_batch_.RollbackToSavePoint(); assert(s.ok()); // Rollback any keys that were tracked since the last savepoint - const TransactionKeyMap* key_map = GetTrackedKeysSinceSavePoint(); - assert(key_map); - for (auto& key_map_iter : *key_map) { + const TransactionKeyMap& key_map = save_point.new_keys_; + for (const auto& key_map_iter : key_map) { uint32_t column_family_id = key_map_iter.first; auto& keys = key_map_iter.second; - for (auto& key_iter : keys) { + auto& cf_tracked_keys = tracked_keys_[column_family_id]; + + for (const auto& key_iter : keys) { const std::string& key = key_iter.first; - tracked_keys_[column_family_id].erase(key); + uint32_t num_reads = key_iter.second.num_reads; + uint32_t num_writes = key_iter.second.num_writes; + + auto tracked_keys_iter = cf_tracked_keys.find(key); + assert(tracked_keys_iter != cf_tracked_keys.end()); + + // Decrement the total reads/writes of this key by the number of + // reads/writes done since the last SavePoint. + if (num_reads > 0) { + assert(tracked_keys_iter->second.num_reads >= num_reads); + tracked_keys_iter->second.num_reads -= num_reads; + } + if (num_writes > 0) { + assert(tracked_keys_iter->second.num_writes >= num_writes); + tracked_keys_iter->second.num_writes -= num_writes; + } + if (tracked_keys_iter->second.num_reads == 0 && + tracked_keys_iter->second.num_writes == 0) { + tracked_keys_[column_family_id].erase(tracked_keys_iter); + } } } @@ -119,7 +163,7 @@ Status TransactionBaseImpl::RollbackToSavePoint() { return s; } else { - assert(write_batch_->RollbackToSavePoint().IsNotFound()); + assert(write_batch_.RollbackToSavePoint().IsNotFound()); return Status::NotFound(); } } @@ -127,14 +171,14 @@ Status TransactionBaseImpl::RollbackToSavePoint() { Status TransactionBaseImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - return write_batch_->GetFromBatchAndDB(db_, read_options, column_family, key, - value); + return write_batch_.GetFromBatchAndDB(db_, read_options, column_family, key, + value); } Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, true /* read_only */); if (s.ok() && value != nullptr) { s = Get(read_options, column_family, key, value); @@ -168,7 +212,7 @@ std::vector TransactionBaseImpl::MultiGetForUpdate( // Lock all keys for (size_t i = 0; i < num_keys; ++i) { - Status s = TryLock(column_family[i], keys[i]); + Status s = TryLock(column_family[i], keys[i], true /* read_only */); if (!s.ok()) { // Fail entire multiget if we cannot lock all keys return std::vector(num_keys, s); @@ -189,7 +233,7 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options) { Iterator* db_iter = db_->NewIterator(read_options); assert(db_iter); - return write_batch_->NewIteratorWithBase(db_iter); + return write_batch_.NewIteratorWithBase(db_iter); } Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, @@ -197,12 +241,12 @@ Iterator* TransactionBaseImpl::GetIterator(const ReadOptions& read_options, Iterator* db_iter = db_->NewIterator(read_options, column_family); assert(db_iter); - return write_batch_->NewIteratorWithBase(column_family, db_iter); + return write_batch_.NewIteratorWithBase(column_family, db_iter); } Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->Put(column_family, key, value); @@ -215,7 +259,7 @@ Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, const SliceParts& key, const SliceParts& value) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->Put(column_family, key, value); @@ -227,7 +271,7 @@ Status TransactionBaseImpl::Put(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->Merge(column_family, key, value); @@ -239,7 +283,7 @@ Status TransactionBaseImpl::Merge(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, const Slice& key) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->Delete(column_family, key); @@ -251,7 +295,7 @@ Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, const SliceParts& key) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->Delete(column_family, key); @@ -263,7 +307,7 @@ Status TransactionBaseImpl::Delete(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, const Slice& key) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->SingleDelete(column_family, key); @@ -275,7 +319,7 @@ Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, const SliceParts& key) { - Status s = TryLock(column_family, key); + Status s = TryLock(column_family, key, false /* read_only */); if (s.ok()) { GetBatchForWrite()->SingleDelete(column_family, key); @@ -287,8 +331,8 @@ Status TransactionBaseImpl::SingleDelete(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - bool untracked = true; - Status s = TryLock(column_family, key, untracked); + Status s = + TryLock(column_family, key, false /* read_only */, true /* untracked */); if (s.ok()) { GetBatchForWrite()->Put(column_family, key, value); @@ -301,8 +345,8 @@ Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, const SliceParts& key, const SliceParts& value) { - bool untracked = true; - Status s = TryLock(column_family, key, untracked); + Status s = + TryLock(column_family, key, false /* read_only */, true /* untracked */); if (s.ok()) { GetBatchForWrite()->Put(column_family, key, value); @@ -315,8 +359,8 @@ Status TransactionBaseImpl::PutUntracked(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family, const Slice& key, const Slice& value) { - bool untracked = true; - Status s = TryLock(column_family, key, untracked); + Status s = + TryLock(column_family, key, false /* read_only */, true /* untracked */); if (s.ok()) { GetBatchForWrite()->Merge(column_family, key, value); @@ -328,8 +372,8 @@ Status TransactionBaseImpl::MergeUntracked(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, const Slice& key) { - bool untracked = true; - Status s = TryLock(column_family, key, untracked); + Status s = + TryLock(column_family, key, false /* read_only */, true /* untracked */); if (s.ok()) { GetBatchForWrite()->Delete(column_family, key); @@ -341,8 +385,8 @@ Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, const SliceParts& key) { - bool untracked = true; - Status s = TryLock(column_family, key, untracked); + Status s = + TryLock(column_family, key, false /* read_only */, true /* untracked */); if (s.ok()) { GetBatchForWrite()->Delete(column_family, key); @@ -353,11 +397,11 @@ Status TransactionBaseImpl::DeleteUntracked(ColumnFamilyHandle* column_family, } void TransactionBaseImpl::PutLogData(const Slice& blob) { - write_batch_->PutLogData(blob); + write_batch_.PutLogData(blob); } WriteBatchWithIndex* TransactionBaseImpl::GetWriteBatch() { - return write_batch_.get(); + return &write_batch_; } uint64_t TransactionBaseImpl::GetElapsedTime() const { @@ -383,26 +427,73 @@ uint64_t TransactionBaseImpl::GetNumKeys() const { } void TransactionBaseImpl::TrackKey(uint32_t cfh_id, const std::string& key, - SequenceNumber seq) { - auto iter = tracked_keys_[cfh_id].find(key); - if (iter == tracked_keys_[cfh_id].end()) { - tracked_keys_[cfh_id].insert({key, seq}); - - if (save_points_ != nullptr && !save_points_->empty()) { - // Aren't tracking this key, add it. - save_points_->top().new_keys_[cfh_id][key] = seq; - } - } else if (seq < iter->second) { + SequenceNumber seq, bool read_only) { + // Update map of all tracked keys for this transaction + TrackKey(&tracked_keys_, cfh_id, key, seq, read_only); + + if (save_points_ != nullptr && !save_points_->empty()) { + // Update map of tracked keys in this SavePoint + TrackKey(&save_points_->top().new_keys_, cfh_id, key, seq, read_only); + } +} + +// Add a key to the given TransactionKeyMap +void TransactionBaseImpl::TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id, + const std::string& key, SequenceNumber seq, + bool read_only) { + auto& cf_key_map = (*key_map)[cfh_id]; + auto iter = cf_key_map.find(key); + if (iter == cf_key_map.end()) { + auto result = cf_key_map.insert({key, TransactionKeyMapInfo(seq)}); + iter = result.first; + } else if (seq < iter->second.seq) { // Now tracking this key with an earlier sequence number - iter->second = seq; + iter->second.seq = seq; + } + + if (read_only) { + iter->second.num_reads++; + } else { + iter->second.num_writes++; } } -const TransactionKeyMap* TransactionBaseImpl::GetTrackedKeysSinceSavePoint() { +std::unique_ptr +TransactionBaseImpl::GetTrackedKeysSinceSavePoint() { if (save_points_ != nullptr && !save_points_->empty()) { - return &save_points_->top().new_keys_; + // Examine the number of reads/writes performed on all keys written + // since the last SavePoint and compare to the total number of reads/writes + // for each key. + TransactionKeyMap* result = new TransactionKeyMap(); + for (const auto& key_map_iter : save_points_->top().new_keys_) { + uint32_t column_family_id = key_map_iter.first; + auto& keys = key_map_iter.second; + + auto& cf_tracked_keys = tracked_keys_[column_family_id]; + + for (const auto& key_iter : keys) { + const std::string& key = key_iter.first; + uint32_t num_reads = key_iter.second.num_reads; + uint32_t num_writes = key_iter.second.num_writes; + + auto total_key_info = cf_tracked_keys.find(key); + assert(total_key_info != cf_tracked_keys.end()); + assert(total_key_info->second.num_reads >= num_reads); + assert(total_key_info->second.num_writes >= num_writes); + + if (total_key_info->second.num_reads == num_reads && + total_key_info->second.num_writes == num_writes) { + // All the reads/writes to this key were done in the last savepoint. + bool read_only = (num_writes == 0); + TrackKey(result, column_family_id, key, key_iter.second.seq, + read_only); + } + } + } + return std::unique_ptr(result); } + // No SavePoint return nullptr; } @@ -413,10 +504,70 @@ const TransactionKeyMap* TransactionBaseImpl::GetTrackedKeysSinceSavePoint() { WriteBatchBase* TransactionBaseImpl::GetBatchForWrite() { if (indexing_enabled_) { // Use WriteBatchWithIndex - return write_batch_.get(); + return &write_batch_; } else { // Don't use WriteBatchWithIndex. Return base WriteBatch. - return write_batch_->GetWriteBatch(); + return write_batch_.GetWriteBatch(); + } +} + +void TransactionBaseImpl::ReleaseSnapshot(const Snapshot* snapshot, DB* db) { + if (snapshot != nullptr) { + db->ReleaseSnapshot(snapshot); + } +} + +void TransactionBaseImpl::UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) { + uint32_t column_family_id = GetColumnFamilyID(column_family); + auto& cf_tracked_keys = tracked_keys_[column_family_id]; + std::string key_str = key.ToString(); + bool can_decrement = false; + bool can_unlock __attribute__((unused)) = false; + + if (save_points_ != nullptr && !save_points_->empty()) { + // Check if this key was fetched ForUpdate in this SavePoint + auto& cf_savepoint_keys = save_points_->top().new_keys_[column_family_id]; + + auto savepoint_iter = cf_savepoint_keys.find(key_str); + if (savepoint_iter != cf_savepoint_keys.end()) { + if (savepoint_iter->second.num_reads > 0) { + savepoint_iter->second.num_reads--; + can_decrement = true; + + if (savepoint_iter->second.num_reads == 0 && + savepoint_iter->second.num_writes == 0) { + // No other GetForUpdates or write on this key in this SavePoint + cf_savepoint_keys.erase(savepoint_iter); + can_unlock = true; + } + } + } + } else { + // No SavePoint set + can_decrement = true; + can_unlock = true; + } + + // We can only decrement the read count for this key if we were able to + // decrement the read count in the current SavePoint, OR if there is no + // SavePoint set. + if (can_decrement) { + auto key_iter = cf_tracked_keys.find(key_str); + + if (key_iter != cf_tracked_keys.end()) { + if (key_iter->second.num_reads > 0) { + key_iter->second.num_reads--; + + if (key_iter->second.num_reads == 0 && + key_iter->second.num_writes == 0) { + // No other GetForUpdates or writes on this key + assert(can_unlock); + cf_tracked_keys.erase(key_iter); + UnlockGetForUpdate(column_family, key); + } + } + } } } diff --git a/utilities/transactions/transaction_base.h b/utilities/transactions/transaction_base.h index 4515bfaf5..db33b6f65 100644 --- a/utilities/transactions/transaction_base.h +++ b/utilities/transactions/transaction_base.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -32,12 +32,14 @@ class TransactionBaseImpl : public Transaction { // Remove pending operations queued in this transaction. virtual void Clear(); + void Reinitialize(DB* db, const WriteOptions& write_options); + // Called before executing Put, Merge, Delete, and GetForUpdate. If TryLock // returns non-OK, the Put/Merge/Delete/GetForUpdate will be failed. // untracked will be true if called from PutUntracked, DeleteUntracked, or // MergeUntracked. virtual Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, - bool untracked = false) = 0; + bool read_only, bool untracked = false) = 0; void SetSavePoint() override; @@ -165,7 +167,7 @@ class TransactionBaseImpl : public Transaction { } const Snapshot* GetSnapshot() const override { - return snapshot_ ? snapshot_->snapshot() : nullptr; + return snapshot_ ? snapshot_.get() : nullptr; } void SetSnapshot() override; @@ -192,6 +194,12 @@ class TransactionBaseImpl : public Transaction { uint64_t GetNumKeys() const override; + void UndoGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; + void UndoGetForUpdate(const Slice& key) override { + return UndoGetForUpdate(nullptr, key); + }; + // Get list of keys in this transaction that must not have any conflicts // with writes in other transactions. const TransactionKeyMap& GetTrackedKeys() const { return tracked_keys_; } @@ -202,31 +210,43 @@ class TransactionBaseImpl : public Transaction { write_options_ = write_options; } + // Used for memory management for snapshot_ + void ReleaseSnapshot(const Snapshot* snapshot, DB* db); + protected: // Add a key to the list of tracked keys. + // // seqno is the earliest seqno this key was involved with this transaction. - void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno); + // readonly should be set to true if no data was written for this key + void TrackKey(uint32_t cfh_id, const std::string& key, SequenceNumber seqno, + bool readonly); + + // Helper function to add a key to the given TransactionKeyMap + static void TrackKey(TransactionKeyMap* key_map, uint32_t cfh_id, + const std::string& key, SequenceNumber seqno, + bool readonly); + + // Called when UndoGetForUpdate determines that this key can be unlocked. + virtual void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) = 0; - const TransactionKeyMap* GetTrackedKeysSinceSavePoint(); + std::unique_ptr GetTrackedKeysSinceSavePoint(); // Sets a snapshot if SetSnapshotOnNextOperation() has been called. void SetSnapshotIfNeeded(); - DB* const db_; + DB* db_; WriteOptions write_options_; const Comparator* cmp_; - // Records writes pending in this transaction - std::unique_ptr write_batch_; - // Stores that time the txn was constructed, in microseconds. - const uint64_t start_time_; + uint64_t start_time_; // Stores the current snapshot that was was set by SetSnapshot or null if // no snapshot is currently set. - std::shared_ptr snapshot_; + std::shared_ptr snapshot_; // Count of various operations pending in this transaction uint64_t num_puts_ = 0; @@ -234,7 +254,7 @@ class TransactionBaseImpl : public Transaction { uint64_t num_merges_ = 0; struct SavePoint { - std::shared_ptr snapshot_; + std::shared_ptr snapshot_; bool snapshot_needed_; std::shared_ptr snapshot_notifier_; uint64_t num_puts_; @@ -244,7 +264,7 @@ class TransactionBaseImpl : public Transaction { // Record all keys tracked since the last savepoint TransactionKeyMap new_keys_; - SavePoint(std::shared_ptr snapshot, bool snapshot_needed, + SavePoint(std::shared_ptr snapshot, bool snapshot_needed, std::shared_ptr snapshot_notifier, uint64_t num_puts, uint64_t num_deletes, uint64_t num_merges) : snapshot_(snapshot), @@ -256,6 +276,9 @@ class TransactionBaseImpl : public Transaction { }; private: + // Records writes pending in this transaction + WriteBatchWithIndex write_batch_; + // Stack of the Snapshot saved at each save point. Saved snapshots may be // nullptr if there was no snapshot at the time SetSavePoint() was called. std::unique_ptr> save_points_; @@ -271,7 +294,7 @@ class TransactionBaseImpl : public Transaction { // WriteBatchWithIndex. // If false, future Put/Merge/Deletes will be inserted directly into the // underlying WriteBatch and not indexed in the WriteBatchWithIndex. - bool indexing_enabled_ = true; + bool indexing_enabled_; // SetSnapshotOnNextOperation() has been called and the snapshot has not yet // been reset. @@ -282,9 +305,11 @@ class TransactionBaseImpl : public Transaction { std::shared_ptr snapshot_notifier_ = nullptr; Status TryLock(ColumnFamilyHandle* column_family, const SliceParts& key, - bool untracked = false); + bool read_only, bool untracked = false); WriteBatchBase* GetBatchForWrite(); + + void SetSnapshotInternal(const Snapshot* snapshot); }; } // namespace rocksdb diff --git a/utilities/transactions/transaction_db_impl.cc b/utilities/transactions/transaction_db_impl.cc index f8a47b948..ef03f3454 100644 --- a/utilities/transactions/transaction_db_impl.cc +++ b/utilities/transactions/transaction_db_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,17 +24,21 @@ TransactionDBImpl::TransactionDBImpl(DB* db, const TransactionDBOptions& txn_db_options) : TransactionDB(db), txn_db_options_(txn_db_options), - lock_mgr_(txn_db_options_.num_stripes, txn_db_options.max_num_locks, + lock_mgr_(this, txn_db_options_.num_stripes, txn_db_options.max_num_locks, txn_db_options_.custom_mutex_factory ? txn_db_options_.custom_mutex_factory : std::shared_ptr( new TransactionDBMutexFactoryImpl())) {} Transaction* TransactionDBImpl::BeginTransaction( - const WriteOptions& write_options, const TransactionOptions& txn_options) { - Transaction* txn = new TransactionImpl(this, write_options, txn_options); - - return txn; + const WriteOptions& write_options, const TransactionOptions& txn_options, + Transaction* old_txn) { + if (old_txn != nullptr) { + ReinitializeTransaction(old_txn, write_options, txn_options); + return old_txn; + } else { + return new TransactionImpl(this, write_options, txn_options); + } } TransactionDBOptions TransactionDBImpl::ValidateTxnDBOptions( @@ -173,7 +177,7 @@ void TransactionDBImpl::UnLock(TransactionImpl* txn, uint32_t cfh_id, Transaction* TransactionDBImpl::BeginInternalTransaction( const WriteOptions& options) { TransactionOptions txn_options; - Transaction* txn = BeginTransaction(options, txn_options); + Transaction* txn = BeginTransaction(options, txn_options, nullptr); assert(dynamic_cast(txn) != nullptr); auto txn_impl = reinterpret_cast(txn); @@ -278,5 +282,38 @@ Status TransactionDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) { return s; } +void TransactionDBImpl::InsertExpirableTransaction(TransactionID tx_id, + TransactionImpl* tx) { + assert(tx->GetExpirationTime() > 0); + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.insert({tx_id, tx}); +} + +void TransactionDBImpl::RemoveExpirableTransaction(TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + expirable_transactions_map_.erase(tx_id); +} + +bool TransactionDBImpl::TryStealingExpiredTransactionLocks( + TransactionID tx_id) { + std::lock_guard lock(map_mutex_); + + auto tx_it = expirable_transactions_map_.find(tx_id); + if (tx_it == expirable_transactions_map_.end()) { + return true; + } + TransactionImpl& tx = *(tx_it->second); + return tx.TryStealingLocks(); +} + +void TransactionDBImpl::ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options) { + assert(dynamic_cast(txn) != nullptr); + auto txn_impl = reinterpret_cast(txn); + + txn_impl->Reinitialize(this, write_options, txn_options); +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/utilities/transactions/transaction_db_impl.h b/utilities/transactions/transaction_db_impl.h index 5a9d8b474..7b7d646a1 100644 --- a/utilities/transactions/transaction_db_impl.h +++ b/utilities/transactions/transaction_db_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,7 +6,9 @@ #pragma once #ifndef ROCKSDB_LITE +#include #include +#include #include "rocksdb/db.h" #include "rocksdb/options.h" @@ -24,7 +26,8 @@ class TransactionDBImpl : public TransactionDB { ~TransactionDBImpl() {} Transaction* BeginTransaction(const WriteOptions& write_options, - const TransactionOptions& txn_options) override; + const TransactionOptions& txn_options, + Transaction* old_txn) override; using StackableDB::Put; virtual Status Put(const WriteOptions& options, @@ -66,7 +69,20 @@ class TransactionDBImpl : public TransactionDB { return txn_db_options_; } + void InsertExpirableTransaction(TransactionID tx_id, TransactionImpl* tx); + void RemoveExpirableTransaction(TransactionID tx_id); + + // If transaction is no longer available, locks can be stolen + // If transaction is available, try stealing locks directly from transaction + // It is the caller's responsibility to ensure that the referred transaction + // is expirable (GetExpirationTime() > 0) and that it is expired. + bool TryStealingExpiredTransactionLocks(TransactionID tx_id); + private: + void ReinitializeTransaction( + Transaction* txn, const WriteOptions& write_options, + const TransactionOptions& txn_options = TransactionOptions()); + const TransactionDBOptions txn_db_options_; TransactionLockMgr lock_mgr_; @@ -74,6 +90,13 @@ class TransactionDBImpl : public TransactionDB { InstrumentedMutex column_family_mutex_; Transaction* BeginInternalTransaction(const WriteOptions& options); Status WriteHelper(WriteBatch* updates, TransactionImpl* txn_impl); + + // Used to ensure that no locks are stolen from an expirable transaction + // that has started a commit. Only transactions with an expiration time + // should be in this map. + std::mutex map_mutex_; + std::unordered_map + expirable_transactions_map_; }; } // namespace rocksdb diff --git a/utilities/transactions/transaction_db_mutex_impl.cc b/utilities/transactions/transaction_db_mutex_impl.cc index ec905fbdb..c6649159c 100644 --- a/utilities/transactions/transaction_db_mutex_impl.cc +++ b/utilities/transactions/transaction_db_mutex_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -18,20 +18,19 @@ namespace rocksdb { class TransactionDBMutexImpl : public TransactionDBMutex { public: - TransactionDBMutexImpl() : lock_(mutex_, std::defer_lock) {} + TransactionDBMutexImpl() {} ~TransactionDBMutexImpl() {} Status Lock() override; Status TryLockFor(int64_t timeout_time) override; - void UnLock() override { lock_.unlock(); } + void UnLock() override { mutex_.unlock(); } friend class TransactionDBCondVarImpl; private: - std::mutex mutex_; // Do not acquire mutex_ directly. Use lock_. - std::unique_lock lock_; + std::mutex mutex_; }; class TransactionDBCondVarImpl : public TransactionDBCondVar { @@ -63,7 +62,7 @@ TransactionDBMutexFactoryImpl::AllocateCondVar() { } Status TransactionDBMutexImpl::Lock() { - lock_.lock(); + mutex_.lock(); return Status::OK(); } @@ -71,7 +70,7 @@ Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { bool locked = true; if (timeout_time == 0) { - locked = lock_.try_lock(); + locked = mutex_.try_lock(); } else { // Previously, this code used a std::timed_mutex. However, this was changed // due to known bugs in gcc versions < 4.9. @@ -80,7 +79,7 @@ Status TransactionDBMutexImpl::TryLockFor(int64_t timeout_time) { // Since this mutex isn't held for long and only a single mutex is ever // held at a time, it is reasonable to ignore the lock timeout_time here // and only check it when waiting on the condition_variable. - lock_.lock(); + mutex_.lock(); } if (!locked) { @@ -95,30 +94,40 @@ Status TransactionDBCondVarImpl::Wait( std::shared_ptr mutex) { auto mutex_impl = reinterpret_cast(mutex.get()); - cv_.wait(mutex_impl->lock_); + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); + cv_.wait(lock); + + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); return Status::OK(); } Status TransactionDBCondVarImpl::WaitFor( std::shared_ptr mutex, int64_t timeout_time) { + Status s; + auto mutex_impl = reinterpret_cast(mutex.get()); + std::unique_lock lock(mutex_impl->mutex_, std::adopt_lock); if (timeout_time < 0) { // If timeout is negative, do not use a timeout - cv_.wait(mutex_impl->lock_); + cv_.wait(lock); } else { auto duration = std::chrono::microseconds(timeout_time); - auto cv_status = cv_.wait_for(mutex_impl->lock_, duration); + auto cv_status = cv_.wait_for(lock, duration); // Check if the wait stopped due to timing out. if (cv_status == std::cv_status::timeout) { - return Status::TimedOut(Status::SubCode::kMutexTimeout); + s = Status::TimedOut(Status::SubCode::kMutexTimeout); } } + // Make sure unique_lock doesn't unlock mutex when it destructs + lock.release(); + // CV was signaled, or we spuriously woke up (but didn't time out) - return Status::OK(); + return s; } } // namespace rocksdb diff --git a/utilities/transactions/transaction_db_mutex_impl.h b/utilities/transactions/transaction_db_mutex_impl.h index 7c915ca56..0dfac4fa8 100644 --- a/utilities/transactions/transaction_db_mutex_impl.h +++ b/utilities/transactions/transaction_db_mutex_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/transactions/transaction_impl.cc b/utilities/transactions/transaction_impl.cc index 7480ce6dd..8f80433a8 100644 --- a/utilities/transactions/transaction_impl.cc +++ b/utilities/transactions/transaction_impl.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -20,6 +20,7 @@ #include "rocksdb/status.h" #include "rocksdb/utilities/transaction_db.h" #include "util/string_util.h" +#include "util/sync_point.h" #include "utilities/transactions/transaction_db_impl.h" #include "utilities/transactions/transaction_util.h" @@ -38,27 +39,48 @@ TransactionImpl::TransactionImpl(TransactionDB* txn_db, const TransactionOptions& txn_options) : TransactionBaseImpl(txn_db->GetBaseDB(), write_options), txn_db_impl_(nullptr), - txn_id_(GenTxnID()), - expiration_time_(txn_options.expiration >= 0 - ? start_time_ + txn_options.expiration * 1000 - : 0), - lock_timeout_(txn_options.lock_timeout * 1000) { + txn_id_(0), + expiration_time_(0), + lock_timeout_(0), + exec_status_(STARTED) { txn_db_impl_ = dynamic_cast(txn_db); assert(txn_db_impl_); + Initialize(txn_options); +} + +void TransactionImpl::Initialize(const TransactionOptions& txn_options) { + txn_id_ = GenTxnID(); + + exec_status_ = STARTED; + + lock_timeout_ = txn_options.lock_timeout * 1000; if (lock_timeout_ < 0) { // Lock timeout not set, use default lock_timeout_ = txn_db_impl_->GetTxnDBOptions().transaction_lock_timeout * 1000; } + if (txn_options.expiration >= 0) { + expiration_time_ = start_time_ + txn_options.expiration * 1000; + } else { + expiration_time_ = 0; + } + if (txn_options.set_snapshot) { SetSnapshot(); } + + if (expiration_time_ > 0) { + txn_db_impl_->InsertExpirableTransaction(txn_id_, this); + } } TransactionImpl::~TransactionImpl() { txn_db_impl_->UnLock(this, &GetTrackedKeys()); + if (expiration_time_ > 0) { + txn_db_impl_->RemoveExpirableTransaction(txn_id_); + } } void TransactionImpl::Clear() { @@ -66,6 +88,13 @@ void TransactionImpl::Clear() { TransactionBaseImpl::Clear(); } +void TransactionImpl::Reinitialize(TransactionDB* txn_db, + const WriteOptions& write_options, + const TransactionOptions& txn_options) { + TransactionBaseImpl::Reinitialize(txn_db->GetBaseDB(), write_options); + Initialize(txn_options); +} + bool TransactionImpl::IsExpired() const { if (expiration_time_ > 0) { if (db_->GetEnv()->NowMicros() >= expiration_time_) { @@ -92,7 +121,7 @@ Status TransactionImpl::CommitBatch(WriteBatch* batch) { } Status TransactionImpl::Commit() { - Status s = DoCommit(write_batch_->GetWriteBatch()); + Status s = DoCommit(GetWriteBatch()->GetWriteBatch()); Clear(); @@ -103,18 +132,27 @@ Status TransactionImpl::DoCommit(WriteBatch* batch) { Status s; if (expiration_time_ > 0) { - // We cannot commit a transaction that is expired as its locks might have - // been released. - // To avoid race conditions, we need to use a WriteCallback to check the - // expiration time once we're on the writer thread. - TransactionCallback callback(this); - - // Do write directly on base db as TransctionDB::Write() would attempt to - // do conflict checking that we've already done. - assert(dynamic_cast(db_) != nullptr); - auto db_impl = reinterpret_cast(db_); - - s = db_impl->WriteWithCallback(write_options_, batch, &callback); + if (IsExpired()) { + return Status::Expired(); + } + + // Transaction should only be committed if the thread succeeds + // changing its execution status to COMMITTING. This is because + // A different transaction may consider this one expired and attempt + // to steal its locks between the IsExpired() check and the beginning + // of a commit. + ExecutionStatus expected = STARTED; + bool can_commit = std::atomic_compare_exchange_strong( + &exec_status_, &expected, COMMITTING); + + TEST_SYNC_POINT("TransactionTest::ExpirableTransactionDataRace:1"); + + if (can_commit) { + s = db_->Write(write_options_, batch); + } else { + assert(exec_status_ == LOCKS_STOLEN); + return Status::Expired(); + } } else { s = db_->Write(write_options_, batch); } @@ -126,9 +164,11 @@ void TransactionImpl::Rollback() { Clear(); } Status TransactionImpl::RollbackToSavePoint() { // Unlock any keys locked since last transaction - const TransactionKeyMap* keys = GetTrackedKeysSinceSavePoint(); + const std::unique_ptr& keys = + GetTrackedKeysSinceSavePoint(); + if (keys) { - txn_db_impl_->UnLock(this, keys); + txn_db_impl_->UnLock(this, keys.get()); } return TransactionBaseImpl::RollbackToSavePoint(); @@ -193,7 +233,8 @@ Status TransactionImpl::LockBatch(WriteBatch* batch, if (!s.ok()) { break; } - (*keys_to_unlock)[cfh_id].insert({std::move(key), kMaxSequenceNumber}); + TrackKey(keys_to_unlock, cfh_id, std::move(key), kMaxSequenceNumber, + false); } if (!s.ok()) { @@ -214,7 +255,8 @@ Status TransactionImpl::LockBatch(WriteBatch* batch, // this key will only be locked if there have been no writes to this key since // the snapshot time. Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, - const Slice& key, bool untracked) { + const Slice& key, bool read_only, + bool untracked) { uint32_t cfh_id = GetColumnFamilyID(column_family); std::string key_str = key.ToString(); bool previously_locked; @@ -234,7 +276,7 @@ Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, previously_locked = false; } else { previously_locked = true; - current_seqno = iter->second; + current_seqno = iter->second.seq; } } @@ -281,7 +323,7 @@ Status TransactionImpl::TryLock(ColumnFamilyHandle* column_family, if (s.ok()) { // Let base class know we've conflict checked this key. - TrackKey(cfh_id, key_str, new_seqno); + TrackKey(cfh_id, key_str, new_seqno, read_only); } return s; @@ -295,7 +337,7 @@ Status TransactionImpl::ValidateSnapshot(ColumnFamilyHandle* column_family, SequenceNumber* new_seqno) { assert(snapshot_); - SequenceNumber seq = snapshot_->snapshot()->GetSequenceNumber(); + SequenceNumber seq = snapshot_->GetSequenceNumber(); if (prev_seqno <= seq) { // If the key has been previous validated at a sequence number earlier // than the curent snapshot's sequence number, we already know it has not @@ -311,9 +353,21 @@ Status TransactionImpl::ValidateSnapshot(ColumnFamilyHandle* column_family, ColumnFamilyHandle* cfh = column_family ? column_family : db_impl->DefaultColumnFamily(); - return TransactionUtil::CheckKeyForConflicts( - db_impl, cfh, key.ToString(), snapshot_->snapshot()->GetSequenceNumber(), - false /* cache_only */); + return TransactionUtil::CheckKeyForConflicts(db_impl, cfh, key.ToString(), + snapshot_->GetSequenceNumber(), + false /* cache_only */); +} + +bool TransactionImpl::TryStealingLocks() { + assert(IsExpired()); + ExecutionStatus expected = STARTED; + return std::atomic_compare_exchange_strong(&exec_status_, &expected, + LOCKS_STOLEN); +} + +void TransactionImpl::UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) { + txn_db_impl_->UnLock(this, GetColumnFamilyID(column_family), key.ToString()); } } // namespace rocksdb diff --git a/utilities/transactions/transaction_impl.h b/utilities/transactions/transaction_impl.h index 0fa087d67..cb02e2834 100644 --- a/utilities/transactions/transaction_impl.h +++ b/utilities/transactions/transaction_impl.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -38,6 +38,9 @@ class TransactionImpl : public TransactionBaseImpl { virtual ~TransactionImpl(); + void Reinitialize(TransactionDB* txn_db, const WriteOptions& write_options, + const TransactionOptions& txn_options); + Status Commit() override; Status CommitBatch(WriteBatch* batch); @@ -66,28 +69,38 @@ class TransactionImpl : public TransactionBaseImpl { lock_timeout_ = timeout * 1000; } + // Returns true if locks were stolen successfully, false otherwise. + bool TryStealingLocks(); + protected: Status TryLock(ColumnFamilyHandle* column_family, const Slice& key, - bool untracked = false) override; + bool read_only, bool untracked = false) override; private: + enum ExecutionStatus { STARTED, COMMITTING, LOCKS_STOLEN }; + TransactionDBImpl* txn_db_impl_; // Used to create unique ids for transactions. static std::atomic txn_id_counter_; // Unique ID for this transaction - const TransactionID txn_id_; + TransactionID txn_id_; // If non-zero, this transaction should not be committed after this time (in // microseconds according to Env->NowMicros()) - const uint64_t expiration_time_; + uint64_t expiration_time_; // Timeout in microseconds when locking a key or -1 if there is no timeout. int64_t lock_timeout_; + // Execution status of the transaction. + std::atomic exec_status_; + void Clear() override; + void Initialize(const TransactionOptions& txn_options); + Status ValidateSnapshot(ColumnFamilyHandle* column_family, const Slice& key, SequenceNumber prev_seqno, SequenceNumber* new_seqno); @@ -97,6 +110,9 @@ class TransactionImpl : public TransactionBaseImpl { void RollbackLastN(size_t num); + void UnlockGetForUpdate(ColumnFamilyHandle* column_family, + const Slice& key) override; + // No copying allowed TransactionImpl(const TransactionImpl&); void operator=(const TransactionImpl&); @@ -116,6 +132,8 @@ class TransactionCallback : public WriteCallback { } } + bool AllowWriteBatching() override { return true; } + private: TransactionImpl* txn_; }; diff --git a/utilities/transactions/transaction_lock_mgr.cc b/utilities/transactions/transaction_lock_mgr.cc index 80e4fb8d9..f4fd9aff2 100644 --- a/utilities/transactions/transaction_lock_mgr.cc +++ b/utilities/transactions/transaction_lock_mgr.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -25,6 +25,7 @@ #include "util/autovector.h" #include "util/murmurhash.h" #include "util/thread_local.h" +#include "utilities/transactions/transaction_db_impl.h" namespace rocksdb { @@ -99,12 +100,16 @@ void UnrefLockMapsCache(void* ptr) { } // anonymous namespace TransactionLockMgr::TransactionLockMgr( - size_t default_num_stripes, int64_t max_num_locks, + TransactionDB* txn_db, size_t default_num_stripes, int64_t max_num_locks, std::shared_ptr mutex_factory) - : default_num_stripes_(default_num_stripes), + : txn_db_impl_(nullptr), + default_num_stripes_(default_num_stripes), max_num_locks_(max_num_locks), mutex_factory_(mutex_factory), - lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) {} + lock_maps_cache_(new ThreadLocalPtr(&UnrefLockMapsCache)) { + txn_db_impl_ = dynamic_cast(txn_db); + assert(txn_db_impl_); +} TransactionLockMgr::~TransactionLockMgr() {} @@ -197,6 +202,11 @@ bool TransactionLockMgr::IsLockExpired(const LockInfo& lock_info, Env* env, // return how many microseconds until lock will be expired *expire_time = lock_info.expiration_time; } else { + bool success = + txn_db_impl_->TryStealingExpiredTransactionLocks(lock_info.txn_id); + if (!success) { + expired = false; + } *expire_time = 0; } diff --git a/utilities/transactions/transaction_lock_mgr.h b/utilities/transactions/transaction_lock_mgr.h index 8f640d4ca..5018f39c3 100644 --- a/utilities/transactions/transaction_lock_mgr.h +++ b/utilities/transactions/transaction_lock_mgr.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -24,10 +24,12 @@ struct LockMap; struct LockMapStripe; class Slice; +class TransactionDBImpl; class TransactionLockMgr { public: - TransactionLockMgr(size_t default_num_stripes, int64_t max_num_locks, + TransactionLockMgr(TransactionDB* txn_db, size_t default_num_stripes, + int64_t max_num_locks, std::shared_ptr factory); ~TransactionLockMgr(); @@ -53,6 +55,8 @@ class TransactionLockMgr { const std::string& key, Env* env); private: + TransactionDBImpl* txn_db_impl_; + // Default number of lock map stripes per column family const size_t default_num_stripes_; diff --git a/utilities/transactions/transaction_test.cc b/utilities/transactions/transaction_test.cc index 911212317..f7a1f2ed8 100644 --- a/utilities/transactions/transaction_test.cc +++ b/utilities/transactions/transaction_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -6,6 +6,7 @@ #ifndef ROCKSDB_LITE #include +#include #include "db/db_impl.h" #include "rocksdb/db.h" @@ -14,8 +15,11 @@ #include "rocksdb/utilities/transaction_db.h" #include "table/mock_table.h" #include "util/logging.h" +#include "util/random.h" +#include "util/sync_point.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/transaction_test_util.h" #include "utilities/merge_operators.h" #include "utilities/merge_operators/string_append/stringappend.h" @@ -61,6 +65,17 @@ class TransactionTest : public testing::Test { } }; +TEST_F(TransactionTest, DoubleEmptyWrite) { + WriteOptions write_options; + write_options.sync = true; + write_options.disableWAL = false; + + WriteBatch batch; + + ASSERT_OK(db->Write(write_options, &batch)); + ASSERT_OK(db->Write(write_options, &batch)); +} + TEST_F(TransactionTest, SuccessTest) { WriteOptions write_options; ReadOptions read_options; @@ -436,7 +451,6 @@ TEST_F(TransactionTest, FlushTest2) { s = txn->Delete("S"); // Should fail after encountering a write to S in SST file - fprintf(stderr, "%" ROCKSDB_PRIszt " %s\n", n, s.ToString().c_str()); ASSERT_TRUE(s.IsBusy()); // Write a bunch of keys to db to force a compaction @@ -1198,6 +1212,97 @@ TEST_F(TransactionTest, ExpiredTransaction) { delete txn2; } +TEST_F(TransactionTest, ReinitializeTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + // Set txn expiration timeout to 0 microseconds (expires instantly) + txn_options.expiration = 0; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + // Reinitialize transaction to no long expire + txn_options.expiration = -1; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "z"); + ASSERT_OK(s); + + // Should commit since not expired + s = txn1->Commit(); + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + s = txn1->Put("Z", "zz"); + ASSERT_OK(s); + + // Reinitilize txn1 and verify that Z gets unlocked + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options, nullptr); + s = txn2->Put("Z", "zzz"); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzz"); + + // Verify snapshots get reinitialized correctly + txn1->SetSnapshot(); + s = txn1->Put("Z", "zzzz"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + const Snapshot* snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + txn_options.set_snapshot = true; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_TRUE(snapshot); + + s = txn1->Put("Z", "a"); + ASSERT_OK(s); + + txn1->Rollback(); + + s = txn1->Put("Y", "y"); + ASSERT_OK(s); + + txn_options.set_snapshot = false; + txn1 = db->BeginTransaction(write_options, txn_options, txn1); + snapshot = txn1->GetSnapshot(); + ASSERT_FALSE(snapshot); + + s = txn1->Put("X", "x"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + + s = db->Get(read_options, "Z", &value); + ASSERT_OK(s); + ASSERT_EQ(value, "zzzz"); + + s = db->Get(read_options, "Y", &value); + ASSERT_TRUE(s.IsNotFound()); + + delete txn1; +} + TEST_F(TransactionTest, Rollback) { WriteOptions write_options; ReadOptions read_options; @@ -1716,9 +1821,8 @@ TEST_F(TransactionTest, SavepointTest) { TEST_F(TransactionTest, SavepointTest2) { WriteOptions write_options; - ReadOptions read_options, snapshot_read_options; + ReadOptions read_options; TransactionOptions txn_options; - string value; Status s; txn_options.lock_timeout = 1; // 1 ms @@ -1813,6 +1917,356 @@ TEST_F(TransactionTest, SavepointTest2) { delete txn2; } +TEST_F(TransactionTest, UndoGetForUpdateTest) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + txn1->UndoGetForUpdate("A"); + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; + + txn1 = db->BeginTransaction(write_options, txn_options); + + txn1->UndoGetForUpdate("A"); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + // Verify that A is locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("A"); + + // Verify that A is now unlocked + s = txn2->Put("A", "a2"); + ASSERT_OK(s); + txn2->Commit(); + delete txn2; + s = db->Get(read_options, "A", &value); + ASSERT_OK(s); + ASSERT_EQ("a2", value); + + s = txn1->Delete("A"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("B", "b3"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + + // Verify that A and B are still locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a4"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b4"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->Rollback(); + delete txn1; + + // Verify that A and B are no longer locked + s = txn2->Put("A", "a5"); + ASSERT_OK(s); + s = txn2->Put("B", "b5"); + ASSERT_OK(s); + s = txn2->Commit(); + delete txn2; + ASSERT_OK(s); + + txn1 = db->BeginTransaction(write_options, txn_options); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + s = txn1->Put("B", "b5"); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B,C are locked + txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify A,B are locked and C is not + s = txn2->Put("A", "a6"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c6"); + ASSERT_OK(s); + s = txn2->Put("X", "x6"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("X"); + + // Verify B is locked and A and C are not + s = txn2->Put("A", "a7"); + ASSERT_OK(s); + s = txn2->Delete("B"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c7"); + ASSERT_OK(s); + s = txn2->Put("X", "x7"); + ASSERT_OK(s); + + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + + s = txn1->Commit(); + ASSERT_OK(s); + delete txn1; +} + +TEST_F(TransactionTest, UndoGetForUpdateTest2) { + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + string value; + Status s; + + s = db->Put(write_options, "A", ""); + ASSERT_OK(s); + + txn_options.lock_timeout = 1; // 1 ms + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + ASSERT_TRUE(txn1); + + s = txn1->GetForUpdate(read_options, "A", &value); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "B", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("F", "f"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 1 + + txn1->UndoGetForUpdate("A"); + + s = txn1->GetForUpdate(read_options, "C", &value); + ASSERT_TRUE(s.IsNotFound()); + s = txn1->GetForUpdate(read_options, "D", &value); + ASSERT_TRUE(s.IsNotFound()); + + s = txn1->Put("E", "e"); + ASSERT_OK(s); + s = txn1->GetForUpdate(read_options, "E", &value); + ASSERT_OK(s); + + s = txn1->GetForUpdate(read_options, "F", &value); + ASSERT_OK(s); + + // Verify A,B,C,D,E,F are still locked + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + s = txn2->Put("A", "a1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e1"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f1"); + ASSERT_TRUE(s.IsTimedOut()); + + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("E"); + + // Verify A,B,D,E,F are still locked and C is not. + s = txn2->Put("A", "a2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c2"); + ASSERT_OK(s); + + txn1->SetSavePoint(); // 2 + + s = txn1->Put("H", "h"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,D,E,F,H are still locked and C,G are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("H", "h3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + + txn1->RollbackToSavePoint(); // rollback to 2 + + // Verify A,B,D,E,F are still locked and C,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("D", "d3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify A,B,E,F are still locked and C,D,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("E", "e3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->RollbackToSavePoint(); // rollback to 1 + + // Verify A,B,F are still locked and C,D,E,G,H are not. + s = txn2->Put("A", "a3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("B", "b3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + txn1->UndoGetForUpdate("A"); + txn1->UndoGetForUpdate("B"); + txn1->UndoGetForUpdate("C"); + txn1->UndoGetForUpdate("D"); + txn1->UndoGetForUpdate("E"); + txn1->UndoGetForUpdate("F"); + txn1->UndoGetForUpdate("G"); + txn1->UndoGetForUpdate("H"); + + // Verify F is still locked and A,B,C,D,E,G,H are not. + s = txn2->Put("F", "f3"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Put("A", "a3"); + ASSERT_OK(s); + s = txn2->Put("B", "b3"); + ASSERT_OK(s); + s = txn2->Put("C", "c3"); + ASSERT_OK(s); + s = txn2->Put("D", "d3"); + ASSERT_OK(s); + s = txn2->Put("E", "e3"); + ASSERT_OK(s); + s = txn2->Put("G", "g3"); + ASSERT_OK(s); + s = txn2->Put("H", "h3"); + ASSERT_OK(s); + + s = txn1->Commit(); + ASSERT_OK(s); + s = txn2->Commit(); + ASSERT_OK(s); + + delete txn1; + delete txn2; +} + TEST_F(TransactionTest, TimeoutTest) { WriteOptions write_options; ReadOptions read_options; @@ -2483,6 +2937,119 @@ TEST_F(TransactionTest, ToggleAutoCompactionTest) { } } +TEST_F(TransactionTest, ExpiredTransactionDataRace1) { + // In this test, txn1 should succeed committing, + // as the callback is called after txn1 starts committing. + rocksdb::SyncPoint::GetInstance()->LoadDependency( + {{"TransactionTest::ExpirableTransactionDataRace:1"}}); + rocksdb::SyncPoint::GetInstance()->SetCallBack( + "TransactionTest::ExpirableTransactionDataRace:1", [&](void* arg) { + WriteOptions write_options; + TransactionOptions txn_options; + + // Force txn1 to expire + /* sleep override */ + std::this_thread::sleep_for(std::chrono::milliseconds(150)); + + Transaction* txn2 = db->BeginTransaction(write_options, txn_options); + Status s; + s = txn2->Put("X", "2"); + ASSERT_TRUE(s.IsTimedOut()); + s = txn2->Commit(); + ASSERT_OK(s); + delete txn2; + }); + + rocksdb::SyncPoint::GetInstance()->EnableProcessing(); + + WriteOptions write_options; + TransactionOptions txn_options; + + txn_options.expiration = 100; + Transaction* txn1 = db->BeginTransaction(write_options, txn_options); + + Status s; + s = txn1->Put("X", "1"); + ASSERT_OK(s); + s = txn1->Commit(); + ASSERT_OK(s); + + ReadOptions read_options; + string value; + s = db->Get(read_options, "X", &value); + ASSERT_EQ("1", value); + + delete txn1; + rocksdb::SyncPoint::GetInstance()->DisableProcessing(); +} + +namespace { +Status TransactionStressTestInserter(TransactionDB* db, + const size_t num_transactions, + const size_t num_sets, + const size_t num_keys_per_set) { + size_t seed = std::hash()(std::this_thread::get_id()); + Random64 _rand(seed); + WriteOptions write_options; + ReadOptions read_options; + TransactionOptions txn_options; + txn_options.set_snapshot = true; + + RandomTransactionInserter inserter(&_rand, write_options, read_options, + num_keys_per_set, + static_cast(num_sets)); + + for (size_t t = 0; t < num_transactions; t++) { + bool success = inserter.TransactionDBInsert(db, txn_options); + if (!success) { + // unexpected failure + return inserter.GetLastStatus(); + } + } + + // Make sure at least some of the transactions succeeded. It's ok if + // some failed due to write-conflicts. + if (inserter.GetFailureCount() > num_transactions / 2) { + return Status::TryAgain("Too many transactions failed! " + + std::to_string(inserter.GetFailureCount()) + " / " + + std::to_string(num_transactions)); + } + + return Status::OK(); +} +} // namespace + +TEST_F(TransactionTest, TransactionStressTest) { + const size_t num_threads = 4; + const size_t num_transactions_per_thread = 10000; + const size_t num_sets = 3; + const size_t num_keys_per_set = 100; + // Setting the key-space to be 100 keys should cause enough write-conflicts + // to make this test interesting. + + std::vector threads; + + std::function call_inserter = [&] { + ASSERT_OK(TransactionStressTestInserter(db, num_transactions_per_thread, + num_sets, num_keys_per_set)); + }; + + // Create N threads that use RandomTransactionInserter to write + // many transactions. + for (uint32_t i = 0; i < num_threads; i++) { + threads.emplace_back(call_inserter); + } + + // Wait for all threads to run + for (auto& t : threads) { + t.join(); + } + + // Verify that data is consistent + Status s = RandomTransactionInserter::Verify(db, num_sets); + ASSERT_OK(s); +} + } // namespace rocksdb int main(int argc, char** argv) { diff --git a/utilities/transactions/transaction_util.cc b/utilities/transactions/transaction_util.cc index 0cf4c7329..363e8dece 100644 --- a/utilities/transactions/transaction_util.cc +++ b/utilities/transactions/transaction_util.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -137,7 +137,7 @@ Status TransactionUtil::CheckKeysForConflicts(DBImpl* db_impl, // written to this key since the start of the transaction. for (const auto& key_iter : keys) { const auto& key = key_iter.first; - const SequenceNumber key_seq = key_iter.second; + const SequenceNumber key_seq = key_iter.second.seq; result = CheckKey(db_impl, sv, earliest_seq, key_seq, key, cache_only); diff --git a/utilities/transactions/transaction_util.h b/utilities/transactions/transaction_util.h index b2ce7da19..b9579f7f1 100644 --- a/utilities/transactions/transaction_util.h +++ b/utilities/transactions/transaction_util.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. @@ -17,9 +17,20 @@ namespace rocksdb { +struct TransactionKeyMapInfo { + // Earliest sequence number that is relevant to this transaction for this key + SequenceNumber seq; + + uint32_t num_writes; + uint32_t num_reads; + + explicit TransactionKeyMapInfo(SequenceNumber seq_no) + : seq(seq_no), num_writes(0), num_reads(0) {} +}; + using TransactionKeyMap = std::unordered_map>; + std::unordered_map>; class DBImpl; struct SuperVersion; diff --git a/utilities/write_batch_with_index/write_batch_with_index.cc b/utilities/write_batch_with_index/write_batch_with_index.cc index bae4535d5..e57c95c42 100644 --- a/utilities/write_batch_with_index/write_batch_with_index.cc +++ b/utilities/write_batch_with_index/write_batch_with_index.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.cc b/utilities/write_batch_with_index/write_batch_with_index_internal.cc index ba88e67d4..7b1a6dd27 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/write_batch_with_index/write_batch_with_index_internal.h b/utilities/write_batch_with_index/write_batch_with_index_internal.h index b88cd768e..ec4da19e4 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_internal.h +++ b/utilities/write_batch_with_index/write_batch_with_index_internal.h @@ -1,4 +1,4 @@ -// Copyright (c) 2015, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. diff --git a/utilities/write_batch_with_index/write_batch_with_index_test.cc b/utilities/write_batch_with_index/write_batch_with_index_test.cc index da695c4ca..d91482db4 100644 --- a/utilities/write_batch_with_index/write_batch_with_index_test.cc +++ b/utilities/write_batch_with_index/write_batch_with_index_test.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under the BSD-style license found in the // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory.