merge from master

main
Praveen Rao 9 years ago
commit eb24178553
  1. 2
      .travis.yml
  2. 9
      CMakeLists.txt
  3. 12
      HISTORY.md
  4. 13
      INSTALL.md
  5. 35
      Makefile
  6. 3
      build_tools/build_detect_platform
  7. 67
      build_tools/rocksdb-lego-determinator
  8. 17
      db/builder.cc
  9. 6
      db/builder.h
  10. 36
      db/compaction.cc
  11. 11
      db/compaction.h
  12. 44
      db/compaction_iterator.cc
  13. 2
      db/compaction_iterator.h
  14. 3
      db/compaction_iterator_test.cc
  15. 35
      db/compaction_job.cc
  16. 140
      db/compaction_job_test.cc
  17. 147
      db/compaction_picker_test.cc
  18. 55
      db/db_compaction_filter_test.cc
  19. 2
      db/db_compaction_test.cc
  20. 2
      db/db_dynamic_level_test.cc
  21. 34
      db/db_impl.cc
  22. 3
      db/db_impl.h
  23. 2
      db/db_inplace_update_test.cc
  24. 2
      db/db_log_iter_test.cc
  25. 2
      db/db_tailing_iter_test.cc
  26. 355
      db/db_test.cc
  27. 27
      db/db_test_util.cc
  28. 4
      db/db_test_util.h
  29. 2
      db/db_universal_compaction_test.cc
  30. 2
      db/db_wal_test.cc
  31. 2
      db/event_helpers.cc
  32. 26
      db/filename.cc
  33. 4
      db/filename.h
  34. 16
      db/flush_job.cc
  35. 2
      db/flush_job.h
  36. 56
      db/listener_test.cc
  37. 25
      db/memtable.cc
  38. 113
      db/merge_helper.cc
  39. 38
      db/merge_helper.h
  40. 229
      db/merge_helper_test.cc
  41. 9
      db/repair.cc
  42. 13
      db/table_properties_collector.h
  43. 12
      db/table_properties_collector_test.cc
  44. 124
      db/version_edit.cc
  45. 2
      db/version_edit.h
  46. 116
      db/version_edit_test.cc
  47. 1
      db/version_set.h
  48. 7
      examples/.gitignore
  49. 5
      examples/Makefile
  50. 84
      examples/compaction_filter_example.cc
  51. 182
      examples/rocksdb_option_file_example.ini
  52. 3
      include/rocksdb/cache.h
  53. 118
      include/rocksdb/compaction_filter.h
  54. 3
      include/rocksdb/convenience.h
  55. 45
      include/rocksdb/db_dump_tool.h
  56. 3
      include/rocksdb/env.h
  57. 12
      include/rocksdb/listener.h
  58. 12
      include/rocksdb/options.h
  59. 8
      include/rocksdb/perf_context.h
  60. 6
      include/rocksdb/statistics.h
  61. 2
      include/rocksdb/table.h
  62. 9
      include/rocksdb/table_properties.h
  63. 37
      include/rocksdb/utilities/transaction.h
  64. 2
      include/rocksdb/version.h
  65. 8
      java/crossbuild/build-linux-centos.sh
  66. 11
      port/win/env_win.cc
  67. 51
      port/win/port_win.cc
  68. 54
      port/win/port_win.h
  69. 3
      src.mk
  70. 5
      table/adaptive_table_factory.cc
  71. 2
      table/adaptive_table_factory.h
  72. 12
      table/block_based_filter_block.cc
  73. 15
      table/block_based_table_builder.cc
  74. 3
      table/block_based_table_builder.h
  75. 6
      table/block_based_table_factory.cc
  76. 2
      table/block_based_table_factory.h
  77. 8
      table/block_based_table_reader.cc
  78. 2
      table/cuckoo_table_factory.cc
  79. 2
      table/cuckoo_table_factory.h
  80. 9
      table/full_filter_block.cc
  81. 4
      table/get_context.cc
  82. 2
      table/mock_table.cc
  83. 2
      table/mock_table.h
  84. 10
      table/plain_table_builder.cc
  85. 10
      table/plain_table_builder.h
  86. 9
      table/plain_table_factory.cc
  87. 2
      table/plain_table_factory.h
  88. 12
      table/plain_table_reader.cc
  89. 7
      table/sst_file_writer.cc
  90. 3
      table/table_properties.cc
  91. 2
      table/table_reader_bench.cc
  92. 37
      table/table_test.cc
  93. 261
      tools/dump/db_dump_tool.cc
  94. 167
      tools/dump/rocksdb_dump.cc
  95. 170
      tools/dump/rocksdb_undump.cc
  96. 4
      tools/rocksdb_dump_test.sh
  97. 5
      util/cache.cc
  98. 2
      util/env.cc
  99. 40
      util/env_posix.cc
  100. 3
      util/options.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -33,7 +33,7 @@ before_script:
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
script: script:
- if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format; fi - if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format || true; fi
- OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest - OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest
notifications: notifications:

@ -13,8 +13,8 @@
# cd build # cd build
# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries. # 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
# See thirdparty.inc for more information. # See thirdparty.inc for more information.
# cmake -G "Visual Studio 12 Win64" .. <more options> # sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 ..
# 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads) # 4. Then build the project in debug mode (you may want to add /m:<N> flag to run msbuild in <N> parallel threads)
# msbuild ALL_BUILD.vcxproj # msbuild ALL_BUILD.vcxproj
# 5. And release mode (/m[:<N>] is also supported) # 5. And release mode (/m[:<N>] is also supported)
# msbuild ALL_BUILD.vcxproj /p:Configuration=Release # msbuild ALL_BUILD.vcxproj /p:Configuration=Release
@ -66,13 +66,13 @@ endif()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /MD") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /MD")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64) add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64)
include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR}/include) include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/port)
include_directories(${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src) include_directories(${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
set(ROCKSDB_LIBS rocksdblib${ARTIFACT_SUFFIX}) set(ROCKSDB_LIBS rocksdblib${ARTIFACT_SUFFIX})
@ -100,6 +100,7 @@ set(SOURCES
db/db_impl_experimental.cc db/db_impl_experimental.cc
db/db_impl_readonly.cc db/db_impl_readonly.cc
db/db_iter.cc db/db_iter.cc
db/db_test_util.cc
db/event_helpers.cc db/event_helpers.cc
db/experimental.cc db/experimental.cc
db/filename.cc db/filename.cc
@ -163,6 +164,7 @@ set(SOURCES
table/plain_table_reader.cc table/plain_table_reader.cc
table/table_properties.cc table/table_properties.cc
table/two_level_iterator.cc table/two_level_iterator.cc
tools/dump/db_dump_tool.cc
util/arena.cc util/arena.cc
util/auto_roll_logger.cc util/auto_roll_logger.cc
util/bloom.cc util/bloom.cc
@ -174,7 +176,6 @@ set(SOURCES
util/crc32c.cc util/crc32c.cc
util/db_info_dumper.cc util/db_info_dumper.cc
util/delete_scheduler_impl.cc util/delete_scheduler_impl.cc
util/db_test_util.cc
util/dynamic_bloom.cc util/dynamic_bloom.cc
util/env.cc util/env.cc
util/env_hdfs.cc util/env_hdfs.cc

@ -1,19 +1,29 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased
### Public API Changes
* CompactionFilter::Context includes information of Column Family ID
* The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower.
* TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written.
## 4.1.0 (10/8/2015)
### New Features ### New Features
* Added single delete operation as a more efficient way to delete keys that have not been overwritten. * Added single delete operation as a more efficient way to delete keys that have not been overwritten.
* Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info. * Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
* Added support for opening SST files with .ldb suffix which enables opening LevelDB databases.
* CompactionFilter now supports filtering of merge operands and merge results.
### Public API Changes ### Public API Changes
* Added SingleDelete() to the DB interface. * Added SingleDelete() to the DB interface.
* Added AddFile() to DB interface. * Added AddFile() to DB interface.
* Added SstFileWriter class. * Added SstFileWriter class.
* CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
* We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
## 4.0.0 (9/9/2015) ## 4.0.0 (9/9/2015)
### New Features ### New Features
* Added support for transactions. See include/rocksdb/utilities/transaction.h for more info. * Added support for transactions. See include/rocksdb/utilities/transaction.h for more info.
* DB::GetProperty() now accept "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used. * DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
* Add compression option kZSTDNotFinalCompression for people to experiment ZSTD although its format is not finalized. * Add compression option kZSTDNotFinalCompression for people to experiment ZSTD although its format is not finalized.
* We removed the need for LATEST_BACKUP file in BackupEngine. We still keep writing it when we create new backups (because of backward compatibility), but we don't read it anymore. * We removed the need for LATEST_BACKUP file in BackupEngine. We still keep writing it when we create new backups (because of backward compatibility), but we don't read it anymore.

@ -1,19 +1,24 @@
## Compilation ## Compilation
**Important**: If you plan to run RocksDB in production, don't compile using default
`make` or `make all`. That will compile RocksDB in debug mode, which is much slower
than release mode.
RocksDB's library should be able to compile without any dependency installed, RocksDB's library should be able to compile without any dependency installed,
although we recommend installing some compression libraries (see below). although we recommend installing some compression libraries (see below).
We do depend on newer gcc/clang with C++11 support. We do depend on newer gcc/clang with C++11 support.
There are few options when compiling RocksDB: There are few options when compiling RocksDB:
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. * [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode.
* `make shared_lib` will compile librocksdb.so, RocksDB shared library. * `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode.
* `make check` will compile and run all the unit tests * `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode.
* `make all` will compile our static library, and all our tools and unit tests. Our tools * `make all` will compile our static library, and all our tools and unit tests. Our tools
depend on gflags. You will need to have gflags installed to run `make all`. depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
use binaries compiled by `make all` in production.
* By default the binary we produce is optimized for the platform you're compiling on * By default the binary we produce is optimized for the platform you're compiling on
(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before (-march=native). If you want to build a portable binary, add 'PORTABLE=1' before

@ -76,6 +76,8 @@ endif
ifeq ($(DEBUG_LEVEL),0) ifeq ($(DEBUG_LEVEL),0)
OPT += -DNDEBUG OPT += -DNDEBUG
DISABLE_WARNING_AS_ERROR=1 DISABLE_WARNING_AS_ERROR=1
else
$(warning Warning: Compiling in debug mode. Don't use the resulting binary in production)
endif endif
#----------------------------------------------- #-----------------------------------------------
@ -322,10 +324,14 @@ TOOLS = \
BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench
# The library name is configurable since we are maintaining libraries of both # if user didn't config LIBNAME, set the default
# debug/release mode.
ifeq ($(LIBNAME),) ifeq ($(LIBNAME),)
# we should only run rocksdb in production with DEBUG_LEVEL 0
ifeq ($(DEBUG_LEVEL),0)
LIBNAME=librocksdb LIBNAME=librocksdb
else
LIBNAME=librocksdb_debug
endif
endif endif
LIBRARY = ${LIBNAME}.a LIBRARY = ${LIBNAME}.a
@ -602,6 +608,11 @@ unity.a: unity.o
$(AM_V_AR)rm -f $@ $(AM_V_AR)rm -f $@
$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o $(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
# try compiling db_test with unity
unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a
$(AM_LINK)
./unity_test
rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
@ -698,34 +709,34 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS) slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_test: db/db_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_log_iter_test: db/db_log_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_compaction_filter_test: db/db_compaction_filter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_compaction_filter_test: db/db_compaction_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_compaction_test: db/db_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_compaction_test: db/db_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_dynamic_level_test: db/db_dynamic_level_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_inplace_update_test: db/db_inplace_update_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_tailing_iter_test: db/db_tailing_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS) db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_universal_compaction_test: db/db_universal_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
db_wal_test: db/db_wal_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK) $(AM_LINK)
log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS) log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
@ -1037,7 +1048,7 @@ rocksdbjavastatic: $(java_libobjects) libz.a libbz2.a libsnappy.a liblz4.a
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \ $(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
-o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \ -o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
$(java_libobjects) $(COVERAGEFLAGS) \ $(java_libobjects) $(COVERAGEFLAGS) \
libz.a libbz2.a libsnappy.a liblz4.a $(LDFLAGS) libz.a libbz2.a libsnappy.a liblz4.a $(JAVA_STATIC_LDFLAGS)
cd java/target;strip -S -x $(ROCKSDBJNILIB) cd java/target;strip -S -x $(ROCKSDBJNILIB)
cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB) cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)

@ -8,6 +8,7 @@
# CXX C++ Compiler path # CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags # PLATFORM_LDFLAGS Linker flags
# JAVA_LDFLAGS Linker flags for RocksDBJava # JAVA_LDFLAGS Linker flags for RocksDBJava
# JAVA_STATIC_LDFLAGS Linker flags for RocksDBJava static build
# PLATFORM_SHARED_EXT Extension for shared libraries # PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library # PLATFORM_SHARED_LDFLAGS Flags for building shared library
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library # PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
@ -181,6 +182,7 @@ esac
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
JAVA_LDFLAGS="$PLATFORM_LDFLAGS" JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
# Cross-compiling; do not try any compilation tests. # Cross-compiling; do not try any compilation tests.
@ -374,6 +376,7 @@ echo "CXX=$CXX" >> "$OUTPUT"
echo "PLATFORM=$PLATFORM" >> "$OUTPUT" echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT" echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT" echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT" echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT" echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT" echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"

@ -36,7 +36,6 @@ if [ ! -z $ONCALL ]; then
{ {
'type':'task', 'type':'task',
'triggers':[ 'fail' ], 'triggers':[ 'fail' ],
'assignee':'$ONCALL',
'priority':0, 'priority':0,
'subscribers':[ '$SUBSCRIBER' ], 'subscribers':[ '$SUBSCRIBER' ],
'tags':[ 'rocksdb', 'ci' ], 'tags':[ 'rocksdb', 'ci' ],
@ -69,6 +68,7 @@ CLANG="USE_CLANG=1"
LITE="OPT=-DROCKSDB_LITE" LITE="OPT=-DROCKSDB_LITE"
TSAN="COMPILE_WITH_TSAN=1" TSAN="COMPILE_WITH_TSAN=1"
DISABLE_JEMALLOC="DISABLE_JEMALLOC=1" DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
PARSER="'parser':'egrep \'Failure|^#|Abort\''"
# #
# A mechanism to disable tests temporarily # A mechanism to disable tests temporarily
@ -101,13 +101,15 @@ PARALLEL_UNIT_TEST_COMMANDS="[
{ {
'name':'Build and test RocksDB debug version', 'name':'Build and test RocksDB debug version',
'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*', 'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
'user':'root' 'user':'root',
$PARSER
}, },
$CLEANUP_ENV, $CLEANUP_ENV,
{ {
'name':'Build and test RocksDB debug version under gcc-4.8.1', 'name':'Build and test RocksDB debug version under gcc-4.8.1',
'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*', 'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -126,7 +128,8 @@ UNIT_TEST_COMMANDS="[
{ {
'name':'Build and test RocksDB debug version', 'name':'Build and test RocksDB debug version',
'shell':'$SHM $DEBUG make J=1 check', 'shell':'$SHM $DEBUG make J=1 check',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -145,7 +148,8 @@ UNIT_TEST_COMMANDS_481="[
{ {
'name':'Build and test RocksDB debug version', 'name':'Build and test RocksDB debug version',
'shell':'$SHM $GCC_481 $DEBUG make J=1 check', 'shell':'$SHM $GCC_481 $DEBUG make J=1 check',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -164,7 +168,8 @@ CLANG_UNIT_TEST_COMMANDS="[
{ {
'name':'Build and test RocksDB debug', 'name':'Build and test RocksDB debug',
'shell':'$CLANG $SHM $DEBUG make J=1 check', 'shell':'$CLANG $SHM $DEBUG make J=1 check',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -183,7 +188,8 @@ CLANG_ANALYZE_COMMANDS="[
{ {
'name':'RocksDB build and analyze', 'name':'RocksDB build and analyze',
'shell':'$CLANG $SHM $DEBUG make J=1 analyze', 'shell':'$CLANG $SHM $DEBUG make J=1 analyze',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -202,7 +208,8 @@ CODE_COV_COMMANDS="[
{ {
'name':'Build, test and collect code coverage info', 'name':'Build, test and collect code coverage info',
'shell':'$SHM $DEBUG make J=1 coverage', 'shell':'$SHM $DEBUG make J=1 coverage',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -220,8 +227,9 @@ UNITY_COMMANDS="[
$CLEANUP_ENV, $CLEANUP_ENV,
{ {
'name':'Build, test unity test', 'name':'Build, test unity test',
'shell':'$SHM $DEBUG V=1 make J=1 unity', 'shell':'$SHM $DEBUG V=1 make J=1 unity_test',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -240,7 +248,8 @@ LITE_BUILD_COMMANDS="[
{ {
'name':'Build RocksDB debug version', 'name':'Build RocksDB debug version',
'shell':'$LITE $DEBUG make J=1 static_lib', 'shell':'$LITE $DEBUG make J=1 static_lib',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -260,21 +269,21 @@ STRESS_CRASH_TEST_COMMANDS="[
{ {
'name':'Build and run RocksDB debug stress tests', 'name':'Build and run RocksDB debug stress tests',
'shell':'$SHM $DEBUG make J=1 db_stress', 'shell':'$SHM $DEBUG make J=1 db_stress',
'user':'root' 'user':'root',
$PARSER
}, },
{ {
'name':'Build and run RocksDB debug crash tests', 'name':'Build and run RocksDB debug crash tests',
'timeout': 86400, 'timeout': 86400,
'shell':'$SHM $DEBUG make J=1 crash_test', 'shell':'$SHM $DEBUG make J=1 crash_test',
'user':'root' 'user':'root',
$PARSER
} }
], ],
$REPORT $REPORT
} }
]" ]"
STRESS_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
# #
# RocksDB test under address sanitizer # RocksDB test under address sanitizer
# #
@ -287,7 +296,8 @@ ASAN_TEST_COMMANDS="[
{ {
'name':'Test RocksDB debug under ASAN', 'name':'Test RocksDB debug under ASAN',
'shell':'set -o pipefail && $SHM $ASAN $DEBUG make J=1 asan_check |& /usr/facebook/ops/scripts/asan_symbolize.py -d', 'shell':'set -o pipefail && $SHM $ASAN $DEBUG make J=1 asan_check |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
'user':'root' 'user':'root',
$PARSER
} }
], ],
$REPORT $REPORT
@ -308,15 +318,14 @@ ASAN_CRASH_TEST_COMMANDS="[
'name':'Build and run RocksDB debug asan_crash_test', 'name':'Build and run RocksDB debug asan_crash_test',
'timeout': 86400, 'timeout': 86400,
'shell':'$SHM $DEBUG make J=1 asan_crash_test', 'shell':'$SHM $DEBUG make J=1 asan_crash_test',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
} }
]" ]"
ASAN_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
# #
# RocksDB unit test under valgrind # RocksDB unit test under valgrind
# #
@ -329,7 +338,8 @@ VALGRIND_TEST_COMMANDS="[
{ {
'name':'Run RocksDB debug unit tests', 'name':'Run RocksDB debug unit tests',
'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make valgrind_check', 'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make valgrind_check',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -348,7 +358,8 @@ TSAN_UNIT_TEST_COMMANDS="[
{ {
'name':'Run RocksDB debug unit test', 'name':'Run RocksDB debug unit test',
'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 check', 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 check',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -369,15 +380,14 @@ TSAN_CRASH_TEST_COMMANDS="[
'name':'Compile and run', 'name':'Compile and run',
'timeout': 86400, 'timeout': 86400,
'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 crash_test', 'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 crash_test',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
} }
]" ]"
TSAN_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
# #
# RocksDB format compatible # RocksDB format compatible
# #
@ -432,7 +442,8 @@ FORMAT_COMPATIBLE_COMMANDS="[
{ {
'name':'Run RocksDB debug unit test', 'name':'Run RocksDB debug unit test',
'shell':'build_tools/rocksdb-lego-determinator run_format_compatible', 'shell':'build_tools/rocksdb-lego-determinator run_format_compatible',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -464,7 +475,8 @@ NO_COMPRESSION_COMMANDS="[
{ {
'name':'Run RocksDB debug unit test', 'name':'Run RocksDB debug unit test',
'shell':'build_tools/rocksdb-lego-determinator run_no_compression', 'shell':'build_tools/rocksdb-lego-determinator run_no_compression',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT
@ -520,7 +532,8 @@ REGRESSION_COMMANDS="[
{ {
'name':'Make and run script', 'name':'Make and run script',
'shell':'build_tools/rocksdb-lego-determinator run_regression', 'shell':'build_tools/rocksdb-lego-determinator run_regression',
'user':'root' 'user':'root',
$PARSER
}, },
], ],
$REPORT $REPORT

@ -40,13 +40,14 @@ TableBuilder* NewTableBuilder(
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type, uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters) { const CompressionOptions& compression_opts, const bool skip_filters) {
return ioptions.table_factory->NewTableBuilder( return ioptions.table_factory->NewTableBuilder(
TableBuilderOptions(ioptions, internal_comparator, TableBuilderOptions(ioptions, internal_comparator,
int_tbl_prop_collector_factories, compression_type, int_tbl_prop_collector_factories, compression_type,
compression_opts, skip_filters), compression_opts, skip_filters),
file); column_family_id, file);
} }
Status BuildTable( Status BuildTable(
@ -55,7 +56,8 @@ Status BuildTable(
FileMetaData* meta, const InternalKeyComparator& internal_comparator, FileMetaData* meta, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
std::vector<SequenceNumber> snapshots, const CompressionType compression, uint32_t column_family_id, std::vector<SequenceNumber> snapshots,
const CompressionType compression,
const CompressionOptions& compression_opts, bool paranoid_file_checks, const CompressionOptions& compression_opts, bool paranoid_file_checks,
InternalStats* internal_stats, const Env::IOPriority io_priority, InternalStats* internal_stats, const Env::IOPriority io_priority,
TableProperties* table_properties) { TableProperties* table_properties) {
@ -82,13 +84,14 @@ Status BuildTable(
builder = NewTableBuilder( builder = NewTableBuilder(
ioptions, internal_comparator, int_tbl_prop_collector_factories, ioptions, internal_comparator, int_tbl_prop_collector_factories,
file_writer.get(), compression, compression_opts); column_family_id, file_writer.get(), compression, compression_opts);
} }
MergeHelper merge(internal_comparator.user_comparator(), MergeHelper merge(env, internal_comparator.user_comparator(),
ioptions.merge_operator, ioptions.info_log, ioptions.merge_operator, nullptr, ioptions.info_log,
ioptions.min_partial_merge_operands, ioptions.min_partial_merge_operands,
true /* internal key corruption is not ok */); true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back());
CompactionIterator c_iter(iter, internal_comparator.user_comparator(), CompactionIterator c_iter(iter, internal_comparator.user_comparator(),
&merge, kMaxSequenceNumber, &snapshots, env, &merge, kMaxSequenceNumber, &snapshots, env,

@ -37,7 +37,8 @@ TableBuilder* NewTableBuilder(
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type, uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const CompressionOptions& compression_opts,
const bool skip_filters = false); const bool skip_filters = false);
@ -52,7 +53,8 @@ extern Status BuildTable(
FileMetaData* meta, const InternalKeyComparator& internal_comparator, FileMetaData* meta, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
std::vector<SequenceNumber> snapshots, const CompressionType compression, uint32_t column_family_id, std::vector<SequenceNumber> snapshots,
const CompressionType compression,
const CompressionOptions& compression_opts, bool paranoid_file_checks, const CompressionOptions& compression_opts, bool paranoid_file_checks,
InternalStats* internal_stats, InternalStats* internal_stats,
const Env::IOPriority io_priority = Env::IO_HIGH, const Env::IOPriority io_priority = Env::IO_HIGH,

@ -50,15 +50,34 @@ void Compaction::GetBoundaryKeys(
if (inputs[i].files.empty()) { if (inputs[i].files.empty()) {
continue; continue;
} }
const Slice& start_user_key = inputs[i].files[0]->smallest.user_key(); if (inputs[i].level == 0) {
if (!initialized || ucmp->Compare(start_user_key, *smallest_user_key) < 0) { // we need to consider all files on level 0
*smallest_user_key = start_user_key; for (const auto* f : inputs[i].files) {
} const Slice& start_user_key = f->smallest.user_key();
const Slice& end_user_key = inputs[i].files.back()->largest.user_key(); if (!initialized ||
if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*largest_user_key = end_user_key; *smallest_user_key = start_user_key;
}
const Slice& end_user_key = f->largest.user_key();
if (!initialized ||
ucmp->Compare(end_user_key, *largest_user_key) > 0) {
*largest_user_key = end_user_key;
}
initialized = true;
}
} else {
// we only need to consider the first and last file
const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
if (!initialized ||
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*smallest_user_key = start_user_key;
}
const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
*largest_user_key = end_user_key;
}
initialized = true;
} }
initialized = true;
} }
} }
@ -420,6 +439,7 @@ std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
CompactionFilter::Context context; CompactionFilter::Context context;
context.is_full_compaction = is_full_compaction_; context.is_full_compaction = is_full_compaction_;
context.is_manual_compaction = is_manual_compaction_; context.is_manual_compaction = is_manual_compaction_;
context.column_family_id = cfd_->GetID();
return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter( return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
context); context);
} }

@ -210,6 +210,14 @@ class Compaction {
int output_level, VersionStorageInfo* vstorage, int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs); const std::vector<CompactionInputFiles>& inputs);
TablePropertiesCollection GetOutputTableProperties() const {
return output_table_properties_;
}
void SetOutputTableProperties(TablePropertiesCollection tp) {
output_table_properties_ = std::move(tp);
}
private: private:
// mark (or clear) all files that are being compacted // mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted); void MarkFilesBeingCompacted(bool mark_as_compacted);
@ -273,6 +281,9 @@ class Compaction {
// Does input compression match the output compression? // Does input compression match the output compression?
bool InputCompressionMatchesOutput() const; bool InputCompressionMatchesOutput() const;
// table properties of output files
TablePropertiesCollection output_table_properties_;
}; };
// Utility function // Utility function

@ -12,16 +12,14 @@ namespace rocksdb {
CompactionIterator::CompactionIterator( CompactionIterator::CompactionIterator(
Iterator* input, const Comparator* cmp, MergeHelper* merge_helper, Iterator* input, const Comparator* cmp, MergeHelper* merge_helper,
SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
Env* env, bool expect_valid_internal_key, Statistics* stats, Env* env, bool expect_valid_internal_key, Compaction* compaction,
Compaction* compaction, const CompactionFilter* compaction_filter, const CompactionFilter* compaction_filter, LogBuffer* log_buffer)
LogBuffer* log_buffer)
: input_(input), : input_(input),
cmp_(cmp), cmp_(cmp),
merge_helper_(merge_helper), merge_helper_(merge_helper),
snapshots_(snapshots), snapshots_(snapshots),
env_(env), env_(env),
expect_valid_internal_key_(expect_valid_internal_key), expect_valid_internal_key_(expect_valid_internal_key),
stats_(stats),
compaction_(compaction), compaction_(compaction),
compaction_filter_(compaction_filter), compaction_filter_(compaction_filter),
log_buffer_(log_buffer), log_buffer_(log_buffer),
@ -277,24 +275,30 @@ void CompactionIterator::NextFromInput() {
// have hit (A) // have hit (A)
// We encapsulate the merge related state machine in a different // We encapsulate the merge related state machine in a different
// object to minimize change to the existing flow. // object to minimize change to the existing flow.
merge_helper_->MergeUntil(input_, prev_snapshot, bottommost_level_, merge_helper_->MergeUntil(input_, prev_snapshot, bottommost_level_);
stats_, env_);
merge_out_iter_.SeekToFirst(); merge_out_iter_.SeekToFirst();
// NOTE: key, value, and ikey_ refer to old entries. if (merge_out_iter_.Valid()) {
// These will be correctly set below. // NOTE: key, value, and ikey_ refer to old entries.
key_ = merge_out_iter_.key(); // These will be correctly set below.
value_ = merge_out_iter_.value(); key_ = merge_out_iter_.key();
bool valid_key __attribute__((__unused__)) = value_ = merge_out_iter_.value();
ParseInternalKey(key_, &ikey_); bool valid_key __attribute__((__unused__)) =
// MergeUntil stops when it encounters a corrupt key and does not ParseInternalKey(key_, &ikey_);
// include them in the result, so we expect the keys here to valid. // MergeUntil stops when it encounters a corrupt key and does not
assert(valid_key); // include them in the result, so we expect the keys here to valid.
// Keep current_key_ in sync. assert(valid_key);
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type); // Keep current_key_ in sync.
key_ = current_key_.GetKey(); current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
ikey_.user_key = current_key_.GetUserKey(); key_ = current_key_.GetKey();
valid_ = true; ikey_.user_key = current_key_.GetUserKey();
valid_ = true;
} else {
// all merge operands were filtered out. reset the user key, since the
// batch consumed by the merge operator should not shadow any keys
// coming after the merges
has_current_user_key_ = false;
}
} else { } else {
valid_ = true; valid_ = true;
} }

@ -41,7 +41,6 @@ class CompactionIterator {
MergeHelper* merge_helper, SequenceNumber last_sequence, MergeHelper* merge_helper, SequenceNumber last_sequence,
std::vector<SequenceNumber>* snapshots, Env* env, std::vector<SequenceNumber>* snapshots, Env* env,
bool expect_valid_internal_key, bool expect_valid_internal_key,
Statistics* stats = nullptr,
Compaction* compaction = nullptr, Compaction* compaction = nullptr,
const CompactionFilter* compaction_filter = nullptr, const CompactionFilter* compaction_filter = nullptr,
LogBuffer* log_buffer = nullptr); LogBuffer* log_buffer = nullptr);
@ -91,7 +90,6 @@ class CompactionIterator {
const std::vector<SequenceNumber>* snapshots_; const std::vector<SequenceNumber>* snapshots_;
Env* env_; Env* env_;
bool expect_valid_internal_key_; bool expect_valid_internal_key_;
Statistics* stats_;
Compaction* compaction_; Compaction* compaction_;
const CompactionFilter* compaction_filter_; const CompactionFilter* compaction_filter_;
LogBuffer* log_buffer_; LogBuffer* log_buffer_;

@ -16,7 +16,8 @@ class CompactionIteratorTest : public testing::Test {
void InitIterator(const std::vector<std::string>& ks, void InitIterator(const std::vector<std::string>& ks,
const std::vector<std::string>& vs, const std::vector<std::string>& vs,
SequenceNumber last_sequence) { SequenceNumber last_sequence) {
merge_helper_.reset(new MergeHelper(cmp_, nullptr, nullptr, 0U, false)); merge_helper_.reset(new MergeHelper(Env::Default(), cmp_, nullptr, nullptr,
nullptr, 0U, false, 0));
iter_.reset(new test::VectorIterator(ks, vs)); iter_.reset(new test::VectorIterator(ks, vs));
iter_->SeekToFirst(); iter_->SeekToFirst();
c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(), c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(),

@ -77,6 +77,7 @@ struct CompactionJob::SubcompactionState {
struct Output { struct Output {
FileMetaData meta; FileMetaData meta;
bool finished; bool finished;
std::shared_ptr<const TableProperties> table_properties;
}; };
// State kept for output being generated // State kept for output being generated
@ -487,6 +488,16 @@ Status CompactionJob::Run() {
} }
} }
TablePropertiesCollection tp;
for (const auto& state : compact_->sub_compact_states) {
for (const auto& output : state.outputs) {
auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(),
output.meta.fd.GetPathId());
tp[fn] = output.table_properties;
}
}
compact_->compaction->SetOutputTableProperties(std::move(tp));
// Finish up all book-keeping to unify the subcompaction results // Finish up all book-keeping to unify the subcompaction results
AggregateStatistics(); AggregateStatistics();
UpdateCompactionStats(); UpdateCompactionStats();
@ -597,10 +608,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
} }
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data(); ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
db_options_.info_log.get(),
cfd->ioptions()->min_partial_merge_operands,
false /* internal key corruption is expected */);
auto compaction_filter = cfd->ioptions()->compaction_filter; auto compaction_filter = cfd->ioptions()->compaction_filter;
std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr; std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
if (compaction_filter == nullptr) { if (compaction_filter == nullptr) {
@ -608,6 +615,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
sub_compact->compaction->CreateCompactionFilter(); sub_compact->compaction->CreateCompactionFilter();
compaction_filter = compaction_filter_from_factory.get(); compaction_filter = compaction_filter_from_factory.get();
} }
MergeHelper merge(
env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
compaction_filter, db_options_.info_log.get(),
cfd->ioptions()->min_partial_merge_operands,
false /* internal key corruption is expected */,
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
compact_->compaction->level(), db_options_.statistics.get());
TEST_SYNC_POINT("CompactionJob::Run():Inprogress"); TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
@ -624,8 +638,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
Status status; Status status;
sub_compact->c_iter.reset(new CompactionIterator( sub_compact->c_iter.reset(new CompactionIterator(
input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(), input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
&existing_snapshots_, env_, false, db_options_.statistics.get(), &existing_snapshots_, env_, false, sub_compact->compaction,
sub_compact->compaction, compaction_filter)); compaction_filter));
auto c_iter = sub_compact->c_iter.get(); auto c_iter = sub_compact->c_iter.get();
c_iter->SeekToFirst(); c_iter->SeekToFirst();
const auto& c_iter_stats = c_iter->iter_stats(); const auto& c_iter_stats = c_iter->iter_stats();
@ -811,7 +825,10 @@ Status CompactionJob::FinishCompactionOutputFile(
delete iter; delete iter;
if (s.ok()) { if (s.ok()) {
TableFileCreationInfo info(sub_compact->builder->GetTableProperties()); auto tp = sub_compact->builder->GetTableProperties();
sub_compact->current_output()->table_properties =
std::make_shared<TableProperties>(tp);
TableFileCreationInfo info(std::move(tp));
info.db_name = dbname_; info.db_name = dbname_;
info.cf_name = cfd->GetName(); info.cf_name = cfd->GetName();
info.file_path = info.file_path =
@ -924,8 +941,8 @@ Status CompactionJob::OpenCompactionOutputFile(
cfd->ioptions()->optimize_filters_for_hits && bottommost_level_; cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
sub_compact->builder.reset(NewTableBuilder( sub_compact->builder.reset(NewTableBuilder(
*cfd->ioptions(), cfd->internal_comparator(), *cfd->ioptions(), cfd->internal_comparator(),
cfd->int_tbl_prop_collector_factories(), sub_compact->outfile.get(), cfd->int_tbl_prop_collector_factories(), cfd->GetID(),
sub_compact->compaction->output_compression(), sub_compact->outfile.get(), sub_compact->compaction->output_compression(),
cfd->ioptions()->compression_opts, skip_filters)); cfd->ioptions()->compression_opts, skip_filters));
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
return s; return s;

@ -182,7 +182,7 @@ class CompactionJobTest : public testing::Test {
return expected_results; return expected_results;
} }
void NewDB(std::shared_ptr<MergeOperator> merge_operator = nullptr) { void NewDB() {
VersionEdit new_db; VersionEdit new_db;
new_db.SetLogNumber(0); new_db.SetLogNumber(0);
new_db.SetNextFile(2); new_db.SetNextFile(2);
@ -207,7 +207,8 @@ class CompactionJobTest : public testing::Test {
std::vector<ColumnFamilyDescriptor> column_families; std::vector<ColumnFamilyDescriptor> column_families;
cf_options_.table_factory = mock_table_factory_; cf_options_.table_factory = mock_table_factory_;
cf_options_.merge_operator = merge_operator; cf_options_.merge_operator = merge_op_;
cf_options_.compaction_filter = compaction_filter_.get();
column_families.emplace_back(kDefaultColumnFamilyName, cf_options_); column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
EXPECT_OK(versions_->Recover(column_families, false)); EXPECT_OK(versions_->Recover(column_families, false));
@ -258,10 +259,16 @@ class CompactionJobTest : public testing::Test {
&mutex_)); &mutex_));
mutex_.Unlock(); mutex_.Unlock();
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U); if (expected_results.size() == 0) {
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files); ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_output_files, 1U); ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
mock_table_factory_->AssertLatestFile(expected_results); ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
} else {
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
mock_table_factory_->AssertLatestFile(expected_results);
}
} }
Env* env_; Env* env_;
@ -279,6 +286,8 @@ class CompactionJobTest : public testing::Test {
std::shared_ptr<mock::MockTableFactory> mock_table_factory_; std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
CompactionJobStats compaction_job_stats_; CompactionJobStats compaction_job_stats_;
ColumnFamilyData* cfd_; ColumnFamilyData* cfd_;
std::unique_ptr<CompactionFilter> compaction_filter_;
std::shared_ptr<MergeOperator> merge_op_;
}; };
TEST_F(CompactionJobTest, Simple) { TEST_F(CompactionJobTest, Simple) {
@ -297,7 +306,7 @@ TEST_F(CompactionJobTest, SimpleCorrupted) {
auto expected_results = CreateTwoFiles(true); auto expected_results = CreateTwoFiles(true);
auto cfd = versions_->GetColumnFamilySet()->GetDefault(); auto cfd = versions_->GetColumnFamilySet()->GetDefault();
auto files = cfd->current()->storage_info()->LevelFiles(0); auto files = cfd->current()->storage_info()->LevelFiles(0);
RunCompaction({ files }, expected_results); RunCompaction({files}, expected_results);
ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U); ASSERT_EQ(compaction_job_stats_.num_corrupt_keys, 400U);
} }
@ -317,7 +326,7 @@ TEST_F(CompactionJobTest, SimpleDeletion) {
SetLastSequence(4U); SetLastSequence(4U);
auto files = cfd_->current()->storage_info()->LevelFiles(0); auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({ files }, expected_results); RunCompaction({files}, expected_results);
} }
TEST_F(CompactionJobTest, SimpleOverwrite) { TEST_F(CompactionJobTest, SimpleOverwrite) {
@ -339,7 +348,7 @@ TEST_F(CompactionJobTest, SimpleOverwrite) {
SetLastSequence(4U); SetLastSequence(4U);
auto files = cfd_->current()->storage_info()->LevelFiles(0); auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({ files }, expected_results); RunCompaction({files}, expected_results);
} }
TEST_F(CompactionJobTest, SimpleNonLastLevel) { TEST_F(CompactionJobTest, SimpleNonLastLevel) {
@ -368,12 +377,12 @@ TEST_F(CompactionJobTest, SimpleNonLastLevel) {
SetLastSequence(6U); SetLastSequence(6U);
auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0); auto lvl0_files = cfd_->current()->storage_info()->LevelFiles(0);
auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1); auto lvl1_files = cfd_->current()->storage_info()->LevelFiles(1);
RunCompaction({ lvl0_files, lvl1_files }, expected_results); RunCompaction({lvl0_files, lvl1_files}, expected_results);
} }
TEST_F(CompactionJobTest, SimpleMerge) { TEST_F(CompactionJobTest, SimpleMerge) {
auto merge_op = MergeOperators::CreateStringAppendOperator(); merge_op_ = MergeOperators::CreateStringAppendOperator();
NewDB(merge_op); NewDB();
auto file1 = mock::MakeMockFile({ auto file1 = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeMerge), "5"}, {KeyStr("a", 5U, kTypeMerge), "5"},
@ -392,12 +401,12 @@ TEST_F(CompactionJobTest, SimpleMerge) {
SetLastSequence(5U); SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0); auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({ files }, expected_results); RunCompaction({files}, expected_results);
} }
TEST_F(CompactionJobTest, NonAssocMerge) { TEST_F(CompactionJobTest, NonAssocMerge) {
auto merge_op = MergeOperators::CreateStringAppendTESTOperator(); merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
NewDB(merge_op); NewDB();
auto file1 = mock::MakeMockFile({ auto file1 = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeMerge), "5"}, {KeyStr("a", 5U, kTypeMerge), "5"},
@ -417,7 +426,106 @@ TEST_F(CompactionJobTest, NonAssocMerge) {
SetLastSequence(5U); SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0); auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({ files }, expected_results); RunCompaction({files}, expected_results);
}
// Filters merge operands with value 10.
TEST_F(CompactionJobTest, MergeOperandFilter) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 = mock::MakeMockFile(
{{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
{KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
{KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
AddMockFile(file1);
auto file2 = mock::MakeMockFile({
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
});
AddMockFile(file2);
auto expected_results =
mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}});
SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({files}, expected_results);
}
TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 = mock::MakeMockFile(
{{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
{KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
{KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
{KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file1);
auto file2 =
mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
{KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
{KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
AddMockFile(file2);
auto file3 =
mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
AddMockFile(file3, 2);
auto expected_results = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
{KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
// b does not appear because the operands are filtered
});
SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({files}, expected_results);
}
// Test where all operands/merge results are filtered out.
TEST_F(CompactionJobTest, FilterAllMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 =
mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file1);
auto file2 =
mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file2);
auto file3 =
mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file3, 2);
SetLastSequence(11U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
stl_wrappers::KVMap empty_map;
RunCompaction({files}, empty_map);
} }
TEST_F(CompactionJobTest, SimpleSingleDelete) { TEST_F(CompactionJobTest, SimpleSingleDelete) {

@ -7,6 +7,8 @@
#include "db/compaction_picker.h" #include "db/compaction_picker.h"
#include <limits> #include <limits>
#include <string> #include <string>
#include <utility>
#include "util/logging.h" #include "util/logging.h"
#include "util/string_util.h" #include "util/string_util.h"
#include "util/testharness.h" #include "util/testharness.h"
@ -36,6 +38,8 @@ class CompactionPickerTest : public testing::Test {
CompactionOptionsFIFO fifo_options_; CompactionOptionsFIFO fifo_options_;
std::unique_ptr<VersionStorageInfo> vstorage_; std::unique_ptr<VersionStorageInfo> vstorage_;
std::vector<std::unique_ptr<FileMetaData>> files_; std::vector<std::unique_ptr<FileMetaData>> files_;
// does not own FileMetaData
std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
// input files to compaction process. // input files to compaction process.
std::vector<CompactionInputFiles> input_files_; std::vector<CompactionInputFiles> input_files_;
int compaction_level_start_; int compaction_level_start_;
@ -70,12 +74,7 @@ class CompactionPickerTest : public testing::Test {
void DeleteVersionStorage() { void DeleteVersionStorage() {
vstorage_.reset(); vstorage_.reset();
files_.clear(); files_.clear();
for (uint32_t i = 0; i < input_files_.size(); ++i) { file_map_.clear();
for (uint32_t j = 0; j < input_files_[i].files.size(); ++j) {
delete input_files_[i].files[j];
}
input_files_[i].files.clear();
}
input_files_.clear(); input_files_.clear();
} }
@ -94,9 +93,10 @@ class CompactionPickerTest : public testing::Test {
f->refs = 0; f->refs = 0;
vstorage_->AddFile(level, f); vstorage_->AddFile(level, f);
files_.emplace_back(f); files_.emplace_back(f);
file_map_.insert({file_number, {f, level}});
} }
void setCompactionInputFilesLevels(int level_count, int start_level) { void SetCompactionInputFilesLevels(int level_count, int start_level) {
input_files_.resize(level_count); input_files_.resize(level_count);
for (int i = 0; i < level_count; ++i) { for (int i = 0; i < level_count; ++i) {
input_files_[i].level = start_level + i; input_files_[i].level = start_level + i;
@ -104,21 +104,13 @@ class CompactionPickerTest : public testing::Test {
compaction_level_start_ = start_level; compaction_level_start_ = start_level;
} }
void AddToCompactionFiles(int level, uint32_t file_number, void AddToCompactionFiles(uint32_t file_number) {
const char* smallest, const char* largest, auto iter = file_map_.find(file_number);
uint64_t file_size = 0, uint32_t path_id = 0, assert(iter != file_map_.end());
SequenceNumber smallest_seq = 100, int level = iter->second.second;
SequenceNumber largest_seq = 100) {
assert(level < vstorage_->num_levels()); assert(level < vstorage_->num_levels());
FileMetaData* f = new FileMetaData; input_files_[level - compaction_level_start_].files.emplace_back(
f->fd = FileDescriptor(file_number, path_id, file_size); iter->second.first);
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
f->smallest_seqno = smallest_seq;
f->largest_seqno = largest_seq;
f->compensated_file_size = file_size;
f->refs = 0;
input_files_[level - compaction_level_start_].files.emplace_back(f);
} }
void UpdateVersionStorageInfo() { void UpdateVersionStorageInfo() {
@ -676,25 +668,24 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
TEST_F(CompactionPickerTest, IsBottommostLevelTest) { TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
// case 1: Higher levels are empty // case 1: Higher levels are empty
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c"); Add(0, 1U, "a", "m");
Add(0, 2U, "y", "z"); Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e"); Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p"); Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i"); Add(2, 5U, "g", "i");
Add(2, 6U, "x", "z"); Add(2, 6U, "x", "z");
UpdateVersionStorageInfo(); UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1); SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e"); AddToCompactionFiles(3U);
AddToCompactionFiles(2, 5U, "g", "i"); AddToCompactionFiles(5U);
bool result = bool result =
Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result); ASSERT_TRUE(result);
// case 2: Higher levels have no overlap // case 2: Higher levels have no overlap
DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c"); Add(0, 1U, "a", "m");
Add(0, 2U, "y", "z"); Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e"); Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p"); Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i"); Add(2, 5U, "g", "i");
@ -704,17 +695,16 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(4, 9U, "a", "b"); Add(4, 9U, "a", "b");
Add(5, 10U, "c", "cc"); Add(5, 10U, "c", "cc");
UpdateVersionStorageInfo(); UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1); SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e"); AddToCompactionFiles(3U);
AddToCompactionFiles(2, 5U, "g", "i"); AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result); ASSERT_TRUE(result);
// case 3.1: Higher levels (level 3) have overlap // case 3.1: Higher levels (level 3) have overlap
DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c"); Add(0, 1U, "a", "m");
Add(0, 2U, "y", "z"); Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e"); Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p"); Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i"); Add(2, 5U, "g", "i");
@ -724,17 +714,17 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(4, 9U, "a", "b"); Add(4, 9U, "a", "b");
Add(5, 10U, "c", "cc"); Add(5, 10U, "c", "cc");
UpdateVersionStorageInfo(); UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1); SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e"); AddToCompactionFiles(3U);
AddToCompactionFiles(2, 5U, "g", "i"); AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result); ASSERT_FALSE(result);
// case 3.1: Higher levels (level 5) have overlap // case 3.2: Higher levels (level 5) have overlap
DeleteVersionStorage(); DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c"); Add(0, 1U, "a", "m");
Add(0, 2U, "y", "z"); Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e"); Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p"); Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i"); Add(2, 5U, "g", "i");
@ -747,17 +737,17 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(5, 12U, "y", "yy"); Add(5, 12U, "y", "yy");
Add(5, 13U, "z", "zz"); Add(5, 13U, "z", "zz");
UpdateVersionStorageInfo(); UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1); SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "i"); AddToCompactionFiles(3U);
AddToCompactionFiles(2, 5U, "g", "i"); AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result); ASSERT_FALSE(result);
// case 3.1: Higher levels (level 5) have overlap // case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
DeleteVersionStorage(); // one key ("d")
NewVersionStorage(6, kCompactionStyleLevel); NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c"); Add(0, 1U, "a", "m");
Add(0, 2U, "y", "z"); Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e"); Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p"); Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i"); Add(2, 5U, "g", "i");
@ -770,11 +760,66 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(5, 12U, "y", "yy"); Add(5, 12U, "y", "yy");
Add(5, 13U, "z", "zz"); Add(5, 13U, "z", "zz");
UpdateVersionStorageInfo(); UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1); SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "i"); AddToCompactionFiles(3U);
AddToCompactionFiles(2, 5U, "g", "i"); AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
// Level 0 files overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "z");
Add(0, 4U, "e", "f");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(1, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_); result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result); ASSERT_FALSE(result);
// Level 0 files don't overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "k");
Add(0, 4U, "e", "f");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(1, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result);
// Level 1 files overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "k");
Add(0, 4U, "e", "f");
Add(1, 5U, "a", "m");
Add(1, 6U, "n", "o");
Add(1, 7U, "w", "y");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(2, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
AddToCompactionFiles(5U);
AddToCompactionFiles(6U);
AddToCompactionFiles(7U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
DeleteVersionStorage(); DeleteVersionStorage();
} }

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb { namespace rocksdb {
@ -97,8 +97,11 @@ class ChangeFilter : public CompactionFilter {
class KeepFilterFactory : public CompactionFilterFactory { class KeepFilterFactory : public CompactionFilterFactory {
public: public:
explicit KeepFilterFactory(bool check_context = false) explicit KeepFilterFactory(bool check_context = false,
: check_context_(check_context) {} bool check_context_cf_id = false)
: check_context_(check_context),
check_context_cf_id_(check_context_cf_id),
compaction_filter_created_(false) {}
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter( virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) override { const CompactionFilter::Context& context) override {
@ -106,13 +109,22 @@ class KeepFilterFactory : public CompactionFilterFactory {
EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction); EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction); EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
} }
if (check_context_cf_id_) {
EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
}
compaction_filter_created_ = true;
return std::unique_ptr<CompactionFilter>(new KeepFilter()); return std::unique_ptr<CompactionFilter>(new KeepFilter());
} }
bool compaction_filter_created() const { return compaction_filter_created_; }
virtual const char* Name() const override { return "KeepFilterFactory"; } virtual const char* Name() const override { return "KeepFilterFactory"; }
bool check_context_; bool check_context_;
bool check_context_cf_id_;
std::atomic_bool expect_full_compaction_; std::atomic_bool expect_full_compaction_;
std::atomic_bool expect_manual_compaction_; std::atomic_bool expect_manual_compaction_;
std::atomic<uint32_t> expect_cf_id_;
bool compaction_filter_created_;
}; };
class DeleteFilterFactory : public CompactionFilterFactory { class DeleteFilterFactory : public CompactionFilterFactory {
@ -482,7 +494,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
} }
TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) { TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
KeepFilterFactory* filter = new KeepFilterFactory(); KeepFilterFactory* filter = new KeepFilterFactory(true, true);
Options options = CurrentOptions(); Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal; options.compaction_style = kCompactionStyleUniversal;
@ -504,15 +516,17 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
// be triggered. // be triggered.
num_keys_per_file /= 2; num_keys_per_file /= 2;
} }
dbfull()->TEST_WaitForCompact();
// Force a manual compaction // Force a manual compaction
cfilter_count = 0; cfilter_count = 0;
filter->expect_manual_compaction_.store(true); filter->expect_manual_compaction_.store(true);
filter->expect_full_compaction_.store(false); // Manual compaction always filter->expect_full_compaction_.store(true);
// set this flag. filter->expect_cf_id_.store(0);
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr); dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_EQ(cfilter_count, 700); ASSERT_EQ(cfilter_count, 700);
ASSERT_EQ(NumSortedRuns(0), 1); ASSERT_EQ(NumSortedRuns(0), 1);
ASSERT_TRUE(filter->compaction_filter_created());
// Verify total number of keys is correct after manual compaction. // Verify total number of keys is correct after manual compaction.
{ {
@ -537,6 +551,35 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
} }
} }
TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
KeepFilterFactory* filter = new KeepFilterFactory(false, true);
filter->expect_cf_id_.store(1);
Options options = CurrentOptions();
options.compaction_filter_factory.reset(filter);
options.compression = kNoCompression;
options.level0_file_num_compaction_trigger = 2;
CreateAndReopenWithCF({"pikachu"}, options);
int num_keys_per_file = 400;
for (int j = 0; j < 3; j++) {
// Write several keys.
const std::string value(10, 'x');
for (int i = 0; i < num_keys_per_file; i++) {
char key[100];
snprintf(key, sizeof(key), "B%08d%02d", i, j);
Put(1, key, value);
}
Flush(1);
// Make sure next file is much smaller so automatic compaction will not
// be triggered.
num_keys_per_file /= 2;
}
dbfull()->TEST_WaitForCompact();
ASSERT_TRUE(filter->compaction_filter_created());
}
// Compaction filters should only be applied to records that are newer than the // Compaction filters should only be applied to records that are newer than the
// latest snapshot. This test inserts records and applies a delete filter. // latest snapshot. This test inserts records and applies a delete filter.
TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) { TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {

@ -7,9 +7,9 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "rocksdb/experimental.h" #include "rocksdb/experimental.h"
#include "util/db_test_util.h"
#include "util/sync_point.h" #include "util/sync_point.h"
namespace rocksdb { namespace rocksdb {

@ -12,8 +12,8 @@
// which is a pity, it is a good test // which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN) #if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb { namespace rocksdb {
class DBTestDynamicLevel : public DBTestBase { class DBTestDynamicLevel : public DBTestBase {

@ -1338,8 +1338,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
s = BuildTable( s = BuildTable(
dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(), dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
iter.get(), &meta, cfd->internal_comparator(), iter.get(), &meta, cfd->internal_comparator(),
cfd->int_tbl_prop_collector_factories(), snapshots_.GetAll(), cfd->int_tbl_prop_collector_factories(), cfd->GetID(),
GetCompressionFlush(*cfd->ioptions()), snapshots_.GetAll(), GetCompressionFlush(*cfd->ioptions()),
cfd->ioptions()->compression_opts, paranoid_file_checks, cfd->ioptions()->compression_opts, paranoid_file_checks,
cfd->internal_stats(), Env::IO_HIGH, &info.table_properties); cfd->internal_stats(), Env::IO_HIGH, &info.table_properties);
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
@ -1433,15 +1433,16 @@ Status DBImpl::FlushMemTableToOutputFile(
if (s.ok()) { if (s.ok()) {
// may temporarily unlock and lock the mutex. // may temporarily unlock and lock the mutex.
NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options, NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
job_context->job_id); job_context->job_id, flush_job.GetTableProperties());
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
return s; return s;
} }
void DBImpl::NotifyOnFlushCompleted( void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
ColumnFamilyData* cfd, FileMetaData* file_meta, FileMetaData* file_meta,
const MutableCFOptions& mutable_cf_options, int job_id) { const MutableCFOptions& mutable_cf_options,
int job_id, TableProperties prop) {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
if (db_options_.listeners.size() == 0U) { if (db_options_.listeners.size() == 0U) {
return; return;
@ -1471,6 +1472,7 @@ void DBImpl::NotifyOnFlushCompleted(
info.triggered_writes_stop = triggered_writes_stop; info.triggered_writes_stop = triggered_writes_stop;
info.smallest_seqno = file_meta->smallest_seqno; info.smallest_seqno = file_meta->smallest_seqno;
info.largest_seqno = file_meta->largest_seqno; info.largest_seqno = file_meta->largest_seqno;
info.table_properties = prop;
for (auto listener : db_options_.listeners) { for (auto listener : db_options_.listeners) {
listener->OnFlushCompleted(this, info); listener->OnFlushCompleted(this, info);
} }
@ -1816,12 +1818,20 @@ void DBImpl::NotifyOnCompactionCompleted(
info.base_input_level = c->start_level(); info.base_input_level = c->start_level();
info.output_level = c->output_level(); info.output_level = c->output_level();
info.stats = compaction_job_stats; info.stats = compaction_job_stats;
info.table_properties = c->GetOutputTableProperties();
for (size_t i = 0; i < c->num_input_levels(); ++i) { for (size_t i = 0; i < c->num_input_levels(); ++i) {
for (const auto fmd : *c->inputs(i)) { for (const auto fmd : *c->inputs(i)) {
info.input_files.push_back( auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(),
TableFileName(db_options_.db_paths, fmd->fd.GetPathId());
fmd->fd.GetNumber(), info.input_files.push_back(fn);
fmd->fd.GetPathId())); if (info.table_properties.count(fn) == 0) {
std::shared_ptr<const TableProperties> tp;
std::string fname;
auto s = cfd->current()->GetTableProperties(&tp, fmd, &fname);
if (s.ok()) {
info.table_properties[fn] = tp;
}
}
} }
} }
for (const auto newf : c->edit()->GetNewFiles()) { for (const auto newf : c->edit()->GetNewFiles()) {
@ -4502,6 +4512,10 @@ Status DBImpl::CheckConsistency() {
uint64_t fsize = 0; uint64_t fsize = 0;
Status s = env_->GetFileSize(file_path, &fsize); Status s = env_->GetFileSize(file_path, &fsize);
if (!s.ok() &&
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
s = Status::OK();
}
if (!s.ok()) { if (!s.ok()) {
corruption_messages += corruption_messages +=
"Can't access " + md.name + ": " + s.ToString() + "\n"; "Can't access " + md.name + ": " + s.ToString() + "\n";

@ -51,7 +51,6 @@ class TableCache;
class Version; class Version;
class VersionEdit; class VersionEdit;
class VersionSet; class VersionSet;
class CompactionFilterV2;
class Arena; class Arena;
class WriteCallback; class WriteCallback;
struct JobContext; struct JobContext;
@ -376,7 +375,7 @@ class DBImpl : public DB {
void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta, void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
const MutableCFOptions& mutable_cf_options, const MutableCFOptions& mutable_cf_options,
int job_id); int job_id, TableProperties prop);
void NotifyOnCompactionCompleted(ColumnFamilyData* cfd, void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
Compaction *c, const Status &st, Compaction *c, const Status &st,

@ -6,8 +6,8 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb { namespace rocksdb {

@ -12,8 +12,8 @@
// which is a pity, it is a good test // which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN) #if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb { namespace rocksdb {

@ -12,9 +12,9 @@
// which is a pity, it is a good test // which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN) #if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "db/forward_iterator.h" #include "db/forward_iterator.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb { namespace rocksdb {

@ -26,7 +26,7 @@
#include "db/filename.h" #include "db/filename.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/filename.h" #include "db/db_test_util.h"
#include "db/job_context.h" #include "db/job_context.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
@ -55,7 +55,6 @@
#include "table/block_based_table_factory.h" #include "table/block_based_table_factory.h"
#include "table/mock_table.h" #include "table/mock_table.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
#include "util/db_test_util.h"
#include "util/file_reader_writer.h" #include "util/file_reader_writer.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/hash_linklist_rep.h" #include "util/hash_linklist_rep.h"
@ -133,6 +132,46 @@ class DBTestWithParam : public DBTest,
uint32_t max_subcompactions_; uint32_t max_subcompactions_;
}; };
class BloomStatsTestWithParam
: public DBTest,
public testing::WithParamInterface<std::tuple<bool, bool>> {
public:
BloomStatsTestWithParam() {
use_block_table_ = std::get<0>(GetParam());
use_block_based_builder_ = std::get<1>(GetParam());
options_.create_if_missing = true;
options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
options_.memtable_prefix_bloom_bits = 8 * 1024;
if (use_block_table_) {
BlockBasedTableOptions table_options;
table_options.hash_index_allow_collision = false;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder_));
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
} else {
PlainTableOptions table_options;
options_.table_factory.reset(NewPlainTableFactory(table_options));
}
perf_context.Reset();
DestroyAndReopen(options_);
}
~BloomStatsTestWithParam() {
perf_context.Reset();
Destroy(options_);
}
// Required if inheriting from testing::WithParamInterface<>
static void SetUpTestCase() {}
static void TearDownTestCase() {}
bool use_block_table_;
bool use_block_based_builder_;
Options options_;
};
TEST_F(DBTest, Empty) { TEST_F(DBTest, Empty) {
do { do {
Options options; Options options;
@ -604,10 +643,10 @@ TEST_F(DBTest, AggregatedTableProperties) {
TEST_F(DBTest, ReadLatencyHistogramByLevel) { TEST_F(DBTest, ReadLatencyHistogramByLevel) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.write_buffer_size = 110 << 10; options.write_buffer_size = 110 << 10;
options.level0_file_num_compaction_trigger = 3; options.level0_file_num_compaction_trigger = 6;
options.num_levels = 4; options.num_levels = 4;
options.compression = kNoCompression; options.compression = kNoCompression;
options.max_bytes_for_level_base = 450 << 10; options.max_bytes_for_level_base = 4500 << 10;
options.target_file_size_base = 98 << 10; options.target_file_size_base = 98 << 10;
options.max_write_buffer_number = 2; options.max_write_buffer_number = 2;
options.statistics = rocksdb::CreateDBStatistics(); options.statistics = rocksdb::CreateDBStatistics();
@ -619,10 +658,11 @@ TEST_F(DBTest, ReadLatencyHistogramByLevel) {
DestroyAndReopen(options); DestroyAndReopen(options);
int key_index = 0; int key_index = 0;
Random rnd(301); Random rnd(301);
for (int num = 0; num < 5; num++) { for (int num = 0; num < 7; num++) {
Put("foo", "bar"); Put("foo", "bar");
GenerateNewFile(&rnd, &key_index); GenerateNewFile(&rnd, &key_index);
} }
dbfull()->TEST_WaitForCompact();
std::string prop; std::string prop;
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop)); ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
@ -638,6 +678,7 @@ TEST_F(DBTest, ReadLatencyHistogramByLevel) {
// Reopen and issue Get(). See thee latency tracked // Reopen and issue Get(). See thee latency tracked
Reopen(options); Reopen(options);
dbfull()->TEST_WaitForCompact();
for (int key = 0; key < 500; key++) { for (int key = 0; key < 500; key++) {
Get(Key(key)); Get(Key(key));
} }
@ -781,21 +822,34 @@ class CoutingUserTblPropCollector : public TablePropertiesCollector {
class CoutingUserTblPropCollectorFactory class CoutingUserTblPropCollectorFactory
: public TablePropertiesCollectorFactory { : public TablePropertiesCollectorFactory {
public: public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { explicit CoutingUserTblPropCollectorFactory(
uint32_t expected_column_family_id)
: expected_column_family_id_(expected_column_family_id),
num_created_(0) {}
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
EXPECT_EQ(expected_column_family_id_, context.column_family_id);
num_created_++;
return new CoutingUserTblPropCollector(); return new CoutingUserTblPropCollector();
} }
const char* Name() const override { const char* Name() const override {
return "CoutingUserTblPropCollectorFactory"; return "CoutingUserTblPropCollectorFactory";
} }
void set_expected_column_family_id(uint32_t v) {
expected_column_family_id_ = v;
}
uint32_t expected_column_family_id_;
uint32_t num_created_;
}; };
TEST_F(DBTest, GetUserDefinedTablaProperties) { TEST_F(DBTest, GetUserDefinedTableProperties) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = (1<<30); options.level0_file_num_compaction_trigger = (1<<30);
options.max_background_flushes = 0; options.max_background_flushes = 0;
options.table_properties_collector_factories.resize(1); options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = std::shared_ptr<CoutingUserTblPropCollectorFactory> collector_factory =
std::make_shared<CoutingUserTblPropCollectorFactory>(); std::make_shared<CoutingUserTblPropCollectorFactory>(0);
options.table_properties_collector_factories[0] = collector_factory;
Reopen(options); Reopen(options);
// Create 4 tables // Create 4 tables
for (int table = 0; table < 4; ++table) { for (int table = 0; table < 4; ++table) {
@ -821,6 +875,72 @@ TEST_F(DBTest, GetUserDefinedTablaProperties) {
sum += count; sum += count;
} }
ASSERT_EQ(10u + 11u + 12u + 13u, sum); ASSERT_EQ(10u + 11u + 12u + 13u, sum);
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_GT(collector_factory->num_created_, 0);
}
TEST_F(DBTest, UserDefinedTablePropertiesContext) {
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = 3;
options.max_background_flushes = 0;
options.table_properties_collector_factories.resize(1);
std::shared_ptr<CoutingUserTblPropCollectorFactory> collector_factory =
std::make_shared<CoutingUserTblPropCollectorFactory>(1);
options.table_properties_collector_factories[0] = collector_factory,
CreateAndReopenWithCF({"pikachu"}, options);
// Create 2 files
for (int table = 0; table < 2; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(1, ToString(table * 100 + i), "val");
}
Flush(1);
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
// Trigger automatic compactions.
for (int table = 0; table < 3; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(1, ToString(table * 100 + i), "val");
}
Flush(1);
dbfull()->TEST_WaitForCompact();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
ASSERT_GT(collector_factory->num_created_, 0);
// Come back to write to default column family
collector_factory->num_created_ = 0;
collector_factory->set_expected_column_family_id(0); // default CF
// Create 4 tables in default column family
for (int table = 0; table < 2; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(ToString(table * 100 + i), "val");
}
Flush();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
// Trigger automatic compactions.
for (int table = 0; table < 3; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(ToString(table * 100 + i), "val");
}
Flush();
dbfull()->TEST_WaitForCompact();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_GT(collector_factory->num_created_, 0);
} }
TEST_F(DBTest, LevelLimitReopen) { TEST_F(DBTest, LevelLimitReopen) {
@ -8244,7 +8364,8 @@ class CountingDeleteTabPropCollector : public TablePropertiesCollector {
class CountingDeleteTabPropCollectorFactory class CountingDeleteTabPropCollectorFactory
: public TablePropertiesCollectorFactory { : public TablePropertiesCollectorFactory {
public: public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
return new CountingDeleteTabPropCollector(); return new CountingDeleteTabPropCollector();
} }
const char* Name() const override { const char* Name() const override {
@ -8268,8 +8389,8 @@ TEST_F(DBTest, TablePropertiesNeedCompactTest) {
options.soft_rate_limit = 1.1; options.soft_rate_limit = 1.1;
options.num_levels = 8; options.num_levels = 8;
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory( std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
new CountingDeleteTabPropCollectorFactory); std::make_shared<CountingDeleteTabPropCollectorFactory>();
options.table_properties_collector_factories.resize(1); options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory; options.table_properties_collector_factories[0] = collector_factory;
@ -8328,6 +8449,61 @@ TEST_F(DBTest, TablePropertiesNeedCompactTest) {
} }
} }
TEST_F(DBTest, NeedCompactHintPersistentTest) {
Random rnd(301);
Options options;
options.create_if_missing = true;
options.max_write_buffer_number = 8;
options.level0_file_num_compaction_trigger = 10;
options.level0_slowdown_writes_trigger = 10;
options.level0_stop_writes_trigger = 10;
options.disable_auto_compactions = true;
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
std::make_shared<CountingDeleteTabPropCollectorFactory>();
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory;
DestroyAndReopen(options);
const int kMaxKey = 100;
for (int i = 0; i < kMaxKey; i++) {
ASSERT_OK(Put(Key(i), ""));
}
Flush();
dbfull()->TEST_WaitForFlushMemTable();
for (int i = 1; i < kMaxKey - 1; i++) {
Delete(Key(i));
}
Flush();
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), 2);
// Restart the DB. Although number of files didn't reach
// options.level0_file_num_compaction_trigger, compaction should
// still be triggered because of the need-compaction hint.
options.disable_auto_compactions = false;
Reopen(options);
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
{
SetPerfLevel(kEnableCount);
perf_context.Reset();
int c = 0;
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
c++;
}
ASSERT_EQ(c, 2);
ASSERT_EQ(perf_context.internal_delete_skipped_count, 0);
// We iterate every key twice. Is it a bug?
ASSERT_LE(perf_context.internal_key_skipped_count, 2);
SetPerfLevel(kDisable);
}
}
TEST_F(DBTest, SuggestCompactRangeTest) { TEST_F(DBTest, SuggestCompactRangeTest) {
class CompactionFilterFactoryGetContext : public CompactionFilterFactory { class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
public: public:
@ -9640,6 +9816,47 @@ TEST_F(DBTest, AddExternalSstFileMultiThreaded) {
kSkipFIFOCompaction)); kSkipFIFOCompaction));
} }
// 1 Create some SST files by inserting K-V pairs into DB
// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
// 3 Open DB and check if all key can be read
TEST_F(DBTest, SSTsWithLdbSuffixHandling) {
Options options = CurrentOptions();
options.write_buffer_size = 110 << 10; // 110KB
options.num_levels = 4;
DestroyAndReopen(options);
Random rnd(301);
int key_id = 0;
for (int i = 0; i < 10; ++i) {
GenerateNewFile(&rnd, &key_id, false);
}
Flush();
Close();
int const num_files = GetSstFileCount(dbname_);
ASSERT_GT(num_files, 0);
std::vector<std::string> filenames;
GetSstFiles(dbname_, &filenames);
int num_ldb_files = 0;
for (unsigned int i = 0; i < filenames.size(); ++i) {
if (i & 1) {
continue;
}
std::string const rdb_name = dbname_ + "/" + filenames[i];
std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
++num_ldb_files;
}
ASSERT_GT(num_ldb_files, 0);
ASSERT_EQ(num_files, GetSstFileCount(dbname_));
Reopen(options);
for (int k = 0; k < key_id; ++k) {
ASSERT_NE("NOT_FOUND", Get(Key(k)));
}
Destroy(options);
}
INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam, INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
::testing::Values(1, 4)); ::testing::Values(1, 4));
@ -9846,6 +10063,120 @@ TEST_F(DBTest, WalFilterTest) {
} }
} }
} }
// 1 Insert 2 K-V pairs into DB
// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
// 4 Call Flush() to create SST
// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
// Test both: block and plain SST
TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
std::string key1("AAAA");
std::string key2("RXDB"); // not in DB
std::string key3("ZBRA");
std::string value1("Value1");
std::string value3("Value3");
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
// check memtable bloom stats
ASSERT_EQ(value1, Get(key1));
ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(value3, Get(key3));
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
ASSERT_EQ("NOT_FOUND", Get(key2));
ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
// sanity checks
ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
Flush();
// sanity checks
ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
// check SST bloom stats
// NOTE: hits per get differs because of code paths differences
// in BlockBasedTable::Get()
int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1;
ASSERT_EQ(value1, Get(key1));
ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count);
ASSERT_EQ(value3, Get(key3));
ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count);
ASSERT_EQ("NOT_FOUND", Get(key2));
ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
}
// Same scenario as in BloomStatsTest but using an iterator
TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
std::string key1("AAAA");
std::string key2("RXDB"); // not in DB
std::string key3("ZBRA");
std::string value1("Value1");
std::string value3("Value3");
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
// check memtable bloom stats
iter->Seek(key1);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value1, iter->value().ToString());
ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
iter->Seek(key3);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value3, iter->value().ToString());
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
iter->Seek(key2);
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
Flush();
iter.reset(dbfull()->NewIterator(ReadOptions()));
// check SST bloom stats
iter->Seek(key1);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value1, iter->value().ToString());
ASSERT_EQ(1, perf_context.bloom_sst_hit_count);
iter->Seek(key3);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value3, iter->value().ToString());
ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
iter->Seek(key2);
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
}
INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
::testing::Values(std::make_tuple(true, true),
std::make_tuple(true, false),
std::make_tuple(false, false)));
} // namespace rocksdb } // namespace rocksdb
#endif #endif

@ -7,7 +7,7 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/db_test_util.h" #include "db/db_test_util.h"
namespace rocksdb { namespace rocksdb {
@ -794,19 +794,22 @@ std::string DBTestBase::DumpSSTableList() {
return property; return property;
} }
void DBTestBase::GetSstFiles(std::string path,
std::vector<std::string>* files) {
env_->GetChildren(path, files);
files->erase(
std::remove_if(files->begin(), files->end(), [](std::string name) {
uint64_t number;
FileType type;
return !(ParseFileName(name, &number, &type) && type == kTableFile);
}), files->end());
}
int DBTestBase::GetSstFileCount(std::string path) { int DBTestBase::GetSstFileCount(std::string path) {
std::vector<std::string> files; std::vector<std::string> files;
env_->GetChildren(path, &files); GetSstFiles(path, &files);
return static_cast<int>(files.size());
int sst_count = 0;
uint64_t number;
FileType type;
for (size_t i = 0; i < files.size(); i++) {
if (ParseFileName(files[i], &number, &type) && type == kTableFile) {
sst_count++;
}
}
return sst_count;
} }
// this will generate non-overlapping files since it keeps increasing key_idx // this will generate non-overlapping files since it keeps increasing key_idx

@ -27,6 +27,7 @@
#include <vector> #include <vector>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/db_test_util.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/filename.h" #include "db/filename.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
@ -43,7 +44,6 @@
#include "table/mock_table.h" #include "table/mock_table.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
#include "util/compression.h" #include "util/compression.h"
#include "util/db_test_util.h"
#include "util/hash_linklist_rep.h" #include "util/hash_linklist_rep.h"
#include "util/mock_env.h" #include "util/mock_env.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
@ -618,6 +618,8 @@ class DBTestBase : public testing::Test {
std::string DumpSSTableList(); std::string DumpSSTableList();
void GetSstFiles(std::string path, std::vector<std::string>* files);
int GetSstFileCount(std::string path); int GetSstFileCount(std::string path);
// this will generate non-overlapping files since it keeps increasing key_idx // this will generate non-overlapping files since it keeps increasing key_idx

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
#if !(defined NDEBUG) || !defined(OS_WIN) #if !(defined NDEBUG) || !defined(OS_WIN)
#include "util/sync_point.h" #include "util/sync_point.h"

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
#include "util/db_test_util.h"
#if !(defined NDEBUG) || !defined(OS_WIN) #if !(defined NDEBUG) || !defined(OS_WIN)
#include "util/sync_point.h" #include "util/sync_point.h"
#endif #endif

@ -53,7 +53,7 @@ void EventHelpers::LogAndNotifyTableFileCreation(
info.table_properties.filter_policy_name; info.table_properties.filter_policy_name;
// user collected properties // user collected properties
for (const auto& prop : info.table_properties.user_collected_properties) { for (const auto& prop : info.table_properties.readable_properties) {
jwriter << prop.first << prop.second; jwriter << prop.first << prop.second;
} }
jwriter.EndObject(); jwriter.EndObject();

@ -24,6 +24,9 @@
namespace rocksdb { namespace rocksdb {
static const std::string kRocksDbTFileExt = "sst";
static const std::string kLevelDbTFileExt = "ldb";
// Given a path, flatten the path name by replacing all chars not in // Given a path, flatten the path name by replacing all chars not in
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end. // {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
// Return the number of chars stored in dest not including the trailing '\0'. // Return the number of chars stored in dest not including the trailing '\0'.
@ -78,7 +81,16 @@ std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
} }
std::string MakeTableFileName(const std::string& path, uint64_t number) { std::string MakeTableFileName(const std::string& path, uint64_t number) {
return MakeFileName(path, number, "sst"); return MakeFileName(path, number, kRocksDbTFileExt.c_str());
}
std::string Rocks2LevelTableFileName(const std::string& fullname) {
assert(fullname.size() > kRocksDbTFileExt.size() + 1);
if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
return "";
}
return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
kLevelDbTFileExt;
} }
uint64_t TableFileNameToNumber(const std::string& name) { uint64_t TableFileNameToNumber(const std::string& name) {
@ -273,17 +285,23 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
if (!ConsumeDecimalNumber(&rest, &num)) { if (!ConsumeDecimalNumber(&rest, &num)) {
return false; return false;
} }
if (rest.size() <= 1 || rest[0] != '.') {
return false;
}
rest.remove_prefix(1);
Slice suffix = rest; Slice suffix = rest;
if (suffix == Slice(".log")) { if (suffix == Slice("log")) {
*type = kLogFile; *type = kLogFile;
if (log_type && !archive_dir_found) { if (log_type && !archive_dir_found) {
*log_type = kAliveLogFile; *log_type = kAliveLogFile;
} }
} else if (archive_dir_found) { } else if (archive_dir_found) {
return false; // Archive dir can contain only log files return false; // Archive dir can contain only log files
} else if (suffix == Slice(".sst")) { } else if (suffix == Slice(kRocksDbTFileExt) ||
suffix == Slice(kLevelDbTFileExt)) {
*type = kTableFile; *type = kTableFile;
} else if (suffix == Slice(".dbtmp")) { } else if (suffix == Slice("dbtmp")) {
*type = kTempFile; *type = kTempFile;
} else { } else {
return false; return false;

@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
extern std::string MakeTableFileName(const std::string& name, uint64_t number); extern std::string MakeTableFileName(const std::string& name, uint64_t number);
// Return the name of sstable with LevelDB suffix
// created from RocksDB sstable suffixed name
extern std::string Rocks2LevelTableFileName(const std::string& fullname);
// the reverse function of MakeTableFileName // the reverse function of MakeTableFileName
// TODO(yhchiang): could merge this function with ParseFileName() // TODO(yhchiang): could merge this function with ParseFileName()
extern uint64_t TableFileNameToNumber(const std::string& name); extern uint64_t TableFileNameToNumber(const std::string& name);

@ -231,13 +231,15 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression", TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
&output_compression_); &output_compression_);
s = BuildTable( s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
dbname_, db_options_.env, *cfd_->ioptions(), env_options_, cfd_->table_cache(), iter.get(), meta,
cfd_->table_cache(), iter.get(), meta, cfd_->internal_comparator(), cfd_->internal_comparator(),
cfd_->int_tbl_prop_collector_factories(), existing_snapshots_, cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
output_compression_, cfd_->ioptions()->compression_opts, existing_snapshots_, output_compression_,
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(), cfd_->ioptions()->compression_opts,
Env::IO_HIGH, &info.table_properties); mutable_cf_options_.paranoid_file_checks,
cfd_->internal_stats(), Env::IO_HIGH, &table_properties_);
info.table_properties = table_properties_;
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
} }
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,

@ -66,6 +66,7 @@ class FlushJob {
~FlushJob(); ~FlushJob();
Status Run(FileMetaData* file_meta = nullptr); Status Run(FileMetaData* file_meta = nullptr);
TableProperties GetTableProperties() const { return table_properties_; }
private: private:
void ReportStartedFlush(); void ReportStartedFlush();
@ -89,6 +90,7 @@ class FlushJob {
CompressionType output_compression_; CompressionType output_compression_;
Statistics* stats_; Statistics* stats_;
EventLogger* event_logger_; EventLogger* event_logger_;
TableProperties table_properties_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -152,6 +152,40 @@ class EventListenerTest : public testing::Test {
std::vector<ColumnFamilyHandle*> handles_; std::vector<ColumnFamilyHandle*> handles_;
}; };
struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector {
virtual rocksdb::Status AddUserKey(const rocksdb::Slice& key,
const rocksdb::Slice& value,
rocksdb::EntryType type,
rocksdb::SequenceNumber seq,
uint64_t file_size) override {
return Status::OK();
}
virtual rocksdb::Status Finish(
rocksdb::UserCollectedProperties* properties) override {
properties->insert({"0", "1"});
return Status::OK();
}
virtual const char* Name() const override {
return "TestTablePropertiesCollector";
}
rocksdb::UserCollectedProperties GetReadableProperties() const override {
rocksdb::UserCollectedProperties ret;
ret["2"] = "3";
return ret;
}
};
class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
return new TestPropertiesCollector;
}
const char* Name() const override { return "TestTablePropertiesCollector"; }
};
class TestCompactionListener : public EventListener { class TestCompactionListener : public EventListener {
public: public:
void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override { void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
@ -161,6 +195,16 @@ class TestCompactionListener : public EventListener {
ASSERT_GT(ci.output_files.size(), 0U); ASSERT_GT(ci.output_files.size(), 0U);
ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id); ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
ASSERT_GT(ci.thread_id, 0U); ASSERT_GT(ci.thread_id, 0U);
for (auto fl : {ci.input_files, ci.output_files}) {
for (auto fn : fl) {
auto it = ci.table_properties.find(fn);
ASSERT_NE(it, ci.table_properties.end());
auto tp = it->second;
ASSERT_TRUE(tp != nullptr);
ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
}
}
} }
std::vector<DB*> compacted_dbs_; std::vector<DB*> compacted_dbs_;
@ -186,6 +230,8 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
options.enable_thread_tracking = true; options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS #endif // ROCKSDB_USING_THREAD_STATUS
options.level0_file_num_compaction_trigger = kNumL0Files; options.level0_file_num_compaction_trigger = kNumL0Files;
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
TestCompactionListener* listener = new TestCompactionListener(); TestCompactionListener* listener = new TestCompactionListener();
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
@ -274,6 +320,8 @@ class TestFlushListener : public EventListener {
ASSERT_EQ(prev_fc_info_.file_path, info.file_path); ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id); ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
ASSERT_GT(info.thread_id, 0U); ASSERT_GT(info.thread_id, 0U);
ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
"1");
} }
std::vector<std::string> flushed_column_family_names_; std::vector<std::string> flushed_column_family_names_;
@ -299,6 +347,8 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
std::vector<std::string> cf_names = { std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia", "pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"}; "nikitich", "alyosha", "popovich"};
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
CreateAndReopenWithCF(cf_names, &options); CreateAndReopenWithCF(cf_names, &options);
ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p'))); ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
@ -330,6 +380,8 @@ TEST_F(EventListenerTest, MultiCF) {
#endif // ROCKSDB_USING_THREAD_STATUS #endif // ROCKSDB_USING_THREAD_STATUS
TestFlushListener* listener = new TestFlushListener(options.env); TestFlushListener* listener = new TestFlushListener(options.env);
options.listeners.emplace_back(listener); options.listeners.emplace_back(listener);
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
std::vector<std::string> cf_names = { std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia", "pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"}; "nikitich", "alyosha", "popovich"};
@ -360,6 +412,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
#if ROCKSDB_USING_THREAD_STATUS #if ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true; options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS #endif // ROCKSDB_USING_THREAD_STATUS
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
std::vector<TestFlushListener*> listeners; std::vector<TestFlushListener*> listeners;
const int kNumDBs = 5; const int kNumDBs = 5;
const int kNumListeners = 10; const int kNumListeners = 10;
@ -454,6 +508,8 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
options.compaction_style = kCompactionStyleNone; options.compaction_style = kCompactionStyleNone;
options.compression = kNoCompression; options.compression = kNoCompression;
options.write_buffer_size = 100000; // Small write buffer options.write_buffer_size = 100000; // Small write buffer
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
CreateAndReopenWithCF({"pikachu"}, &options); CreateAndReopenWithCF({"pikachu"}, &options);
ColumnFamilyMetaData cf_meta; ColumnFamilyMetaData cf_meta;

@ -230,10 +230,15 @@ class MemTableIterator: public Iterator {
virtual void Seek(const Slice& k) override { virtual void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time); PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1); PERF_COUNTER_ADD(seek_on_memtable_count, 1);
if (bloom_ != nullptr && if (bloom_ != nullptr) {
!bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) { if (!bloom_->MayContain(
valid_ = false; prefix_extractor_->Transform(ExtractUserKey(k)))) {
return; PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
valid_ = false;
return;
} else {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
} }
iter_->Seek(k, nullptr); iter_->Seek(k, nullptr);
valid_ = iter_->Valid(); valid_ = iter_->Valid();
@ -508,12 +513,18 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
Slice user_key = key.user_key(); Slice user_key = key.user_key();
bool found_final_value = false; bool found_final_value = false;
bool merge_in_progress = s->IsMergeInProgress(); bool merge_in_progress = s->IsMergeInProgress();
bool const may_contain =
if (prefix_bloom_ && nullptr == prefix_bloom_
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) { ? false
: prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
if (prefix_bloom_ && !may_contain) {
// iter is null if prefix bloom says the key does not exist // iter is null if prefix bloom says the key does not exist
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
*seq = kMaxSequenceNumber; *seq = kMaxSequenceNumber;
} else { } else {
if (prefix_bloom_) {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
Saver saver; Saver saver;
saver.status = s; saver.status = s;
saver.found_final_value = &found_final_value; saver.found_final_value = &found_final_value;

@ -14,7 +14,6 @@
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "util/perf_context_imp.h" #include "util/perf_context_imp.h"
#include "util/statistics.h" #include "util/statistics.h"
#include "util/stop_watch.h"
namespace rocksdb { namespace rocksdb {
@ -41,8 +40,7 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
bool success = bool success =
merge_operator->FullMerge(key, value, operands, result, logger); merge_operator->FullMerge(key, value, operands, result, logger);
RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanosSafe());
env != nullptr ? timer.ElapsedNanos() : 0);
if (!success) { if (!success) {
RecordTick(statistics, NUMBER_MERGE_FAILURES); RecordTick(statistics, NUMBER_MERGE_FAILURES);
@ -59,30 +57,33 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
// operands_ stores the list of merge operands encountered while merging. // operands_ stores the list of merge operands encountered while merging.
// keys_[i] corresponds to operands_[i] for each i. // keys_[i] corresponds to operands_[i] for each i.
Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before, Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
const bool at_bottom, Statistics* stats, const bool at_bottom) {
Env* env_) {
// Get a copy of the internal key, before it's invalidated by iter->Next() // Get a copy of the internal key, before it's invalidated by iter->Next()
// Also maintain the list of merge operands seen. // Also maintain the list of merge operands seen.
assert(HasOperator()); assert(HasOperator());
keys_.clear(); keys_.clear();
operands_.clear(); operands_.clear();
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
assert(user_merge_operator_); assert(user_merge_operator_);
bool first_key = true;
// We need to parse the internal key again as the parsed key is // We need to parse the internal key again as the parsed key is
// backed by the internal key! // backed by the internal key!
// Assume no internal key corruption as it has been successfully parsed // Assume no internal key corruption as it has been successfully parsed
// by the caller. // by the caller.
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid. // original_key_is_iter variable is just caching the information:
// original_key_is_iter == (iter->key().ToString() == original_key)
bool original_key_is_iter = true;
std::string original_key = iter->key().ToString();
// Important:
// orig_ikey is backed by original_key if keys_.empty()
// orig_ikey is backed by keys_.back() if !keys_.empty()
ParsedInternalKey orig_ikey; ParsedInternalKey orig_ikey;
ParseInternalKey(keys_.back(), &orig_ikey); ParseInternalKey(original_key, &orig_ikey);
Status s; Status s;
bool hit_the_next_user_key = false; bool hit_the_next_user_key = false;
for (iter->Next(); iter->Valid(); iter->Next()) { for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
ParsedInternalKey ikey; ParsedInternalKey ikey;
assert(operands_.size() >= 1); // Should be invariants!
assert(keys_.size() == operands_.size()); assert(keys_.size() == operands_.size());
if (!ParseInternalKey(iter->key(), &ikey)) { if (!ParseInternalKey(iter->key(), &ikey)) {
@ -92,6 +93,9 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
return Status::Corruption("Corrupted internal key not expected."); return Status::Corruption("Corrupted internal key not expected.");
} }
break; break;
} else if (first_key) {
assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
first_key = false;
} else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) { } else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
// hit a different user key, stop right here // hit a different user key, stop right here
hit_the_next_user_key = true; hit_the_next_user_key = true;
@ -105,16 +109,29 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
assert(IsValueType(ikey.type)); assert(IsValueType(ikey.type));
if (ikey.type != kTypeMerge) { if (ikey.type != kTypeMerge) {
// Merges operands can only be used with puts and deletions, single if (ikey.type != kTypeValue && ikey.type != kTypeDeletion) {
// deletions are not supported. // Merges operands can only be used with puts and deletions, single
assert(ikey.type == kTypeValue || ikey.type == kTypeDeletion); // deletions are not supported.
assert(false);
// release build doesn't have asserts, so we return error status
return Status::InvalidArgument(
" Merges operands can only be used with puts and deletions, single "
"deletions are not supported.");
}
// hit a put/delete // hit a put/delete
// => merge the put value or a nullptr with operands_ // => merge the put value or a nullptr with operands_
// => store result in operands_.back() (and update keys_.back()) // => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back() // => change the entry type to kTypeValue for keys_.back()
// We are done! Success! // We are done! Success!
//
// If there are no operands, just return the Status::OK(). That will cause
// the compaction iterator to write out the key we're currently at, which
// is the put/delete we just encountered.
if (keys_.empty()) {
return Status::OK();
}
// TODO(noetzli) If the merge operator returns false, we are currently // TODO(noetzli) If the merge operator returns false, we are currently
// (almost) silently dropping the put/delete. That's probably not what we // (almost) silently dropping the put/delete. That's probably not what we
// want. // want.
@ -122,14 +139,14 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr; const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
std::string merge_result; std::string merge_result;
s = TimedFullMerge(ikey.user_key, val_ptr, operands_, s = TimedFullMerge(ikey.user_key, val_ptr, operands_,
user_merge_operator_, stats, env_, logger_, user_merge_operator_, stats_, env_, logger_,
&merge_result); &merge_result);
// We store the result in keys_.back() and operands_.back() // We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk) // if nothing went wrong (i.e.: no operand corruption on disk)
if (s.ok()) { if (s.ok()) {
// The original key encountered // The original key encountered
std::string original_key = std::move(keys_.back()); original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue; orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear(); keys_.clear();
@ -143,18 +160,42 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
return s; return s;
} else { } else {
// hit a merge // hit a merge
// => if there is a compaction filter, apply it.
// => merge the operand into the front of the operands_ list // => merge the operand into the front of the operands_ list
// => use the user's associative merge function to determine how. // if not filtered
// => then continue because we haven't yet seen a Put/Delete. // => then continue because we haven't yet seen a Put/Delete.
assert(!operands_.empty()); // Should have at least one element in it //
// Keep queuing keys and operands until we either meet a put / delete
// keep queuing keys and operands until we either meet a put / delete
// request or later did a partial merge. // request or later did a partial merge.
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString()); Slice value_slice = iter->value();
// add an operand to the list if:
// 1) it's included in one of the snapshots. in that case we *must* write
// it out, no matter what compaction filter says
// 2) it's not filtered by a compaction filter
if (ikey.sequence <= latest_snapshot_ ||
!FilterMerge(orig_ikey.user_key, value_slice)) {
if (original_key_is_iter) {
// this is just an optimization that saves us one memcpy
keys_.push_front(std::move(original_key));
} else {
keys_.push_front(iter->key().ToString());
}
if (keys_.size() == 1) {
// we need to re-anchor the orig_ikey because it was anchored by
// original_key before
ParseInternalKey(keys_.back(), &orig_ikey);
}
operands_.push_front(value_slice.ToString());
}
} }
} }
if (operands_.size() == 0) {
// we filtered out all the merge operands
return Status::OK();
}
// We are sure we have seen this key's entire history if we are at the // We are sure we have seen this key's entire history if we are at the
// last level and exhausted all internal keys of this user key. // last level and exhausted all internal keys of this user key.
// NOTE: !iter->Valid() does not necessarily mean we hit the // NOTE: !iter->Valid() does not necessarily mean we hit the
@ -179,11 +220,13 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
assert(operands_.size() == keys_.size()); assert(operands_.size() == keys_.size());
std::string merge_result; std::string merge_result;
s = TimedFullMerge(orig_ikey.user_key, nullptr, operands_, s = TimedFullMerge(orig_ikey.user_key, nullptr, operands_,
user_merge_operator_, stats, env_, logger_, user_merge_operator_, stats_, env_, logger_,
&merge_result); &merge_result);
if (s.ok()) { if (s.ok()) {
// The original key encountered // The original key encountered
std::string original_key = std::move(keys_.back()); // We are certain that keys_ is not empty here (see assertions couple of
// lines before).
original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue; orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type); UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear(); keys_.clear();
@ -205,14 +248,14 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
bool merge_success = false; bool merge_success = false;
std::string merge_result; std::string merge_result;
{ {
StopWatchNano timer(env_, stats != nullptr); StopWatchNano timer(env_, stats_ != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos); PERF_TIMER_GUARD(merge_operator_time_nanos);
merge_success = user_merge_operator_->PartialMergeMulti( merge_success = user_merge_operator_->PartialMergeMulti(
orig_ikey.user_key, orig_ikey.user_key,
std::deque<Slice>(operands_.begin(), operands_.end()), std::deque<Slice>(operands_.begin(), operands_.end()),
&merge_result, logger_); &merge_result, logger_);
RecordTick(stats, MERGE_OPERATION_TOTAL_TIME, RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0); timer.ElapsedNanosSafe());
} }
if (merge_success) { if (merge_success) {
// Merging of operands (associative merge) was successful. // Merging of operands (associative merge) was successful.
@ -236,7 +279,6 @@ MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
void MergeOutputIterator::SeekToFirst() { void MergeOutputIterator::SeekToFirst() {
const auto& keys = merge_helper_->keys(); const auto& keys = merge_helper_->keys();
const auto& values = merge_helper_->values(); const auto& values = merge_helper_->values();
assert(keys.size() > 0);
assert(keys.size() == values.size()); assert(keys.size() == values.size());
it_keys_ = keys.rbegin(); it_keys_ = keys.rbegin();
it_values_ = values.rbegin(); it_values_ = values.rbegin();
@ -247,4 +289,17 @@ void MergeOutputIterator::Next() {
++it_values_; ++it_values_;
} }
bool MergeHelper::FilterMerge(const Slice& user_key, const Slice& value_slice) {
if (compaction_filter_ == nullptr) {
return false;
}
if (stats_ != nullptr) {
filter_timer_.Start();
}
bool to_delete =
compaction_filter_->FilterMergeOperand(level_, user_key, value_slice);
total_filter_time_ += filter_timer_.ElapsedNanosSafe();
return to_delete;
}
} // namespace rocksdb } // namespace rocksdb

@ -10,8 +10,10 @@
#include <string> #include <string>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "util/stop_watch.h"
namespace rocksdb { namespace rocksdb {
@ -23,17 +25,26 @@ class Statistics;
class MergeHelper { class MergeHelper {
public: public:
MergeHelper(const Comparator* user_comparator, MergeHelper(Env* env, const Comparator* user_comparator,
const MergeOperator* user_merge_operator, Logger* logger, const MergeOperator* user_merge_operator,
const CompactionFilter* compaction_filter, Logger* logger,
unsigned min_partial_merge_operands, unsigned min_partial_merge_operands,
bool assert_valid_internal_key) bool assert_valid_internal_key, SequenceNumber latest_snapshot,
: user_comparator_(user_comparator), int level = 0, Statistics* stats = nullptr)
: env_(env),
user_comparator_(user_comparator),
user_merge_operator_(user_merge_operator), user_merge_operator_(user_merge_operator),
compaction_filter_(compaction_filter),
logger_(logger), logger_(logger),
min_partial_merge_operands_(min_partial_merge_operands), min_partial_merge_operands_(min_partial_merge_operands),
assert_valid_internal_key_(assert_valid_internal_key), assert_valid_internal_key_(assert_valid_internal_key),
latest_snapshot_(latest_snapshot),
level_(level),
keys_(), keys_(),
operands_() { operands_(),
filter_timer_(env_),
total_filter_time_(0U),
stats_(stats) {
assert(user_comparator_ != nullptr); assert(user_comparator_ != nullptr);
} }
@ -62,6 +73,7 @@ class MergeHelper {
// 0 means no restriction // 0 means no restriction
// at_bottom: (IN) true if the iterator covers the bottem level, which means // at_bottom: (IN) true if the iterator covers the bottem level, which means
// we could reach the start of the history of this user key. // we could reach the start of the history of this user key.
//
// Returns one of the following statuses: // Returns one of the following statuses:
// - OK: Entries were successfully merged. // - OK: Entries were successfully merged.
// - MergeInProgress: Put/Delete not encountered and unable to merge operands. // - MergeInProgress: Put/Delete not encountered and unable to merge operands.
@ -71,8 +83,11 @@ class MergeHelper {
// //
// REQUIRED: The first key in the input is not corrupted. // REQUIRED: The first key in the input is not corrupted.
Status MergeUntil(Iterator* iter, const SequenceNumber stop_before = 0, Status MergeUntil(Iterator* iter, const SequenceNumber stop_before = 0,
const bool at_bottom = false, Statistics* stats = nullptr, const bool at_bottom = false);
Env* env_ = nullptr);
// Filters a merge operand using the compaction filter specified
// in the constructor. Returns true if the operand should be filtered out.
bool FilterMerge(const Slice& user_key, const Slice& value_slice);
// Query the merge result // Query the merge result
// These are valid until the next MergeUntil call // These are valid until the next MergeUntil call
@ -101,19 +116,28 @@ class MergeHelper {
// TODO: Re-style this comment to be like the first one // TODO: Re-style this comment to be like the first one
const std::deque<std::string>& keys() const { return keys_; } const std::deque<std::string>& keys() const { return keys_; }
const std::deque<std::string>& values() const { return operands_; } const std::deque<std::string>& values() const { return operands_; }
uint64_t TotalFilterTime() const { return total_filter_time_; }
bool HasOperator() const { return user_merge_operator_ != nullptr; } bool HasOperator() const { return user_merge_operator_ != nullptr; }
private: private:
Env* env_;
const Comparator* user_comparator_; const Comparator* user_comparator_;
const MergeOperator* user_merge_operator_; const MergeOperator* user_merge_operator_;
const CompactionFilter* compaction_filter_;
Logger* logger_; Logger* logger_;
unsigned min_partial_merge_operands_; unsigned min_partial_merge_operands_;
bool assert_valid_internal_key_; // enforce no internal key corruption? bool assert_valid_internal_key_; // enforce no internal key corruption?
SequenceNumber latest_snapshot_;
int level_;
// the scratch area that holds the result of MergeUntil // the scratch area that holds the result of MergeUntil
// valid up to the next MergeUntil call // valid up to the next MergeUntil call
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
std::deque<std::string> operands_; // Parallel with keys_; stores the values std::deque<std::string> operands_; // Parallel with keys_; stores the values
StopWatchNano filter_timer_;
uint64_t total_filter_time_;
Statistics* stats_;
}; };
// MergeOutputIterator can be used to iterate over the result of a merge. // MergeOutputIterator can be used to iterate over the result of a merge.

@ -18,26 +18,18 @@ namespace rocksdb {
class MergeHelperTest : public testing::Test { class MergeHelperTest : public testing::Test {
public: public:
MergeHelperTest() = default; MergeHelperTest() { env_ = Env::Default(); }
~MergeHelperTest() = default;
Status RunUInt64MergeHelper(SequenceNumber stop_before, bool at_bottom) { ~MergeHelperTest() = default;
InitIterator();
merge_op_ = MergeOperators::CreateUInt64AddOperator();
merge_helper_.reset(new MergeHelper(BytewiseComparator(), merge_op_.get(),
nullptr, 2U, false));
return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom,
nullptr, Env::Default());
}
Status RunStringAppendMergeHelper(SequenceNumber stop_before, Status Run(SequenceNumber stop_before, bool at_bottom,
bool at_bottom) { SequenceNumber latest_snapshot = 0) {
InitIterator(); iter_.reset(new test::VectorIterator(ks_, vs_));
merge_op_ = MergeOperators::CreateStringAppendTESTOperator(); iter_->SeekToFirst();
merge_helper_.reset(new MergeHelper(BytewiseComparator(), merge_op_.get(), merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
nullptr, 2U, false)); merge_op_.get(), filter_.get(), nullptr,
return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom, 2U, false, latest_snapshot));
nullptr, Env::Default()); return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom);
} }
void AddKeyVal(const std::string& user_key, const SequenceNumber& seq, void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
@ -51,66 +43,63 @@ class MergeHelperTest : public testing::Test {
vs_.push_back(val); vs_.push_back(val);
} }
void InitIterator() { Env* env_;
iter_.reset(new test::VectorIterator(ks_, vs_));
iter_->SeekToFirst();
}
std::string EncodeInt(uint64_t x) {
std::string result;
PutFixed64(&result, x);
return result;
}
std::unique_ptr<test::VectorIterator> iter_; std::unique_ptr<test::VectorIterator> iter_;
std::shared_ptr<MergeOperator> merge_op_; std::shared_ptr<MergeOperator> merge_op_;
std::unique_ptr<MergeHelper> merge_helper_; std::unique_ptr<MergeHelper> merge_helper_;
std::vector<std::string> ks_; std::vector<std::string> ks_;
std::vector<std::string> vs_; std::vector<std::string> vs_;
std::unique_ptr<test::FilterNumber> filter_;
}; };
// If MergeHelper encounters a new key on the last level, we know that // If MergeHelper encounters a new key on the last level, we know that
// the key has no more history and it can merge keys. // the key has no more history and it can merge keys.
TEST_F(MergeHelperTest, MergeAtBottomSuccess) { TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
AddKeyVal("a", 20, kTypeMerge, EncodeInt(1U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 10, kTypeMerge, EncodeInt(3U));
AddKeyVal("b", 10, kTypeMerge, EncodeInt(4U)); // <- Iterator after merge AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge
ASSERT_TRUE(RunUInt64MergeHelper(0, true).ok()); ASSERT_TRUE(Run(0, true).ok());
ASSERT_EQ(ks_[2], iter_->key()); ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
// Merging with a value results in a successful merge. // Merging with a value results in a successful merge.
TEST_F(MergeHelperTest, MergeValue) { TEST_F(MergeHelperTest, MergeValue) {
AddKeyVal("a", 40, kTypeMerge, EncodeInt(1U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U));
AddKeyVal("a", 20, kTypeValue, EncodeInt(4U)); // <- Iterator after merge AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 10, kTypeMerge, EncodeInt(1U)); AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(RunUInt64MergeHelper(0, false).ok()); ASSERT_TRUE(Run(0, false).ok());
ASSERT_EQ(ks_[3], iter_->key()); ASSERT_EQ(ks_[3], iter_->key());
ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(8U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
// Merging stops before a snapshot. // Merging stops before a snapshot.
TEST_F(MergeHelperTest, SnapshotBeforeValue) { TEST_F(MergeHelperTest, SnapshotBeforeValue) {
AddKeyVal("a", 50, kTypeMerge, EncodeInt(1U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 40, kTypeMerge, EncodeInt(3U)); // <- Iterator after merge
AddKeyVal("a", 30, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 20, kTypeValue, EncodeInt(4U));
AddKeyVal("a", 10, kTypeMerge, EncodeInt(1U));
ASSERT_TRUE(RunUInt64MergeHelper(31, true).IsMergeInProgress()); AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key()); ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
@ -118,11 +107,13 @@ TEST_F(MergeHelperTest, SnapshotBeforeValue) {
// MergeHelper preserves the operand stack for merge operators that // MergeHelper preserves the operand stack for merge operators that
// cannot do a partial merge. // cannot do a partial merge.
TEST_F(MergeHelperTest, NoPartialMerge) { TEST_F(MergeHelperTest, NoPartialMerge) {
merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
AddKeyVal("a", 50, kTypeMerge, "v2"); AddKeyVal("a", 50, kTypeMerge, "v2");
AddKeyVal("a", 40, kTypeMerge, "v"); // <- Iterator after merge AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge
AddKeyVal("a", 30, kTypeMerge, "v"); AddKeyVal("a", 30, kTypeMerge, "v");
ASSERT_TRUE(RunStringAppendMergeHelper(31, true).IsMergeInProgress()); ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key()); ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ("v", merge_helper_->values()[0]); ASSERT_EQ("v", merge_helper_->values()[0]);
@ -134,44 +125,162 @@ TEST_F(MergeHelperTest, NoPartialMerge) {
// A single operand can not be merged. // A single operand can not be merged.
TEST_F(MergeHelperTest, SingleOperand) { TEST_F(MergeHelperTest, SingleOperand) {
AddKeyVal("a", 50, kTypeMerge, EncodeInt(1U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
ASSERT_TRUE(RunUInt64MergeHelper(31, true).IsMergeInProgress()); AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_FALSE(iter_->Valid()); ASSERT_FALSE(iter_->Valid());
ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(1U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
// Merging with a deletion turns the deletion into a value // Merging with a deletion turns the deletion into a value
TEST_F(MergeHelperTest, MergeDeletion) { TEST_F(MergeHelperTest, MergeDeletion) {
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 20, kTypeDeletion, ""); AddKeyVal("a", 20, kTypeDeletion, "");
ASSERT_TRUE(RunUInt64MergeHelper(15, false).ok()); ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid()); ASSERT_FALSE(iter_->Valid());
ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(3U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
// The merge helper stops upon encountering a corrupt key // The merge helper stops upon encountering a corrupt key
TEST_F(MergeHelperTest, CorruptKey) { TEST_F(MergeHelperTest, CorruptKey) {
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U)); merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 25, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
// Corrupt key // Corrupt key
AddKeyVal("a", 20, kTypeDeletion, "", true); // <- Iterator after merge AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge
ASSERT_TRUE(RunUInt64MergeHelper(15, false).IsMergeInProgress()); ASSERT_TRUE(Run(15, false).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key()); ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]); ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]); ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size()); ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size()); ASSERT_EQ(1U, merge_helper_->values().size());
} }
// The compaction filter is called on every merge operand
TEST_F(MergeHelperTest, FilterMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
}
TEST_F(MergeHelperTest, FilterAllMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
// filtered out all
ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_FALSE(merge_output_iter.Valid());
// we have one operand that will survive because it's a delete
AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
ASSERT_TRUE(Run(15, true).ok());
merge_output_iter = MergeOutputIterator(merge_helper_.get());
ASSERT_TRUE(iter_->Valid());
merge_output_iter.SeekToFirst();
ASSERT_FALSE(merge_output_iter.Valid());
// when all merge operands are filtered out, we leave the iterator pointing to
// the Put/Delete that survived
ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
}
// Make sure that merge operands are filtered at the beginning
TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key
ASSERT_OK(Run(15, true));
ASSERT_TRUE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
// sequence number is 29 here, because the first merge operand got filtered
// out
ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
// make sure that we're passing user keys into the filter
ASSERT_EQ("a", filter_->last_merge_operand_key());
}
// Make sure that merge operands are not filtered out if there's a snapshot
// pointing at them
TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
ASSERT_OK(Run(15, true, 32));
ASSERT_TRUE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -290,10 +290,11 @@ class Repairer {
ro.total_order_seek = true; ro.total_order_seek = true;
Arena arena; Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena)); ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_, status = BuildTable(
iter.get(), &meta, icmp_, dbname_, env_, ioptions_, env_options_, table_cache_, iter.get(),
&int_tbl_prop_collector_factories_, {}, &meta, icmp_, &int_tbl_prop_collector_factories_,
kNoCompression, CompressionOptions(), false, nullptr); TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, {},
kNoCompression, CompressionOptions(), false, nullptr);
} }
delete mem->Unref(); delete mem->Unref();
delete cf_mems_default; delete cf_mems_default;

@ -41,7 +41,8 @@ class IntTblPropCollectorFactory {
public: public:
virtual ~IntTblPropCollectorFactory() {} virtual ~IntTblPropCollectorFactory() {}
// has to be thread-safe // has to be thread-safe
virtual IntTblPropCollector* CreateIntTblPropCollector() = 0; virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) = 0;
// The name of the properties collector can be used for debugging purpose. // The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0; virtual const char* Name() const = 0;
@ -69,7 +70,8 @@ class InternalKeyPropertiesCollector : public IntTblPropCollector {
class InternalKeyPropertiesCollectorFactory class InternalKeyPropertiesCollectorFactory
: public IntTblPropCollectorFactory { : public IntTblPropCollectorFactory {
public: public:
virtual IntTblPropCollector* CreateIntTblPropCollector() override { virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new InternalKeyPropertiesCollector(); return new InternalKeyPropertiesCollector();
} }
@ -114,9 +116,12 @@ class UserKeyTablePropertiesCollectorFactory
explicit UserKeyTablePropertiesCollectorFactory( explicit UserKeyTablePropertiesCollectorFactory(
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory) std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
: user_collector_factory_(user_collector_factory) {} : user_collector_factory_(user_collector_factory) {}
virtual IntTblPropCollector* CreateIntTblPropCollector() override { virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
TablePropertiesCollectorFactory::Context context;
context.column_family_id = column_family_id;
return new UserKeyTablePropertiesCollector( return new UserKeyTablePropertiesCollector(
user_collector_factory_->CreateTablePropertiesCollector()); user_collector_factory_->CreateTablePropertiesCollector(context));
} }
virtual const char* Name() const override { virtual const char* Name() const override {

@ -35,6 +35,8 @@ class TablePropertiesTest : public testing::Test,
// Utilities test functions // Utilities test functions
namespace { namespace {
static const uint32_t kTestColumnFamilyId = 66;
void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
@ -46,7 +48,8 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
builder->reset(NewTableBuilder( builder->reset(NewTableBuilder(
ioptions, internal_comparator, int_tbl_prop_collector_factories, ioptions, internal_comparator, int_tbl_prop_collector_factories,
writable->get(), options.compression, options.compression_opts)); kTestColumnFamilyId /* column_family_id */, writable->get(),
options.compression, options.compression_opts));
} }
} // namespace } // namespace
@ -178,14 +181,17 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
public: public:
explicit RegularKeysStartWithAFactory(bool backward_mode) explicit RegularKeysStartWithAFactory(bool backward_mode)
: backward_mode_(backward_mode) {} : backward_mode_(backward_mode) {}
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override { virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
if (!backward_mode_) { if (!backward_mode_) {
return new RegularKeysStartWithA(); return new RegularKeysStartWithA();
} else { } else {
return new RegularKeysStartWithABackwardCompatible(); return new RegularKeysStartWithABackwardCompatible();
} }
} }
virtual IntTblPropCollector* CreateIntTblPropCollector() override { virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new RegularKeysStartWithAInternal(); return new RegularKeysStartWithAInternal();
} }
const char* Name() const override { return "RegularKeysStartWithA"; } const char* Name() const override { return "RegularKeysStartWithA"; }

@ -12,6 +12,7 @@
#include "db/version_set.h" #include "db/version_set.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/event_logger.h" #include "util/event_logger.h"
#include "util/sync_point.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
namespace rocksdb { namespace rocksdb {
@ -32,12 +33,22 @@ enum Tag {
// these are new formats divergent from open source leveldb // these are new formats divergent from open source leveldb
kNewFile2 = 100, kNewFile2 = 100,
kNewFile3 = 102, kNewFile3 = 102,
kNewFile4 = 103, // 4th (the latest) format version of adding files
kColumnFamily = 200, // specify column family for version edit kColumnFamily = 200, // specify column family for version edit
kColumnFamilyAdd = 201, kColumnFamilyAdd = 201,
kColumnFamilyDrop = 202, kColumnFamilyDrop = 202,
kMaxColumnFamily = 203, kMaxColumnFamily = 203,
}; };
enum CustomTag {
kTerminate = 1, // The end of customized fields
kNeedCompaction = 2,
kPathId = 65,
};
// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) { uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
assert(number <= kFileNumberMask); assert(number <= kFileNumberMask);
return number | (path_id * (kFileNumberMask + 1)); return number | (path_id * (kFileNumberMask + 1));
@ -102,7 +113,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
if (!f.smallest.Valid() || !f.largest.Valid()) { if (!f.smallest.Valid() || !f.largest.Valid()) {
return false; return false;
} }
if (f.fd.GetPathId() == 0) { bool has_customized_fields = false;
if (f.marked_for_compaction) {
PutVarint32(dst, kNewFile4);
has_customized_fields = true;
} else if (f.fd.GetPathId() == 0) {
// Use older format to make sure user can roll back the build if they // Use older format to make sure user can roll back the build if they
// don't config multiple DB paths. // don't config multiple DB paths.
PutVarint32(dst, kNewFile2); PutVarint32(dst, kNewFile2);
@ -111,7 +126,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
} }
PutVarint32(dst, new_files_[i].first); // level PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.fd.GetNumber()); PutVarint64(dst, f.fd.GetNumber());
if (f.fd.GetPathId() != 0) { if (f.fd.GetPathId() != 0 && !has_customized_fields) {
// kNewFile3
PutVarint32(dst, f.fd.GetPathId()); PutVarint32(dst, f.fd.GetPathId());
} }
PutVarint64(dst, f.fd.GetFileSize()); PutVarint64(dst, f.fd.GetFileSize());
@ -119,6 +135,48 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
PutLengthPrefixedSlice(dst, f.largest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode());
PutVarint64(dst, f.smallest_seqno); PutVarint64(dst, f.smallest_seqno);
PutVarint64(dst, f.largest_seqno); PutVarint64(dst, f.largest_seqno);
if (has_customized_fields) {
// Customized fields' format:
// +-----------------------------+
// | 1st field's tag (varint32) |
// +-----------------------------+
// | 1st field's size (varint32) |
// +-----------------------------+
// | bytes for 1st field |
// | (based on size decoded) |
// +-----------------------------+
// | |
// | ...... |
// | |
// +-----------------------------+
// | last field's size (varint32)|
// +-----------------------------+
// | bytes for last field |
// | (based on size decoded) |
// +-----------------------------+
// | terminating tag (varint32) |
// +-----------------------------+
//
// Customized encoding for fields:
// tag kPathId: 1 byte as path_id
// tag kNeedCompaction:
// now only can take one char value 1 indicating need-compaction
//
if (f.fd.GetPathId() != 0) {
PutVarint32(dst, CustomTag::kPathId);
char p = static_cast<char>(f.fd.GetPathId());
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
if (f.marked_for_compaction) {
PutVarint32(dst, CustomTag::kNeedCompaction);
char p = static_cast<char>(1);
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
dst);
PutVarint32(dst, CustomTag::kTerminate);
}
} }
// 0 is default and does not need to be explicitly written // 0 is default and does not need to be explicitly written
@ -161,6 +219,63 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
} }
} }
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
const char* msg = nullptr;
int level;
FileMetaData f;
uint64_t number;
uint32_t path_id = 0;
uint64_t file_size;
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
GetInternalKey(input, &f.largest) &&
GetVarint64(input, &f.smallest_seqno) &&
GetVarint64(input, &f.largest_seqno)) {
// See comments in VersionEdit::EncodeTo() for format of customized fields
while (true) {
uint32_t custom_tag;
Slice field;
if (!GetVarint32(input, &custom_tag)) {
return "new-file4 custom field";
}
if (custom_tag == kTerminate) {
break;
}
if (!GetLengthPrefixedSlice(input, &field)) {
return "new-file4 custom field lenth prefixed slice error";
}
switch (custom_tag) {
case kPathId:
if (field.size() != 1) {
return "path_id field wrong size";
}
path_id = field[0];
if (path_id > 3) {
return "path_id wrong vaue";
}
break;
case kNeedCompaction:
if (field.size() != 1) {
return "need_compaction field wrong size";
}
f.marked_for_compaction = (field[0] == 1);
break;
default:
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
// Should not proceed if cannot understand it
return "new-file4 custom field not supported";
}
break;
}
}
} else {
return "new-file4 entry";
}
f.fd = FileDescriptor(number, path_id, file_size);
new_files_.push_back(std::make_pair(level, f));
return nullptr;
}
Status VersionEdit::DecodeFrom(const Slice& src) { Status VersionEdit::DecodeFrom(const Slice& src) {
Clear(); Clear();
Slice input = src; Slice input = src;
@ -304,6 +419,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
break; break;
} }
case kNewFile4: {
msg = DecodeNewFile4From(&input);
break;
}
case kColumnFamily: case kColumnFamily:
if (!GetVarint32(&input, &column_family_)) { if (!GetVarint32(&input, &column_family_)) {
if (!msg) { if (!msg) {

@ -237,6 +237,8 @@ class VersionEdit {
bool EncodeTo(std::string* dst) const; bool EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src); Status DecodeFrom(const Slice& src);
const char* DecodeNewFile4From(Slice* input);
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet; typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
const DeletedFileSet& GetDeletedFiles() { return deleted_files_; } const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h" #include "db/version_edit.h"
#include "util/sync_point.h"
#include "util/testharness.h" #include "util/testharness.h"
namespace rocksdb { namespace rocksdb {
@ -45,6 +46,121 @@ TEST_F(VersionEditTest, EncodeDecode) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
} }
TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
kBig + 602, true);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
auto& new_files = parsed.GetNewFiles();
ASSERT_TRUE(new_files[0].second.marked_for_compaction);
ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
ASSERT_TRUE(new_files[2].second.marked_for_compaction);
ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
ASSERT_EQ(0, new_files[2].second.fd.GetPathId());
}
TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
std::string encoded;
// Call back function to add extra customized builds.
bool first = true;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
std::string* str = reinterpret_cast<std::string*>(arg);
PutVarint32(str, 33);
const std::string str1 = "random_string";
PutLengthPrefixedSlice(str, str1);
if (first) {
first = false;
PutVarint32(str, 22);
const std::string str2 = "s";
PutLengthPrefixedSlice(str, str2);
}
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
edit.EncodeTo(&encoded);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
ASSERT_TRUE(!first);
auto& new_files = parsed.GetNewFiles();
ASSERT_TRUE(new_files[0].second.marked_for_compaction);
ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
}
TEST_F(VersionEditTest, NewFile4NotSupportedField) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
std::string encoded;
// Call back function to add extra customized builds.
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
std::string* str = reinterpret_cast<std::string*>(arg);
const std::string str1 = "s";
PutLengthPrefixedSlice(str, str1);
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
edit.EncodeTo(&encoded);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_NOK(s);
}
TEST_F(VersionEditTest, EncodeEmptyFile) { TEST_F(VersionEditTest, EncodeEmptyFile) {
VersionEdit edit; VersionEdit edit;
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false); edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);

@ -666,6 +666,7 @@ class VersionSet {
Status GetMetadataForFile(uint64_t number, int* filelevel, Status GetMetadataForFile(uint64_t number, int* filelevel,
FileMetaData** metadata, ColumnFamilyData** cfd); FileMetaData** metadata, ColumnFamilyData** cfd);
// This function doesn't support leveldb SST filenames
void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata); void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
void GetObsoleteFiles(std::vector<FileMetaData*>* files, void GetObsoleteFiles(std::vector<FileMetaData*>* files,

@ -1,6 +1,7 @@
column_families_example
simple_example
c_simple_example c_simple_example
column_families_example
compact_files_example compact_files_example
transaction_example compaction_filter_example
optimistic_transaction_example optimistic_transaction_example
simple_example
transaction_example

@ -10,6 +10,9 @@ simple_example: librocksdb simple_example.cc
column_families_example: librocksdb column_families_example.cc column_families_example: librocksdb column_families_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
compaction_filter_example: librocksdb compaction_filter_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
compact_files_example: librocksdb compact_files_example.cc compact_files_example: librocksdb compact_files_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@ -26,7 +29,7 @@ transaction_example: librocksdb transaction_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS) $(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
clean: clean:
rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example
librocksdb: librocksdb:
cd .. && $(MAKE) librocksdb.a cd .. && $(MAKE) librocksdb.a

@ -0,0 +1,84 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <rocksdb/compaction_filter.h>
#include <rocksdb/db.h>
#include <rocksdb/merge_operator.h>
#include <rocksdb/options.h>
class MyMerge : public rocksdb::MergeOperator {
public:
bool FullMerge(const rocksdb::Slice& key,
const rocksdb::Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
rocksdb::Logger* logger) const override {
new_value->clear();
if (existing_value != nullptr) {
new_value->assign(existing_value->data(), existing_value->size());
}
for (const std::string& m : operand_list) {
fprintf(stderr, "Merge(%s)\n", m.c_str());
assert(m != "bad"); // the compaction filter filters out bad values
new_value->assign(m);
}
return true;
}
const char* Name() const override { return "MyMerge"; }
};
class MyFilter : public rocksdb::CompactionFilter {
public:
bool Filter(int level, const rocksdb::Slice& key,
const rocksdb::Slice& existing_value, std::string* new_value,
bool* value_changed) const override {
fprintf(stderr, "Filter(%s)\n", key.ToString().c_str());
++count_;
assert(*value_changed == false);
return false;
}
bool FilterMergeOperand(int level, const rocksdb::Slice& key,
const rocksdb::Slice& existing_value) const override {
fprintf(stderr, "FilterMerge(%s)\n", key.ToString().c_str());
++merge_count_;
return existing_value == "bad";
}
const char* Name() const override { return "MyFilter"; }
mutable int count_ = 0;
mutable int merge_count_ = 0;
};
int main() {
rocksdb::DB* raw_db;
rocksdb::Status status;
MyFilter filter;
system("rm -rf /tmp/rocksmergetest");
rocksdb::Options options;
options.create_if_missing = true;
options.merge_operator.reset(new MyMerge);
options.compaction_filter = &filter;
status = rocksdb::DB::Open(options, "/tmp/rocksmergetest", &raw_db);
assert(status.ok());
std::unique_ptr<rocksdb::DB> db(raw_db);
rocksdb::WriteOptions wopts;
db->Merge(wopts, "0", "bad"); // This is filtered out
db->Merge(wopts, "1", "data1");
db->Merge(wopts, "1", "bad");
db->Merge(wopts, "1", "data2");
db->Merge(wopts, "1", "bad");
db->Merge(wopts, "3", "data3");
db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
fprintf(stderr, "filter.count_ = %d\n", filter.count_);
assert(filter.count_ == 1);
fprintf(stderr, "filter.merge_count_ = %d\n", filter.merge_count_);
assert(filter.merge_count_ == 5);
}

@ -29,25 +29,167 @@
# #
# Below is an example of a RocksDB options file: # Below is an example of a RocksDB options file:
[Version] [Version]
# The Version section stores the version information about rocksdb rocksdb_version=4.0.0
# and option file. This is used for handling potential format options_file_version=1.1
# change in the future.
rocksdb_version=4.0.0 # We support "#" style comment.
options_file_version=1.0
[DBOptions] [DBOptions]
# Followed by the Version section is the DBOptions section. stats_dump_period_sec=600
# The value of an options can be assigned using a statement. max_manifest_file_size=18446744073709551615
# Note that for those options that is not set in the options file, bytes_per_sync=0
# we will use the default value. delayed_write_rate=1048576
max_open_files=12345 WAL_ttl_seconds=0
max_background_flushes=301 WAL_size_limit_MB=0
max_subcompactions=1
wal_dir=
wal_bytes_per_sync=0
db_write_buffer_size=0
max_total_wal_size=0
skip_stats_update_on_db_open=false
max_open_files=5000
max_file_opening_threads=1
use_fsync=false
max_background_compactions=1
manifest_preallocation_size=4194304
max_background_flushes=1
is_fd_close_on_exec=true
create_if_missing=false
use_adaptive_mutex=false
enable_thread_tracking=false
disableDataSync=false
max_log_file_size=0
advise_random_on_open=true
create_missing_column_families=false
keep_log_file_num=1000
table_cache_numshardbits=4
error_if_exists=false
skip_log_error_on_recovery=false
allow_os_buffer=true
allow_mmap_reads=false
paranoid_checks=true
delete_obsolete_files_period_micros=21600000000
disable_data_sync=false
log_file_time_to_roll=0
compaction_readahead_size=0
db_log_dir=
new_table_reader_for_compaction_inputs=false
allow_mmap_writes=false
[CFOptions "default"] [CFOptions "default"]
# ColumnFamilyOptions section must follow the format of compaction_style=kCompactionStyleLevel
# [CFOptions "cf name"]. If a rocksdb instance compaction_filter=nullptr
# has multiple column families, then its CFOptions must be num_levels=7
# specified in the same order as column family creation order. table_factory=BlockBasedTable
[CFOptions "the second column family"] comparator=leveldb.BytewiseComparator
# Each column family must have one section in the RocksDB option max_sequential_skip_in_iterations=8
# file even all the options of this column family are set to soft_rate_limit=0.000000
# default value. max_bytes_for_level_base=536870912
[CFOptions "the third column family"] memtable_prefix_bloom_probes=6
memtable_prefix_bloom_bits=0
memtable_prefix_bloom_huge_page_tlb_size=0
max_successive_merges=0
arena_block_size=0
min_write_buffer_number_to_merge=2
target_file_size_multiplier=1
source_compaction_factor=1
max_bytes_for_level_multiplier=10
compaction_filter_factory=nullptr
max_write_buffer_number=6
level0_stop_writes_trigger=24
compression=kSnappyCompression
level0_file_num_compaction_trigger=2
purge_redundant_kvs_while_flush=true
max_write_buffer_number_to_maintain=0
memtable_factory=SkipListFactory
max_grandparent_overlap_factor=10
expanded_compaction_factor=25
hard_pending_compaction_bytes_limit=0
inplace_update_num_locks=10000
level_compaction_dynamic_level_bytes=false
level0_slowdown_writes_trigger=20
filter_deletes=false
verify_checksums_in_compaction=true
min_partial_merge_operands=2
paranoid_file_checks=false
target_file_size_base=67108864
optimize_filters_for_hits=false
merge_operator=nullptr
compression_per_level=kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression
compaction_measure_io_stats=false
prefix_extractor=nullptr
bloom_locality=0
write_buffer_size=134217728
disable_auto_compactions=false
inplace_update_support=false
[TableOptions/BlockBasedTable "default"]
format_version=0
whole_key_filtering=true
block_size_deviation=10
block_size=4096
block_restart_interval=16
filter_policy=nullptr
no_block_cache=false
checksum=kCRC32c
cache_index_and_filter_blocks=false
index_type=kBinarySearch
hash_index_allow_collision=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "universal"]
compaction_style=kCompactionStyleUniversal
compaction_filter=nullptr
num_levels=7
table_factory=BlockBasedTable
comparator=leveldb.BytewiseComparator
max_sequential_skip_in_iterations=8
soft_rate_limit=0.000000
max_bytes_for_level_base=10485760
memtable_prefix_bloom_probes=6
memtable_prefix_bloom_bits=0
memtable_prefix_bloom_huge_page_tlb_size=0
max_successive_merges=0
arena_block_size=0
min_write_buffer_number_to_merge=2
target_file_size_multiplier=1
source_compaction_factor=1
max_bytes_for_level_multiplier=10
compaction_filter_factory=nullptr
max_write_buffer_number=6
level0_stop_writes_trigger=24
compression=kSnappyCompression
level0_file_num_compaction_trigger=4
purge_redundant_kvs_while_flush=true
max_write_buffer_number_to_maintain=0
memtable_factory=SkipListFactory
max_grandparent_overlap_factor=10
expanded_compaction_factor=25
hard_pending_compaction_bytes_limit=0
inplace_update_num_locks=10000
level_compaction_dynamic_level_bytes=false
level0_slowdown_writes_trigger=20
filter_deletes=false
verify_checksums_in_compaction=true
min_partial_merge_operands=2
paranoid_file_checks=false
target_file_size_base=2097152
optimize_filters_for_hits=false
merge_operator=nullptr
compression_per_level=
compaction_measure_io_stats=false
prefix_extractor=nullptr
bloom_locality=0
write_buffer_size=134217728
disable_auto_compactions=false
inplace_update_support=false
[TableOptions/BlockBasedTable "universal"]
format_version=0
whole_key_filtering=true
block_size_deviation=10
block_size=4096
block_restart_interval=16
filter_policy=nullptr
no_block_cache=false
checksum=kCRC32c
cache_index_and_filter_blocks=false
index_type=kBinarySearch
hash_index_allow_collision=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory

@ -104,6 +104,9 @@ class Cache {
// returns the memory size for the entries residing in the cache. // returns the memory size for the entries residing in the cache.
virtual size_t GetUsage() const = 0; virtual size_t GetUsage() const = 0;
// returns the memory size for a specific entry in the cache.
virtual size_t GetUsage(Handle* handle) const = 0;
// returns the memory size for the entries in use by the system // returns the memory size for the entries in use by the system
virtual size_t GetPinnedUsage() const = 0; virtual size_t GetPinnedUsage() const = 0;

@ -39,6 +39,8 @@ class CompactionFilter {
// Is this compaction requested by the client (true), // Is this compaction requested by the client (true),
// or is it occurring as an automatic compaction process // or is it occurring as an automatic compaction process
bool is_manual_compaction; bool is_manual_compaction;
// Which column family this compaction is for.
uint32_t column_family_id;
}; };
virtual ~CompactionFilter() {} virtual ~CompactionFilter() {}
@ -51,10 +53,24 @@ class CompactionFilter {
// output of the compaction. The application can inspect // output of the compaction. The application can inspect
// the existing value of the key and make decision based on it. // the existing value of the key and make decision based on it.
// //
// Key-Values that are results of merge operation during compaction are not
// passed into this function. Currently, when you have a mix of Put()s and
// Merge()s on a same key, we only guarantee to process the merge operands
// through the compaction filters. Put()s might be processed, or might not.
//
// When the value is to be preserved, the application has the option // When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value. // to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case. // value_changed needs to be set to true in this case.
// //
// If you use snapshot feature of RocksDB (i.e. call GetSnapshot() API on a
// DB* object), CompactionFilter might not be very useful for you. Due to
// guarantees we need to maintain, compaction process will not call Filter()
// on any keys that were written before the latest snapshot. In other words,
// compaction will only call Filter() on keys written after your most recent
// call to GetSnapshot(). In most cases, Filter() will not be called very
// often. This is something we're fixing. See the discussion at:
// https://www.facebook.com/groups/mysqlonrocksdb/permalink/999723240091865/
//
// If multithreaded compaction is being used *and* a single CompactionFilter // If multithreaded compaction is being used *and* a single CompactionFilter
// instance was supplied via Options::compaction_filter, this method may be // instance was supplied via Options::compaction_filter, this method may be
// called from different threads concurrently. The application must ensure // called from different threads concurrently. The application must ensure
@ -64,44 +80,23 @@ class CompactionFilter {
// be used by a single thread that is doing the compaction run, and this // be used by a single thread that is doing the compaction run, and this
// call does not need to be thread-safe. However, multiple filters may be // call does not need to be thread-safe. However, multiple filters may be
// in existence and operating concurrently. // in existence and operating concurrently.
//
// The last paragraph is not true if you set max_subcompactions to more than
// 1. In that case, subcompaction from multiple threads may call a single
// CompactionFilter concurrently.
virtual bool Filter(int level, virtual bool Filter(int level,
const Slice& key, const Slice& key,
const Slice& existing_value, const Slice& existing_value,
std::string* new_value, std::string* new_value,
bool* value_changed) const = 0; bool* value_changed) const = 0;
// Returns a name that identifies this compaction filter. // The compaction process invokes this method on every merge operand. If this
// The name will be printed to LOG file on start up for diagnosis. // method returns true, the merge operand will be ignored and not written out
virtual const char* Name() const = 0; // in the compaction output
}; virtual bool FilterMergeOperand(int level, const Slice& key,
const Slice& operand) const {
// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let return false;
// application layer to make individual decisions for all the kv pairs in the }
// buffer.
class CompactionFilterV2 {
public:
virtual ~CompactionFilterV2() {}
// The compaction process invokes this method for all the kv pairs
// sharing the same prefix. It is a "roll-up" version of CompactionFilter.
//
// Each entry in the return vector indicates if the corresponding kv should
// be preserved in the output of this compaction run. The application can
// inspect the existing values of the keys and make decision based on it.
//
// When a value is to be preserved, the application has the option
// to modify the entry in existing_values and pass it back through an entry
// in new_values. A corresponding values_changed entry needs to be set to
// true in this case. Note that the new_values vector contains only changed
// values, i.e. new_values.size() <= values_changed.size().
//
typedef std::vector<Slice> SliceVector;
virtual std::vector<bool> Filter(int level,
const SliceVector& keys,
const SliceVector& existing_values,
std::vector<std::string>* new_values,
std::vector<bool>* values_changed)
const = 0;
// Returns a name that identifies this compaction filter. // Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis. // The name will be printed to LOG file on start up for diagnosis.
@ -135,65 +130,6 @@ class DefaultCompactionFilterFactory : public CompactionFilterFactory {
} }
}; };
// Each compaction will create a new CompactionFilterV2
//
// CompactionFilterFactoryV2 enables application to specify a prefix and use
// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
// the kv-pairs sharing the same prefix.
//
// This is useful for applications that require grouping kv-pairs in
// compaction filter to make a purge/no-purge decision. For example, if the
// key prefix is user id and the rest of key represents the type of value.
// This batching filter will come in handy if the application's compaction
// filter requires knowledge of all types of values for any user id.
//
class CompactionFilterFactoryV2 {
public:
// NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
: prefix_extractor_(prefix_extractor) { }
virtual ~CompactionFilterFactoryV2() { }
virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
const CompactionFilterContext& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
const SliceTransform* GetPrefixExtractor() const {
return prefix_extractor_;
}
void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
prefix_extractor_ = prefix_extractor;
}
private:
// Prefix extractor for compaction filter v2
// Keys sharing the same prefix will be buffered internally.
// Client can implement a Filter callback function to operate on the buffer
const SliceTransform* prefix_extractor_;
};
// Default implementation of CompactionFilterFactoryV2 which does not
// return any filter
class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
public:
explicit DefaultCompactionFilterFactoryV2()
: CompactionFilterFactoryV2(nullptr) { }
virtual std::unique_ptr<CompactionFilterV2>
CreateCompactionFilterV2(
const CompactionFilterContext& context) override {
return std::unique_ptr<CompactionFilterV2>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactoryV2";
}
};
} // namespace rocksdb } // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ #endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

@ -40,7 +40,8 @@ Status GetDBOptionsFromMap(
Status GetBlockBasedTableOptionsFromMap( Status GetBlockBasedTableOptionsFromMap(
const BlockBasedTableOptions& table_options, const BlockBasedTableOptions& table_options,
const std::unordered_map<std::string, std::string>& opts_map, const std::unordered_map<std::string, std::string>& opts_map,
BlockBasedTableOptions* new_table_options); BlockBasedTableOptions* new_table_options,
bool input_strings_escaped = false);
// Take a string representation of option names and values, apply them into the // Take a string representation of option names and values, apply them into the
// base_options, and return the new options as a result. The string has the // base_options, and return the new options as a result. The string has the

@ -0,0 +1,45 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include "rocksdb/db.h"
namespace rocksdb {
struct DumpOptions {
// Database that will be dumped
std::string db_path;
// File location that will contain dump output
std::string dump_location;
// Dont include db information header in the dump
bool anonymous = false;
};
class DbDumpTool {
public:
bool Run(const DumpOptions& dump_options,
rocksdb::Options options = rocksdb::Options());
};
struct UndumpOptions {
// Database that we will load the dumped file into
std::string db_path;
// File location of the dumped file that will be loaded
std::string dump_location;
// Compact the db after loading the dumped file
bool compact_db = false;
};
class DbUndumpTool {
public:
bool Run(const UndumpOptions& undump_options,
rocksdb::Options options = rocksdb::Options());
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -68,6 +68,9 @@ struct EnvOptions {
// If true, then use mmap to write data // If true, then use mmap to write data
bool use_mmap_writes = true; bool use_mmap_writes = true;
// If false, fallocate() calls are bypassed
bool allow_fallocate = true;
// If true, set the FD_CLOEXEC on open fd. // If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec = true; bool set_fd_cloexec = true;

@ -4,7 +4,9 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "rocksdb/compaction_job_stats.h" #include "rocksdb/compaction_job_stats.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
@ -12,6 +14,9 @@
namespace rocksdb { namespace rocksdb {
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
TablePropertiesCollection;
class DB; class DB;
class Status; class Status;
struct CompactionJobStats; struct CompactionJobStats;
@ -72,6 +77,8 @@ struct FlushJobInfo {
SequenceNumber smallest_seqno; SequenceNumber smallest_seqno;
// The largest sequence number in the newly created file // The largest sequence number in the newly created file
SequenceNumber largest_seqno; SequenceNumber largest_seqno;
// Table properties of the table being flushed
TableProperties table_properties;
}; };
struct CompactionJobInfo { struct CompactionJobInfo {
@ -93,8 +100,13 @@ struct CompactionJobInfo {
int output_level; int output_level;
// the names of the compaction input files. // the names of the compaction input files.
std::vector<std::string> input_files; std::vector<std::string> input_files;
// the names of the compaction output files. // the names of the compaction output files.
std::vector<std::string> output_files; std::vector<std::string> output_files;
// Table properties for input and output tables.
// The map is keyed by values from input_files and output_files.
TablePropertiesCollection table_properties;
// If non-null, this variable stores detailed information // If non-null, this variable stores detailed information
// about this compaction. // about this compaction.
CompactionJobStats stats; CompactionJobStats stats;

@ -30,7 +30,6 @@ namespace rocksdb {
class Cache; class Cache;
class CompactionFilter; class CompactionFilter;
class CompactionFilterFactory; class CompactionFilterFactory;
class CompactionFilterFactoryV2;
class Comparator; class Comparator;
class Env; class Env;
enum InfoLogLevel : unsigned char; enum InfoLogLevel : unsigned char;
@ -227,10 +226,6 @@ struct ColumnFamilyOptions {
// Default: nullptr // Default: nullptr
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory; std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
// This is deprecated. Talk to us if you depend on
// compaction_filter_factory_v2 and we'll put it back
// std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
// ------------------- // -------------------
// Parameters that affect performance // Parameters that affect performance
@ -1007,6 +1002,9 @@ struct DBOptions {
// Default: false // Default: false
bool allow_mmap_writes; bool allow_mmap_writes;
// If false, fallocate() calls are bypassed
bool allow_fallocate;
// Disable child process inherit open files. Default: true // Disable child process inherit open files. Default: true
bool is_fd_close_on_exec; bool is_fd_close_on_exec;
@ -1145,9 +1143,7 @@ struct DBOptions {
// Options to control the behavior of a database (passed to DB::Open) // Options to control the behavior of a database (passed to DB::Open)
struct Options : public DBOptions, public ColumnFamilyOptions { struct Options : public DBOptions, public ColumnFamilyOptions {
// Create an Options object with default values for all fields. // Create an Options object with default values for all fields.
Options() : Options() : DBOptions(), ColumnFamilyOptions() {}
DBOptions(),
ColumnFamilyOptions() {}
Options(const DBOptions& db_options, Options(const DBOptions& db_options,
const ColumnFamilyOptions& column_family_options) const ColumnFamilyOptions& column_family_options)

@ -83,6 +83,14 @@ struct PerfContext {
uint64_t block_seek_nanos; uint64_t block_seek_nanos;
// Time spent on finding or creating a table reader // Time spent on finding or creating a table reader
uint64_t find_table_nanos; uint64_t find_table_nanos;
// total number of mem table bloom hits
uint64_t bloom_memtable_hit_count;
// total number of mem table bloom misses
uint64_t bloom_memtable_miss_count;
// total number of SST table bloom hits
uint64_t bloom_sst_hit_count;
// total number of SST table bloom misses
uint64_t bloom_sst_miss_count;
}; };
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE) #if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)

@ -45,6 +45,10 @@ enum Tickers : uint32_t {
BLOCK_CACHE_DATA_MISS, BLOCK_CACHE_DATA_MISS,
// # of times cache hit when accessing data block from block cache. // # of times cache hit when accessing data block from block cache.
BLOCK_CACHE_DATA_HIT, BLOCK_CACHE_DATA_HIT,
// # of bytes read from cache.
BLOCK_CACHE_BYTES_READ,
// # of bytes written into cache.
BLOCK_CACHE_BYTES_WRITE,
// # of times bloom filter has avoided file reads. // # of times bloom filter has avoided file reads.
BLOOM_FILTER_USEFUL, BLOOM_FILTER_USEFUL,
@ -177,6 +181,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"}, {BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"}, {BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"}, {BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
{BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
{BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"}, {BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
{MEMTABLE_HIT, "rocksdb.memtable.hit"}, {MEMTABLE_HIT, "rocksdb.memtable.hit"},
{MEMTABLE_MISS, "rocksdb.memtable.miss"}, {MEMTABLE_MISS, "rocksdb.memtable.miss"},

@ -375,7 +375,7 @@ class TableFactory {
// to use in this table. // to use in this table.
virtual TableBuilder* NewTableBuilder( virtual TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const = 0; uint32_t column_family_id, WritableFileWriter* file) const = 0;
// Sanitizes the specified DB Options and ColumnFamilyOptions. // Sanitizes the specified DB Options and ColumnFamilyOptions.
// //

@ -56,6 +56,7 @@ struct TableProperties {
// user collected properties // user collected properties
UserCollectedProperties user_collected_properties; UserCollectedProperties user_collected_properties;
UserCollectedProperties readable_properties;
// convert this object to a human readable form // convert this object to a human readable form
// @prop_delim: delimiter for each property. // @prop_delim: delimiter for each property.
@ -144,9 +145,15 @@ class TablePropertiesCollector {
// TablePropertiesCollector for each new table // TablePropertiesCollector for each new table
class TablePropertiesCollectorFactory { class TablePropertiesCollectorFactory {
public: public:
struct Context {
uint32_t column_family_id;
static const uint32_t kUnknownColumnFamily;
};
virtual ~TablePropertiesCollectorFactory() {} virtual ~TablePropertiesCollectorFactory() {}
// has to be thread-safe // has to be thread-safe
virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0; virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) = 0;
// The name of the properties collector can be used for debugging purpose. // The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0; virtual const char* Name() const = 0;

@ -61,10 +61,30 @@ class Transaction {
// methods. See Transaction::Get() for more details. // methods. See Transaction::Get() for more details.
virtual void SetSnapshot() = 0; virtual void SetSnapshot() = 0;
// Similar to SetSnapshot(), but will not change the current snapshot
// until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
// By calling this function, the transaction will essentially call
// SetSnapshot() for you right before performing the next write/GetForUpdate.
//
// Calling SetSnapshotOnNextOperation() will not affect what snapshot is
// returned by GetSnapshot() until the next write/GetForUpdate is executed.
//
// This is an optimization to reduce the likelyhood of conflicts that
// could occur in between the time SetSnapshot() is called and the first
// write/GetForUpdate operation. Eg, this prevents the following
// race-condition:
//
// txn1->SetSnapshot();
// txn2->Put("A", ...);
// txn2->Commit();
// txn1->GetForUpdate(opts, "A", ...); // FAIL!
virtual void SetSnapshotOnNextOperation() = 0;
// Returns the Snapshot created by the last call to SetSnapshot(). // Returns the Snapshot created by the last call to SetSnapshot().
// //
// REQUIRED: The returned Snapshot is only valid up until the next time // REQUIRED: The returned Snapshot is only valid up until the next time
// SetSnapshot() is called or the Transaction is deleted. // SetSnapshot()/SetSnapshotOnNextSavePoint() is called or the Transaction
// is deleted.
virtual const Snapshot* GetSnapshot() const = 0; virtual const Snapshot* GetSnapshot() const = 0;
// Write all batched keys to the db atomically. // Write all batched keys to the db atomically.
@ -263,6 +283,21 @@ class Transaction {
// Similar to WriteBatch::PutLogData // Similar to WriteBatch::PutLogData
virtual void PutLogData(const Slice& blob) = 0; virtual void PutLogData(const Slice& blob) = 0;
// By default, all Put/Merge/Delete operations will be indexed in the
// transaction so that Get/GetForUpdate/GetIterator can search for these
// keys.
//
// If the caller does not want to fetch the keys about to be written,
// they may want to avoid indexing as a performance optimization.
// Calling DisableIndexing() will turn off indexing for all future
// Put/Merge/Delete operations until EnableIndexing() is called.
//
// If a key is Put/Merge/Deleted after DisableIndexing is called and then
// is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
// undefined.
virtual void DisableIndexing() = 0;
virtual void EnableIndexing() = 0;
// Returns the number of distinct Keys being tracked by this transaction. // Returns the number of distinct Keys being tracked by this transaction.
// If this transaction was created by a TransactinDB, this is the number of // If this transaction was created by a TransactinDB, this is the number of
// keys that are currently locked by this transaction. // keys that are currently locked by this transaction.

@ -5,7 +5,7 @@
#pragma once #pragma once
#define ROCKSDB_MAJOR 4 #define ROCKSDB_MAJOR 4
#define ROCKSDB_MINOR 0 #define ROCKSDB_MINOR 1
#define ROCKSDB_PATCH 0 #define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with // Do not use these. We made the mistake of declaring macros starting with

@ -2,14 +2,16 @@
# install all required packages for rocksdb that are available through yum # install all required packages for rocksdb that are available through yum
ARCH=$(uname -i) ARCH=$(uname -i)
sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
sudo yum -y install epel-release-5-4.noarch
sudo yum -y install snappy snappy-devel
# install gcc/g++ 4.8.2 via CERN (http://linux.web.cern.ch/linux/devtoolset/) # install gcc/g++ 4.8.2 via CERN (http://linux.web.cern.ch/linux/devtoolset/)
sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo
sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern
sudo yum -y install devtoolset-2 sudo yum -y install devtoolset-2
wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz wget http://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install tar xvfz gflags-2.0-no-svn-files.tar.gz; cd gflags-2.0; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
# set java home so we can build rocksdb jars # set java home so we can build rocksdb jars
@ -18,7 +20,7 @@ export JAVA_HOME=/usr/lib/jvm/java-1.7.0
# build rocksdb # build rocksdb
cd /rocksdb cd /rocksdb
scl enable devtoolset-2 'make jclean clean' scl enable devtoolset-2 'make jclean clean'
scl enable devtoolset-2 'make rocksdbjavastatic' scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build

@ -233,13 +233,14 @@ class WinMmapReadableFile : public RandomAccessFile {
char* scratch) const override { char* scratch) const override {
Status s; Status s;
if (offset + n > length_) { if (offset > length_) {
*result = Slice(); *result = Slice();
s = IOError(fileName_, EINVAL); return IOError(fileName_, EINVAL);
} else { } else if (offset + n > length_) {
*result = n = length_ - offset;
Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
} }
*result =
Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
return s; return s;
} }

@ -43,65 +43,50 @@ void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
tv->tv_usec = usNow.count() - duration_cast<microseconds>(secNow).count(); tv->tv_usec = usNow.count() - duration_cast<microseconds>(secNow).count();
} }
Mutex::Mutex(bool adaptive) : lock(m_mutex, std::defer_lock) {}
Mutex::~Mutex() {} Mutex::~Mutex() {}
void Mutex::Lock() {
lock.lock();
#ifndef NDEBUG
locked_ = true;
#endif
}
void Mutex::Unlock() {
#ifndef NDEBUG
locked_ = false;
#endif
lock.unlock();
}
void Mutex::AssertHeld() {
#ifndef NDEBUG
assert(locked_);
#endif
}
CondVar::CondVar(Mutex* mu) : mu_(mu) {}
CondVar::~CondVar() {} CondVar::~CondVar() {}
void CondVar::Wait() { void CondVar::Wait() {
// Caller must ensure that mutex is held prior to calling this method
std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
#ifndef NDEBUG #ifndef NDEBUG
mu_->locked_ = false; mu_->locked_ = false;
#endif #endif
cv_.wait(mu_->getLock()); cv_.wait(lk);
#ifndef NDEBUG #ifndef NDEBUG
mu_->locked_ = true; mu_->locked_ = true;
#endif #endif
// Release ownership of the lock as we don't want it to be unlocked when
// it goes out of scope (as we adopted the lock and didn't lock it ourselves)
lk.release();
} }
bool CondVar::TimedWait(uint64_t abs_time_us) { bool CondVar::TimedWait(uint64_t abs_time_us) {
#ifndef NDEBUG
mu_->locked_ = false;
#endif
using namespace std::chrono; using namespace std::chrono;
// MSVC++ library implements wait_until in terms of wait_for so // MSVC++ library implements wait_until in terms of wait_for so
// there is not an absolute wait anyway. // we need to convert absoulte wait into relative wait.
microseconds usAbsTime(abs_time_us); microseconds usAbsTime(abs_time_us);
microseconds usNow( microseconds usNow(
duration_cast<microseconds>(system_clock::now().time_since_epoch())); duration_cast<microseconds>(system_clock::now().time_since_epoch()));
microseconds relTimeUs = microseconds relTimeUs =
(usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero(); (usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero();
std::cv_status cvStatus = cv_.wait_for(mu_->getLock(), relTimeUs);
// Caller must ensure that mutex is held prior to calling this method
std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
#ifndef NDEBUG
mu_->locked_ = false;
#endif
std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs);
#ifndef NDEBUG #ifndef NDEBUG
mu_->locked_ = true; mu_->locked_ = true;
#endif #endif
// Release ownership of the lock as we don't want it to be unlocked when
// it goes out of scope (as we adopted the lock and didn't lock it ourselves)
lk.release();
if (cvStatus == std::cv_status::timeout) { if (cvStatus == std::cv_status::timeout) {
return true; return true;

@ -113,29 +113,50 @@ class CondVar;
class Mutex { class Mutex {
public: public:
/* implicit */ Mutex(bool adaptive = false);
/* implicit */ Mutex(bool adaptive = false) : locked_(false) {
}
~Mutex(); ~Mutex();
void Lock(); void Lock() {
void Unlock(); mutex_.lock();
#ifndef NDEBUG
locked_ = true;
#endif
}
void Unlock() {
#ifndef NDEBUG
locked_ = false;
#endif
mutex_.unlock();
}
// this will assert if the mutex is not locked // this will assert if the mutex is not locked
// it does NOT verify that mutex is held by a calling thread // it does NOT verify that mutex is held by a calling thread
void AssertHeld(); void AssertHeld() {
#ifndef NDEBUG
assert(locked_);
#endif
}
std::unique_lock<std::mutex>& getLock() { return lock; } // Mutex is move only with lock ownership transfer
Mutex(const Mutex&) = delete;
void operator=(const Mutex&) = delete;
private: private:
friend class CondVar; friend class CondVar;
std::mutex m_mutex;
std::unique_lock<std::mutex> lock; std::mutex& getLock() {
return mutex_;
}
std::mutex mutex_;
#ifndef NDEBUG #ifndef NDEBUG
bool locked_; bool locked_;
#endif #endif
// No copying
Mutex(const Mutex&);
void operator=(const Mutex&);
}; };
class RWMutex { class RWMutex {
@ -162,13 +183,22 @@ class RWMutex {
class CondVar { class CondVar {
public: public:
explicit CondVar(Mutex* mu); explicit CondVar(Mutex* mu) : mu_(mu) {
}
~CondVar(); ~CondVar();
void Wait(); void Wait();
bool TimedWait(uint64_t expiration_time); bool TimedWait(uint64_t expiration_time);
void Signal(); void Signal();
void SignalAll(); void SignalAll();
// Condition var is not copy/move constructible
CondVar(const CondVar&) = delete;
CondVar& operator=(const CondVar&) = delete;
CondVar(CondVar&&) = delete;
CondVar& operator=(CondVar&&) = delete;
private: private:
std::condition_variable cv_; std::condition_variable cv_;
Mutex* mu_; Mutex* mu_;

@ -76,6 +76,7 @@ LIB_SOURCES = \
table/plain_table_reader.cc \ table/plain_table_reader.cc \
table/table_properties.cc \ table/table_properties.cc \
table/two_level_iterator.cc \ table/two_level_iterator.cc \
tools/dump/db_dump_tool.cc \
util/arena.cc \ util/arena.cc \
util/auto_roll_logger.cc \ util/auto_roll_logger.cc \
util/bloom.cc \ util/bloom.cc \
@ -204,6 +205,7 @@ TEST_BENCH_SOURCES = \
db/prefix_test.cc \ db/prefix_test.cc \
db/skiplist_test.cc \ db/skiplist_test.cc \
db/table_properties_collector_test.cc \ db/table_properties_collector_test.cc \
db/db_test_util.cc \
db/version_builder_test.cc \ db/version_builder_test.cc \
db/version_edit_test.cc \ db/version_edit_test.cc \
db/version_set_test.cc \ db/version_set_test.cc \
@ -232,7 +234,6 @@ TEST_BENCH_SOURCES = \
util/cache_test.cc \ util/cache_test.cc \
util/coding_test.cc \ util/coding_test.cc \
util/crc32c_test.cc \ util/crc32c_test.cc \
util/db_test_util.cc \
util/dynamic_bloom_test.cc \ util/dynamic_bloom_test.cc \
util/env_test.cc \ util/env_test.cc \
util/filelock_test.cc \ util/filelock_test.cc \

@ -66,9 +66,10 @@ Status AdaptiveTableFactory::NewTableReader(
} }
TableBuilder* AdaptiveTableFactory::NewTableBuilder( TableBuilder* AdaptiveTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const { WritableFileWriter* file) const {
return table_factory_to_write_->NewTableBuilder(table_builder_options, file); return table_factory_to_write_->NewTableBuilder(table_builder_options,
column_family_id, file);
} }
std::string AdaptiveTableFactory::GetPrintableTableOptions() const { std::string AdaptiveTableFactory::GetPrintableTableOptions() const {

@ -40,7 +40,7 @@ class AdaptiveTableFactory : public TableFactory {
TableBuilder* NewTableBuilder( TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override; uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts, Status SanitizeOptions(const DBOptions& db_opts,

@ -7,12 +7,13 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include "table/block_based_filter_block.h" #include "table/block_based_filter_block.h"
#include <algorithm>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/perf_context_imp.h"
#include "util/string_util.h" #include "util/string_util.h"
namespace rocksdb { namespace rocksdb {
@ -219,7 +220,14 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4); uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start); Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(entry, filter); bool const may_match = policy_->KeyMayMatch(entry, filter);
if (may_match) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
} else if (start == limit) { } else if (start == limit) {
// Empty filters do not match any entries // Empty filters do not match any entries
return false; return false;

@ -474,7 +474,8 @@ struct BlockBasedTableBuilder::Rep {
const InternalKeyComparator& icomparator, const InternalKeyComparator& icomparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* f, const CompressionType _compression_type, uint32_t column_family_id, WritableFileWriter* f,
const CompressionType _compression_type,
const CompressionOptions& _compression_opts, const bool skip_filters) const CompressionOptions& _compression_opts, const bool skip_filters)
: ioptions(_ioptions), : ioptions(_ioptions),
table_options(table_opt), table_options(table_opt),
@ -494,7 +495,7 @@ struct BlockBasedTableBuilder::Rep {
table_options, data_block)) { table_options, data_block)) {
for (auto& collector_factories : *int_tbl_prop_collector_factories) { for (auto& collector_factories : *int_tbl_prop_collector_factories) {
table_properties_collectors.emplace_back( table_properties_collectors.emplace_back(
collector_factories->CreateIntTblPropCollector()); collector_factories->CreateIntTblPropCollector(column_family_id));
} }
table_properties_collectors.emplace_back( table_properties_collectors.emplace_back(
new BlockBasedTablePropertiesCollector( new BlockBasedTablePropertiesCollector(
@ -509,7 +510,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type, uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters) { const CompressionOptions& compression_opts, const bool skip_filters) {
BlockBasedTableOptions sanitized_table_options(table_options); BlockBasedTableOptions sanitized_table_options(table_options);
if (sanitized_table_options.format_version == 0 && if (sanitized_table_options.format_version == 0 &&
@ -523,8 +525,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
} }
rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator, rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
int_tbl_prop_collector_factories, file, compression_type, int_tbl_prop_collector_factories, column_family_id, file,
compression_opts, skip_filters); compression_type, compression_opts, skip_filters);
if (rep_->filter_block != nullptr) { if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0); rep_->filter_block->StartBlock(0);
@ -871,8 +873,9 @@ TableProperties BlockBasedTableBuilder::GetTableProperties() const {
TableProperties ret = rep_->props; TableProperties ret = rep_->props;
for (const auto& collector : rep_->table_properties_collectors) { for (const auto& collector : rep_->table_properties_collectors) {
for (const auto& prop : collector->GetReadableProperties()) { for (const auto& prop : collector->GetReadableProperties()) {
ret.user_collected_properties.insert(prop); ret.readable_properties.insert(prop);
} }
collector->Finish(&ret.user_collected_properties);
} }
return ret; return ret;
} }

@ -40,7 +40,8 @@ class BlockBasedTableBuilder : public TableBuilder {
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type, uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters); const CompressionOptions& compression_opts, const bool skip_filters);
// REQUIRES: Either Finish() or Abandon() has been called. // REQUIRES: Either Finish() or Abandon() has been called.

@ -61,13 +61,13 @@ Status BlockBasedTableFactory::NewTableReader(
} }
TableBuilder* BlockBasedTableFactory::NewTableBuilder( TableBuilder* BlockBasedTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const { WritableFileWriter* file) const {
auto table_builder = new BlockBasedTableBuilder( auto table_builder = new BlockBasedTableBuilder(
table_builder_options.ioptions, table_options_, table_builder_options.ioptions, table_options_,
table_builder_options.internal_comparator, table_builder_options.internal_comparator,
table_builder_options.int_tbl_prop_collector_factories, file, table_builder_options.int_tbl_prop_collector_factories, column_family_id,
table_builder_options.compression_type, file, table_builder_options.compression_type,
table_builder_options.compression_opts, table_builder_options.compression_opts,
table_builder_options.skip_filters); table_builder_options.skip_filters);

@ -48,7 +48,7 @@ class BlockBasedTableFactory : public TableFactory {
TableBuilder* NewTableBuilder( TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override; uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts, Status SanitizeOptions(const DBOptions& db_opts,

@ -117,6 +117,9 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
PERF_COUNTER_ADD(block_cache_hit_count, 1); PERF_COUNTER_ADD(block_cache_hit_count, 1);
// overall cache hit // overall cache hit
RecordTick(statistics, BLOCK_CACHE_HIT); RecordTick(statistics, BLOCK_CACHE_HIT);
// total bytes read from cache
RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(cache_handle));
// block-type specific cache hit // block-type specific cache hit
RecordTick(statistics, block_cache_hit_ticker); RecordTick(statistics, block_cache_hit_ticker);
} else { } else {
@ -795,6 +798,8 @@ Status BlockBasedTable::PutDataBlockToCache(
block->value->usable_size(), block->value->usable_size(),
&DeleteCachedEntry<Block>); &DeleteCachedEntry<Block>);
RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) == assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
block->value); block->value);
} }
@ -886,6 +891,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
cache_handle = block_cache->Insert(key, filter, filter_size, cache_handle = block_cache->Insert(key, filter, filter_size,
&DeleteCachedEntry<FilterBlockReader>); &DeleteCachedEntry<FilterBlockReader>);
RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size);
} }
} }
@ -944,6 +950,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
block_cache->Insert(key, index_reader, index_reader->usable_size(), block_cache->Insert(key, index_reader, index_reader->usable_size(),
&DeleteCachedEntry<IndexReader>); &DeleteCachedEntry<IndexReader>);
RecordTick(statistics, BLOCK_CACHE_ADD); RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
index_reader->usable_size());
} }
assert(cache_handle); assert(cache_handle);

@ -27,7 +27,7 @@ Status CuckooTableFactory::NewTableReader(
} }
TableBuilder* CuckooTableFactory::NewTableBuilder( TableBuilder* CuckooTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const { WritableFileWriter* file) const {
// Ignore the skipFIlters flag. Does not apply to this file format // Ignore the skipFIlters flag. Does not apply to this file format
// //

@ -62,7 +62,7 @@ class CuckooTableFactory : public TableFactory {
TableBuilder* NewTableBuilder( TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override; uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options. // Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts, Status SanitizeOptions(const DBOptions& db_opts,

@ -8,6 +8,7 @@
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "port/port.h" #include "port/port.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/perf_context_imp.h"
namespace rocksdb { namespace rocksdb {
@ -89,7 +90,13 @@ bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
bool FullFilterBlockReader::MayMatch(const Slice& entry) { bool FullFilterBlockReader::MayMatch(const Slice& entry) {
if (contents_.size() != 0) { if (contents_.size() != 0) {
return filter_bits_reader_->MayMatch(entry); if (filter_bits_reader_->MayMatch(entry)) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
} }
return true; // remain the same with block_based filter return true; // remain the same with block_based filter
} }

@ -92,7 +92,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
user_key_, &value, merge_context_->GetOperands(), value_, user_key_, &value, merge_context_->GetOperands(), value_,
logger_); logger_);
RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0); timer.ElapsedNanosSafe());
} }
if (!merge_success) { if (!merge_success) {
RecordTick(statistics_, NUMBER_MERGE_FAILURES); RecordTick(statistics_, NUMBER_MERGE_FAILURES);
@ -118,7 +118,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
user_key_, nullptr, merge_context_->GetOperands(), value_, user_key_, nullptr, merge_context_->GetOperands(), value_,
logger_); logger_);
RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME, RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0); timer.ElapsedNanosSafe());
} }
if (!merge_success) { if (!merge_success) {
RecordTick(statistics_, NUMBER_MERGE_FAILURES); RecordTick(statistics_, NUMBER_MERGE_FAILURES);

@ -74,7 +74,7 @@ Status MockTableFactory::NewTableReader(
} }
TableBuilder* MockTableFactory::NewTableBuilder( TableBuilder* MockTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const { WritableFileWriter* file) const {
uint32_t id = GetAndWriteNextID(file->writable_file()); uint32_t id = GetAndWriteNextID(file->writable_file());

@ -151,7 +151,7 @@ class MockTableFactory : public TableFactory {
unique_ptr<TableReader>* table_reader) const override; unique_ptr<TableReader>* table_reader) const override;
TableBuilder* NewTableBuilder( TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override; uint32_t column_familly_id, WritableFileWriter* file) const override;
// This function will directly create mock table instead of going through // This function will directly create mock table instead of going through
// MockTableBuilder. file_contents has to have a format of <internal_key, // MockTableBuilder. file_contents has to have a format of <internal_key,

@ -60,10 +60,10 @@ PlainTableBuilder::PlainTableBuilder(
const ImmutableCFOptions& ioptions, const ImmutableCFOptions& ioptions,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, EncodingType encoding_type, size_t index_sparseness,
size_t huge_page_tlb_size, double hash_table_ratio, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
bool store_index_in_file) double hash_table_ratio, bool store_index_in_file)
: ioptions_(ioptions), : ioptions_(ioptions),
bloom_block_(num_probes), bloom_block_(num_probes),
file_(file), file_(file),
@ -108,7 +108,7 @@ PlainTableBuilder::PlainTableBuilder(
for (auto& collector_factories : *int_tbl_prop_collector_factories) { for (auto& collector_factories : *int_tbl_prop_collector_factories) {
table_properties_collectors_.emplace_back( table_properties_collectors_.emplace_back(
collector_factories->CreateIntTblPropCollector()); collector_factories->CreateIntTblPropCollector(column_family_id));
} }
} }

@ -34,11 +34,11 @@ class PlainTableBuilder: public TableBuilder {
const ImmutableCFOptions& ioptions, const ImmutableCFOptions& ioptions,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>* const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories, int_tbl_prop_collector_factories,
WritableFileWriter* file, uint32_t user_key_size, uint32_t column_family_id, WritableFileWriter* file,
EncodingType encoding_type, size_t index_sparseness, uint32_t user_key_size, EncodingType encoding_type,
uint32_t bloom_bits_per_key, uint32_t num_probes = 6, size_t index_sparseness, uint32_t bloom_bits_per_key,
size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
bool store_index_in_file = false); double hash_table_ratio = 0, bool store_index_in_file = false);
// REQUIRES: Either Finish() or Abandon() has been called. // REQUIRES: Either Finish() or Abandon() has been called.
~PlainTableBuilder(); ~PlainTableBuilder();

@ -26,7 +26,7 @@ Status PlainTableFactory::NewTableReader(
} }
TableBuilder* PlainTableFactory::NewTableBuilder( TableBuilder* PlainTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const { WritableFileWriter* file) const {
// Ignore the skip_filters flag. PlainTable format is optimized for small // Ignore the skip_filters flag. PlainTable format is optimized for small
// in-memory dbs. The skip_filters optimization is not useful for plain // in-memory dbs. The skip_filters optimization is not useful for plain
@ -34,9 +34,10 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
// //
return new PlainTableBuilder( return new PlainTableBuilder(
table_builder_options.ioptions, table_builder_options.ioptions,
table_builder_options.int_tbl_prop_collector_factories, file, table_builder_options.int_tbl_prop_collector_factories, column_family_id,
user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6, file, user_key_len_, encoding_type_, index_sparseness_,
huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_); bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_,
store_index_in_file_);
} }
std::string PlainTableFactory::GetPrintableTableOptions() const { std::string PlainTableFactory::GetPrintableTableOptions() const {

@ -159,7 +159,7 @@ class PlainTableFactory : public TableFactory {
unique_ptr<TableReader>* table) const override; unique_ptr<TableReader>* table) const override;
TableBuilder* NewTableBuilder( TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options, const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override; uint32_t column_family_id, WritableFileWriter* file) const override;
std::string GetPrintableTableOptions() const override; std::string GetPrintableTableOptions() const override;

@ -488,7 +488,17 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
} }
bool PlainTableReader::MatchBloom(uint32_t hash) const { bool PlainTableReader::MatchBloom(uint32_t hash) const {
return !enable_bloom_ || bloom_.MayContainHash(hash); if (!enable_bloom_) {
return true;
}
if (bloom_.MayContainHash(hash)) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
} }
Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset, Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,

@ -57,7 +57,8 @@ class SstFileWriter::SstFileWriterPropertiesCollectorFactory
explicit SstFileWriterPropertiesCollectorFactory(int32_t version) explicit SstFileWriterPropertiesCollectorFactory(int32_t version)
: version_(version) {} : version_(version) {}
virtual IntTblPropCollector* CreateIntTblPropCollector() override { virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new SstFileWriterPropertiesCollector(version_); return new SstFileWriterPropertiesCollector(version_);
} }
@ -117,7 +118,9 @@ Status SstFileWriter::Open(const std::string& file_path) {
r->file_writer.reset( r->file_writer.reset(
new WritableFileWriter(std::move(sst_file), r->env_options)); new WritableFileWriter(std::move(sst_file), r->env_options));
r->builder.reset(r->ioptions.table_factory->NewTableBuilder( r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
table_builder_options, r->file_writer.get())); table_builder_options,
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
r->file_writer.get()));
r->file_info.file_path = file_path; r->file_info.file_path = file_path;
r->file_info.file_size = 0; r->file_info.file_size = 0;

@ -12,6 +12,9 @@
namespace rocksdb { namespace rocksdb {
const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily =
port::kMaxInt32;
namespace { namespace {
void AppendProperty( void AppendProperty(
std::string& props, std::string& props,

@ -98,7 +98,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
CompressionType::kNoCompression, CompressionType::kNoCompression,
CompressionOptions(), false), CompressionOptions(), false),
file_writer.get()); 0, file_writer.get());
} else { } else {
s = DB::Open(opts, dbname, &db); s = DB::Open(opts, dbname, &db);
ASSERT_OK(s); ASSERT_OK(s);

@ -272,6 +272,7 @@ class TableConstructor: public Constructor {
TableBuilderOptions(ioptions, internal_comparator, TableBuilderOptions(ioptions, internal_comparator,
&int_tbl_prop_collector_factories, &int_tbl_prop_collector_factories,
options.compression, CompressionOptions(), false), options.compression, CompressionOptions(), false),
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
file_writer_.get())); file_writer_.get()));
for (const auto kv : kv_map) { for (const auto kv : kv_map) {
@ -1423,6 +1424,9 @@ class BlockCachePropertiesSnapshot {
filter_block_cache_miss = filter_block_cache_miss =
statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS); statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT); filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
block_cache_bytes_write =
statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
} }
void AssertIndexBlockStat(int64_t expected_index_block_cache_miss, void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
@ -1453,6 +1457,10 @@ class BlockCachePropertiesSnapshot {
block_cache_hit); block_cache_hit);
} }
int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
private: private:
int64_t block_cache_miss = 0; int64_t block_cache_miss = 0;
int64_t block_cache_hit = 0; int64_t block_cache_hit = 0;
@ -1462,6 +1470,8 @@ class BlockCachePropertiesSnapshot {
int64_t data_block_cache_hit = 0; int64_t data_block_cache_hit = 0;
int64_t filter_block_cache_miss = 0; int64_t filter_block_cache_miss = 0;
int64_t filter_block_cache_hit = 0; int64_t filter_block_cache_hit = 0;
int64_t block_cache_bytes_read = 0;
int64_t block_cache_bytes_write = 0;
}; };
// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't // Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
@ -1537,12 +1547,17 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
// Since block_cache is disabled, no cache activities will be involved. // Since block_cache is disabled, no cache activities will be involved.
unique_ptr<Iterator> iter; unique_ptr<Iterator> iter;
int64_t last_cache_bytes_read = 0;
// At first, no block will be accessed. // At first, no block will be accessed.
{ {
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
// index will be added to block cache. // index will be added to block cache.
props.AssertEqual(1, // index block miss props.AssertEqual(1, // index block miss
0, 0, 0); 0, 0, 0);
ASSERT_EQ(props.GetCacheBytesRead(), 0);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
} }
// Only index block will be accessed // Only index block will be accessed
@ -1554,6 +1569,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
// value; other numbers remain the same. // value; other numbers remain the same.
props.AssertEqual(1, 0 + 1, // index block hit props.AssertEqual(1, 0 + 1, // index block hit
0, 0); 0, 0);
// Cache hit, bytes read from cache should increase
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
} }
// Only data block will be accessed // Only data block will be accessed
@ -1562,6 +1582,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, 1, 0 + 1, // data block miss props.AssertEqual(1, 1, 0 + 1, // data block miss
0); 0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
} }
// Data block will be in cache // Data block will be in cache
@ -1571,6 +1596,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, 1 + 1, /* index block hit */ props.AssertEqual(1, 1 + 1, /* index block hit */
1, 0 + 1 /* data block hit */); 1, 0 + 1 /* data block hit */);
// Cache hit, bytes read from cache should increase
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
} }
// release the iterator so that the block cache can reset correctly. // release the iterator so that the block cache can reset correctly.
iter.reset(); iter.reset();
@ -1587,6 +1617,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, // index block miss props.AssertEqual(1, // index block miss
0, 0, 0); 0, 0, 0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), 0);
} }
{ {
@ -1598,6 +1630,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
props.AssertEqual(1 + 1, // index block miss props.AssertEqual(1 + 1, // index block miss
0, 0, // data block miss 0, 0, // data block miss
0); 0);
// Cache hit, bytes read from cache should increase
ASSERT_EQ(props.GetCacheBytesRead(), 0);
} }
{ {
@ -1607,6 +1641,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get()); BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(2, 0, 0 + 1, // data block miss props.AssertEqual(2, 0, 0 + 1, // data block miss
0); 0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), 0);
} }
iter.reset(); iter.reset();
@ -1789,6 +1825,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder( std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
kNoCompression, CompressionOptions(), false), kNoCompression, CompressionOptions(), false),
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
file_writer.get())); file_writer.get()));
for (char c = 'a'; c <= 'z'; ++c) { for (char c = 'a'; c <= 'z'; ++c) {

@ -0,0 +1,261 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <iostream>
#include "rocksdb/db.h"
#include "rocksdb/db_dump_tool.h"
#include "rocksdb/env.h"
#include "util/coding.h"
namespace rocksdb {
bool DbDumpTool::Run(const DumpOptions& dump_options,
rocksdb::Options options) {
rocksdb::DB* dbptr;
rocksdb::Status status;
std::unique_ptr<rocksdb::WritableFile> dumpfile;
char hostname[1024];
int64_t timesec;
std::string abspath;
char json[4096];
static const char* magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
rocksdb::Env* env = rocksdb::Env::Default();
// Open the database
options.create_if_missing = false;
status = rocksdb::DB::OpenForReadOnly(options, dump_options.db_path, &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << dump_options.db_path
<< "' for reading: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << dump_options.dump_location
<< "' for writing: " << status.ToString() << std::endl;
return false;
}
rocksdb::Slice magicslice(magicstr, 8);
status = dumpfile->Append(magicslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
rocksdb::Slice versionslice(versionstr, 8);
status = dumpfile->Append(versionslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
if (dump_options.anonymous) {
snprintf(json, sizeof(json), "{}");
} else {
status = env->GetHostName(hostname, sizeof(hostname));
status = env->GetCurrentTime(&timesec);
status = env->GetAbsolutePath(dump_options.db_path, &abspath);
snprintf(json, sizeof(json),
"{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
"\"creation-time\": %" PRIi64 " }",
abspath.c_str(), hostname, timesec);
}
rocksdb::Slice infoslice(json, strlen(json));
char infosize[4];
rocksdb::EncodeFixed32(infosize, (uint32_t)infoslice.size());
rocksdb::Slice infosizeslice(infosize, 4);
status = dumpfile->Append(infosizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(infoslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::Iterator> it(
db->NewIterator(rocksdb::ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
char keysize[4];
rocksdb::EncodeFixed32(keysize, (uint32_t)it->key().size());
rocksdb::Slice keysizeslice(keysize, 4);
status = dumpfile->Append(keysizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(it->key());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
char valsize[4];
rocksdb::EncodeFixed32(valsize, (uint32_t)it->value().size());
rocksdb::Slice valsizeslice(valsize, 4);
status = dumpfile->Append(valsizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(it->value());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
}
if (!it->status().ok()) {
std::cerr << "Database iteration failed: " << status.ToString()
<< std::endl;
return false;
}
return true;
}
bool DbUndumpTool::Run(const UndumpOptions& undump_options,
rocksdb::Options options) {
rocksdb::DB* dbptr;
rocksdb::Status status;
rocksdb::Env* env;
std::unique_ptr<rocksdb::SequentialFile> dumpfile;
rocksdb::Slice slice;
char scratch8[8];
static const char* magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
env = rocksdb::Env::Default();
status = env->NewSequentialFile(undump_options.dump_location, &dumpfile,
rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << undump_options.dump_location
<< "' for reading: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), magicstr, 8) != 0) {
std::cerr << "File '" << undump_options.dump_location
<< "' is not a recognizable dump file." << std::endl;
return false;
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), versionstr, 8) != 0) {
std::cerr << "File '" << undump_options.dump_location
<< "' version not recognized." << std::endl;
return false;
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read info blob size." << std::endl;
return false;
}
uint32_t infosize = rocksdb::DecodeFixed32(slice.data());
status = dumpfile->Skip(infosize);
if (!status.ok()) {
std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
return false;
}
options.create_if_missing = true;
status = rocksdb::DB::Open(options, undump_options.db_path, &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << undump_options.db_path
<< "' for writing: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
uint32_t last_keysize = 64;
size_t last_valsize = 1 << 20;
std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
std::unique_ptr<char[]> valscratch(new char[last_valsize]);
while (1) {
uint32_t keysize, valsize;
rocksdb::Slice keyslice;
rocksdb::Slice valslice;
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) break;
keysize = rocksdb::DecodeFixed32(slice.data());
if (keysize > last_keysize) {
while (keysize > last_keysize) last_keysize *= 2;
keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
}
status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
if (!status.ok() || keyslice.size() != keysize) {
std::cerr << "Key read failure: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read value size: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
valsize = rocksdb::DecodeFixed32(slice.data());
if (valsize > last_valsize) {
while (valsize > last_valsize) last_valsize *= 2;
valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
}
status = dumpfile->Read(valsize, &valslice, valscratch.get());
if (!status.ok() || valslice.size() != valsize) {
std::cerr << "Unable to read value: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
status = db->Put(rocksdb::WriteOptions(), keyslice, valslice);
if (!status.ok()) {
fprintf(stderr, "Unable to write database entry\n");
return false;
}
}
if (undump_options.compact_db) {
status = db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
if (!status.ok()) {
fprintf(stderr,
"Unable to compact the database after loading the dumped file\n");
return false;
}
}
return true;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -1,154 +1,63 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved. // Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the // This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant // LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
#ifndef GFLAGS #if !(defined GFLAGS) || defined(ROCKSDB_LITE)
#include <cstdio> #include <cstdio>
int main() { int main() {
#ifndef GFLAGS
fprintf(stderr, "Please install gflags to run rocksdb tools\n"); fprintf(stderr, "Please install gflags to run rocksdb tools\n");
#endif
#ifdef ROCKSDB_LITE
fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n");
#endif
return 1; return 1;
} }
#else
#ifndef __STDC_FORMAT_MACROS #else
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <iostream> #include "rocksdb/convenience.h"
#include "rocksdb/db_dump_tool.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/coding.h"
DEFINE_bool(anonymous, false, "Output an empty information blob."); DEFINE_string(db_path, "", "Path to the db that will be dumped");
DEFINE_string(dump_location, "", "Path to where the dump file location");
void usage(const char* name) { DEFINE_bool(anonymous, false,
std::cout << "usage: " << name << " [--anonymous] <db> <dumpfile>" "Remove information like db path, creation time from dumped file");
<< std::endl; DEFINE_string(db_options, "",
} "Options string used to open the database that will be dumped");
int main(int argc, char** argv) { int main(int argc, char** argv) {
rocksdb::DB* dbptr;
rocksdb::Options options;
rocksdb::Status status;
std::unique_ptr<rocksdb::WritableFile> dumpfile;
char hostname[1024];
int64_t timesec;
std::string abspath;
char json[4096];
GFLAGS::ParseCommandLineFlags(&argc, &argv, true); GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
static const char* magicstr = "ROCKDUMP"; if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1}; fprintf(stderr, "Please set --db_path and --dump_location\n");
return 1;
if (argc != 3) {
usage(argv[0]);
exit(1);
}
rocksdb::Env* env = rocksdb::Env::Default();
// Open the database
options.create_if_missing = false;
status = rocksdb::DB::OpenForReadOnly(options, argv[1], &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << argv[1]
<< "' for reading: " << status.ToString() << std::endl;
exit(1);
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
status = env->NewWritableFile(argv[2], &dumpfile, rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << argv[2]
<< "' for writing: " << status.ToString() << std::endl;
exit(1);
} }
rocksdb::Slice magicslice(magicstr, 8); rocksdb::DumpOptions dump_options;
status = dumpfile->Append(magicslice); dump_options.db_path = FLAGS_db_path;
if (!status.ok()) { dump_options.dump_location = FLAGS_dump_location;
std::cerr << "Append failed: " << status.ToString() << std::endl; dump_options.anonymous = FLAGS_anonymous;
exit(1);
}
rocksdb::Slice versionslice(versionstr, 8);
status = dumpfile->Append(versionslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
if (FLAGS_anonymous) { rocksdb::Options db_options;
snprintf(json, sizeof(json), "{}"); if (FLAGS_db_options != "") {
} else { rocksdb::Options parsed_options;
status = env->GetHostName(hostname, sizeof(hostname)); rocksdb::Status s = rocksdb::GetOptionsFromString(
status = env->GetCurrentTime(&timesec); db_options, FLAGS_db_options, &parsed_options);
status = env->GetAbsolutePath(argv[1], &abspath); if (!s.ok()) {
snprintf(json, sizeof(json), fprintf(stderr, "Cannot parse provided db_options\n");
"{ \"database-path\": \"%s\", \"hostname\": \"%s\", " return 1;
"\"creation-time\": %" PRIi64 " }",
abspath.c_str(), hostname, timesec);
}
rocksdb::Slice infoslice(json, strlen(json));
char infosize[4];
rocksdb::EncodeFixed32(infosize, (uint32_t)infoslice.size());
rocksdb::Slice infosizeslice(infosize, 4);
status = dumpfile->Append(infosizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Append(infoslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
const std::unique_ptr<rocksdb::Iterator> it(
db->NewIterator(rocksdb::ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
char keysize[4];
rocksdb::EncodeFixed32(keysize, (uint32_t)it->key().size());
rocksdb::Slice keysizeslice(keysize, 4);
status = dumpfile->Append(keysizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Append(it->key());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
char valsize[4];
rocksdb::EncodeFixed32(valsize, (uint32_t)it->value().size());
rocksdb::Slice valsizeslice(valsize, 4);
status = dumpfile->Append(valsizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
} }
status = dumpfile->Append(it->value()); db_options = parsed_options;
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
}
if (!it->status().ok()) {
std::cerr << "Database iteration failed: " << status.ToString()
<< std::endl;
exit(1);
} }
rocksdb::DbDumpTool tool;
if (!tool.Run(dump_options, db_options)) {
return 1;
}
return 0; return 0;
} }
#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)
#endif // GFLAGS

@ -1,136 +1,62 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved. // Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the // This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant // LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
#include <cstring> #if !(defined GFLAGS) || defined(ROCKSDB_LITE)
#include <iostream>
#include <cstdio>
#include "rocksdb/db.h" int main() {
#include "rocksdb/env.h" #ifndef GFLAGS
#include "util/coding.h" fprintf(stderr, "Please install gflags to run rocksdb tools\n");
#endif
void usage(const char *name) { #ifdef ROCKSDB_LITE
std::cout << "usage: " << name << " <dumpfile> <rocksdb>" << std::endl; fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n");
#endif
return 1;
} }
int main(int argc, char **argv) { #else
rocksdb::DB *dbptr;
rocksdb::Options options;
rocksdb::Status status;
rocksdb::Env *env;
std::unique_ptr<rocksdb::SequentialFile> dumpfile;
rocksdb::Slice slice;
char scratch8[8];
static const char *magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
if (argc != 3) {
usage(argv[0]);
exit(1);
}
env = rocksdb::Env::Default();
status = env->NewSequentialFile(argv[1], &dumpfile, rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << argv[1]
<< "' for reading: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), magicstr, 8) != 0) {
std::cerr << "File '" << argv[1] << "' is not a recognizable dump file."
<< std::endl;
exit(1);
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), versionstr, 8) != 0) {
std::cerr << "File '" << argv[1] << "' version not recognized."
<< std::endl;
exit(1);
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read info blob size." << std::endl;
exit(1);
}
uint32_t infosize = rocksdb::DecodeFixed32(slice.data());
status = dumpfile->Skip(infosize);
if (!status.ok()) {
std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
exit(1);
}
options.create_if_missing = true;
status = rocksdb::DB::Open(options, argv[2], &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << argv[2]
<< "' for writing: " << status.ToString() << std::endl;
exit(1);
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
uint32_t last_keysize = 64; #include <gflags/gflags.h>
size_t last_valsize = 1 << 20; #include "rocksdb/convenience.h"
std::unique_ptr<char[]> keyscratch(new char[last_keysize]); #include "rocksdb/db_dump_tool.h"
std::unique_ptr<char[]> valscratch(new char[last_valsize]);
while (1) { DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
uint32_t keysize, valsize; DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
rocksdb::Slice keyslice; DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
rocksdb::Slice valslice; DEFINE_string(db_options, "",
"Options string used to open the database that will be loaded");
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) break;
keysize = rocksdb::DecodeFixed32(slice.data());
if (keysize > last_keysize) {
while (keysize > last_keysize) last_keysize *= 2;
keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
}
status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
if (!status.ok() || keyslice.size() != keysize) {
std::cerr << "Key read failure: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
exit(1);
}
status = dumpfile->Read(4, &slice, scratch8); int main(int argc, char **argv) {
if (!status.ok() || slice.size() != 4) { GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
std::cerr << "Unable to read value size: "
<< (status.ok() ? "insufficient data" : status.ToString()) if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
<< std::endl; fprintf(stderr, "Please set --db_path and --dump_location\n");
exit(1); return 1;
} }
valsize = rocksdb::DecodeFixed32(slice.data());
if (valsize > last_valsize) { rocksdb::UndumpOptions undump_options;
while (valsize > last_valsize) last_valsize *= 2; undump_options.db_path = FLAGS_db_path;
valscratch = std::unique_ptr<char[]>(new char[last_valsize]); undump_options.dump_location = FLAGS_dump_location;
} undump_options.compact_db = FLAGS_compact;
status = dumpfile->Read(valsize, &valslice, valscratch.get()); rocksdb::Options db_options;
if (!status.ok() || valslice.size() != valsize) { if (FLAGS_db_options != "") {
std::cerr << "Unable to read value: " rocksdb::Options parsed_options;
<< (status.ok() ? "insufficient data" : status.ToString()) rocksdb::Status s = rocksdb::GetOptionsFromString(
<< std::endl; db_options, FLAGS_db_options, &parsed_options);
exit(1); if (!s.ok()) {
} fprintf(stderr, "Cannot parse provided db_options\n");
return 1;
status = db->Put(rocksdb::WriteOptions(), keyslice, valslice);
if (!status.ok()) {
fprintf(stderr, "Unable to write database entry\n");
exit(1);
} }
db_options = parsed_options;
} }
rocksdb::DbUndumpTool tool;
if (!tool.Run(undump_options, db_options)) {
return 1;
}
return 0; return 0;
} }
#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)

@ -2,6 +2,6 @@ TESTDIR=`mktemp -d /tmp/rocksdb-dump-test.XXXXX`
DUMPFILE="tools/sample-dump.dmp" DUMPFILE="tools/sample-dump.dmp"
# Verify that the sample dump file is undumpable and then redumpable. # Verify that the sample dump file is undumpable and then redumpable.
./rocksdb_undump $DUMPFILE $TESTDIR/db ./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db
./rocksdb_dump --anonymous $TESTDIR/db $TESTDIR/dump ./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump
cmp $DUMPFILE $TESTDIR/dump cmp $DUMPFILE $TESTDIR/dump

@ -540,6 +540,11 @@ class ShardedLRUCache : public Cache {
} }
return usage; return usage;
} }
virtual size_t GetUsage(Handle* handle) const override {
return reinterpret_cast<LRUHandle*>(handle)->charge;
}
virtual size_t GetPinnedUsage() const override { virtual size_t GetPinnedUsage() const override {
// We will not lock the cache when getting the usage from shards. // We will not lock the cache when getting the usage from shards.
int num_shards = 1 << num_shard_bits_; int num_shards = 1 << num_shard_bits_;

@ -12,7 +12,6 @@
#include <thread> #include <thread>
#include "port/port.h" #include "port/port.h"
#include "port/sys_time.h" #include "port/sys_time.h"
#include "port/port.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "util/arena.h" #include "util/arena.h"
@ -283,6 +282,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->set_fd_cloexec = options.is_fd_close_on_exec; env_options->set_fd_cloexec = options.is_fd_close_on_exec;
env_options->bytes_per_sync = options.bytes_per_sync; env_options->bytes_per_sync = options.bytes_per_sync;
env_options->rate_limiter = options.rate_limiter.get(); env_options->rate_limiter = options.rate_limiter.get();
env_options->allow_fallocate = options.allow_fallocate;
} }
} }

@ -352,6 +352,7 @@ class PosixMmapFile : public WritableFile {
char* last_sync_; // Where have we synced up to char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file uint64_t file_offset_; // Offset of base_ in file
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_; // If false, fallocate calls are bypassed
bool fallocate_with_keep_size_; bool fallocate_with_keep_size_;
#endif #endif
@ -393,7 +394,7 @@ class PosixMmapFile : public WritableFile {
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here // we can't fallocate with FALLOC_FL_KEEP_SIZE here
{ if (allow_fallocate_) {
IOSTATS_TIMER_GUARD(allocate_nanos); IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) { if (alloc_status != 0) {
@ -453,6 +454,7 @@ class PosixMmapFile : public WritableFile {
last_sync_(nullptr), last_sync_(nullptr),
file_offset_(0) { file_offset_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size; fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif #endif
assert((page_size & (page_size - 1)) == 0); assert((page_size & (page_size - 1)) == 0);
@ -575,8 +577,12 @@ class PosixMmapFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override { virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = fallocate( int alloc_status = 0;
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (allow_fallocate_) {
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) { if (alloc_status == 0) {
return Status::OK(); return Status::OK();
} else { } else {
@ -593,6 +599,7 @@ class PosixWritableFile : public WritableFile {
int fd_; int fd_;
uint64_t filesize_; uint64_t filesize_;
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_;
bool fallocate_with_keep_size_; bool fallocate_with_keep_size_;
#endif #endif
@ -600,6 +607,7 @@ class PosixWritableFile : public WritableFile {
PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options) PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options)
: filename_(fname), fd_(fd), filesize_(0) { : filename_(fname), fd_(fd), filesize_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size; fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif #endif
assert(!options.use_mmap_writes); assert(!options.use_mmap_writes);
@ -660,8 +668,10 @@ class PosixWritableFile : public WritableFile {
// We ignore error since failure of this operation does not affect // We ignore error since failure of this operation does not affect
// correctness. // correctness.
IOSTATS_TIMER_GUARD(allocate_nanos); IOSTATS_TIMER_GUARD(allocate_nanos);
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, if (allow_fallocate_) {
filesize_, block_size * last_allocated_block - filesize_); fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
block_size * last_allocated_block - filesize_);
}
#endif #endif
} }
@ -714,9 +724,12 @@ class PosixWritableFile : public WritableFile {
virtual Status Allocate(off_t offset, off_t len) override { virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds); TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos); IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status; int alloc_status = 0;
alloc_status = fallocate( if (allow_fallocate_) {
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) { if (alloc_status == 0) {
return Status::OK(); return Status::OK();
} else { } else {
@ -1146,7 +1159,7 @@ class PosixEnv : public Env {
} else { } else {
int fd = fileno(f); int fd = fileno(f);
#ifdef ROCKSDB_FALLOCATE_PRESENT #ifdef ROCKSDB_FALLOCATE_PRESENT
fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024 * 1024); fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
#endif #endif
SetFD_CLOEXEC(fd, nullptr); SetFD_CLOEXEC(fd, nullptr);
result->reset(new PosixLogger(f, &PosixEnv::gettid, this)); result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
@ -1609,10 +1622,11 @@ class PosixEnv : public Env {
}; };
PosixEnv::PosixEnv() : checkedDiskForMmap_(false), PosixEnv::PosixEnv()
forceMmapOff(false), : checkedDiskForMmap_(false),
page_size_(getpagesize()), forceMmapOff(false),
thread_pools_(Priority::TOTAL) { page_size_(getpagesize()),
thread_pools_(Priority::TOTAL) {
PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr)); PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) { for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
thread_pools_[pool_id].SetThreadPriority( thread_pools_[pool_id].SetThreadPriority(

@ -240,6 +240,7 @@ DBOptions::DBOptions()
allow_os_buffer(true), allow_os_buffer(true),
allow_mmap_reads(false), allow_mmap_reads(false),
allow_mmap_writes(false), allow_mmap_writes(false),
allow_fallocate(true),
is_fd_close_on_exec(true), is_fd_close_on_exec(true),
skip_log_error_on_recovery(false), skip_log_error_on_recovery(false),
stats_dump_period_sec(600), stats_dump_period_sec(600),
@ -294,6 +295,7 @@ DBOptions::DBOptions(const Options& options)
allow_os_buffer(options.allow_os_buffer), allow_os_buffer(options.allow_os_buffer),
allow_mmap_reads(options.allow_mmap_reads), allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes), allow_mmap_writes(options.allow_mmap_writes),
allow_fallocate(options.allow_fallocate),
is_fd_close_on_exec(options.is_fd_close_on_exec), is_fd_close_on_exec(options.is_fd_close_on_exec),
skip_log_error_on_recovery(options.skip_log_error_on_recovery), skip_log_error_on_recovery(options.skip_log_error_on_recovery),
stats_dump_period_sec(options.stats_dump_period_sec), stats_dump_period_sec(options.stats_dump_period_sec),
@ -341,6 +343,7 @@ void DBOptions::Dump(Logger* log) const {
keep_log_file_num); keep_log_file_num);
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer); Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads); Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Header(log, " Options.allow_fallocate: %d", allow_fallocate);
Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes); Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
Header(log, " Options.create_missing_column_families: %d", Header(log, " Options.create_missing_column_families: %d",
create_missing_column_families); create_missing_column_families);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save