merge from master

main
Praveen Rao 9 years ago
commit eb24178553
  1. 2
      .travis.yml
  2. 9
      CMakeLists.txt
  3. 12
      HISTORY.md
  4. 13
      INSTALL.md
  5. 35
      Makefile
  6. 3
      build_tools/build_detect_platform
  7. 67
      build_tools/rocksdb-lego-determinator
  8. 17
      db/builder.cc
  9. 6
      db/builder.h
  10. 22
      db/compaction.cc
  11. 11
      db/compaction.h
  12. 16
      db/compaction_iterator.cc
  13. 2
      db/compaction_iterator.h
  14. 3
      db/compaction_iterator_test.cc
  15. 35
      db/compaction_job.cc
  16. 120
      db/compaction_job_test.cc
  17. 147
      db/compaction_picker_test.cc
  18. 55
      db/db_compaction_filter_test.cc
  19. 2
      db/db_compaction_test.cc
  20. 2
      db/db_dynamic_level_test.cc
  21. 34
      db/db_impl.cc
  22. 3
      db/db_impl.h
  23. 2
      db/db_inplace_update_test.cc
  24. 2
      db/db_log_iter_test.cc
  25. 2
      db/db_tailing_iter_test.cc
  26. 355
      db/db_test.cc
  27. 23
      db/db_test_util.cc
  28. 4
      db/db_test_util.h
  29. 2
      db/db_universal_compaction_test.cc
  30. 2
      db/db_wal_test.cc
  31. 2
      db/event_helpers.cc
  32. 26
      db/filename.cc
  33. 4
      db/filename.h
  34. 16
      db/flush_job.cc
  35. 2
      db/flush_job.h
  36. 56
      db/listener_test.cc
  37. 21
      db/memtable.cc
  38. 107
      db/merge_helper.cc
  39. 38
      db/merge_helper.h
  40. 229
      db/merge_helper_test.cc
  41. 7
      db/repair.cc
  42. 13
      db/table_properties_collector.h
  43. 12
      db/table_properties_collector_test.cc
  44. 124
      db/version_edit.cc
  45. 2
      db/version_edit.h
  46. 116
      db/version_edit_test.cc
  47. 1
      db/version_set.h
  48. 7
      examples/.gitignore
  49. 5
      examples/Makefile
  50. 84
      examples/compaction_filter_example.cc
  51. 182
      examples/rocksdb_option_file_example.ini
  52. 3
      include/rocksdb/cache.h
  53. 118
      include/rocksdb/compaction_filter.h
  54. 3
      include/rocksdb/convenience.h
  55. 45
      include/rocksdb/db_dump_tool.h
  56. 3
      include/rocksdb/env.h
  57. 12
      include/rocksdb/listener.h
  58. 12
      include/rocksdb/options.h
  59. 8
      include/rocksdb/perf_context.h
  60. 6
      include/rocksdb/statistics.h
  61. 2
      include/rocksdb/table.h
  62. 9
      include/rocksdb/table_properties.h
  63. 37
      include/rocksdb/utilities/transaction.h
  64. 2
      include/rocksdb/version.h
  65. 8
      java/crossbuild/build-linux-centos.sh
  66. 9
      port/win/env_win.cc
  67. 47
      port/win/port_win.cc
  68. 54
      port/win/port_win.h
  69. 3
      src.mk
  70. 5
      table/adaptive_table_factory.cc
  71. 2
      table/adaptive_table_factory.h
  72. 12
      table/block_based_filter_block.cc
  73. 15
      table/block_based_table_builder.cc
  74. 3
      table/block_based_table_builder.h
  75. 6
      table/block_based_table_factory.cc
  76. 2
      table/block_based_table_factory.h
  77. 8
      table/block_based_table_reader.cc
  78. 2
      table/cuckoo_table_factory.cc
  79. 2
      table/cuckoo_table_factory.h
  80. 9
      table/full_filter_block.cc
  81. 4
      table/get_context.cc
  82. 2
      table/mock_table.cc
  83. 2
      table/mock_table.h
  84. 10
      table/plain_table_builder.cc
  85. 10
      table/plain_table_builder.h
  86. 9
      table/plain_table_factory.cc
  87. 2
      table/plain_table_factory.h
  88. 12
      table/plain_table_reader.cc
  89. 7
      table/sst_file_writer.cc
  90. 3
      table/table_properties.cc
  91. 2
      table/table_reader_bench.cc
  92. 37
      table/table_test.cc
  93. 261
      tools/dump/db_dump_tool.cc
  94. 165
      tools/dump/rocksdb_dump.cc
  95. 158
      tools/dump/rocksdb_undump.cc
  96. 4
      tools/rocksdb_dump_test.sh
  97. 5
      util/cache.cc
  98. 2
      util/env.cc
  99. 34
      util/env_posix.cc
  100. 3
      util/options.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -33,7 +33,7 @@ before_script:
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
script:
- if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format; fi
- if [[ "${TRAVIS_OS_NAME}" == 'linux' ]]; then OPT=-DTRAVIS CLANG_FORMAT_DIFF=/tmp/clang-format-diff.py make format || true; fi
- OPT=-DTRAVIS V=1 make -j4 check && OPT=-DTRAVIS V=1 make clean jclean rocksdbjava jtest
notifications:

@ -13,8 +13,8 @@
# cd build
# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
# See thirdparty.inc for more information.
# cmake -G "Visual Studio 12 Win64" .. <more options>
# 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads)
# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 ..
# 4. Then build the project in debug mode (you may want to add /m:<N> flag to run msbuild in <N> parallel threads)
# msbuild ALL_BUILD.vcxproj
# 5. And release mode (/m[:<N>] is also supported)
# msbuild ALL_BUILD.vcxproj /p:Configuration=Release
@ -66,13 +66,13 @@ endif()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Oxt /Zp8 /Gm- /Gy /MD")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEBUG")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEBUG")
add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64)
include_directories(${PROJECT_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR}/include)
include_directories(${PROJECT_SOURCE_DIR}/port)
include_directories(${PROJECT_SOURCE_DIR}/third-party/gtest-1.7.0/fused-src)
set(ROCKSDB_LIBS rocksdblib${ARTIFACT_SUFFIX})
@ -100,6 +100,7 @@ set(SOURCES
db/db_impl_experimental.cc
db/db_impl_readonly.cc
db/db_iter.cc
db/db_test_util.cc
db/event_helpers.cc
db/experimental.cc
db/filename.cc
@ -163,6 +164,7 @@ set(SOURCES
table/plain_table_reader.cc
table/table_properties.cc
table/two_level_iterator.cc
tools/dump/db_dump_tool.cc
util/arena.cc
util/auto_roll_logger.cc
util/bloom.cc
@ -174,7 +176,6 @@ set(SOURCES
util/crc32c.cc
util/db_info_dumper.cc
util/delete_scheduler_impl.cc
util/db_test_util.cc
util/dynamic_bloom.cc
util/env.cc
util/env_hdfs.cc

@ -1,19 +1,29 @@
# Rocksdb Change Log
## Unreleased
### Public API Changes
* CompactionFilter::Context includes information of Column Family ID
* The need-compaction hint given by TablePropertiesCollector::NeedCompact() will be persistent and recoverable after DB recovery. This introduces a breaking format change. If you use this experimental feature, including NewCompactOnDeletionCollectorFactory() in the new version, you may not be able to directly downgrade the DB back to version 4.0 or lower.
* TablePropertiesCollectorFactory::CreateTablePropertiesCollector() now takes an option Context, containing the information of column family ID for the file being written.
## 4.1.0 (10/8/2015)
### New Features
* Added single delete operation as a more efficient way to delete keys that have not been overwritten.
* Added experimental AddFile() to DB interface that allow users to add files created by SstFileWriter into an empty Database, see include/rocksdb/sst_file_writer.h and DB::AddFile() for more info.
* Added support for opening SST files with .ldb suffix which enables opening LevelDB databases.
* CompactionFilter now supports filtering of merge operands and merge results.
### Public API Changes
* Added SingleDelete() to the DB interface.
* Added AddFile() to DB interface.
* Added SstFileWriter class.
* CompactionFilter has a new method FilterMergeOperand() that RocksDB applies to every merge operand during compaction to decide whether to filter the operand.
* We removed CompactionFilterV2 interfaces from include/rocksdb/compaction_filter.h. The functionality was deprecated already in version 3.13.
## 4.0.0 (9/9/2015)
### New Features
* Added support for transactions. See include/rocksdb/utilities/transaction.h for more info.
* DB::GetProperty() now accept "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
* DB::GetProperty() now accepts "rocksdb.aggregated-table-properties" and "rocksdb.aggregated-table-properties-at-levelN", in which case it returns aggregated table properties of the target column family, or the aggregated table properties of the specified level N if the "at-level" version is used.
* Add compression option kZSTDNotFinalCompression for people to experiment ZSTD although its format is not finalized.
* We removed the need for LATEST_BACKUP file in BackupEngine. We still keep writing it when we create new backups (because of backward compatibility), but we don't read it anymore.

@ -1,19 +1,24 @@
## Compilation
**Important**: If you plan to run RocksDB in production, don't compile using default
`make` or `make all`. That will compile RocksDB in debug mode, which is much slower
than release mode.
RocksDB's library should be able to compile without any dependency installed,
although we recommend installing some compression libraries (see below).
We do depend on newer gcc/clang with C++11 support.
There are few options when compiling RocksDB:
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library.
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode.
* `make shared_lib` will compile librocksdb.so, RocksDB shared library.
* `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode.
* `make check` will compile and run all the unit tests
* `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode.
* `make all` will compile our static library, and all our tools and unit tests. Our tools
depend on gflags. You will need to have gflags installed to run `make all`.
depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
use binaries compiled by `make all` in production.
* By default the binary we produce is optimized for the platform you're compiling on
(-march=native). If you want to build a portable binary, add 'PORTABLE=1' before

@ -76,6 +76,8 @@ endif
ifeq ($(DEBUG_LEVEL),0)
OPT += -DNDEBUG
DISABLE_WARNING_AS_ERROR=1
else
$(warning Warning: Compiling in debug mode. Don't use the resulting binary in production)
endif
#-----------------------------------------------
@ -322,10 +324,14 @@ TOOLS = \
BENCHMARKS = db_bench table_reader_bench cache_bench memtablerep_bench
# The library name is configurable since we are maintaining libraries of both
# debug/release mode.
# if user didn't config LIBNAME, set the default
ifeq ($(LIBNAME),)
# we should only run rocksdb in production with DEBUG_LEVEL 0
ifeq ($(DEBUG_LEVEL),0)
LIBNAME=librocksdb
else
LIBNAME=librocksdb_debug
endif
endif
LIBRARY = ${LIBNAME}.a
@ -602,6 +608,11 @@ unity.a: unity.o
$(AM_V_AR)rm -f $@
$(AM_V_at)$(AR) $(ARFLAGS) $@ unity.o
# try compiling db_test with unity
unity_test: db/db_test.o db/db_test_util.o $(TESTHARNESS) unity.a
$(AM_LINK)
./unity_test
rocksdb.h rocksdb.cc: build_tools/amalgamate.py Makefile $(LIB_SOURCES) unity.cc
build_tools/amalgamate.py -I. -i./include unity.cc -x include/rocksdb/c.h -H rocksdb.h -o rocksdb.cc
@ -698,34 +709,34 @@ crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)
slice_transform_test: util/slice_transform_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_test: db/db_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_test: db/db_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_log_iter_test: db/db_log_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_log_iter_test: db/db_log_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_compaction_filter_test: db/db_compaction_filter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_compaction_filter_test: db/db_compaction_filter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_compaction_test: db/db_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_compaction_test: db/db_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_dynamic_level_test: db/db_dynamic_level_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_dynamic_level_test: db/db_dynamic_level_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_inplace_update_test: db/db_inplace_update_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_inplace_update_test: db/db_inplace_update_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_tailing_iter_test: db/db_tailing_iter_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_tailing_iter_test: db/db_tailing_iter_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_iter_test: db/db_iter_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_universal_compaction_test: db/db_universal_compaction_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_universal_compaction_test: db/db_universal_compaction_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
db_wal_test: db/db_wal_test.o util/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
db_wal_test: db/db_wal_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
log_write_bench: util/log_write_bench.o $(LIBOBJECTS) $(TESTHARNESS)
@ -1037,7 +1048,7 @@ rocksdbjavastatic: $(java_libobjects) libz.a libbz2.a libsnappy.a liblz4.a
$(CXX) $(CXXFLAGS) -I./java/. $(JAVA_INCLUDE) -shared -fPIC \
-o ./java/target/$(ROCKSDBJNILIB) $(JNI_NATIVE_SOURCES) \
$(java_libobjects) $(COVERAGEFLAGS) \
libz.a libbz2.a libsnappy.a liblz4.a $(LDFLAGS)
libz.a libbz2.a libsnappy.a liblz4.a $(JAVA_STATIC_LDFLAGS)
cd java/target;strip -S -x $(ROCKSDBJNILIB)
cd java;jar -cf target/$(ROCKSDB_JAR) HISTORY*.md
cd java/target;jar -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)

@ -8,6 +8,7 @@
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
# JAVA_LDFLAGS Linker flags for RocksDBJava
# JAVA_STATIC_LDFLAGS Linker flags for RocksDBJava static build
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
@ -181,6 +182,7 @@ esac
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
@ -374,6 +376,7 @@ echo "CXX=$CXX" >> "$OUTPUT"
echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"

@ -36,7 +36,6 @@ if [ ! -z $ONCALL ]; then
{
'type':'task',
'triggers':[ 'fail' ],
'assignee':'$ONCALL',
'priority':0,
'subscribers':[ '$SUBSCRIBER' ],
'tags':[ 'rocksdb', 'ci' ],
@ -69,6 +68,7 @@ CLANG="USE_CLANG=1"
LITE="OPT=-DROCKSDB_LITE"
TSAN="COMPILE_WITH_TSAN=1"
DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
PARSER="'parser':'egrep \'Failure|^#|Abort\''"
#
# A mechanism to disable tests temporarily
@ -101,13 +101,15 @@ PARALLEL_UNIT_TEST_COMMANDS="[
{
'name':'Build and test RocksDB debug version',
'shell':'$DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
'user':'root'
'user':'root',
$PARSER
},
$CLEANUP_ENV,
{
'name':'Build and test RocksDB debug version under gcc-4.8.1',
'shell':'$GCC_481 $DEBUG make -j$(nproc) all && $SHM make check > /dev/null 2>&1 || cat t/log-*',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -126,7 +128,8 @@ UNIT_TEST_COMMANDS="[
{
'name':'Build and test RocksDB debug version',
'shell':'$SHM $DEBUG make J=1 check',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -145,7 +148,8 @@ UNIT_TEST_COMMANDS_481="[
{
'name':'Build and test RocksDB debug version',
'shell':'$SHM $GCC_481 $DEBUG make J=1 check',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -164,7 +168,8 @@ CLANG_UNIT_TEST_COMMANDS="[
{
'name':'Build and test RocksDB debug',
'shell':'$CLANG $SHM $DEBUG make J=1 check',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -183,7 +188,8 @@ CLANG_ANALYZE_COMMANDS="[
{
'name':'RocksDB build and analyze',
'shell':'$CLANG $SHM $DEBUG make J=1 analyze',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -202,7 +208,8 @@ CODE_COV_COMMANDS="[
{
'name':'Build, test and collect code coverage info',
'shell':'$SHM $DEBUG make J=1 coverage',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -220,8 +227,9 @@ UNITY_COMMANDS="[
$CLEANUP_ENV,
{
'name':'Build, test unity test',
'shell':'$SHM $DEBUG V=1 make J=1 unity',
'user':'root'
'shell':'$SHM $DEBUG V=1 make J=1 unity_test',
'user':'root',
$PARSER
},
],
$REPORT
@ -240,7 +248,8 @@ LITE_BUILD_COMMANDS="[
{
'name':'Build RocksDB debug version',
'shell':'$LITE $DEBUG make J=1 static_lib',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -260,21 +269,21 @@ STRESS_CRASH_TEST_COMMANDS="[
{
'name':'Build and run RocksDB debug stress tests',
'shell':'$SHM $DEBUG make J=1 db_stress',
'user':'root'
'user':'root',
$PARSER
},
{
'name':'Build and run RocksDB debug crash tests',
'timeout': 86400,
'shell':'$SHM $DEBUG make J=1 crash_test',
'user':'root'
'user':'root',
$PARSER
}
],
$REPORT
}
]"
STRESS_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
#
# RocksDB test under address sanitizer
#
@ -287,7 +296,8 @@ ASAN_TEST_COMMANDS="[
{
'name':'Test RocksDB debug under ASAN',
'shell':'set -o pipefail && $SHM $ASAN $DEBUG make J=1 asan_check |& /usr/facebook/ops/scripts/asan_symbolize.py -d',
'user':'root'
'user':'root',
$PARSER
}
],
$REPORT
@ -308,15 +318,14 @@ ASAN_CRASH_TEST_COMMANDS="[
'name':'Build and run RocksDB debug asan_crash_test',
'timeout': 86400,
'shell':'$SHM $DEBUG make J=1 asan_crash_test',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
}
]"
ASAN_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
#
# RocksDB unit test under valgrind
#
@ -329,7 +338,8 @@ VALGRIND_TEST_COMMANDS="[
{
'name':'Run RocksDB debug unit tests',
'shell':'$DISABLE_JEMALLOC $SHM $DEBUG make valgrind_check',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -348,7 +358,8 @@ TSAN_UNIT_TEST_COMMANDS="[
{
'name':'Run RocksDB debug unit test',
'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 check',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -369,15 +380,14 @@ TSAN_CRASH_TEST_COMMANDS="[
'name':'Compile and run',
'timeout': 86400,
'shell':'set -o pipefail && $SHM $DEBUG $TSAN make J=1 crash_test',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
}
]"
TSAN_CRASH_TEST_COMMANDS=$DISABLE_COMMANDS
#
# RocksDB format compatible
#
@ -432,7 +442,8 @@ FORMAT_COMPATIBLE_COMMANDS="[
{
'name':'Run RocksDB debug unit test',
'shell':'build_tools/rocksdb-lego-determinator run_format_compatible',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -464,7 +475,8 @@ NO_COMPRESSION_COMMANDS="[
{
'name':'Run RocksDB debug unit test',
'shell':'build_tools/rocksdb-lego-determinator run_no_compression',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT
@ -520,7 +532,8 @@ REGRESSION_COMMANDS="[
{
'name':'Make and run script',
'shell':'build_tools/rocksdb-lego-determinator run_regression',
'user':'root'
'user':'root',
$PARSER
},
],
$REPORT

@ -40,13 +40,14 @@ TableBuilder* NewTableBuilder(
const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type,
uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters) {
return ioptions.table_factory->NewTableBuilder(
TableBuilderOptions(ioptions, internal_comparator,
int_tbl_prop_collector_factories, compression_type,
compression_opts, skip_filters),
file);
column_family_id, file);
}
Status BuildTable(
@ -55,7 +56,8 @@ Status BuildTable(
FileMetaData* meta, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
std::vector<SequenceNumber> snapshots, const CompressionType compression,
uint32_t column_family_id, std::vector<SequenceNumber> snapshots,
const CompressionType compression,
const CompressionOptions& compression_opts, bool paranoid_file_checks,
InternalStats* internal_stats, const Env::IOPriority io_priority,
TableProperties* table_properties) {
@ -82,13 +84,14 @@ Status BuildTable(
builder = NewTableBuilder(
ioptions, internal_comparator, int_tbl_prop_collector_factories,
file_writer.get(), compression, compression_opts);
column_family_id, file_writer.get(), compression, compression_opts);
}
MergeHelper merge(internal_comparator.user_comparator(),
ioptions.merge_operator, ioptions.info_log,
MergeHelper merge(env, internal_comparator.user_comparator(),
ioptions.merge_operator, nullptr, ioptions.info_log,
ioptions.min_partial_merge_operands,
true /* internal key corruption is not ok */);
true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back());
CompactionIterator c_iter(iter, internal_comparator.user_comparator(),
&merge, kMaxSequenceNumber, &snapshots, env,

@ -37,7 +37,8 @@ TableBuilder* NewTableBuilder(
const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type,
uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts,
const bool skip_filters = false);
@ -52,7 +53,8 @@ extern Status BuildTable(
FileMetaData* meta, const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
std::vector<SequenceNumber> snapshots, const CompressionType compression,
uint32_t column_family_id, std::vector<SequenceNumber> snapshots,
const CompressionType compression,
const CompressionOptions& compression_opts, bool paranoid_file_checks,
InternalStats* internal_stats,
const Env::IOPriority io_priority = Env::IO_HIGH,

@ -50,8 +50,26 @@ void Compaction::GetBoundaryKeys(
if (inputs[i].files.empty()) {
continue;
}
if (inputs[i].level == 0) {
// we need to consider all files on level 0
for (const auto* f : inputs[i].files) {
const Slice& start_user_key = f->smallest.user_key();
if (!initialized ||
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*smallest_user_key = start_user_key;
}
const Slice& end_user_key = f->largest.user_key();
if (!initialized ||
ucmp->Compare(end_user_key, *largest_user_key) > 0) {
*largest_user_key = end_user_key;
}
initialized = true;
}
} else {
// we only need to consider the first and last file
const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
if (!initialized || ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
if (!initialized ||
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
*smallest_user_key = start_user_key;
}
const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
@ -61,6 +79,7 @@ void Compaction::GetBoundaryKeys(
initialized = true;
}
}
}
// helper function to determine if compaction is creating files at the
// bottommost level
@ -420,6 +439,7 @@ std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
CompactionFilter::Context context;
context.is_full_compaction = is_full_compaction_;
context.is_manual_compaction = is_manual_compaction_;
context.column_family_id = cfd_->GetID();
return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
context);
}

@ -210,6 +210,14 @@ class Compaction {
int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs);
TablePropertiesCollection GetOutputTableProperties() const {
return output_table_properties_;
}
void SetOutputTableProperties(TablePropertiesCollection tp) {
output_table_properties_ = std::move(tp);
}
private:
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool mark_as_compacted);
@ -273,6 +281,9 @@ class Compaction {
// Does input compression match the output compression?
bool InputCompressionMatchesOutput() const;
// table properties of output files
TablePropertiesCollection output_table_properties_;
};
// Utility function

@ -12,16 +12,14 @@ namespace rocksdb {
CompactionIterator::CompactionIterator(
Iterator* input, const Comparator* cmp, MergeHelper* merge_helper,
SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
Env* env, bool expect_valid_internal_key, Statistics* stats,
Compaction* compaction, const CompactionFilter* compaction_filter,
LogBuffer* log_buffer)
Env* env, bool expect_valid_internal_key, Compaction* compaction,
const CompactionFilter* compaction_filter, LogBuffer* log_buffer)
: input_(input),
cmp_(cmp),
merge_helper_(merge_helper),
snapshots_(snapshots),
env_(env),
expect_valid_internal_key_(expect_valid_internal_key),
stats_(stats),
compaction_(compaction),
compaction_filter_(compaction_filter),
log_buffer_(log_buffer),
@ -277,10 +275,10 @@ void CompactionIterator::NextFromInput() {
// have hit (A)
// We encapsulate the merge related state machine in a different
// object to minimize change to the existing flow.
merge_helper_->MergeUntil(input_, prev_snapshot, bottommost_level_,
stats_, env_);
merge_helper_->MergeUntil(input_, prev_snapshot, bottommost_level_);
merge_out_iter_.SeekToFirst();
if (merge_out_iter_.Valid()) {
// NOTE: key, value, and ikey_ refer to old entries.
// These will be correctly set below.
key_ = merge_out_iter_.key();
@ -295,6 +293,12 @@ void CompactionIterator::NextFromInput() {
key_ = current_key_.GetKey();
ikey_.user_key = current_key_.GetUserKey();
valid_ = true;
} else {
// all merge operands were filtered out. reset the user key, since the
// batch consumed by the merge operator should not shadow any keys
// coming after the merges
has_current_user_key_ = false;
}
} else {
valid_ = true;
}

@ -41,7 +41,6 @@ class CompactionIterator {
MergeHelper* merge_helper, SequenceNumber last_sequence,
std::vector<SequenceNumber>* snapshots, Env* env,
bool expect_valid_internal_key,
Statistics* stats = nullptr,
Compaction* compaction = nullptr,
const CompactionFilter* compaction_filter = nullptr,
LogBuffer* log_buffer = nullptr);
@ -91,7 +90,6 @@ class CompactionIterator {
const std::vector<SequenceNumber>* snapshots_;
Env* env_;
bool expect_valid_internal_key_;
Statistics* stats_;
Compaction* compaction_;
const CompactionFilter* compaction_filter_;
LogBuffer* log_buffer_;

@ -16,7 +16,8 @@ class CompactionIteratorTest : public testing::Test {
void InitIterator(const std::vector<std::string>& ks,
const std::vector<std::string>& vs,
SequenceNumber last_sequence) {
merge_helper_.reset(new MergeHelper(cmp_, nullptr, nullptr, 0U, false));
merge_helper_.reset(new MergeHelper(Env::Default(), cmp_, nullptr, nullptr,
nullptr, 0U, false, 0));
iter_.reset(new test::VectorIterator(ks, vs));
iter_->SeekToFirst();
c_iter_.reset(new CompactionIterator(iter_.get(), cmp_, merge_helper_.get(),

@ -77,6 +77,7 @@ struct CompactionJob::SubcompactionState {
struct Output {
FileMetaData meta;
bool finished;
std::shared_ptr<const TableProperties> table_properties;
};
// State kept for output being generated
@ -487,6 +488,16 @@ Status CompactionJob::Run() {
}
}
TablePropertiesCollection tp;
for (const auto& state : compact_->sub_compact_states) {
for (const auto& output : state.outputs) {
auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(),
output.meta.fd.GetPathId());
tp[fn] = output.table_properties;
}
}
compact_->compaction->SetOutputTableProperties(std::move(tp));
// Finish up all book-keeping to unify the subcompaction results
AggregateStatistics();
UpdateCompactionStats();
@ -597,10 +608,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
}
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
MergeHelper merge(cfd->user_comparator(), cfd->ioptions()->merge_operator,
db_options_.info_log.get(),
cfd->ioptions()->min_partial_merge_operands,
false /* internal key corruption is expected */);
auto compaction_filter = cfd->ioptions()->compaction_filter;
std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
if (compaction_filter == nullptr) {
@ -608,6 +615,13 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
sub_compact->compaction->CreateCompactionFilter();
compaction_filter = compaction_filter_from_factory.get();
}
MergeHelper merge(
env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
compaction_filter, db_options_.info_log.get(),
cfd->ioptions()->min_partial_merge_operands,
false /* internal key corruption is expected */,
existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
compact_->compaction->level(), db_options_.statistics.get());
TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
@ -624,8 +638,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
Status status;
sub_compact->c_iter.reset(new CompactionIterator(
input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
&existing_snapshots_, env_, false, db_options_.statistics.get(),
sub_compact->compaction, compaction_filter));
&existing_snapshots_, env_, false, sub_compact->compaction,
compaction_filter));
auto c_iter = sub_compact->c_iter.get();
c_iter->SeekToFirst();
const auto& c_iter_stats = c_iter->iter_stats();
@ -811,7 +825,10 @@ Status CompactionJob::FinishCompactionOutputFile(
delete iter;
if (s.ok()) {
TableFileCreationInfo info(sub_compact->builder->GetTableProperties());
auto tp = sub_compact->builder->GetTableProperties();
sub_compact->current_output()->table_properties =
std::make_shared<TableProperties>(tp);
TableFileCreationInfo info(std::move(tp));
info.db_name = dbname_;
info.cf_name = cfd->GetName();
info.file_path =
@ -924,8 +941,8 @@ Status CompactionJob::OpenCompactionOutputFile(
cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
sub_compact->builder.reset(NewTableBuilder(
*cfd->ioptions(), cfd->internal_comparator(),
cfd->int_tbl_prop_collector_factories(), sub_compact->outfile.get(),
sub_compact->compaction->output_compression(),
cfd->int_tbl_prop_collector_factories(), cfd->GetID(),
sub_compact->outfile.get(), sub_compact->compaction->output_compression(),
cfd->ioptions()->compression_opts, skip_filters));
LogFlush(db_options_.info_log);
return s;

@ -182,7 +182,7 @@ class CompactionJobTest : public testing::Test {
return expected_results;
}
void NewDB(std::shared_ptr<MergeOperator> merge_operator = nullptr) {
void NewDB() {
VersionEdit new_db;
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
@ -207,7 +207,8 @@ class CompactionJobTest : public testing::Test {
std::vector<ColumnFamilyDescriptor> column_families;
cf_options_.table_factory = mock_table_factory_;
cf_options_.merge_operator = merge_operator;
cf_options_.merge_operator = merge_op_;
cf_options_.compaction_filter = compaction_filter_.get();
column_families.emplace_back(kDefaultColumnFamilyName, cf_options_);
EXPECT_OK(versions_->Recover(column_families, false));
@ -258,11 +259,17 @@ class CompactionJobTest : public testing::Test {
&mutex_));
mutex_.Unlock();
if (expected_results.size() == 0) {
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
ASSERT_EQ(compaction_job_stats_.num_output_files, 0U);
} else {
ASSERT_GE(compaction_job_stats_.elapsed_micros, 0U);
ASSERT_EQ(compaction_job_stats_.num_input_files, num_input_files);
ASSERT_EQ(compaction_job_stats_.num_output_files, 1U);
mock_table_factory_->AssertLatestFile(expected_results);
}
}
Env* env_;
std::string dbname_;
@ -279,6 +286,8 @@ class CompactionJobTest : public testing::Test {
std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
CompactionJobStats compaction_job_stats_;
ColumnFamilyData* cfd_;
std::unique_ptr<CompactionFilter> compaction_filter_;
std::shared_ptr<MergeOperator> merge_op_;
};
TEST_F(CompactionJobTest, Simple) {
@ -372,8 +381,8 @@ TEST_F(CompactionJobTest, SimpleNonLastLevel) {
}
TEST_F(CompactionJobTest, SimpleMerge) {
auto merge_op = MergeOperators::CreateStringAppendOperator();
NewDB(merge_op);
merge_op_ = MergeOperators::CreateStringAppendOperator();
NewDB();
auto file1 = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeMerge), "5"},
@ -396,8 +405,8 @@ TEST_F(CompactionJobTest, SimpleMerge) {
}
TEST_F(CompactionJobTest, NonAssocMerge) {
auto merge_op = MergeOperators::CreateStringAppendTESTOperator();
NewDB(merge_op);
merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
NewDB();
auto file1 = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeMerge), "5"},
@ -420,6 +429,105 @@ TEST_F(CompactionJobTest, NonAssocMerge) {
RunCompaction({files}, expected_results);
}
// Filters merge operands with value 10.
TEST_F(CompactionJobTest, MergeOperandFilter) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 = mock::MakeMockFile(
{{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
{KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
{KeyStr("a", 3U, kTypeMerge), test::EncodeInt(3U)}});
AddMockFile(file1);
auto file2 = mock::MakeMockFile({
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)} // Filtered
});
AddMockFile(file2);
auto expected_results =
mock::MakeMockFile({{KeyStr("a", 0U, kTypeValue), test::EncodeInt(8U)},
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(2U)}});
SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({files}, expected_results);
}
TEST_F(CompactionJobTest, FilterSomeMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 = mock::MakeMockFile(
{{KeyStr("a", 5U, kTypeMerge), test::EncodeInt(5U)},
{KeyStr("a", 4U, kTypeMerge), test::EncodeInt(10U)}, // Filtered
{KeyStr("a", 3U, kTypeValue), test::EncodeInt(5U)},
{KeyStr("d", 8U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file1);
auto file2 =
mock::MakeMockFile({{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeMerge), test::EncodeInt(3U)},
{KeyStr("c", 1U, kTypeValue), test::EncodeInt(7U)},
{KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}});
AddMockFile(file2);
auto file3 =
mock::MakeMockFile({{KeyStr("a", 1U, kTypeMerge), test::EncodeInt(3U)}});
AddMockFile(file3, 2);
auto expected_results = mock::MakeMockFile({
{KeyStr("a", 5U, kTypeValue), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeValue), test::EncodeInt(10U)},
{KeyStr("d", 1U, kTypeValue), test::EncodeInt(6U)}
// b does not appear because the operands are filtered
});
SetLastSequence(5U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
RunCompaction({files}, expected_results);
}
// Test where all operands/merge results are filtered out.
TEST_F(CompactionJobTest, FilterAllMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
compaction_filter_.reset(new test::FilterNumber(10U));
NewDB();
auto file1 =
mock::MakeMockFile({{KeyStr("a", 11U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("a", 10U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("a", 9U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file1);
auto file2 =
mock::MakeMockFile({{KeyStr("b", 8U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 7U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 6U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 5U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 4U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 3U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("c", 1U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file2);
auto file3 =
mock::MakeMockFile({{KeyStr("a", 2U, kTypeMerge), test::EncodeInt(10U)},
{KeyStr("b", 1U, kTypeMerge), test::EncodeInt(10U)}});
AddMockFile(file3, 2);
SetLastSequence(11U);
auto files = cfd_->current()->storage_info()->LevelFiles(0);
stl_wrappers::KVMap empty_map;
RunCompaction({files}, empty_map);
}
TEST_F(CompactionJobTest, SimpleSingleDelete) {
NewDB();

@ -7,6 +7,8 @@
#include "db/compaction_picker.h"
#include <limits>
#include <string>
#include <utility>
#include "util/logging.h"
#include "util/string_util.h"
#include "util/testharness.h"
@ -36,6 +38,8 @@ class CompactionPickerTest : public testing::Test {
CompactionOptionsFIFO fifo_options_;
std::unique_ptr<VersionStorageInfo> vstorage_;
std::vector<std::unique_ptr<FileMetaData>> files_;
// does not own FileMetaData
std::unordered_map<uint32_t, std::pair<FileMetaData*, int>> file_map_;
// input files to compaction process.
std::vector<CompactionInputFiles> input_files_;
int compaction_level_start_;
@ -70,12 +74,7 @@ class CompactionPickerTest : public testing::Test {
void DeleteVersionStorage() {
vstorage_.reset();
files_.clear();
for (uint32_t i = 0; i < input_files_.size(); ++i) {
for (uint32_t j = 0; j < input_files_[i].files.size(); ++j) {
delete input_files_[i].files[j];
}
input_files_[i].files.clear();
}
file_map_.clear();
input_files_.clear();
}
@ -94,9 +93,10 @@ class CompactionPickerTest : public testing::Test {
f->refs = 0;
vstorage_->AddFile(level, f);
files_.emplace_back(f);
file_map_.insert({file_number, {f, level}});
}
void setCompactionInputFilesLevels(int level_count, int start_level) {
void SetCompactionInputFilesLevels(int level_count, int start_level) {
input_files_.resize(level_count);
for (int i = 0; i < level_count; ++i) {
input_files_[i].level = start_level + i;
@ -104,21 +104,13 @@ class CompactionPickerTest : public testing::Test {
compaction_level_start_ = start_level;
}
void AddToCompactionFiles(int level, uint32_t file_number,
const char* smallest, const char* largest,
uint64_t file_size = 0, uint32_t path_id = 0,
SequenceNumber smallest_seq = 100,
SequenceNumber largest_seq = 100) {
void AddToCompactionFiles(uint32_t file_number) {
auto iter = file_map_.find(file_number);
assert(iter != file_map_.end());
int level = iter->second.second;
assert(level < vstorage_->num_levels());
FileMetaData* f = new FileMetaData;
f->fd = FileDescriptor(file_number, path_id, file_size);
f->smallest = InternalKey(smallest, smallest_seq, kTypeValue);
f->largest = InternalKey(largest, largest_seq, kTypeValue);
f->smallest_seqno = smallest_seq;
f->largest_seqno = largest_seq;
f->compensated_file_size = file_size;
f->refs = 0;
input_files_[level - compaction_level_start_].files.emplace_back(f);
input_files_[level - compaction_level_start_].files.emplace_back(
iter->second.first);
}
void UpdateVersionStorageInfo() {
@ -676,25 +668,24 @@ TEST_F(CompactionPickerTest, EstimateCompactionBytesNeededDynamicLevel) {
TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
// case 1: Higher levels are empty
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c");
Add(0, 2U, "y", "z");
Add(0, 1U, "a", "m");
Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i");
Add(2, 6U, "x", "z");
UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e");
AddToCompactionFiles(2, 5U, "g", "i");
SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(3U);
AddToCompactionFiles(5U);
bool result =
Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result);
// case 2: Higher levels have no overlap
DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c");
Add(0, 2U, "y", "z");
Add(0, 1U, "a", "m");
Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i");
@ -704,17 +695,16 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(4, 9U, "a", "b");
Add(5, 10U, "c", "cc");
UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e");
AddToCompactionFiles(2, 5U, "g", "i");
SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(3U);
AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result);
// case 3.1: Higher levels (level 3) have overlap
DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c");
Add(0, 2U, "y", "z");
Add(0, 1U, "a", "m");
Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i");
@ -724,17 +714,17 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(4, 9U, "a", "b");
Add(5, 10U, "c", "cc");
UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "e");
AddToCompactionFiles(2, 5U, "g", "i");
SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(3U);
AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
// case 3.1: Higher levels (level 5) have overlap
// case 3.2: Higher levels (level 5) have overlap
DeleteVersionStorage();
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c");
Add(0, 2U, "y", "z");
Add(0, 1U, "a", "m");
Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i");
@ -747,17 +737,17 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(5, 12U, "y", "yy");
Add(5, 13U, "z", "zz");
UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "i");
AddToCompactionFiles(2, 5U, "g", "i");
SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(3U);
AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
// case 3.1: Higher levels (level 5) have overlap
DeleteVersionStorage();
// case 3.3: Higher levels (level 5) have overlap, but it's only overlapping
// one key ("d")
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "a", "c");
Add(0, 2U, "y", "z");
Add(0, 1U, "a", "m");
Add(0, 2U, "c", "z");
Add(1, 3U, "d", "e");
Add(1, 4U, "l", "p");
Add(2, 5U, "g", "i");
@ -770,11 +760,66 @@ TEST_F(CompactionPickerTest, IsBottommostLevelTest) {
Add(5, 12U, "y", "yy");
Add(5, 13U, "z", "zz");
UpdateVersionStorageInfo();
setCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(1, 3U, "d", "i");
AddToCompactionFiles(2, 5U, "g", "i");
SetCompactionInputFilesLevels(2, 1);
AddToCompactionFiles(3U);
AddToCompactionFiles(5U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
// Level 0 files overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "z");
Add(0, 4U, "e", "f");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(1, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
// Level 0 files don't overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "k");
Add(0, 4U, "e", "f");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(1, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_TRUE(result);
// Level 1 files overlap
NewVersionStorage(6, kCompactionStyleLevel);
Add(0, 1U, "s", "t");
Add(0, 2U, "a", "m");
Add(0, 3U, "b", "k");
Add(0, 4U, "e", "f");
Add(1, 5U, "a", "m");
Add(1, 6U, "n", "o");
Add(1, 7U, "w", "y");
Add(5, 10U, "y", "z");
UpdateVersionStorageInfo();
SetCompactionInputFilesLevels(2, 0);
AddToCompactionFiles(1U);
AddToCompactionFiles(2U);
AddToCompactionFiles(3U);
AddToCompactionFiles(4U);
AddToCompactionFiles(5U);
AddToCompactionFiles(6U);
AddToCompactionFiles(7U);
result = Compaction::TEST_IsBottommostLevel(2, vstorage_.get(), input_files_);
ASSERT_FALSE(result);
DeleteVersionStorage();
}

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb {
@ -97,8 +97,11 @@ class ChangeFilter : public CompactionFilter {
class KeepFilterFactory : public CompactionFilterFactory {
public:
explicit KeepFilterFactory(bool check_context = false)
: check_context_(check_context) {}
explicit KeepFilterFactory(bool check_context = false,
bool check_context_cf_id = false)
: check_context_(check_context),
check_context_cf_id_(check_context_cf_id),
compaction_filter_created_(false) {}
virtual std::unique_ptr<CompactionFilter> CreateCompactionFilter(
const CompactionFilter::Context& context) override {
@ -106,13 +109,22 @@ class KeepFilterFactory : public CompactionFilterFactory {
EXPECT_EQ(expect_full_compaction_.load(), context.is_full_compaction);
EXPECT_EQ(expect_manual_compaction_.load(), context.is_manual_compaction);
}
if (check_context_cf_id_) {
EXPECT_EQ(expect_cf_id_.load(), context.column_family_id);
}
compaction_filter_created_ = true;
return std::unique_ptr<CompactionFilter>(new KeepFilter());
}
bool compaction_filter_created() const { return compaction_filter_created_; }
virtual const char* Name() const override { return "KeepFilterFactory"; }
bool check_context_;
bool check_context_cf_id_;
std::atomic_bool expect_full_compaction_;
std::atomic_bool expect_manual_compaction_;
std::atomic<uint32_t> expect_cf_id_;
bool compaction_filter_created_;
};
class DeleteFilterFactory : public CompactionFilterFactory {
@ -482,7 +494,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterWithMergeOperator) {
}
TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
KeepFilterFactory* filter = new KeepFilterFactory();
KeepFilterFactory* filter = new KeepFilterFactory(true, true);
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleUniversal;
@ -504,15 +516,17 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
// be triggered.
num_keys_per_file /= 2;
}
dbfull()->TEST_WaitForCompact();
// Force a manual compaction
cfilter_count = 0;
filter->expect_manual_compaction_.store(true);
filter->expect_full_compaction_.store(false); // Manual compaction always
// set this flag.
filter->expect_full_compaction_.store(true);
filter->expect_cf_id_.store(0);
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_EQ(cfilter_count, 700);
ASSERT_EQ(NumSortedRuns(0), 1);
ASSERT_TRUE(filter->compaction_filter_created());
// Verify total number of keys is correct after manual compaction.
{
@ -537,6 +551,35 @@ TEST_F(DBTestCompactionFilter, CompactionFilterContextManual) {
}
}
TEST_F(DBTestCompactionFilter, CompactionFilterContextCfId) {
KeepFilterFactory* filter = new KeepFilterFactory(false, true);
filter->expect_cf_id_.store(1);
Options options = CurrentOptions();
options.compaction_filter_factory.reset(filter);
options.compression = kNoCompression;
options.level0_file_num_compaction_trigger = 2;
CreateAndReopenWithCF({"pikachu"}, options);
int num_keys_per_file = 400;
for (int j = 0; j < 3; j++) {
// Write several keys.
const std::string value(10, 'x');
for (int i = 0; i < num_keys_per_file; i++) {
char key[100];
snprintf(key, sizeof(key), "B%08d%02d", i, j);
Put(1, key, value);
}
Flush(1);
// Make sure next file is much smaller so automatic compaction will not
// be triggered.
num_keys_per_file /= 2;
}
dbfull()->TEST_WaitForCompact();
ASSERT_TRUE(filter->compaction_filter_created());
}
// Compaction filters should only be applied to records that are newer than the
// latest snapshot. This test inserts records and applies a delete filter.
TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {

@ -7,9 +7,9 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "rocksdb/experimental.h"
#include "util/db_test_util.h"
#include "util/sync_point.h"
namespace rocksdb {

@ -12,8 +12,8 @@
// which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb {
class DBTestDynamicLevel : public DBTestBase {

@ -1338,8 +1338,8 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd,
s = BuildTable(
dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
iter.get(), &meta, cfd->internal_comparator(),
cfd->int_tbl_prop_collector_factories(), snapshots_.GetAll(),
GetCompressionFlush(*cfd->ioptions()),
cfd->int_tbl_prop_collector_factories(), cfd->GetID(),
snapshots_.GetAll(), GetCompressionFlush(*cfd->ioptions()),
cfd->ioptions()->compression_opts, paranoid_file_checks,
cfd->internal_stats(), Env::IO_HIGH, &info.table_properties);
LogFlush(db_options_.info_log);
@ -1433,15 +1433,16 @@ Status DBImpl::FlushMemTableToOutputFile(
if (s.ok()) {
// may temporarily unlock and lock the mutex.
NotifyOnFlushCompleted(cfd, &file_meta, mutable_cf_options,
job_context->job_id);
job_context->job_id, flush_job.GetTableProperties());
}
#endif // ROCKSDB_LITE
return s;
}
void DBImpl::NotifyOnFlushCompleted(
ColumnFamilyData* cfd, FileMetaData* file_meta,
const MutableCFOptions& mutable_cf_options, int job_id) {
void DBImpl::NotifyOnFlushCompleted(ColumnFamilyData* cfd,
FileMetaData* file_meta,
const MutableCFOptions& mutable_cf_options,
int job_id, TableProperties prop) {
#ifndef ROCKSDB_LITE
if (db_options_.listeners.size() == 0U) {
return;
@ -1471,6 +1472,7 @@ void DBImpl::NotifyOnFlushCompleted(
info.triggered_writes_stop = triggered_writes_stop;
info.smallest_seqno = file_meta->smallest_seqno;
info.largest_seqno = file_meta->largest_seqno;
info.table_properties = prop;
for (auto listener : db_options_.listeners) {
listener->OnFlushCompleted(this, info);
}
@ -1816,12 +1818,20 @@ void DBImpl::NotifyOnCompactionCompleted(
info.base_input_level = c->start_level();
info.output_level = c->output_level();
info.stats = compaction_job_stats;
info.table_properties = c->GetOutputTableProperties();
for (size_t i = 0; i < c->num_input_levels(); ++i) {
for (const auto fmd : *c->inputs(i)) {
info.input_files.push_back(
TableFileName(db_options_.db_paths,
fmd->fd.GetNumber(),
fmd->fd.GetPathId()));
auto fn = TableFileName(db_options_.db_paths, fmd->fd.GetNumber(),
fmd->fd.GetPathId());
info.input_files.push_back(fn);
if (info.table_properties.count(fn) == 0) {
std::shared_ptr<const TableProperties> tp;
std::string fname;
auto s = cfd->current()->GetTableProperties(&tp, fmd, &fname);
if (s.ok()) {
info.table_properties[fn] = tp;
}
}
}
}
for (const auto newf : c->edit()->GetNewFiles()) {
@ -4502,6 +4512,10 @@ Status DBImpl::CheckConsistency() {
uint64_t fsize = 0;
Status s = env_->GetFileSize(file_path, &fsize);
if (!s.ok() &&
env_->GetFileSize(Rocks2LevelTableFileName(file_path), &fsize).ok()) {
s = Status::OK();
}
if (!s.ok()) {
corruption_messages +=
"Can't access " + md.name + ": " + s.ToString() + "\n";

@ -51,7 +51,6 @@ class TableCache;
class Version;
class VersionEdit;
class VersionSet;
class CompactionFilterV2;
class Arena;
class WriteCallback;
struct JobContext;
@ -376,7 +375,7 @@ class DBImpl : public DB {
void NotifyOnFlushCompleted(ColumnFamilyData* cfd, FileMetaData* file_meta,
const MutableCFOptions& mutable_cf_options,
int job_id);
int job_id, TableProperties prop);
void NotifyOnCompactionCompleted(ColumnFamilyData* cfd,
Compaction *c, const Status &st,

@ -6,8 +6,8 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb {

@ -12,8 +12,8 @@
// which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb {

@ -12,9 +12,9 @@
// which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "db/db_test_util.h"
#include "db/forward_iterator.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
namespace rocksdb {

@ -26,7 +26,7 @@
#include "db/filename.h"
#include "db/dbformat.h"
#include "db/db_impl.h"
#include "db/filename.h"
#include "db/db_test_util.h"
#include "db/job_context.h"
#include "db/version_set.h"
#include "db/write_batch_internal.h"
@ -55,7 +55,6 @@
#include "table/block_based_table_factory.h"
#include "table/mock_table.h"
#include "table/plain_table_factory.h"
#include "util/db_test_util.h"
#include "util/file_reader_writer.h"
#include "util/hash.h"
#include "util/hash_linklist_rep.h"
@ -133,6 +132,46 @@ class DBTestWithParam : public DBTest,
uint32_t max_subcompactions_;
};
class BloomStatsTestWithParam
: public DBTest,
public testing::WithParamInterface<std::tuple<bool, bool>> {
public:
BloomStatsTestWithParam() {
use_block_table_ = std::get<0>(GetParam());
use_block_based_builder_ = std::get<1>(GetParam());
options_.create_if_missing = true;
options_.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(4));
options_.memtable_prefix_bloom_bits = 8 * 1024;
if (use_block_table_) {
BlockBasedTableOptions table_options;
table_options.hash_index_allow_collision = false;
table_options.filter_policy.reset(
NewBloomFilterPolicy(10, use_block_based_builder_));
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
} else {
PlainTableOptions table_options;
options_.table_factory.reset(NewPlainTableFactory(table_options));
}
perf_context.Reset();
DestroyAndReopen(options_);
}
~BloomStatsTestWithParam() {
perf_context.Reset();
Destroy(options_);
}
// Required if inheriting from testing::WithParamInterface<>
static void SetUpTestCase() {}
static void TearDownTestCase() {}
bool use_block_table_;
bool use_block_based_builder_;
Options options_;
};
TEST_F(DBTest, Empty) {
do {
Options options;
@ -604,10 +643,10 @@ TEST_F(DBTest, AggregatedTableProperties) {
TEST_F(DBTest, ReadLatencyHistogramByLevel) {
Options options = CurrentOptions();
options.write_buffer_size = 110 << 10;
options.level0_file_num_compaction_trigger = 3;
options.level0_file_num_compaction_trigger = 6;
options.num_levels = 4;
options.compression = kNoCompression;
options.max_bytes_for_level_base = 450 << 10;
options.max_bytes_for_level_base = 4500 << 10;
options.target_file_size_base = 98 << 10;
options.max_write_buffer_number = 2;
options.statistics = rocksdb::CreateDBStatistics();
@ -619,10 +658,11 @@ TEST_F(DBTest, ReadLatencyHistogramByLevel) {
DestroyAndReopen(options);
int key_index = 0;
Random rnd(301);
for (int num = 0; num < 5; num++) {
for (int num = 0; num < 7; num++) {
Put("foo", "bar");
GenerateNewFile(&rnd, &key_index);
}
dbfull()->TEST_WaitForCompact();
std::string prop;
ASSERT_TRUE(dbfull()->GetProperty("rocksdb.dbstats", &prop));
@ -638,6 +678,7 @@ TEST_F(DBTest, ReadLatencyHistogramByLevel) {
// Reopen and issue Get(). See thee latency tracked
Reopen(options);
dbfull()->TEST_WaitForCompact();
for (int key = 0; key < 500; key++) {
Get(Key(key));
}
@ -781,21 +822,34 @@ class CoutingUserTblPropCollector : public TablePropertiesCollector {
class CoutingUserTblPropCollectorFactory
: public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
explicit CoutingUserTblPropCollectorFactory(
uint32_t expected_column_family_id)
: expected_column_family_id_(expected_column_family_id),
num_created_(0) {}
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
EXPECT_EQ(expected_column_family_id_, context.column_family_id);
num_created_++;
return new CoutingUserTblPropCollector();
}
const char* Name() const override {
return "CoutingUserTblPropCollectorFactory";
}
void set_expected_column_family_id(uint32_t v) {
expected_column_family_id_ = v;
}
uint32_t expected_column_family_id_;
uint32_t num_created_;
};
TEST_F(DBTest, GetUserDefinedTablaProperties) {
TEST_F(DBTest, GetUserDefinedTableProperties) {
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = (1<<30);
options.max_background_flushes = 0;
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] =
std::make_shared<CoutingUserTblPropCollectorFactory>();
std::shared_ptr<CoutingUserTblPropCollectorFactory> collector_factory =
std::make_shared<CoutingUserTblPropCollectorFactory>(0);
options.table_properties_collector_factories[0] = collector_factory;
Reopen(options);
// Create 4 tables
for (int table = 0; table < 4; ++table) {
@ -821,6 +875,72 @@ TEST_F(DBTest, GetUserDefinedTablaProperties) {
sum += count;
}
ASSERT_EQ(10u + 11u + 12u + 13u, sum);
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_GT(collector_factory->num_created_, 0);
}
TEST_F(DBTest, UserDefinedTablePropertiesContext) {
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = 3;
options.max_background_flushes = 0;
options.table_properties_collector_factories.resize(1);
std::shared_ptr<CoutingUserTblPropCollectorFactory> collector_factory =
std::make_shared<CoutingUserTblPropCollectorFactory>(1);
options.table_properties_collector_factories[0] = collector_factory,
CreateAndReopenWithCF({"pikachu"}, options);
// Create 2 files
for (int table = 0; table < 2; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(1, ToString(table * 100 + i), "val");
}
Flush(1);
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
// Trigger automatic compactions.
for (int table = 0; table < 3; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(1, ToString(table * 100 + i), "val");
}
Flush(1);
dbfull()->TEST_WaitForCompact();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr, handles_[1]);
ASSERT_GT(collector_factory->num_created_, 0);
// Come back to write to default column family
collector_factory->num_created_ = 0;
collector_factory->set_expected_column_family_id(0); // default CF
// Create 4 tables in default column family
for (int table = 0; table < 2; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(ToString(table * 100 + i), "val");
}
Flush();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
// Trigger automatic compactions.
for (int table = 0; table < 3; ++table) {
for (int i = 0; i < 10 + table; ++i) {
Put(ToString(table * 100 + i), "val");
}
Flush();
dbfull()->TEST_WaitForCompact();
}
ASSERT_GT(collector_factory->num_created_, 0);
collector_factory->num_created_ = 0;
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_GT(collector_factory->num_created_, 0);
}
TEST_F(DBTest, LevelLimitReopen) {
@ -8244,7 +8364,8 @@ class CountingDeleteTabPropCollector : public TablePropertiesCollector {
class CountingDeleteTabPropCollectorFactory
: public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
return new CountingDeleteTabPropCollector();
}
const char* Name() const override {
@ -8268,8 +8389,8 @@ TEST_F(DBTest, TablePropertiesNeedCompactTest) {
options.soft_rate_limit = 1.1;
options.num_levels = 8;
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory(
new CountingDeleteTabPropCollectorFactory);
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
std::make_shared<CountingDeleteTabPropCollectorFactory>();
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory;
@ -8328,6 +8449,61 @@ TEST_F(DBTest, TablePropertiesNeedCompactTest) {
}
}
TEST_F(DBTest, NeedCompactHintPersistentTest) {
Random rnd(301);
Options options;
options.create_if_missing = true;
options.max_write_buffer_number = 8;
options.level0_file_num_compaction_trigger = 10;
options.level0_slowdown_writes_trigger = 10;
options.level0_stop_writes_trigger = 10;
options.disable_auto_compactions = true;
std::shared_ptr<TablePropertiesCollectorFactory> collector_factory =
std::make_shared<CountingDeleteTabPropCollectorFactory>();
options.table_properties_collector_factories.resize(1);
options.table_properties_collector_factories[0] = collector_factory;
DestroyAndReopen(options);
const int kMaxKey = 100;
for (int i = 0; i < kMaxKey; i++) {
ASSERT_OK(Put(Key(i), ""));
}
Flush();
dbfull()->TEST_WaitForFlushMemTable();
for (int i = 1; i < kMaxKey - 1; i++) {
Delete(Key(i));
}
Flush();
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ(NumTableFilesAtLevel(0), 2);
// Restart the DB. Although number of files didn't reach
// options.level0_file_num_compaction_trigger, compaction should
// still be triggered because of the need-compaction hint.
options.disable_auto_compactions = false;
Reopen(options);
dbfull()->TEST_WaitForCompact();
ASSERT_EQ(NumTableFilesAtLevel(0), 0);
{
SetPerfLevel(kEnableCount);
perf_context.Reset();
int c = 0;
std::unique_ptr<Iterator> iter(db_->NewIterator(ReadOptions()));
for (iter->Seek(Key(0)); iter->Valid(); iter->Next()) {
c++;
}
ASSERT_EQ(c, 2);
ASSERT_EQ(perf_context.internal_delete_skipped_count, 0);
// We iterate every key twice. Is it a bug?
ASSERT_LE(perf_context.internal_key_skipped_count, 2);
SetPerfLevel(kDisable);
}
}
TEST_F(DBTest, SuggestCompactRangeTest) {
class CompactionFilterFactoryGetContext : public CompactionFilterFactory {
public:
@ -9640,6 +9816,47 @@ TEST_F(DBTest, AddExternalSstFileMultiThreaded) {
kSkipFIFOCompaction));
}
// 1 Create some SST files by inserting K-V pairs into DB
// 2 Close DB and change suffix from ".sst" to ".ldb" for every other SST file
// 3 Open DB and check if all key can be read
TEST_F(DBTest, SSTsWithLdbSuffixHandling) {
Options options = CurrentOptions();
options.write_buffer_size = 110 << 10; // 110KB
options.num_levels = 4;
DestroyAndReopen(options);
Random rnd(301);
int key_id = 0;
for (int i = 0; i < 10; ++i) {
GenerateNewFile(&rnd, &key_id, false);
}
Flush();
Close();
int const num_files = GetSstFileCount(dbname_);
ASSERT_GT(num_files, 0);
std::vector<std::string> filenames;
GetSstFiles(dbname_, &filenames);
int num_ldb_files = 0;
for (unsigned int i = 0; i < filenames.size(); ++i) {
if (i & 1) {
continue;
}
std::string const rdb_name = dbname_ + "/" + filenames[i];
std::string const ldb_name = Rocks2LevelTableFileName(rdb_name);
ASSERT_TRUE(env_->RenameFile(rdb_name, ldb_name).ok());
++num_ldb_files;
}
ASSERT_GT(num_ldb_files, 0);
ASSERT_EQ(num_files, GetSstFileCount(dbname_));
Reopen(options);
for (int k = 0; k < key_id; ++k) {
ASSERT_NE("NOT_FOUND", Get(Key(k)));
}
Destroy(options);
}
INSTANTIATE_TEST_CASE_P(DBTestWithParam, DBTestWithParam,
::testing::Values(1, 4));
@ -9846,6 +10063,120 @@ TEST_F(DBTest, WalFilterTest) {
}
}
}
// 1 Insert 2 K-V pairs into DB
// 2 Call Get() for both keys - expext memtable bloom hit stat to be 2
// 3 Call Get() for nonexisting key - expect memtable bloom miss stat to be 1
// 4 Call Flush() to create SST
// 5 Call Get() for both keys - expext SST bloom hit stat to be 2
// 6 Call Get() for nonexisting key - expect SST bloom miss stat to be 1
// Test both: block and plain SST
TEST_P(BloomStatsTestWithParam, BloomStatsTest) {
std::string key1("AAAA");
std::string key2("RXDB"); // not in DB
std::string key3("ZBRA");
std::string value1("Value1");
std::string value3("Value3");
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
// check memtable bloom stats
ASSERT_EQ(value1, Get(key1));
ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(value3, Get(key3));
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
ASSERT_EQ("NOT_FOUND", Get(key2));
ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
// sanity checks
ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
Flush();
// sanity checks
ASSERT_EQ(0, perf_context.bloom_sst_hit_count);
ASSERT_EQ(0, perf_context.bloom_sst_miss_count);
// check SST bloom stats
// NOTE: hits per get differs because of code paths differences
// in BlockBasedTable::Get()
int hits_per_get = use_block_table_ && !use_block_based_builder_ ? 2 : 1;
ASSERT_EQ(value1, Get(key1));
ASSERT_EQ(hits_per_get, perf_context.bloom_sst_hit_count);
ASSERT_EQ(value3, Get(key3));
ASSERT_EQ(2 * hits_per_get, perf_context.bloom_sst_hit_count);
ASSERT_EQ("NOT_FOUND", Get(key2));
ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
}
// Same scenario as in BloomStatsTest but using an iterator
TEST_P(BloomStatsTestWithParam, BloomStatsTestWithIter) {
std::string key1("AAAA");
std::string key2("RXDB"); // not in DB
std::string key3("ZBRA");
std::string value1("Value1");
std::string value3("Value3");
ASSERT_OK(Put(key1, value1, WriteOptions()));
ASSERT_OK(Put(key3, value3, WriteOptions()));
unique_ptr<Iterator> iter(dbfull()->NewIterator(ReadOptions()));
// check memtable bloom stats
iter->Seek(key1);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value1, iter->value().ToString());
ASSERT_EQ(1, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
iter->Seek(key3);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value3, iter->value().ToString());
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
ASSERT_EQ(0, perf_context.bloom_memtable_miss_count);
iter->Seek(key2);
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ(1, perf_context.bloom_memtable_miss_count);
ASSERT_EQ(2, perf_context.bloom_memtable_hit_count);
Flush();
iter.reset(dbfull()->NewIterator(ReadOptions()));
// check SST bloom stats
iter->Seek(key1);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value1, iter->value().ToString());
ASSERT_EQ(1, perf_context.bloom_sst_hit_count);
iter->Seek(key3);
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(value3, iter->value().ToString());
ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
iter->Seek(key2);
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
ASSERT_EQ(1, perf_context.bloom_sst_miss_count);
ASSERT_EQ(2, perf_context.bloom_sst_hit_count);
}
INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
::testing::Values(std::make_tuple(true, true),
std::make_tuple(true, false),
std::make_tuple(false, false)));
} // namespace rocksdb
#endif

@ -7,7 +7,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "util/db_test_util.h"
#include "db/db_test_util.h"
namespace rocksdb {
@ -794,19 +794,22 @@ std::string DBTestBase::DumpSSTableList() {
return property;
}
int DBTestBase::GetSstFileCount(std::string path) {
std::vector<std::string> files;
env_->GetChildren(path, &files);
void DBTestBase::GetSstFiles(std::string path,
std::vector<std::string>* files) {
env_->GetChildren(path, files);
int sst_count = 0;
files->erase(
std::remove_if(files->begin(), files->end(), [](std::string name) {
uint64_t number;
FileType type;
for (size_t i = 0; i < files.size(); i++) {
if (ParseFileName(files[i], &number, &type) && type == kTableFile) {
sst_count++;
}
return !(ParseFileName(name, &number, &type) && type == kTableFile);
}), files->end());
}
return sst_count;
int DBTestBase::GetSstFileCount(std::string path) {
std::vector<std::string> files;
GetSstFiles(path, &files);
return static_cast<int>(files.size());
}
// this will generate non-overlapping files since it keeps increasing key_idx

@ -27,6 +27,7 @@
#include <vector>
#include "db/db_impl.h"
#include "db/db_test_util.h"
#include "db/dbformat.h"
#include "db/filename.h"
#include "rocksdb/cache.h"
@ -43,7 +44,6 @@
#include "table/mock_table.h"
#include "table/plain_table_factory.h"
#include "util/compression.h"
#include "util/db_test_util.h"
#include "util/hash_linklist_rep.h"
#include "util/mock_env.h"
#include "util/mutexlock.h"
@ -618,6 +618,8 @@ class DBTestBase : public testing::Test {
std::string DumpSSTableList();
void GetSstFiles(std::string path, std::vector<std::string>* files);
int GetSstFileCount(std::string path);
// this will generate non-overlapping files since it keeps increasing key_idx

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "util/sync_point.h"

@ -7,8 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "util/db_test_util.h"
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "util/sync_point.h"
#endif

@ -53,7 +53,7 @@ void EventHelpers::LogAndNotifyTableFileCreation(
info.table_properties.filter_policy_name;
// user collected properties
for (const auto& prop : info.table_properties.user_collected_properties) {
for (const auto& prop : info.table_properties.readable_properties) {
jwriter << prop.first << prop.second;
}
jwriter.EndObject();

@ -24,6 +24,9 @@
namespace rocksdb {
static const std::string kRocksDbTFileExt = "sst";
static const std::string kLevelDbTFileExt = "ldb";
// Given a path, flatten the path name by replacing all chars not in
// {[0-9,a-z,A-Z,-,_,.]} with _. And append '_LOG\0' at the end.
// Return the number of chars stored in dest not including the trailing '\0'.
@ -78,7 +81,16 @@ std::string ArchivedLogFileName(const std::string& name, uint64_t number) {
}
std::string MakeTableFileName(const std::string& path, uint64_t number) {
return MakeFileName(path, number, "sst");
return MakeFileName(path, number, kRocksDbTFileExt.c_str());
}
std::string Rocks2LevelTableFileName(const std::string& fullname) {
assert(fullname.size() > kRocksDbTFileExt.size() + 1);
if (fullname.size() <= kRocksDbTFileExt.size() + 1) {
return "";
}
return fullname.substr(0, fullname.size() - kRocksDbTFileExt.size()) +
kLevelDbTFileExt;
}
uint64_t TableFileNameToNumber(const std::string& name) {
@ -273,17 +285,23 @@ bool ParseFileName(const std::string& fname, uint64_t* number,
if (!ConsumeDecimalNumber(&rest, &num)) {
return false;
}
if (rest.size() <= 1 || rest[0] != '.') {
return false;
}
rest.remove_prefix(1);
Slice suffix = rest;
if (suffix == Slice(".log")) {
if (suffix == Slice("log")) {
*type = kLogFile;
if (log_type && !archive_dir_found) {
*log_type = kAliveLogFile;
}
} else if (archive_dir_found) {
return false; // Archive dir can contain only log files
} else if (suffix == Slice(".sst")) {
} else if (suffix == Slice(kRocksDbTFileExt) ||
suffix == Slice(kLevelDbTFileExt)) {
*type = kTableFile;
} else if (suffix == Slice(".dbtmp")) {
} else if (suffix == Slice("dbtmp")) {
*type = kTempFile;
} else {
return false;

@ -55,6 +55,10 @@ extern std::string ArchivedLogFileName(const std::string& dbname,
extern std::string MakeTableFileName(const std::string& name, uint64_t number);
// Return the name of sstable with LevelDB suffix
// created from RocksDB sstable suffixed name
extern std::string Rocks2LevelTableFileName(const std::string& fullname);
// the reverse function of MakeTableFileName
// TODO(yhchiang): could merge this function with ParseFileName()
extern uint64_t TableFileNameToNumber(const std::string& name);

@ -231,13 +231,15 @@ Status FlushJob::WriteLevel0Table(const autovector<MemTable*>& mems,
TEST_SYNC_POINT_CALLBACK("FlushJob::WriteLevel0Table:output_compression",
&output_compression_);
s = BuildTable(
dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
cfd_->table_cache(), iter.get(), meta, cfd_->internal_comparator(),
cfd_->int_tbl_prop_collector_factories(), existing_snapshots_,
output_compression_, cfd_->ioptions()->compression_opts,
mutable_cf_options_.paranoid_file_checks, cfd_->internal_stats(),
Env::IO_HIGH, &info.table_properties);
s = BuildTable(dbname_, db_options_.env, *cfd_->ioptions(), env_options_,
cfd_->table_cache(), iter.get(), meta,
cfd_->internal_comparator(),
cfd_->int_tbl_prop_collector_factories(), cfd_->GetID(),
existing_snapshots_, output_compression_,
cfd_->ioptions()->compression_opts,
mutable_cf_options_.paranoid_file_checks,
cfd_->internal_stats(), Env::IO_HIGH, &table_properties_);
info.table_properties = table_properties_;
LogFlush(db_options_.info_log);
}
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,

@ -66,6 +66,7 @@ class FlushJob {
~FlushJob();
Status Run(FileMetaData* file_meta = nullptr);
TableProperties GetTableProperties() const { return table_properties_; }
private:
void ReportStartedFlush();
@ -89,6 +90,7 @@ class FlushJob {
CompressionType output_compression_;
Statistics* stats_;
EventLogger* event_logger_;
TableProperties table_properties_;
};
} // namespace rocksdb

@ -152,6 +152,40 @@ class EventListenerTest : public testing::Test {
std::vector<ColumnFamilyHandle*> handles_;
};
struct TestPropertiesCollector : public rocksdb::TablePropertiesCollector {
virtual rocksdb::Status AddUserKey(const rocksdb::Slice& key,
const rocksdb::Slice& value,
rocksdb::EntryType type,
rocksdb::SequenceNumber seq,
uint64_t file_size) override {
return Status::OK();
}
virtual rocksdb::Status Finish(
rocksdb::UserCollectedProperties* properties) override {
properties->insert({"0", "1"});
return Status::OK();
}
virtual const char* Name() const override {
return "TestTablePropertiesCollector";
}
rocksdb::UserCollectedProperties GetReadableProperties() const override {
rocksdb::UserCollectedProperties ret;
ret["2"] = "3";
return ret;
}
};
class TestPropertiesCollectorFactory : public TablePropertiesCollectorFactory {
public:
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
return new TestPropertiesCollector;
}
const char* Name() const override { return "TestTablePropertiesCollector"; }
};
class TestCompactionListener : public EventListener {
public:
void OnCompactionCompleted(DB *db, const CompactionJobInfo& ci) override {
@ -161,6 +195,16 @@ class TestCompactionListener : public EventListener {
ASSERT_GT(ci.output_files.size(), 0U);
ASSERT_EQ(db->GetEnv()->GetThreadID(), ci.thread_id);
ASSERT_GT(ci.thread_id, 0U);
for (auto fl : {ci.input_files, ci.output_files}) {
for (auto fn : fl) {
auto it = ci.table_properties.find(fn);
ASSERT_NE(it, ci.table_properties.end());
auto tp = it->second;
ASSERT_TRUE(tp != nullptr);
ASSERT_EQ(tp->user_collected_properties.find("0")->second, "1");
}
}
}
std::vector<DB*> compacted_dbs_;
@ -186,6 +230,8 @@ TEST_F(EventListenerTest, OnSingleDBCompactionTest) {
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
options.level0_file_num_compaction_trigger = kNumL0Files;
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
TestCompactionListener* listener = new TestCompactionListener();
options.listeners.emplace_back(listener);
@ -274,6 +320,8 @@ class TestFlushListener : public EventListener {
ASSERT_EQ(prev_fc_info_.file_path, info.file_path);
ASSERT_EQ(db->GetEnv()->GetThreadID(), info.thread_id);
ASSERT_GT(info.thread_id, 0U);
ASSERT_EQ(info.table_properties.user_collected_properties.find("0")->second,
"1");
}
std::vector<std::string> flushed_column_family_names_;
@ -299,6 +347,8 @@ TEST_F(EventListenerTest, OnSingleDBFlushTest) {
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
CreateAndReopenWithCF(cf_names, &options);
ASSERT_OK(Put(1, "pikachu", std::string(90000, 'p')));
@ -330,6 +380,8 @@ TEST_F(EventListenerTest, MultiCF) {
#endif // ROCKSDB_USING_THREAD_STATUS
TestFlushListener* listener = new TestFlushListener(options.env);
options.listeners.emplace_back(listener);
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
std::vector<std::string> cf_names = {
"pikachu", "ilya", "muromec", "dobrynia",
"nikitich", "alyosha", "popovich"};
@ -360,6 +412,8 @@ TEST_F(EventListenerTest, MultiDBMultiListeners) {
#if ROCKSDB_USING_THREAD_STATUS
options.enable_thread_tracking = true;
#endif // ROCKSDB_USING_THREAD_STATUS
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
std::vector<TestFlushListener*> listeners;
const int kNumDBs = 5;
const int kNumListeners = 10;
@ -454,6 +508,8 @@ TEST_F(EventListenerTest, DisableBGCompaction) {
options.compaction_style = kCompactionStyleNone;
options.compression = kNoCompression;
options.write_buffer_size = 100000; // Small write buffer
options.table_properties_collector_factories.push_back(
std::make_shared<TestPropertiesCollectorFactory>());
CreateAndReopenWithCF({"pikachu"}, &options);
ColumnFamilyMetaData cf_meta;

@ -230,10 +230,15 @@ class MemTableIterator: public Iterator {
virtual void Seek(const Slice& k) override {
PERF_TIMER_GUARD(seek_on_memtable_time);
PERF_COUNTER_ADD(seek_on_memtable_count, 1);
if (bloom_ != nullptr &&
!bloom_->MayContain(prefix_extractor_->Transform(ExtractUserKey(k)))) {
if (bloom_ != nullptr) {
if (!bloom_->MayContain(
prefix_extractor_->Transform(ExtractUserKey(k)))) {
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
valid_ = false;
return;
} else {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
}
iter_->Seek(k, nullptr);
valid_ = iter_->Valid();
@ -508,12 +513,18 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
Slice user_key = key.user_key();
bool found_final_value = false;
bool merge_in_progress = s->IsMergeInProgress();
if (prefix_bloom_ &&
!prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key))) {
bool const may_contain =
nullptr == prefix_bloom_
? false
: prefix_bloom_->MayContain(prefix_extractor_->Transform(user_key));
if (prefix_bloom_ && !may_contain) {
// iter is null if prefix bloom says the key does not exist
PERF_COUNTER_ADD(bloom_memtable_miss_count, 1);
*seq = kMaxSequenceNumber;
} else {
if (prefix_bloom_) {
PERF_COUNTER_ADD(bloom_memtable_hit_count, 1);
}
Saver saver;
saver.status = s;
saver.found_final_value = &found_final_value;

@ -14,7 +14,6 @@
#include "rocksdb/merge_operator.h"
#include "util/perf_context_imp.h"
#include "util/statistics.h"
#include "util/stop_watch.h"
namespace rocksdb {
@ -41,8 +40,7 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
bool success =
merge_operator->FullMerge(key, value, operands, result, logger);
RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME,
env != nullptr ? timer.ElapsedNanos() : 0);
RecordTick(statistics, MERGE_OPERATION_TOTAL_TIME, timer.ElapsedNanosSafe());
if (!success) {
RecordTick(statistics, NUMBER_MERGE_FAILURES);
@ -59,30 +57,33 @@ Status MergeHelper::TimedFullMerge(const Slice& key, const Slice* value,
// operands_ stores the list of merge operands encountered while merging.
// keys_[i] corresponds to operands_[i] for each i.
Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
const bool at_bottom, Statistics* stats,
Env* env_) {
const bool at_bottom) {
// Get a copy of the internal key, before it's invalidated by iter->Next()
// Also maintain the list of merge operands seen.
assert(HasOperator());
keys_.clear();
operands_.clear();
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
assert(user_merge_operator_);
bool first_key = true;
// We need to parse the internal key again as the parsed key is
// backed by the internal key!
// Assume no internal key corruption as it has been successfully parsed
// by the caller.
// Invariant: keys_.back() will not change. Hence, orig_ikey is always valid.
// original_key_is_iter variable is just caching the information:
// original_key_is_iter == (iter->key().ToString() == original_key)
bool original_key_is_iter = true;
std::string original_key = iter->key().ToString();
// Important:
// orig_ikey is backed by original_key if keys_.empty()
// orig_ikey is backed by keys_.back() if !keys_.empty()
ParsedInternalKey orig_ikey;
ParseInternalKey(keys_.back(), &orig_ikey);
ParseInternalKey(original_key, &orig_ikey);
Status s;
bool hit_the_next_user_key = false;
for (iter->Next(); iter->Valid(); iter->Next()) {
for (; iter->Valid(); iter->Next(), original_key_is_iter = false) {
ParsedInternalKey ikey;
assert(operands_.size() >= 1); // Should be invariants!
assert(keys_.size() == operands_.size());
if (!ParseInternalKey(iter->key(), &ikey)) {
@ -92,6 +93,9 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
return Status::Corruption("Corrupted internal key not expected.");
}
break;
} else if (first_key) {
assert(user_comparator_->Equal(ikey.user_key, orig_ikey.user_key));
first_key = false;
} else if (!user_comparator_->Equal(ikey.user_key, orig_ikey.user_key)) {
// hit a different user key, stop right here
hit_the_next_user_key = true;
@ -105,16 +109,29 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
assert(IsValueType(ikey.type));
if (ikey.type != kTypeMerge) {
if (ikey.type != kTypeValue && ikey.type != kTypeDeletion) {
// Merges operands can only be used with puts and deletions, single
// deletions are not supported.
assert(ikey.type == kTypeValue || ikey.type == kTypeDeletion);
assert(false);
// release build doesn't have asserts, so we return error status
return Status::InvalidArgument(
" Merges operands can only be used with puts and deletions, single "
"deletions are not supported.");
}
// hit a put/delete
// => merge the put value or a nullptr with operands_
// => store result in operands_.back() (and update keys_.back())
// => change the entry type to kTypeValue for keys_.back()
// We are done! Success!
//
// If there are no operands, just return the Status::OK(). That will cause
// the compaction iterator to write out the key we're currently at, which
// is the put/delete we just encountered.
if (keys_.empty()) {
return Status::OK();
}
// TODO(noetzli) If the merge operator returns false, we are currently
// (almost) silently dropping the put/delete. That's probably not what we
// want.
@ -122,14 +139,14 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
const Slice* val_ptr = (kTypeValue == ikey.type) ? &val : nullptr;
std::string merge_result;
s = TimedFullMerge(ikey.user_key, val_ptr, operands_,
user_merge_operator_, stats, env_, logger_,
user_merge_operator_, stats_, env_, logger_,
&merge_result);
// We store the result in keys_.back() and operands_.back()
// if nothing went wrong (i.e.: no operand corruption on disk)
if (s.ok()) {
// The original key encountered
std::string original_key = std::move(keys_.back());
original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear();
@ -143,18 +160,42 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
return s;
} else {
// hit a merge
// => if there is a compaction filter, apply it.
// => merge the operand into the front of the operands_ list
// => use the user's associative merge function to determine how.
// if not filtered
// => then continue because we haven't yet seen a Put/Delete.
assert(!operands_.empty()); // Should have at least one element in it
// keep queuing keys and operands until we either meet a put / delete
//
// Keep queuing keys and operands until we either meet a put / delete
// request or later did a partial merge.
Slice value_slice = iter->value();
// add an operand to the list if:
// 1) it's included in one of the snapshots. in that case we *must* write
// it out, no matter what compaction filter says
// 2) it's not filtered by a compaction filter
if (ikey.sequence <= latest_snapshot_ ||
!FilterMerge(orig_ikey.user_key, value_slice)) {
if (original_key_is_iter) {
// this is just an optimization that saves us one memcpy
keys_.push_front(std::move(original_key));
} else {
keys_.push_front(iter->key().ToString());
operands_.push_front(iter->value().ToString());
}
if (keys_.size() == 1) {
// we need to re-anchor the orig_ikey because it was anchored by
// original_key before
ParseInternalKey(keys_.back(), &orig_ikey);
}
operands_.push_front(value_slice.ToString());
}
}
}
if (operands_.size() == 0) {
// we filtered out all the merge operands
return Status::OK();
}
// We are sure we have seen this key's entire history if we are at the
// last level and exhausted all internal keys of this user key.
// NOTE: !iter->Valid() does not necessarily mean we hit the
@ -179,11 +220,13 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
assert(operands_.size() == keys_.size());
std::string merge_result;
s = TimedFullMerge(orig_ikey.user_key, nullptr, operands_,
user_merge_operator_, stats, env_, logger_,
user_merge_operator_, stats_, env_, logger_,
&merge_result);
if (s.ok()) {
// The original key encountered
std::string original_key = std::move(keys_.back());
// We are certain that keys_ is not empty here (see assertions couple of
// lines before).
original_key = std::move(keys_.back());
orig_ikey.type = kTypeValue;
UpdateInternalKey(&original_key, orig_ikey.sequence, orig_ikey.type);
keys_.clear();
@ -205,14 +248,14 @@ Status MergeHelper::MergeUntil(Iterator* iter, const SequenceNumber stop_before,
bool merge_success = false;
std::string merge_result;
{
StopWatchNano timer(env_, stats != nullptr);
StopWatchNano timer(env_, stats_ != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
merge_success = user_merge_operator_->PartialMergeMulti(
orig_ikey.user_key,
std::deque<Slice>(operands_.begin(), operands_.end()),
&merge_result, logger_);
RecordTick(stats, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0);
RecordTick(stats_, MERGE_OPERATION_TOTAL_TIME,
timer.ElapsedNanosSafe());
}
if (merge_success) {
// Merging of operands (associative merge) was successful.
@ -236,7 +279,6 @@ MergeOutputIterator::MergeOutputIterator(const MergeHelper* merge_helper)
void MergeOutputIterator::SeekToFirst() {
const auto& keys = merge_helper_->keys();
const auto& values = merge_helper_->values();
assert(keys.size() > 0);
assert(keys.size() == values.size());
it_keys_ = keys.rbegin();
it_values_ = values.rbegin();
@ -247,4 +289,17 @@ void MergeOutputIterator::Next() {
++it_values_;
}
bool MergeHelper::FilterMerge(const Slice& user_key, const Slice& value_slice) {
if (compaction_filter_ == nullptr) {
return false;
}
if (stats_ != nullptr) {
filter_timer_.Start();
}
bool to_delete =
compaction_filter_->FilterMergeOperand(level_, user_key, value_slice);
total_filter_time_ += filter_timer_.ElapsedNanosSafe();
return to_delete;
}
} // namespace rocksdb

@ -10,8 +10,10 @@
#include <string>
#include "db/dbformat.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "util/stop_watch.h"
namespace rocksdb {
@ -23,17 +25,26 @@ class Statistics;
class MergeHelper {
public:
MergeHelper(const Comparator* user_comparator,
const MergeOperator* user_merge_operator, Logger* logger,
MergeHelper(Env* env, const Comparator* user_comparator,
const MergeOperator* user_merge_operator,
const CompactionFilter* compaction_filter, Logger* logger,
unsigned min_partial_merge_operands,
bool assert_valid_internal_key)
: user_comparator_(user_comparator),
bool assert_valid_internal_key, SequenceNumber latest_snapshot,
int level = 0, Statistics* stats = nullptr)
: env_(env),
user_comparator_(user_comparator),
user_merge_operator_(user_merge_operator),
compaction_filter_(compaction_filter),
logger_(logger),
min_partial_merge_operands_(min_partial_merge_operands),
assert_valid_internal_key_(assert_valid_internal_key),
latest_snapshot_(latest_snapshot),
level_(level),
keys_(),
operands_() {
operands_(),
filter_timer_(env_),
total_filter_time_(0U),
stats_(stats) {
assert(user_comparator_ != nullptr);
}
@ -62,6 +73,7 @@ class MergeHelper {
// 0 means no restriction
// at_bottom: (IN) true if the iterator covers the bottem level, which means
// we could reach the start of the history of this user key.
//
// Returns one of the following statuses:
// - OK: Entries were successfully merged.
// - MergeInProgress: Put/Delete not encountered and unable to merge operands.
@ -71,8 +83,11 @@ class MergeHelper {
//
// REQUIRED: The first key in the input is not corrupted.
Status MergeUntil(Iterator* iter, const SequenceNumber stop_before = 0,
const bool at_bottom = false, Statistics* stats = nullptr,
Env* env_ = nullptr);
const bool at_bottom = false);
// Filters a merge operand using the compaction filter specified
// in the constructor. Returns true if the operand should be filtered out.
bool FilterMerge(const Slice& user_key, const Slice& value_slice);
// Query the merge result
// These are valid until the next MergeUntil call
@ -101,19 +116,28 @@ class MergeHelper {
// TODO: Re-style this comment to be like the first one
const std::deque<std::string>& keys() const { return keys_; }
const std::deque<std::string>& values() const { return operands_; }
uint64_t TotalFilterTime() const { return total_filter_time_; }
bool HasOperator() const { return user_merge_operator_ != nullptr; }
private:
Env* env_;
const Comparator* user_comparator_;
const MergeOperator* user_merge_operator_;
const CompactionFilter* compaction_filter_;
Logger* logger_;
unsigned min_partial_merge_operands_;
bool assert_valid_internal_key_; // enforce no internal key corruption?
SequenceNumber latest_snapshot_;
int level_;
// the scratch area that holds the result of MergeUntil
// valid up to the next MergeUntil call
std::deque<std::string> keys_; // Keeps track of the sequence of keys seen
std::deque<std::string> operands_; // Parallel with keys_; stores the values
StopWatchNano filter_timer_;
uint64_t total_filter_time_;
Statistics* stats_;
};
// MergeOutputIterator can be used to iterate over the result of a merge.

@ -18,26 +18,18 @@ namespace rocksdb {
class MergeHelperTest : public testing::Test {
public:
MergeHelperTest() = default;
~MergeHelperTest() = default;
MergeHelperTest() { env_ = Env::Default(); }
Status RunUInt64MergeHelper(SequenceNumber stop_before, bool at_bottom) {
InitIterator();
merge_op_ = MergeOperators::CreateUInt64AddOperator();
merge_helper_.reset(new MergeHelper(BytewiseComparator(), merge_op_.get(),
nullptr, 2U, false));
return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom,
nullptr, Env::Default());
}
~MergeHelperTest() = default;
Status RunStringAppendMergeHelper(SequenceNumber stop_before,
bool at_bottom) {
InitIterator();
merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
merge_helper_.reset(new MergeHelper(BytewiseComparator(), merge_op_.get(),
nullptr, 2U, false));
return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom,
nullptr, Env::Default());
Status Run(SequenceNumber stop_before, bool at_bottom,
SequenceNumber latest_snapshot = 0) {
iter_.reset(new test::VectorIterator(ks_, vs_));
iter_->SeekToFirst();
merge_helper_.reset(new MergeHelper(env_, BytewiseComparator(),
merge_op_.get(), filter_.get(), nullptr,
2U, false, latest_snapshot));
return merge_helper_->MergeUntil(iter_.get(), stop_before, at_bottom);
}
void AddKeyVal(const std::string& user_key, const SequenceNumber& seq,
@ -51,66 +43,63 @@ class MergeHelperTest : public testing::Test {
vs_.push_back(val);
}
void InitIterator() {
iter_.reset(new test::VectorIterator(ks_, vs_));
iter_->SeekToFirst();
}
std::string EncodeInt(uint64_t x) {
std::string result;
PutFixed64(&result, x);
return result;
}
Env* env_;
std::unique_ptr<test::VectorIterator> iter_;
std::shared_ptr<MergeOperator> merge_op_;
std::unique_ptr<MergeHelper> merge_helper_;
std::vector<std::string> ks_;
std::vector<std::string> vs_;
std::unique_ptr<test::FilterNumber> filter_;
};
// If MergeHelper encounters a new key on the last level, we know that
// the key has no more history and it can merge keys.
TEST_F(MergeHelperTest, MergeAtBottomSuccess) {
AddKeyVal("a", 20, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 10, kTypeMerge, EncodeInt(3U));
AddKeyVal("b", 10, kTypeMerge, EncodeInt(4U)); // <- Iterator after merge
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 20, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("b", 10, kTypeMerge, test::EncodeInt(4U)); // <- iter_ after merge
ASSERT_TRUE(RunUInt64MergeHelper(0, true).ok());
ASSERT_TRUE(Run(0, true).ok());
ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 20, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
// Merging with a value results in a successful merge.
TEST_F(MergeHelperTest, MergeValue) {
AddKeyVal("a", 40, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U));
AddKeyVal("a", 20, kTypeValue, EncodeInt(4U)); // <- Iterator after merge
AddKeyVal("a", 10, kTypeMerge, EncodeInt(1U));
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U)); // <- iter_ after merge
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(RunUInt64MergeHelper(0, false).ok());
ASSERT_TRUE(Run(0, false).ok());
ASSERT_EQ(ks_[3], iter_->key());
ASSERT_EQ(test::KeyStr("a", 40, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(8U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(8U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
// Merging stops before a snapshot.
TEST_F(MergeHelperTest, SnapshotBeforeValue) {
AddKeyVal("a", 50, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 40, kTypeMerge, EncodeInt(3U)); // <- Iterator after merge
AddKeyVal("a", 30, kTypeMerge, EncodeInt(1U));
AddKeyVal("a", 20, kTypeValue, EncodeInt(4U));
AddKeyVal("a", 10, kTypeMerge, EncodeInt(1U));
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 40, kTypeMerge, test::EncodeInt(3U)); // <- iter_ after merge
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 20, kTypeValue, test::EncodeInt(4U));
AddKeyVal("a", 10, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(RunUInt64MergeHelper(31, true).IsMergeInProgress());
ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
@ -118,11 +107,13 @@ TEST_F(MergeHelperTest, SnapshotBeforeValue) {
// MergeHelper preserves the operand stack for merge operators that
// cannot do a partial merge.
TEST_F(MergeHelperTest, NoPartialMerge) {
merge_op_ = MergeOperators::CreateStringAppendTESTOperator();
AddKeyVal("a", 50, kTypeMerge, "v2");
AddKeyVal("a", 40, kTypeMerge, "v"); // <- Iterator after merge
AddKeyVal("a", 40, kTypeMerge, "v"); // <- iter_ after merge
AddKeyVal("a", 30, kTypeMerge, "v");
ASSERT_TRUE(RunStringAppendMergeHelper(31, true).IsMergeInProgress());
ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 40, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ("v", merge_helper_->values()[0]);
@ -134,44 +125,162 @@ TEST_F(MergeHelperTest, NoPartialMerge) {
// A single operand can not be merged.
TEST_F(MergeHelperTest, SingleOperand) {
AddKeyVal("a", 50, kTypeMerge, EncodeInt(1U));
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 50, kTypeMerge, test::EncodeInt(1U));
ASSERT_TRUE(RunUInt64MergeHelper(31, true).IsMergeInProgress());
ASSERT_TRUE(Run(31, true).IsMergeInProgress());
ASSERT_FALSE(iter_->Valid());
ASSERT_EQ(test::KeyStr("a", 50, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(1U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(1U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
// Merging with a deletion turns the deletion into a value
TEST_F(MergeHelperTest, MergeDeletion) {
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U));
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 20, kTypeDeletion, "");
ASSERT_TRUE(RunUInt64MergeHelper(15, false).ok());
ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid());
ASSERT_EQ(test::KeyStr("a", 30, kTypeValue), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(3U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(3U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
// The merge helper stops upon encountering a corrupt key
TEST_F(MergeHelperTest, CorruptKey) {
AddKeyVal("a", 30, kTypeMerge, EncodeInt(3U));
AddKeyVal("a", 25, kTypeMerge, EncodeInt(1U));
merge_op_ = MergeOperators::CreateUInt64AddOperator();
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(1U));
// Corrupt key
AddKeyVal("a", 20, kTypeDeletion, "", true); // <- Iterator after merge
AddKeyVal("a", 20, kTypeDeletion, "", true); // <- iter_ after merge
ASSERT_TRUE(RunUInt64MergeHelper(15, false).IsMergeInProgress());
ASSERT_TRUE(Run(15, false).IsMergeInProgress());
ASSERT_EQ(ks_[2], iter_->key());
ASSERT_EQ(test::KeyStr("a", 30, kTypeMerge), merge_helper_->keys()[0]);
ASSERT_EQ(EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(test::EncodeInt(4U), merge_helper_->values()[0]);
ASSERT_EQ(1U, merge_helper_->keys().size());
ASSERT_EQ(1U, merge_helper_->values().size());
}
// The compaction filter is called on every merge operand
TEST_F(MergeHelperTest, FilterMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 25, kTypeValue, test::EncodeInt(1U));
ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_EQ(test::KeyStr("a", 30, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(8U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
}
TEST_F(MergeHelperTest, FilterAllMergeOperands) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
// filtered out all
ASSERT_TRUE(Run(15, false).ok());
ASSERT_FALSE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_FALSE(merge_output_iter.Valid());
// we have one operand that will survive because it's a delete
AddKeyVal("a", 24, kTypeDeletion, test::EncodeInt(5U));
AddKeyVal("b", 23, kTypeValue, test::EncodeInt(5U));
ASSERT_TRUE(Run(15, true).ok());
merge_output_iter = MergeOutputIterator(merge_helper_.get());
ASSERT_TRUE(iter_->Valid());
merge_output_iter.SeekToFirst();
ASSERT_FALSE(merge_output_iter.Valid());
// when all merge operands are filtered out, we leave the iterator pointing to
// the Put/Delete that survived
ASSERT_EQ(test::KeyStr("a", 24, kTypeDeletion), iter_->key().ToString());
ASSERT_EQ(test::EncodeInt(5U), iter_->value().ToString());
}
// Make sure that merge operands are filtered at the beginning
TEST_F(MergeHelperTest, FilterFirstMergeOperand) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U)); // Filtered
AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U)); // next user key
ASSERT_OK(Run(15, true));
ASSERT_TRUE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
// sequence number is 29 here, because the first merge operand got filtered
// out
ASSERT_EQ(test::KeyStr("a", 29, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(6U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
// make sure that we're passing user keys into the filter
ASSERT_EQ("a", filter_->last_merge_operand_key());
}
// Make sure that merge operands are not filtered out if there's a snapshot
// pointing at them
TEST_F(MergeHelperTest, DontFilterMergeOperandsBeforeSnapshotTest) {
merge_op_ = MergeOperators::CreateUInt64AddOperator();
filter_.reset(new test::FilterNumber(5U));
AddKeyVal("a", 31, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 30, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 29, kTypeMerge, test::EncodeInt(2U));
AddKeyVal("a", 28, kTypeMerge, test::EncodeInt(1U));
AddKeyVal("a", 27, kTypeMerge, test::EncodeInt(3U));
AddKeyVal("a", 26, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("a", 25, kTypeMerge, test::EncodeInt(5U));
AddKeyVal("b", 24, kTypeValue, test::EncodeInt(5U));
ASSERT_OK(Run(15, true, 32));
ASSERT_TRUE(iter_->Valid());
MergeOutputIterator merge_output_iter(merge_helper_.get());
merge_output_iter.SeekToFirst();
ASSERT_EQ(test::KeyStr("a", 31, kTypeValue),
merge_output_iter.key().ToString());
ASSERT_EQ(test::EncodeInt(26U), merge_output_iter.value().ToString());
merge_output_iter.Next();
ASSERT_FALSE(merge_output_iter.Valid());
}
} // namespace rocksdb
int main(int argc, char** argv) {

@ -290,9 +290,10 @@ class Repairer {
ro.total_order_seek = true;
Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
iter.get(), &meta, icmp_,
&int_tbl_prop_collector_factories_, {},
status = BuildTable(
dbname_, env_, ioptions_, env_options_, table_cache_, iter.get(),
&meta, icmp_, &int_tbl_prop_collector_factories_,
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily, {},
kNoCompression, CompressionOptions(), false, nullptr);
}
delete mem->Unref();

@ -41,7 +41,8 @@ class IntTblPropCollectorFactory {
public:
virtual ~IntTblPropCollectorFactory() {}
// has to be thread-safe
virtual IntTblPropCollector* CreateIntTblPropCollector() = 0;
virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;
@ -69,7 +70,8 @@ class InternalKeyPropertiesCollector : public IntTblPropCollector {
class InternalKeyPropertiesCollectorFactory
: public IntTblPropCollectorFactory {
public:
virtual IntTblPropCollector* CreateIntTblPropCollector() override {
virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new InternalKeyPropertiesCollector();
}
@ -114,9 +116,12 @@ class UserKeyTablePropertiesCollectorFactory
explicit UserKeyTablePropertiesCollectorFactory(
std::shared_ptr<TablePropertiesCollectorFactory> user_collector_factory)
: user_collector_factory_(user_collector_factory) {}
virtual IntTblPropCollector* CreateIntTblPropCollector() override {
virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
TablePropertiesCollectorFactory::Context context;
context.column_family_id = column_family_id;
return new UserKeyTablePropertiesCollector(
user_collector_factory_->CreateTablePropertiesCollector());
user_collector_factory_->CreateTablePropertiesCollector(context));
}
virtual const char* Name() const override {

@ -35,6 +35,8 @@ class TablePropertiesTest : public testing::Test,
// Utilities test functions
namespace {
static const uint32_t kTestColumnFamilyId = 66;
void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
@ -46,7 +48,8 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions,
builder->reset(NewTableBuilder(
ioptions, internal_comparator, int_tbl_prop_collector_factories,
writable->get(), options.compression, options.compression_opts));
kTestColumnFamilyId /* column_family_id */, writable->get(),
options.compression, options.compression_opts));
}
} // namespace
@ -178,14 +181,17 @@ class RegularKeysStartWithAFactory : public IntTblPropCollectorFactory,
public:
explicit RegularKeysStartWithAFactory(bool backward_mode)
: backward_mode_(backward_mode) {}
virtual TablePropertiesCollector* CreateTablePropertiesCollector() override {
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) override {
EXPECT_EQ(kTestColumnFamilyId, context.column_family_id);
if (!backward_mode_) {
return new RegularKeysStartWithA();
} else {
return new RegularKeysStartWithABackwardCompatible();
}
}
virtual IntTblPropCollector* CreateIntTblPropCollector() override {
virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new RegularKeysStartWithAInternal();
}
const char* Name() const override { return "RegularKeysStartWithA"; }

@ -12,6 +12,7 @@
#include "db/version_set.h"
#include "util/coding.h"
#include "util/event_logger.h"
#include "util/sync_point.h"
#include "rocksdb/slice.h"
namespace rocksdb {
@ -32,12 +33,22 @@ enum Tag {
// these are new formats divergent from open source leveldb
kNewFile2 = 100,
kNewFile3 = 102,
kNewFile4 = 103, // 4th (the latest) format version of adding files
kColumnFamily = 200, // specify column family for version edit
kColumnFamilyAdd = 201,
kColumnFamilyDrop = 202,
kMaxColumnFamily = 203,
};
enum CustomTag {
kTerminate = 1, // The end of customized fields
kNeedCompaction = 2,
kPathId = 65,
};
// If this bit for the custom tag is set, opening DB should fail if
// we don't know this field.
uint32_t kCustomTagNonSafeIgnoreMask = 1 << 6;
uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id) {
assert(number <= kFileNumberMask);
return number | (path_id * (kFileNumberMask + 1));
@ -102,7 +113,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
if (!f.smallest.Valid() || !f.largest.Valid()) {
return false;
}
if (f.fd.GetPathId() == 0) {
bool has_customized_fields = false;
if (f.marked_for_compaction) {
PutVarint32(dst, kNewFile4);
has_customized_fields = true;
} else if (f.fd.GetPathId() == 0) {
// Use older format to make sure user can roll back the build if they
// don't config multiple DB paths.
PutVarint32(dst, kNewFile2);
@ -111,7 +126,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
}
PutVarint32(dst, new_files_[i].first); // level
PutVarint64(dst, f.fd.GetNumber());
if (f.fd.GetPathId() != 0) {
if (f.fd.GetPathId() != 0 && !has_customized_fields) {
// kNewFile3
PutVarint32(dst, f.fd.GetPathId());
}
PutVarint64(dst, f.fd.GetFileSize());
@ -119,6 +135,48 @@ bool VersionEdit::EncodeTo(std::string* dst) const {
PutLengthPrefixedSlice(dst, f.largest.Encode());
PutVarint64(dst, f.smallest_seqno);
PutVarint64(dst, f.largest_seqno);
if (has_customized_fields) {
// Customized fields' format:
// +-----------------------------+
// | 1st field's tag (varint32) |
// +-----------------------------+
// | 1st field's size (varint32) |
// +-----------------------------+
// | bytes for 1st field |
// | (based on size decoded) |
// +-----------------------------+
// | |
// | ...... |
// | |
// +-----------------------------+
// | last field's size (varint32)|
// +-----------------------------+
// | bytes for last field |
// | (based on size decoded) |
// +-----------------------------+
// | terminating tag (varint32) |
// +-----------------------------+
//
// Customized encoding for fields:
// tag kPathId: 1 byte as path_id
// tag kNeedCompaction:
// now only can take one char value 1 indicating need-compaction
//
if (f.fd.GetPathId() != 0) {
PutVarint32(dst, CustomTag::kPathId);
char p = static_cast<char>(f.fd.GetPathId());
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
if (f.marked_for_compaction) {
PutVarint32(dst, CustomTag::kNeedCompaction);
char p = static_cast<char>(1);
PutLengthPrefixedSlice(dst, Slice(&p, 1));
}
TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields",
dst);
PutVarint32(dst, CustomTag::kTerminate);
}
}
// 0 is default and does not need to be explicitly written
@ -161,6 +219,63 @@ bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
}
}
const char* VersionEdit::DecodeNewFile4From(Slice* input) {
const char* msg = nullptr;
int level;
FileMetaData f;
uint64_t number;
uint32_t path_id = 0;
uint64_t file_size;
if (GetLevel(input, &level, &msg) && GetVarint64(input, &number) &&
GetVarint64(input, &file_size) && GetInternalKey(input, &f.smallest) &&
GetInternalKey(input, &f.largest) &&
GetVarint64(input, &f.smallest_seqno) &&
GetVarint64(input, &f.largest_seqno)) {
// See comments in VersionEdit::EncodeTo() for format of customized fields
while (true) {
uint32_t custom_tag;
Slice field;
if (!GetVarint32(input, &custom_tag)) {
return "new-file4 custom field";
}
if (custom_tag == kTerminate) {
break;
}
if (!GetLengthPrefixedSlice(input, &field)) {
return "new-file4 custom field lenth prefixed slice error";
}
switch (custom_tag) {
case kPathId:
if (field.size() != 1) {
return "path_id field wrong size";
}
path_id = field[0];
if (path_id > 3) {
return "path_id wrong vaue";
}
break;
case kNeedCompaction:
if (field.size() != 1) {
return "need_compaction field wrong size";
}
f.marked_for_compaction = (field[0] == 1);
break;
default:
if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) {
// Should not proceed if cannot understand it
return "new-file4 custom field not supported";
}
break;
}
}
} else {
return "new-file4 entry";
}
f.fd = FileDescriptor(number, path_id, file_size);
new_files_.push_back(std::make_pair(level, f));
return nullptr;
}
Status VersionEdit::DecodeFrom(const Slice& src) {
Clear();
Slice input = src;
@ -304,6 +419,11 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
break;
}
case kNewFile4: {
msg = DecodeNewFile4From(&input);
break;
}
case kColumnFamily:
if (!GetVarint32(&input, &column_family_)) {
if (!msg) {

@ -237,6 +237,8 @@ class VersionEdit {
bool EncodeTo(std::string* dst) const;
Status DecodeFrom(const Slice& src);
const char* DecodeNewFile4From(Slice* input);
typedef std::set<std::pair<int, uint64_t>> DeletedFileSet;
const DeletedFileSet& GetDeletedFiles() { return deleted_files_; }

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_edit.h"
#include "util/sync_point.h"
#include "util/testharness.h"
namespace rocksdb {
@ -45,6 +46,121 @@ TEST_F(VersionEditTest, EncodeDecode) {
TestEncodeDecode(edit);
}
TEST_F(VersionEditTest, EncodeDecodeNewFile4) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue),
InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502,
kBig + 602, true);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
TestEncodeDecode(edit);
std::string encoded, encoded2;
edit.EncodeTo(&encoded);
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
auto& new_files = parsed.GetNewFiles();
ASSERT_TRUE(new_files[0].second.marked_for_compaction);
ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
ASSERT_TRUE(new_files[2].second.marked_for_compaction);
ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
ASSERT_EQ(0, new_files[2].second.fd.GetPathId());
}
TEST_F(VersionEditTest, ForwardCompatibleNewFile4) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue),
InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501,
kBig + 601, false);
edit.DeleteFile(4, 700);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
std::string encoded;
// Call back function to add extra customized builds.
bool first = true;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
std::string* str = reinterpret_cast<std::string*>(arg);
PutVarint32(str, 33);
const std::string str1 = "random_string";
PutLengthPrefixedSlice(str, str1);
if (first) {
first = false;
PutVarint32(str, 22);
const std::string str2 = "s";
PutLengthPrefixedSlice(str, str2);
}
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
edit.EncodeTo(&encoded);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString();
ASSERT_TRUE(!first);
auto& new_files = parsed.GetNewFiles();
ASSERT_TRUE(new_files[0].second.marked_for_compaction);
ASSERT_TRUE(!new_files[1].second.marked_for_compaction);
ASSERT_EQ(3, new_files[0].second.fd.GetPathId());
ASSERT_EQ(3, new_files[1].second.fd.GetPathId());
ASSERT_EQ(1u, parsed.GetDeletedFiles().size());
}
TEST_F(VersionEditTest, NewFile4NotSupportedField) {
static const uint64_t kBig = 1ull << 50;
VersionEdit edit;
edit.AddFile(3, 300, 3, 100, InternalKey("foo", kBig + 500, kTypeValue),
InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500,
kBig + 600, true);
edit.SetComparatorName("foo");
edit.SetLogNumber(kBig + 100);
edit.SetNextFile(kBig + 200);
edit.SetLastSequence(kBig + 1000);
std::string encoded;
// Call back function to add extra customized builds.
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"VersionEdit::EncodeTo:NewFile4:CustomizeFields", [&](void* arg) {
std::string* str = reinterpret_cast<std::string*>(arg);
const std::string str1 = "s";
PutLengthPrefixedSlice(str, str1);
});
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
edit.EncodeTo(&encoded);
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded);
ASSERT_NOK(s);
}
TEST_F(VersionEditTest, EncodeEmptyFile) {
VersionEdit edit;
edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false);

@ -666,6 +666,7 @@ class VersionSet {
Status GetMetadataForFile(uint64_t number, int* filelevel,
FileMetaData** metadata, ColumnFamilyData** cfd);
// This function doesn't support leveldb SST filenames
void GetLiveFilesMetaData(std::vector<LiveFileMetaData> *metadata);
void GetObsoleteFiles(std::vector<FileMetaData*>* files,

@ -1,6 +1,7 @@
column_families_example
simple_example
c_simple_example
column_families_example
compact_files_example
transaction_example
compaction_filter_example
optimistic_transaction_example
simple_example
transaction_example

@ -10,6 +10,9 @@ simple_example: librocksdb simple_example.cc
column_families_example: librocksdb column_families_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
compaction_filter_example: librocksdb compaction_filter_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
compact_files_example: librocksdb compact_files_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
@ -26,7 +29,7 @@ transaction_example: librocksdb transaction_example.cc
$(CXX) $(CXXFLAGS) $@.cc -o$@ ../librocksdb.a -I../include -O2 -std=c++11 $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
clean:
rm -rf ./simple_example ./column_families_example ./compact_files_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example
rm -rf ./simple_example ./column_families_example ./compact_files_example ./compaction_filter_example ./c_simple_example c_simple_example.o ./optimistic_transaction_example ./transaction_example
librocksdb:
cd .. && $(MAKE) librocksdb.a

@ -0,0 +1,84 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <rocksdb/compaction_filter.h>
#include <rocksdb/db.h>
#include <rocksdb/merge_operator.h>
#include <rocksdb/options.h>
class MyMerge : public rocksdb::MergeOperator {
public:
bool FullMerge(const rocksdb::Slice& key,
const rocksdb::Slice* existing_value,
const std::deque<std::string>& operand_list,
std::string* new_value,
rocksdb::Logger* logger) const override {
new_value->clear();
if (existing_value != nullptr) {
new_value->assign(existing_value->data(), existing_value->size());
}
for (const std::string& m : operand_list) {
fprintf(stderr, "Merge(%s)\n", m.c_str());
assert(m != "bad"); // the compaction filter filters out bad values
new_value->assign(m);
}
return true;
}
const char* Name() const override { return "MyMerge"; }
};
class MyFilter : public rocksdb::CompactionFilter {
public:
bool Filter(int level, const rocksdb::Slice& key,
const rocksdb::Slice& existing_value, std::string* new_value,
bool* value_changed) const override {
fprintf(stderr, "Filter(%s)\n", key.ToString().c_str());
++count_;
assert(*value_changed == false);
return false;
}
bool FilterMergeOperand(int level, const rocksdb::Slice& key,
const rocksdb::Slice& existing_value) const override {
fprintf(stderr, "FilterMerge(%s)\n", key.ToString().c_str());
++merge_count_;
return existing_value == "bad";
}
const char* Name() const override { return "MyFilter"; }
mutable int count_ = 0;
mutable int merge_count_ = 0;
};
int main() {
rocksdb::DB* raw_db;
rocksdb::Status status;
MyFilter filter;
system("rm -rf /tmp/rocksmergetest");
rocksdb::Options options;
options.create_if_missing = true;
options.merge_operator.reset(new MyMerge);
options.compaction_filter = &filter;
status = rocksdb::DB::Open(options, "/tmp/rocksmergetest", &raw_db);
assert(status.ok());
std::unique_ptr<rocksdb::DB> db(raw_db);
rocksdb::WriteOptions wopts;
db->Merge(wopts, "0", "bad"); // This is filtered out
db->Merge(wopts, "1", "data1");
db->Merge(wopts, "1", "bad");
db->Merge(wopts, "1", "data2");
db->Merge(wopts, "1", "bad");
db->Merge(wopts, "3", "data3");
db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
fprintf(stderr, "filter.count_ = %d\n", filter.count_);
assert(filter.count_ == 1);
fprintf(stderr, "filter.merge_count_ = %d\n", filter.merge_count_);
assert(filter.merge_count_ == 5);
}

@ -29,25 +29,167 @@
#
# Below is an example of a RocksDB options file:
[Version]
# The Version section stores the version information about rocksdb
# and option file. This is used for handling potential format
# change in the future.
rocksdb_version=4.0.0 # We support "#" style comment.
options_file_version=1.0
rocksdb_version=4.0.0
options_file_version=1.1
[DBOptions]
# Followed by the Version section is the DBOptions section.
# The value of an options can be assigned using a statement.
# Note that for those options that is not set in the options file,
# we will use the default value.
max_open_files=12345
max_background_flushes=301
stats_dump_period_sec=600
max_manifest_file_size=18446744073709551615
bytes_per_sync=0
delayed_write_rate=1048576
WAL_ttl_seconds=0
WAL_size_limit_MB=0
max_subcompactions=1
wal_dir=
wal_bytes_per_sync=0
db_write_buffer_size=0
max_total_wal_size=0
skip_stats_update_on_db_open=false
max_open_files=5000
max_file_opening_threads=1
use_fsync=false
max_background_compactions=1
manifest_preallocation_size=4194304
max_background_flushes=1
is_fd_close_on_exec=true
create_if_missing=false
use_adaptive_mutex=false
enable_thread_tracking=false
disableDataSync=false
max_log_file_size=0
advise_random_on_open=true
create_missing_column_families=false
keep_log_file_num=1000
table_cache_numshardbits=4
error_if_exists=false
skip_log_error_on_recovery=false
allow_os_buffer=true
allow_mmap_reads=false
paranoid_checks=true
delete_obsolete_files_period_micros=21600000000
disable_data_sync=false
log_file_time_to_roll=0
compaction_readahead_size=0
db_log_dir=
new_table_reader_for_compaction_inputs=false
allow_mmap_writes=false
[CFOptions "default"]
# ColumnFamilyOptions section must follow the format of
# [CFOptions "cf name"]. If a rocksdb instance
# has multiple column families, then its CFOptions must be
# specified in the same order as column family creation order.
[CFOptions "the second column family"]
# Each column family must have one section in the RocksDB option
# file even all the options of this column family are set to
# default value.
[CFOptions "the third column family"]
compaction_style=kCompactionStyleLevel
compaction_filter=nullptr
num_levels=7
table_factory=BlockBasedTable
comparator=leveldb.BytewiseComparator
max_sequential_skip_in_iterations=8
soft_rate_limit=0.000000
max_bytes_for_level_base=536870912
memtable_prefix_bloom_probes=6
memtable_prefix_bloom_bits=0
memtable_prefix_bloom_huge_page_tlb_size=0
max_successive_merges=0
arena_block_size=0
min_write_buffer_number_to_merge=2
target_file_size_multiplier=1
source_compaction_factor=1
max_bytes_for_level_multiplier=10
compaction_filter_factory=nullptr
max_write_buffer_number=6
level0_stop_writes_trigger=24
compression=kSnappyCompression
level0_file_num_compaction_trigger=2
purge_redundant_kvs_while_flush=true
max_write_buffer_number_to_maintain=0
memtable_factory=SkipListFactory
max_grandparent_overlap_factor=10
expanded_compaction_factor=25
hard_pending_compaction_bytes_limit=0
inplace_update_num_locks=10000
level_compaction_dynamic_level_bytes=false
level0_slowdown_writes_trigger=20
filter_deletes=false
verify_checksums_in_compaction=true
min_partial_merge_operands=2
paranoid_file_checks=false
target_file_size_base=67108864
optimize_filters_for_hits=false
merge_operator=nullptr
compression_per_level=kNoCompression:kNoCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression:kSnappyCompression
compaction_measure_io_stats=false
prefix_extractor=nullptr
bloom_locality=0
write_buffer_size=134217728
disable_auto_compactions=false
inplace_update_support=false
[TableOptions/BlockBasedTable "default"]
format_version=0
whole_key_filtering=true
block_size_deviation=10
block_size=4096
block_restart_interval=16
filter_policy=nullptr
no_block_cache=false
checksum=kCRC32c
cache_index_and_filter_blocks=false
index_type=kBinarySearch
hash_index_allow_collision=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "universal"]
compaction_style=kCompactionStyleUniversal
compaction_filter=nullptr
num_levels=7
table_factory=BlockBasedTable
comparator=leveldb.BytewiseComparator
max_sequential_skip_in_iterations=8
soft_rate_limit=0.000000
max_bytes_for_level_base=10485760
memtable_prefix_bloom_probes=6
memtable_prefix_bloom_bits=0
memtable_prefix_bloom_huge_page_tlb_size=0
max_successive_merges=0
arena_block_size=0
min_write_buffer_number_to_merge=2
target_file_size_multiplier=1
source_compaction_factor=1
max_bytes_for_level_multiplier=10
compaction_filter_factory=nullptr
max_write_buffer_number=6
level0_stop_writes_trigger=24
compression=kSnappyCompression
level0_file_num_compaction_trigger=4
purge_redundant_kvs_while_flush=true
max_write_buffer_number_to_maintain=0
memtable_factory=SkipListFactory
max_grandparent_overlap_factor=10
expanded_compaction_factor=25
hard_pending_compaction_bytes_limit=0
inplace_update_num_locks=10000
level_compaction_dynamic_level_bytes=false
level0_slowdown_writes_trigger=20
filter_deletes=false
verify_checksums_in_compaction=true
min_partial_merge_operands=2
paranoid_file_checks=false
target_file_size_base=2097152
optimize_filters_for_hits=false
merge_operator=nullptr
compression_per_level=
compaction_measure_io_stats=false
prefix_extractor=nullptr
bloom_locality=0
write_buffer_size=134217728
disable_auto_compactions=false
inplace_update_support=false
[TableOptions/BlockBasedTable "universal"]
format_version=0
whole_key_filtering=true
block_size_deviation=10
block_size=4096
block_restart_interval=16
filter_policy=nullptr
no_block_cache=false
checksum=kCRC32c
cache_index_and_filter_blocks=false
index_type=kBinarySearch
hash_index_allow_collision=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory

@ -104,6 +104,9 @@ class Cache {
// returns the memory size for the entries residing in the cache.
virtual size_t GetUsage() const = 0;
// returns the memory size for a specific entry in the cache.
virtual size_t GetUsage(Handle* handle) const = 0;
// returns the memory size for the entries in use by the system
virtual size_t GetPinnedUsage() const = 0;

@ -39,6 +39,8 @@ class CompactionFilter {
// Is this compaction requested by the client (true),
// or is it occurring as an automatic compaction process
bool is_manual_compaction;
// Which column family this compaction is for.
uint32_t column_family_id;
};
virtual ~CompactionFilter() {}
@ -51,10 +53,24 @@ class CompactionFilter {
// output of the compaction. The application can inspect
// the existing value of the key and make decision based on it.
//
// Key-Values that are results of merge operation during compaction are not
// passed into this function. Currently, when you have a mix of Put()s and
// Merge()s on a same key, we only guarantee to process the merge operands
// through the compaction filters. Put()s might be processed, or might not.
//
// When the value is to be preserved, the application has the option
// to modify the existing_value and pass it back through new_value.
// value_changed needs to be set to true in this case.
//
// If you use snapshot feature of RocksDB (i.e. call GetSnapshot() API on a
// DB* object), CompactionFilter might not be very useful for you. Due to
// guarantees we need to maintain, compaction process will not call Filter()
// on any keys that were written before the latest snapshot. In other words,
// compaction will only call Filter() on keys written after your most recent
// call to GetSnapshot(). In most cases, Filter() will not be called very
// often. This is something we're fixing. See the discussion at:
// https://www.facebook.com/groups/mysqlonrocksdb/permalink/999723240091865/
//
// If multithreaded compaction is being used *and* a single CompactionFilter
// instance was supplied via Options::compaction_filter, this method may be
// called from different threads concurrently. The application must ensure
@ -64,44 +80,23 @@ class CompactionFilter {
// be used by a single thread that is doing the compaction run, and this
// call does not need to be thread-safe. However, multiple filters may be
// in existence and operating concurrently.
//
// The last paragraph is not true if you set max_subcompactions to more than
// 1. In that case, subcompaction from multiple threads may call a single
// CompactionFilter concurrently.
virtual bool Filter(int level,
const Slice& key,
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const = 0;
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
virtual const char* Name() const = 0;
};
// CompactionFilterV2 that buffers kv pairs sharing the same prefix and let
// application layer to make individual decisions for all the kv pairs in the
// buffer.
class CompactionFilterV2 {
public:
virtual ~CompactionFilterV2() {}
// The compaction process invokes this method for all the kv pairs
// sharing the same prefix. It is a "roll-up" version of CompactionFilter.
//
// Each entry in the return vector indicates if the corresponding kv should
// be preserved in the output of this compaction run. The application can
// inspect the existing values of the keys and make decision based on it.
//
// When a value is to be preserved, the application has the option
// to modify the entry in existing_values and pass it back through an entry
// in new_values. A corresponding values_changed entry needs to be set to
// true in this case. Note that the new_values vector contains only changed
// values, i.e. new_values.size() <= values_changed.size().
//
typedef std::vector<Slice> SliceVector;
virtual std::vector<bool> Filter(int level,
const SliceVector& keys,
const SliceVector& existing_values,
std::vector<std::string>* new_values,
std::vector<bool>* values_changed)
const = 0;
// The compaction process invokes this method on every merge operand. If this
// method returns true, the merge operand will be ignored and not written out
// in the compaction output
virtual bool FilterMergeOperand(int level, const Slice& key,
const Slice& operand) const {
return false;
}
// Returns a name that identifies this compaction filter.
// The name will be printed to LOG file on start up for diagnosis.
@ -135,65 +130,6 @@ class DefaultCompactionFilterFactory : public CompactionFilterFactory {
}
};
// Each compaction will create a new CompactionFilterV2
//
// CompactionFilterFactoryV2 enables application to specify a prefix and use
// CompactionFilterV2 to filter kv-pairs in batches. Each batch contains all
// the kv-pairs sharing the same prefix.
//
// This is useful for applications that require grouping kv-pairs in
// compaction filter to make a purge/no-purge decision. For example, if the
// key prefix is user id and the rest of key represents the type of value.
// This batching filter will come in handy if the application's compaction
// filter requires knowledge of all types of values for any user id.
//
class CompactionFilterFactoryV2 {
public:
// NOTE: CompactionFilterFactoryV2 will not delete prefix_extractor
explicit CompactionFilterFactoryV2(const SliceTransform* prefix_extractor)
: prefix_extractor_(prefix_extractor) { }
virtual ~CompactionFilterFactoryV2() { }
virtual std::unique_ptr<CompactionFilterV2> CreateCompactionFilterV2(
const CompactionFilterContext& context) = 0;
// Returns a name that identifies this compaction filter factory.
virtual const char* Name() const = 0;
const SliceTransform* GetPrefixExtractor() const {
return prefix_extractor_;
}
void SetPrefixExtractor(const SliceTransform* prefix_extractor) {
prefix_extractor_ = prefix_extractor;
}
private:
// Prefix extractor for compaction filter v2
// Keys sharing the same prefix will be buffered internally.
// Client can implement a Filter callback function to operate on the buffer
const SliceTransform* prefix_extractor_;
};
// Default implementation of CompactionFilterFactoryV2 which does not
// return any filter
class DefaultCompactionFilterFactoryV2 : public CompactionFilterFactoryV2 {
public:
explicit DefaultCompactionFilterFactoryV2()
: CompactionFilterFactoryV2(nullptr) { }
virtual std::unique_ptr<CompactionFilterV2>
CreateCompactionFilterV2(
const CompactionFilterContext& context) override {
return std::unique_ptr<CompactionFilterV2>(nullptr);
}
virtual const char* Name() const override {
return "DefaultCompactionFilterFactoryV2";
}
};
} // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_

@ -40,7 +40,8 @@ Status GetDBOptionsFromMap(
Status GetBlockBasedTableOptionsFromMap(
const BlockBasedTableOptions& table_options,
const std::unordered_map<std::string, std::string>& opts_map,
BlockBasedTableOptions* new_table_options);
BlockBasedTableOptions* new_table_options,
bool input_strings_escaped = false);
// Take a string representation of option names and values, apply them into the
// base_options, and return the new options as a result. The string has the

@ -0,0 +1,45 @@
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#ifndef ROCKSDB_LITE
#include <string>
#include "rocksdb/db.h"
namespace rocksdb {
struct DumpOptions {
// Database that will be dumped
std::string db_path;
// File location that will contain dump output
std::string dump_location;
// Dont include db information header in the dump
bool anonymous = false;
};
class DbDumpTool {
public:
bool Run(const DumpOptions& dump_options,
rocksdb::Options options = rocksdb::Options());
};
struct UndumpOptions {
// Database that we will load the dumped file into
std::string db_path;
// File location of the dumped file that will be loaded
std::string dump_location;
// Compact the db after loading the dumped file
bool compact_db = false;
};
class DbUndumpTool {
public:
bool Run(const UndumpOptions& undump_options,
rocksdb::Options options = rocksdb::Options());
};
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -68,6 +68,9 @@ struct EnvOptions {
// If true, then use mmap to write data
bool use_mmap_writes = true;
// If false, fallocate() calls are bypassed
bool allow_fallocate = true;
// If true, set the FD_CLOEXEC on open fd.
bool set_fd_cloexec = true;

@ -4,7 +4,9 @@
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "rocksdb/compaction_job_stats.h"
#include "rocksdb/status.h"
@ -12,6 +14,9 @@
namespace rocksdb {
typedef std::unordered_map<std::string, std::shared_ptr<const TableProperties>>
TablePropertiesCollection;
class DB;
class Status;
struct CompactionJobStats;
@ -72,6 +77,8 @@ struct FlushJobInfo {
SequenceNumber smallest_seqno;
// The largest sequence number in the newly created file
SequenceNumber largest_seqno;
// Table properties of the table being flushed
TableProperties table_properties;
};
struct CompactionJobInfo {
@ -93,8 +100,13 @@ struct CompactionJobInfo {
int output_level;
// the names of the compaction input files.
std::vector<std::string> input_files;
// the names of the compaction output files.
std::vector<std::string> output_files;
// Table properties for input and output tables.
// The map is keyed by values from input_files and output_files.
TablePropertiesCollection table_properties;
// If non-null, this variable stores detailed information
// about this compaction.
CompactionJobStats stats;

@ -30,7 +30,6 @@ namespace rocksdb {
class Cache;
class CompactionFilter;
class CompactionFilterFactory;
class CompactionFilterFactoryV2;
class Comparator;
class Env;
enum InfoLogLevel : unsigned char;
@ -227,10 +226,6 @@ struct ColumnFamilyOptions {
// Default: nullptr
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
// This is deprecated. Talk to us if you depend on
// compaction_filter_factory_v2 and we'll put it back
// std::shared_ptr<CompactionFilterFactoryV2> compaction_filter_factory_v2;
// -------------------
// Parameters that affect performance
@ -1007,6 +1002,9 @@ struct DBOptions {
// Default: false
bool allow_mmap_writes;
// If false, fallocate() calls are bypassed
bool allow_fallocate;
// Disable child process inherit open files. Default: true
bool is_fd_close_on_exec;
@ -1145,9 +1143,7 @@ struct DBOptions {
// Options to control the behavior of a database (passed to DB::Open)
struct Options : public DBOptions, public ColumnFamilyOptions {
// Create an Options object with default values for all fields.
Options() :
DBOptions(),
ColumnFamilyOptions() {}
Options() : DBOptions(), ColumnFamilyOptions() {}
Options(const DBOptions& db_options,
const ColumnFamilyOptions& column_family_options)

@ -83,6 +83,14 @@ struct PerfContext {
uint64_t block_seek_nanos;
// Time spent on finding or creating a table reader
uint64_t find_table_nanos;
// total number of mem table bloom hits
uint64_t bloom_memtable_hit_count;
// total number of mem table bloom misses
uint64_t bloom_memtable_miss_count;
// total number of SST table bloom hits
uint64_t bloom_sst_hit_count;
// total number of SST table bloom misses
uint64_t bloom_sst_miss_count;
};
#if defined(NPERF_CONTEXT) || defined(IOS_CROSS_COMPILE)

@ -45,6 +45,10 @@ enum Tickers : uint32_t {
BLOCK_CACHE_DATA_MISS,
// # of times cache hit when accessing data block from block cache.
BLOCK_CACHE_DATA_HIT,
// # of bytes read from cache.
BLOCK_CACHE_BYTES_READ,
// # of bytes written into cache.
BLOCK_CACHE_BYTES_WRITE,
// # of times bloom filter has avoided file reads.
BLOOM_FILTER_USEFUL,
@ -177,6 +181,8 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{BLOCK_CACHE_FILTER_HIT, "rocksdb.block.cache.filter.hit"},
{BLOCK_CACHE_DATA_MISS, "rocksdb.block.cache.data.miss"},
{BLOCK_CACHE_DATA_HIT, "rocksdb.block.cache.data.hit"},
{BLOCK_CACHE_BYTES_READ, "rocksdb.block.cache.bytes.read"},
{BLOCK_CACHE_BYTES_WRITE, "rocksdb.block.cache.bytes.write"},
{BLOOM_FILTER_USEFUL, "rocksdb.bloom.filter.useful"},
{MEMTABLE_HIT, "rocksdb.memtable.hit"},
{MEMTABLE_MISS, "rocksdb.memtable.miss"},

@ -375,7 +375,7 @@ class TableFactory {
// to use in this table.
virtual TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const = 0;
uint32_t column_family_id, WritableFileWriter* file) const = 0;
// Sanitizes the specified DB Options and ColumnFamilyOptions.
//

@ -56,6 +56,7 @@ struct TableProperties {
// user collected properties
UserCollectedProperties user_collected_properties;
UserCollectedProperties readable_properties;
// convert this object to a human readable form
// @prop_delim: delimiter for each property.
@ -144,9 +145,15 @@ class TablePropertiesCollector {
// TablePropertiesCollector for each new table
class TablePropertiesCollectorFactory {
public:
struct Context {
uint32_t column_family_id;
static const uint32_t kUnknownColumnFamily;
};
virtual ~TablePropertiesCollectorFactory() {}
// has to be thread-safe
virtual TablePropertiesCollector* CreateTablePropertiesCollector() = 0;
virtual TablePropertiesCollector* CreateTablePropertiesCollector(
TablePropertiesCollectorFactory::Context context) = 0;
// The name of the properties collector can be used for debugging purpose.
virtual const char* Name() const = 0;

@ -61,10 +61,30 @@ class Transaction {
// methods. See Transaction::Get() for more details.
virtual void SetSnapshot() = 0;
// Similar to SetSnapshot(), but will not change the current snapshot
// until Put/Merge/Delete/GetForUpdate/MultigetForUpdate is called.
// By calling this function, the transaction will essentially call
// SetSnapshot() for you right before performing the next write/GetForUpdate.
//
// Calling SetSnapshotOnNextOperation() will not affect what snapshot is
// returned by GetSnapshot() until the next write/GetForUpdate is executed.
//
// This is an optimization to reduce the likelyhood of conflicts that
// could occur in between the time SetSnapshot() is called and the first
// write/GetForUpdate operation. Eg, this prevents the following
// race-condition:
//
// txn1->SetSnapshot();
// txn2->Put("A", ...);
// txn2->Commit();
// txn1->GetForUpdate(opts, "A", ...); // FAIL!
virtual void SetSnapshotOnNextOperation() = 0;
// Returns the Snapshot created by the last call to SetSnapshot().
//
// REQUIRED: The returned Snapshot is only valid up until the next time
// SetSnapshot() is called or the Transaction is deleted.
// SetSnapshot()/SetSnapshotOnNextSavePoint() is called or the Transaction
// is deleted.
virtual const Snapshot* GetSnapshot() const = 0;
// Write all batched keys to the db atomically.
@ -263,6 +283,21 @@ class Transaction {
// Similar to WriteBatch::PutLogData
virtual void PutLogData(const Slice& blob) = 0;
// By default, all Put/Merge/Delete operations will be indexed in the
// transaction so that Get/GetForUpdate/GetIterator can search for these
// keys.
//
// If the caller does not want to fetch the keys about to be written,
// they may want to avoid indexing as a performance optimization.
// Calling DisableIndexing() will turn off indexing for all future
// Put/Merge/Delete operations until EnableIndexing() is called.
//
// If a key is Put/Merge/Deleted after DisableIndexing is called and then
// is fetched via Get/GetForUpdate/GetIterator, the result of the fetch is
// undefined.
virtual void DisableIndexing() = 0;
virtual void EnableIndexing() = 0;
// Returns the number of distinct Keys being tracked by this transaction.
// If this transaction was created by a TransactinDB, this is the number of
// keys that are currently locked by this transaction.

@ -5,7 +5,7 @@
#pragma once
#define ROCKSDB_MAJOR 4
#define ROCKSDB_MINOR 0
#define ROCKSDB_MINOR 1
#define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with

@ -2,14 +2,16 @@
# install all required packages for rocksdb that are available through yum
ARCH=$(uname -i)
sudo yum -y install openssl java-1.7.0-openjdk-devel.$ARCH zlib zlib-devel bzip2 bzip2-devel
sudo yum -y install epel-release-5-4.noarch
sudo yum -y install snappy snappy-devel
# install gcc/g++ 4.8.2 via CERN (http://linux.web.cern.ch/linux/devtoolset/)
sudo wget -O /etc/yum.repos.d/slc5-devtoolset.repo http://linuxsoft.cern.ch/cern/devtoolset/slc5-devtoolset.repo
sudo wget -O /etc/pki/rpm-gpg/RPM-GPG-KEY-cern http://ftp.mirrorservice.org/sites/ftp.scientificlinux.org/linux/scientific/51/i386/RPM-GPG-KEYs/RPM-GPG-KEY-cern
sudo yum -y install devtoolset-2
wget http://gflags.googlecode.com/files/gflags-1.6.tar.gz
tar xvfz gflags-1.6.tar.gz; cd gflags-1.6; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install
wget http://gflags.googlecode.com/files/gflags-2.0-no-svn-files.tar.gz
tar xvfz gflags-2.0-no-svn-files.tar.gz; cd gflags-2.0; scl enable devtoolset-2 ./configure; scl enable devtoolset-2 make; sudo make install
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
# set java home so we can build rocksdb jars
@ -18,7 +20,7 @@ export JAVA_HOME=/usr/lib/jvm/java-1.7.0
# build rocksdb
cd /rocksdb
scl enable devtoolset-2 'make jclean clean'
scl enable devtoolset-2 'make rocksdbjavastatic'
scl enable devtoolset-2 'PORTABLE=1 make rocksdbjavastatic'
cp /rocksdb/java/target/librocksdbjni-* /rocksdb-build
cp /rocksdb/java/target/rocksdbjni-* /rocksdb-build

@ -233,13 +233,14 @@ class WinMmapReadableFile : public RandomAccessFile {
char* scratch) const override {
Status s;
if (offset + n > length_) {
if (offset > length_) {
*result = Slice();
s = IOError(fileName_, EINVAL);
} else {
return IOError(fileName_, EINVAL);
} else if (offset + n > length_) {
n = length_ - offset;
}
*result =
Slice(reinterpret_cast<const char*>(mapped_region_) + offset, n);
}
return s;
}

@ -43,53 +43,31 @@ void gettimeofday(struct timeval* tv, struct timezone* /* tz */) {
tv->tv_usec = usNow.count() - duration_cast<microseconds>(secNow).count();
}
Mutex::Mutex(bool adaptive) : lock(m_mutex, std::defer_lock) {}
Mutex::~Mutex() {}
void Mutex::Lock() {
lock.lock();
#ifndef NDEBUG
locked_ = true;
#endif
}
void Mutex::Unlock() {
#ifndef NDEBUG
locked_ = false;
#endif
lock.unlock();
}
void Mutex::AssertHeld() {
#ifndef NDEBUG
assert(locked_);
#endif
}
CondVar::CondVar(Mutex* mu) : mu_(mu) {}
CondVar::~CondVar() {}
void CondVar::Wait() {
// Caller must ensure that mutex is held prior to calling this method
std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
#ifndef NDEBUG
mu_->locked_ = false;
#endif
cv_.wait(mu_->getLock());
cv_.wait(lk);
#ifndef NDEBUG
mu_->locked_ = true;
#endif
// Release ownership of the lock as we don't want it to be unlocked when
// it goes out of scope (as we adopted the lock and didn't lock it ourselves)
lk.release();
}
bool CondVar::TimedWait(uint64_t abs_time_us) {
#ifndef NDEBUG
mu_->locked_ = false;
#endif
using namespace std::chrono;
// MSVC++ library implements wait_until in terms of wait_for so
// there is not an absolute wait anyway.
// we need to convert absoulte wait into relative wait.
microseconds usAbsTime(abs_time_us);
microseconds usNow(
@ -97,11 +75,18 @@ bool CondVar::TimedWait(uint64_t abs_time_us) {
microseconds relTimeUs =
(usAbsTime > usNow) ? (usAbsTime - usNow) : microseconds::zero();
std::cv_status cvStatus = cv_.wait_for(mu_->getLock(), relTimeUs);
// Caller must ensure that mutex is held prior to calling this method
std::unique_lock<std::mutex> lk(mu_->getLock(), std::adopt_lock);
#ifndef NDEBUG
mu_->locked_ = false;
#endif
std::cv_status cvStatus = cv_.wait_for(lk, relTimeUs);
#ifndef NDEBUG
mu_->locked_ = true;
#endif
// Release ownership of the lock as we don't want it to be unlocked when
// it goes out of scope (as we adopted the lock and didn't lock it ourselves)
lk.release();
if (cvStatus == std::cv_status::timeout) {
return true;

@ -113,29 +113,50 @@ class CondVar;
class Mutex {
public:
/* implicit */ Mutex(bool adaptive = false);
/* implicit */ Mutex(bool adaptive = false) : locked_(false) {
}
~Mutex();
void Lock();
void Unlock();
void Lock() {
mutex_.lock();
#ifndef NDEBUG
locked_ = true;
#endif
}
void Unlock() {
#ifndef NDEBUG
locked_ = false;
#endif
mutex_.unlock();
}
// this will assert if the mutex is not locked
// it does NOT verify that mutex is held by a calling thread
void AssertHeld();
void AssertHeld() {
#ifndef NDEBUG
assert(locked_);
#endif
}
std::unique_lock<std::mutex>& getLock() { return lock; }
// Mutex is move only with lock ownership transfer
Mutex(const Mutex&) = delete;
void operator=(const Mutex&) = delete;
private:
friend class CondVar;
std::mutex m_mutex;
std::unique_lock<std::mutex> lock;
std::mutex& getLock() {
return mutex_;
}
std::mutex mutex_;
#ifndef NDEBUG
bool locked_;
#endif
// No copying
Mutex(const Mutex&);
void operator=(const Mutex&);
};
class RWMutex {
@ -162,13 +183,22 @@ class RWMutex {
class CondVar {
public:
explicit CondVar(Mutex* mu);
explicit CondVar(Mutex* mu) : mu_(mu) {
}
~CondVar();
void Wait();
bool TimedWait(uint64_t expiration_time);
void Signal();
void SignalAll();
// Condition var is not copy/move constructible
CondVar(const CondVar&) = delete;
CondVar& operator=(const CondVar&) = delete;
CondVar(CondVar&&) = delete;
CondVar& operator=(CondVar&&) = delete;
private:
std::condition_variable cv_;
Mutex* mu_;

@ -76,6 +76,7 @@ LIB_SOURCES = \
table/plain_table_reader.cc \
table/table_properties.cc \
table/two_level_iterator.cc \
tools/dump/db_dump_tool.cc \
util/arena.cc \
util/auto_roll_logger.cc \
util/bloom.cc \
@ -204,6 +205,7 @@ TEST_BENCH_SOURCES = \
db/prefix_test.cc \
db/skiplist_test.cc \
db/table_properties_collector_test.cc \
db/db_test_util.cc \
db/version_builder_test.cc \
db/version_edit_test.cc \
db/version_set_test.cc \
@ -232,7 +234,6 @@ TEST_BENCH_SOURCES = \
util/cache_test.cc \
util/coding_test.cc \
util/crc32c_test.cc \
util/db_test_util.cc \
util/dynamic_bloom_test.cc \
util/env_test.cc \
util/filelock_test.cc \

@ -66,9 +66,10 @@ Status AdaptiveTableFactory::NewTableReader(
}
TableBuilder* AdaptiveTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options,
const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const {
return table_factory_to_write_->NewTableBuilder(table_builder_options, file);
return table_factory_to_write_->NewTableBuilder(table_builder_options,
column_family_id, file);
}
std::string AdaptiveTableFactory::GetPrintableTableOptions() const {

@ -40,7 +40,7 @@ class AdaptiveTableFactory : public TableFactory {
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts,

@ -7,12 +7,13 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <algorithm>
#include "table/block_based_filter_block.h"
#include <algorithm>
#include "db/dbformat.h"
#include "rocksdb/filter_policy.h"
#include "util/coding.h"
#include "util/perf_context_imp.h"
#include "util/string_util.h"
namespace rocksdb {
@ -219,7 +220,14 @@ bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(entry, filter);
bool const may_match = policy_->KeyMayMatch(entry, filter);
if (may_match) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
} else if (start == limit) {
// Empty filters do not match any entries
return false;

@ -474,7 +474,8 @@ struct BlockBasedTableBuilder::Rep {
const InternalKeyComparator& icomparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* f, const CompressionType _compression_type,
uint32_t column_family_id, WritableFileWriter* f,
const CompressionType _compression_type,
const CompressionOptions& _compression_opts, const bool skip_filters)
: ioptions(_ioptions),
table_options(table_opt),
@ -494,7 +495,7 @@ struct BlockBasedTableBuilder::Rep {
table_options, data_block)) {
for (auto& collector_factories : *int_tbl_prop_collector_factories) {
table_properties_collectors.emplace_back(
collector_factories->CreateIntTblPropCollector());
collector_factories->CreateIntTblPropCollector(column_family_id));
}
table_properties_collectors.emplace_back(
new BlockBasedTablePropertiesCollector(
@ -509,7 +510,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type,
uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters) {
BlockBasedTableOptions sanitized_table_options(table_options);
if (sanitized_table_options.format_version == 0 &&
@ -523,8 +525,8 @@ BlockBasedTableBuilder::BlockBasedTableBuilder(
}
rep_ = new Rep(ioptions, sanitized_table_options, internal_comparator,
int_tbl_prop_collector_factories, file, compression_type,
compression_opts, skip_filters);
int_tbl_prop_collector_factories, column_family_id, file,
compression_type, compression_opts, skip_filters);
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
@ -871,8 +873,9 @@ TableProperties BlockBasedTableBuilder::GetTableProperties() const {
TableProperties ret = rep_->props;
for (const auto& collector : rep_->table_properties_collectors) {
for (const auto& prop : collector->GetReadableProperties()) {
ret.user_collected_properties.insert(prop);
ret.readable_properties.insert(prop);
}
collector->Finish(&ret.user_collected_properties);
}
return ret;
}

@ -40,7 +40,8 @@ class BlockBasedTableBuilder : public TableBuilder {
const InternalKeyComparator& internal_comparator,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, const CompressionType compression_type,
uint32_t column_family_id, WritableFileWriter* file,
const CompressionType compression_type,
const CompressionOptions& compression_opts, const bool skip_filters);
// REQUIRES: Either Finish() or Abandon() has been called.

@ -61,13 +61,13 @@ Status BlockBasedTableFactory::NewTableReader(
}
TableBuilder* BlockBasedTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options,
const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const {
auto table_builder = new BlockBasedTableBuilder(
table_builder_options.ioptions, table_options_,
table_builder_options.internal_comparator,
table_builder_options.int_tbl_prop_collector_factories, file,
table_builder_options.compression_type,
table_builder_options.int_tbl_prop_collector_factories, column_family_id,
file, table_builder_options.compression_type,
table_builder_options.compression_opts,
table_builder_options.skip_filters);

@ -48,7 +48,7 @@ class BlockBasedTableFactory : public TableFactory {
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts,

@ -117,6 +117,9 @@ Cache::Handle* GetEntryFromCache(Cache* block_cache, const Slice& key,
PERF_COUNTER_ADD(block_cache_hit_count, 1);
// overall cache hit
RecordTick(statistics, BLOCK_CACHE_HIT);
// total bytes read from cache
RecordTick(statistics, BLOCK_CACHE_BYTES_READ,
block_cache->GetUsage(cache_handle));
// block-type specific cache hit
RecordTick(statistics, block_cache_hit_ticker);
} else {
@ -795,6 +798,8 @@ Status BlockBasedTable::PutDataBlockToCache(
block->value->usable_size(),
&DeleteCachedEntry<Block>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
block->value->usable_size());
assert(reinterpret_cast<Block*>(block_cache->Value(block->cache_handle)) ==
block->value);
}
@ -886,6 +891,7 @@ BlockBasedTable::CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
cache_handle = block_cache->Insert(key, filter, filter_size,
&DeleteCachedEntry<FilterBlockReader>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE, filter_size);
}
}
@ -944,6 +950,8 @@ Iterator* BlockBasedTable::NewIndexIterator(const ReadOptions& read_options,
block_cache->Insert(key, index_reader, index_reader->usable_size(),
&DeleteCachedEntry<IndexReader>);
RecordTick(statistics, BLOCK_CACHE_ADD);
RecordTick(statistics, BLOCK_CACHE_BYTES_WRITE,
index_reader->usable_size());
}
assert(cache_handle);

@ -27,7 +27,7 @@ Status CuckooTableFactory::NewTableReader(
}
TableBuilder* CuckooTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options,
const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const {
// Ignore the skipFIlters flag. Does not apply to this file format
//

@ -62,7 +62,7 @@ class CuckooTableFactory : public TableFactory {
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
uint32_t column_family_id, WritableFileWriter* file) const override;
// Sanitizes the specified DB Options.
Status SanitizeOptions(const DBOptions& db_opts,

@ -8,6 +8,7 @@
#include "rocksdb/filter_policy.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/perf_context_imp.h"
namespace rocksdb {
@ -89,7 +90,13 @@ bool FullFilterBlockReader::PrefixMayMatch(const Slice& prefix,
bool FullFilterBlockReader::MayMatch(const Slice& entry) {
if (contents_.size() != 0) {
return filter_bits_reader_->MayMatch(entry);
if (filter_bits_reader_->MayMatch(entry)) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
}
return true; // remain the same with block_based filter
}

@ -92,7 +92,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
user_key_, &value, merge_context_->GetOperands(), value_,
logger_);
RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0);
timer.ElapsedNanosSafe());
}
if (!merge_success) {
RecordTick(statistics_, NUMBER_MERGE_FAILURES);
@ -118,7 +118,7 @@ bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
user_key_, nullptr, merge_context_->GetOperands(), value_,
logger_);
RecordTick(statistics_, MERGE_OPERATION_TOTAL_TIME,
env_ != nullptr ? timer.ElapsedNanos() : 0);
timer.ElapsedNanosSafe());
}
if (!merge_success) {
RecordTick(statistics_, NUMBER_MERGE_FAILURES);

@ -74,7 +74,7 @@ Status MockTableFactory::NewTableReader(
}
TableBuilder* MockTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options,
const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const {
uint32_t id = GetAndWriteNextID(file->writable_file());

@ -151,7 +151,7 @@ class MockTableFactory : public TableFactory {
unique_ptr<TableReader>* table_reader) const override;
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
uint32_t column_familly_id, WritableFileWriter* file) const override;
// This function will directly create mock table instead of going through
// MockTableBuilder. file_contents has to have a format of <internal_key,

@ -60,10 +60,10 @@ PlainTableBuilder::PlainTableBuilder(
const ImmutableCFOptions& ioptions,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type,
size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes,
size_t huge_page_tlb_size, double hash_table_ratio,
bool store_index_in_file)
uint32_t column_family_id, WritableFileWriter* file, uint32_t user_key_len,
EncodingType encoding_type, size_t index_sparseness,
uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size,
double hash_table_ratio, bool store_index_in_file)
: ioptions_(ioptions),
bloom_block_(num_probes),
file_(file),
@ -108,7 +108,7 @@ PlainTableBuilder::PlainTableBuilder(
for (auto& collector_factories : *int_tbl_prop_collector_factories) {
table_properties_collectors_.emplace_back(
collector_factories->CreateIntTblPropCollector());
collector_factories->CreateIntTblPropCollector(column_family_id));
}
}

@ -34,11 +34,11 @@ class PlainTableBuilder: public TableBuilder {
const ImmutableCFOptions& ioptions,
const std::vector<std::unique_ptr<IntTblPropCollectorFactory>>*
int_tbl_prop_collector_factories,
WritableFileWriter* file, uint32_t user_key_size,
EncodingType encoding_type, size_t index_sparseness,
uint32_t bloom_bits_per_key, uint32_t num_probes = 6,
size_t huge_page_tlb_size = 0, double hash_table_ratio = 0,
bool store_index_in_file = false);
uint32_t column_family_id, WritableFileWriter* file,
uint32_t user_key_size, EncodingType encoding_type,
size_t index_sparseness, uint32_t bloom_bits_per_key,
uint32_t num_probes = 6, size_t huge_page_tlb_size = 0,
double hash_table_ratio = 0, bool store_index_in_file = false);
// REQUIRES: Either Finish() or Abandon() has been called.
~PlainTableBuilder();

@ -26,7 +26,7 @@ Status PlainTableFactory::NewTableReader(
}
TableBuilder* PlainTableFactory::NewTableBuilder(
const TableBuilderOptions& table_builder_options,
const TableBuilderOptions& table_builder_options, uint32_t column_family_id,
WritableFileWriter* file) const {
// Ignore the skip_filters flag. PlainTable format is optimized for small
// in-memory dbs. The skip_filters optimization is not useful for plain
@ -34,9 +34,10 @@ TableBuilder* PlainTableFactory::NewTableBuilder(
//
return new PlainTableBuilder(
table_builder_options.ioptions,
table_builder_options.int_tbl_prop_collector_factories, file,
user_key_len_, encoding_type_, index_sparseness_, bloom_bits_per_key_, 6,
huge_page_tlb_size_, hash_table_ratio_, store_index_in_file_);
table_builder_options.int_tbl_prop_collector_factories, column_family_id,
file, user_key_len_, encoding_type_, index_sparseness_,
bloom_bits_per_key_, 6, huge_page_tlb_size_, hash_table_ratio_,
store_index_in_file_);
}
std::string PlainTableFactory::GetPrintableTableOptions() const {

@ -159,7 +159,7 @@ class PlainTableFactory : public TableFactory {
unique_ptr<TableReader>* table) const override;
TableBuilder* NewTableBuilder(
const TableBuilderOptions& table_builder_options,
WritableFileWriter* file) const override;
uint32_t column_family_id, WritableFileWriter* file) const override;
std::string GetPrintableTableOptions() const override;

@ -488,7 +488,17 @@ Status PlainTableReader::GetOffset(const Slice& target, const Slice& prefix,
}
bool PlainTableReader::MatchBloom(uint32_t hash) const {
return !enable_bloom_ || bloom_.MayContainHash(hash);
if (!enable_bloom_) {
return true;
}
if (bloom_.MayContainHash(hash)) {
PERF_COUNTER_ADD(bloom_sst_hit_count, 1);
return true;
} else {
PERF_COUNTER_ADD(bloom_sst_miss_count, 1);
return false;
}
}
Status PlainTableReader::Next(PlainTableKeyDecoder* decoder, uint32_t* offset,

@ -57,7 +57,8 @@ class SstFileWriter::SstFileWriterPropertiesCollectorFactory
explicit SstFileWriterPropertiesCollectorFactory(int32_t version)
: version_(version) {}
virtual IntTblPropCollector* CreateIntTblPropCollector() override {
virtual IntTblPropCollector* CreateIntTblPropCollector(
uint32_t column_family_id) override {
return new SstFileWriterPropertiesCollector(version_);
}
@ -117,7 +118,9 @@ Status SstFileWriter::Open(const std::string& file_path) {
r->file_writer.reset(
new WritableFileWriter(std::move(sst_file), r->env_options));
r->builder.reset(r->ioptions.table_factory->NewTableBuilder(
table_builder_options, r->file_writer.get()));
table_builder_options,
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
r->file_writer.get()));
r->file_info.file_path = file_path;
r->file_info.file_size = 0;

@ -12,6 +12,9 @@
namespace rocksdb {
const uint32_t TablePropertiesCollectorFactory::Context::kUnknownColumnFamily =
port::kMaxInt32;
namespace {
void AppendProperty(
std::string& props,

@ -98,7 +98,7 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
CompressionType::kNoCompression,
CompressionOptions(), false),
file_writer.get());
0, file_writer.get());
} else {
s = DB::Open(opts, dbname, &db);
ASSERT_OK(s);

@ -272,6 +272,7 @@ class TableConstructor: public Constructor {
TableBuilderOptions(ioptions, internal_comparator,
&int_tbl_prop_collector_factories,
options.compression, CompressionOptions(), false),
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
file_writer_.get()));
for (const auto kv : kv_map) {
@ -1423,6 +1424,9 @@ class BlockCachePropertiesSnapshot {
filter_block_cache_miss =
statistics->getTickerCount(BLOCK_CACHE_FILTER_MISS);
filter_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_FILTER_HIT);
block_cache_bytes_read = statistics->getTickerCount(BLOCK_CACHE_BYTES_READ);
block_cache_bytes_write =
statistics->getTickerCount(BLOCK_CACHE_BYTES_WRITE);
}
void AssertIndexBlockStat(int64_t expected_index_block_cache_miss,
@ -1453,6 +1457,10 @@ class BlockCachePropertiesSnapshot {
block_cache_hit);
}
int64_t GetCacheBytesRead() { return block_cache_bytes_read; }
int64_t GetCacheBytesWrite() { return block_cache_bytes_write; }
private:
int64_t block_cache_miss = 0;
int64_t block_cache_hit = 0;
@ -1462,6 +1470,8 @@ class BlockCachePropertiesSnapshot {
int64_t data_block_cache_hit = 0;
int64_t filter_block_cache_miss = 0;
int64_t filter_block_cache_hit = 0;
int64_t block_cache_bytes_read = 0;
int64_t block_cache_bytes_write = 0;
};
// Make sure, by default, index/filter blocks were pre-loaded (meaning we won't
@ -1537,12 +1547,17 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
// Since block_cache is disabled, no cache activities will be involved.
unique_ptr<Iterator> iter;
int64_t last_cache_bytes_read = 0;
// At first, no block will be accessed.
{
BlockCachePropertiesSnapshot props(options.statistics.get());
// index will be added to block cache.
props.AssertEqual(1, // index block miss
0, 0, 0);
ASSERT_EQ(props.GetCacheBytesRead(), 0);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
}
// Only index block will be accessed
@ -1554,6 +1569,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
// value; other numbers remain the same.
props.AssertEqual(1, 0 + 1, // index block hit
0, 0);
// Cache hit, bytes read from cache should increase
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
}
// Only data block will be accessed
@ -1562,6 +1582,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, 1, 0 + 1, // data block miss
0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
}
// Data block will be in cache
@ -1571,6 +1596,11 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, 1 + 1, /* index block hit */
1, 0 + 1 /* data block hit */);
// Cache hit, bytes read from cache should increase
ASSERT_GT(props.GetCacheBytesRead(), last_cache_bytes_read);
ASSERT_EQ(props.GetCacheBytesWrite(),
table_options.block_cache->GetUsage());
last_cache_bytes_read = props.GetCacheBytesRead();
}
// release the iterator so that the block cache can reset correctly.
iter.reset();
@ -1587,6 +1617,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(1, // index block miss
0, 0, 0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), 0);
}
{
@ -1598,6 +1630,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
props.AssertEqual(1 + 1, // index block miss
0, 0, // data block miss
0);
// Cache hit, bytes read from cache should increase
ASSERT_EQ(props.GetCacheBytesRead(), 0);
}
{
@ -1607,6 +1641,8 @@ TEST_F(BlockBasedTableTest, FilterBlockInBlockCache) {
BlockCachePropertiesSnapshot props(options.statistics.get());
props.AssertEqual(2, 0, 0 + 1, // data block miss
0);
// Cache miss, Bytes read from cache should not change
ASSERT_EQ(props.GetCacheBytesRead(), 0);
}
iter.reset();
@ -1789,6 +1825,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) {
std::unique_ptr<TableBuilder> builder(factory.NewTableBuilder(
TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories,
kNoCompression, CompressionOptions(), false),
TablePropertiesCollectorFactory::Context::kUnknownColumnFamily,
file_writer.get()));
for (char c = 'a'; c <= 'z'; ++c) {

@ -0,0 +1,261 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <iostream>
#include "rocksdb/db.h"
#include "rocksdb/db_dump_tool.h"
#include "rocksdb/env.h"
#include "util/coding.h"
namespace rocksdb {
bool DbDumpTool::Run(const DumpOptions& dump_options,
rocksdb::Options options) {
rocksdb::DB* dbptr;
rocksdb::Status status;
std::unique_ptr<rocksdb::WritableFile> dumpfile;
char hostname[1024];
int64_t timesec;
std::string abspath;
char json[4096];
static const char* magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
rocksdb::Env* env = rocksdb::Env::Default();
// Open the database
options.create_if_missing = false;
status = rocksdb::DB::OpenForReadOnly(options, dump_options.db_path, &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << dump_options.db_path
<< "' for reading: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
status = env->NewWritableFile(dump_options.dump_location, &dumpfile,
rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << dump_options.dump_location
<< "' for writing: " << status.ToString() << std::endl;
return false;
}
rocksdb::Slice magicslice(magicstr, 8);
status = dumpfile->Append(magicslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
rocksdb::Slice versionslice(versionstr, 8);
status = dumpfile->Append(versionslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
if (dump_options.anonymous) {
snprintf(json, sizeof(json), "{}");
} else {
status = env->GetHostName(hostname, sizeof(hostname));
status = env->GetCurrentTime(&timesec);
status = env->GetAbsolutePath(dump_options.db_path, &abspath);
snprintf(json, sizeof(json),
"{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
"\"creation-time\": %" PRIi64 " }",
abspath.c_str(), hostname, timesec);
}
rocksdb::Slice infoslice(json, strlen(json));
char infosize[4];
rocksdb::EncodeFixed32(infosize, (uint32_t)infoslice.size());
rocksdb::Slice infosizeslice(infosize, 4);
status = dumpfile->Append(infosizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(infoslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::Iterator> it(
db->NewIterator(rocksdb::ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
char keysize[4];
rocksdb::EncodeFixed32(keysize, (uint32_t)it->key().size());
rocksdb::Slice keysizeslice(keysize, 4);
status = dumpfile->Append(keysizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(it->key());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
char valsize[4];
rocksdb::EncodeFixed32(valsize, (uint32_t)it->value().size());
rocksdb::Slice valsizeslice(valsize, 4);
status = dumpfile->Append(valsizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Append(it->value());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
return false;
}
}
if (!it->status().ok()) {
std::cerr << "Database iteration failed: " << status.ToString()
<< std::endl;
return false;
}
return true;
}
bool DbUndumpTool::Run(const UndumpOptions& undump_options,
rocksdb::Options options) {
rocksdb::DB* dbptr;
rocksdb::Status status;
rocksdb::Env* env;
std::unique_ptr<rocksdb::SequentialFile> dumpfile;
rocksdb::Slice slice;
char scratch8[8];
static const char* magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
env = rocksdb::Env::Default();
status = env->NewSequentialFile(undump_options.dump_location, &dumpfile,
rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << undump_options.dump_location
<< "' for reading: " << status.ToString() << std::endl;
return false;
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), magicstr, 8) != 0) {
std::cerr << "File '" << undump_options.dump_location
<< "' is not a recognizable dump file." << std::endl;
return false;
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), versionstr, 8) != 0) {
std::cerr << "File '" << undump_options.dump_location
<< "' version not recognized." << std::endl;
return false;
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read info blob size." << std::endl;
return false;
}
uint32_t infosize = rocksdb::DecodeFixed32(slice.data());
status = dumpfile->Skip(infosize);
if (!status.ok()) {
std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
return false;
}
options.create_if_missing = true;
status = rocksdb::DB::Open(options, undump_options.db_path, &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << undump_options.db_path
<< "' for writing: " << status.ToString() << std::endl;
return false;
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
uint32_t last_keysize = 64;
size_t last_valsize = 1 << 20;
std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
std::unique_ptr<char[]> valscratch(new char[last_valsize]);
while (1) {
uint32_t keysize, valsize;
rocksdb::Slice keyslice;
rocksdb::Slice valslice;
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) break;
keysize = rocksdb::DecodeFixed32(slice.data());
if (keysize > last_keysize) {
while (keysize > last_keysize) last_keysize *= 2;
keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
}
status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
if (!status.ok() || keyslice.size() != keysize) {
std::cerr << "Key read failure: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read value size: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
valsize = rocksdb::DecodeFixed32(slice.data());
if (valsize > last_valsize) {
while (valsize > last_valsize) last_valsize *= 2;
valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
}
status = dumpfile->Read(valsize, &valslice, valscratch.get());
if (!status.ok() || valslice.size() != valsize) {
std::cerr << "Unable to read value: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
return false;
}
status = db->Put(rocksdb::WriteOptions(), keyslice, valslice);
if (!status.ok()) {
fprintf(stderr, "Unable to write database entry\n");
return false;
}
}
if (undump_options.compact_db) {
status = db->CompactRange(rocksdb::CompactRangeOptions(), nullptr, nullptr);
if (!status.ok()) {
fprintf(stderr,
"Unable to compact the database after loading the dumped file\n");
return false;
}
}
return true;
}
} // namespace rocksdb
#endif // ROCKSDB_LITE

@ -1,154 +1,63 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef GFLAGS
#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
#include <cstdio>
int main() {
#ifndef GFLAGS
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
#endif
#ifdef ROCKSDB_LITE
fprintf(stderr, "DbDumpTool is not supported in ROCKSDB_LITE\n");
#endif
return 1;
}
#else
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#else
#include <inttypes.h>
#include <gflags/gflags.h>
#include <iostream>
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/coding.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db_dump_tool.h"
DEFINE_bool(anonymous, false, "Output an empty information blob.");
void usage(const char* name) {
std::cout << "usage: " << name << " [--anonymous] <db> <dumpfile>"
<< std::endl;
}
DEFINE_string(db_path, "", "Path to the db that will be dumped");
DEFINE_string(dump_location, "", "Path to where the dump file location");
DEFINE_bool(anonymous, false,
"Remove information like db path, creation time from dumped file");
DEFINE_string(db_options, "",
"Options string used to open the database that will be dumped");
int main(int argc, char** argv) {
rocksdb::DB* dbptr;
rocksdb::Options options;
rocksdb::Status status;
std::unique_ptr<rocksdb::WritableFile> dumpfile;
char hostname[1024];
int64_t timesec;
std::string abspath;
char json[4096];
GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
static const char* magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
if (argc != 3) {
usage(argv[0]);
exit(1);
}
rocksdb::Env* env = rocksdb::Env::Default();
// Open the database
options.create_if_missing = false;
status = rocksdb::DB::OpenForReadOnly(options, argv[1], &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << argv[1]
<< "' for reading: " << status.ToString() << std::endl;
exit(1);
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
status = env->NewWritableFile(argv[2], &dumpfile, rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << argv[2]
<< "' for writing: " << status.ToString() << std::endl;
exit(1);
}
rocksdb::Slice magicslice(magicstr, 8);
status = dumpfile->Append(magicslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
rocksdb::Slice versionslice(versionstr, 8);
status = dumpfile->Append(versionslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
fprintf(stderr, "Please set --db_path and --dump_location\n");
return 1;
}
if (FLAGS_anonymous) {
snprintf(json, sizeof(json), "{}");
} else {
status = env->GetHostName(hostname, sizeof(hostname));
status = env->GetCurrentTime(&timesec);
status = env->GetAbsolutePath(argv[1], &abspath);
snprintf(json, sizeof(json),
"{ \"database-path\": \"%s\", \"hostname\": \"%s\", "
"\"creation-time\": %" PRIi64 " }",
abspath.c_str(), hostname, timesec);
}
rocksdb::DumpOptions dump_options;
dump_options.db_path = FLAGS_db_path;
dump_options.dump_location = FLAGS_dump_location;
dump_options.anonymous = FLAGS_anonymous;
rocksdb::Slice infoslice(json, strlen(json));
char infosize[4];
rocksdb::EncodeFixed32(infosize, (uint32_t)infoslice.size());
rocksdb::Slice infosizeslice(infosize, 4);
status = dumpfile->Append(infosizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Append(infoslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
const std::unique_ptr<rocksdb::Iterator> it(
db->NewIterator(rocksdb::ReadOptions()));
for (it->SeekToFirst(); it->Valid(); it->Next()) {
char keysize[4];
rocksdb::EncodeFixed32(keysize, (uint32_t)it->key().size());
rocksdb::Slice keysizeslice(keysize, 4);
status = dumpfile->Append(keysizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
rocksdb::Options db_options;
if (FLAGS_db_options != "") {
rocksdb::Options parsed_options;
rocksdb::Status s = rocksdb::GetOptionsFromString(
db_options, FLAGS_db_options, &parsed_options);
if (!s.ok()) {
fprintf(stderr, "Cannot parse provided db_options\n");
return 1;
}
status = dumpfile->Append(it->key());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
db_options = parsed_options;
}
char valsize[4];
rocksdb::EncodeFixed32(valsize, (uint32_t)it->value().size());
rocksdb::Slice valsizeslice(valsize, 4);
status = dumpfile->Append(valsizeslice);
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Append(it->value());
if (!status.ok()) {
std::cerr << "Append failed: " << status.ToString() << std::endl;
exit(1);
}
}
if (!it->status().ok()) {
std::cerr << "Database iteration failed: " << status.ToString()
<< std::endl;
exit(1);
rocksdb::DbDumpTool tool;
if (!tool.Run(dump_options, db_options)) {
return 1;
}
return 0;
}
#endif // GFLAGS
#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)

@ -1,136 +1,62 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// Copyright (c) 2015, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include <cstring>
#include <iostream>
#if !(defined GFLAGS) || defined(ROCKSDB_LITE)
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "util/coding.h"
void usage(const char *name) {
std::cout << "usage: " << name << " <dumpfile> <rocksdb>" << std::endl;
}
int main(int argc, char **argv) {
rocksdb::DB *dbptr;
rocksdb::Options options;
rocksdb::Status status;
rocksdb::Env *env;
std::unique_ptr<rocksdb::SequentialFile> dumpfile;
rocksdb::Slice slice;
char scratch8[8];
static const char *magicstr = "ROCKDUMP";
static const char versionstr[8] = {0, 0, 0, 0, 0, 0, 0, 1};
if (argc != 3) {
usage(argv[0]);
exit(1);
#include <cstdio>
int main() {
#ifndef GFLAGS
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
#endif
#ifdef ROCKSDB_LITE
fprintf(stderr, "DbUndumpTool is not supported in ROCKSDB_LITE\n");
#endif
return 1;
}
env = rocksdb::Env::Default();
#else
status = env->NewSequentialFile(argv[1], &dumpfile, rocksdb::EnvOptions());
if (!status.ok()) {
std::cerr << "Unable to open dump file '" << argv[1]
<< "' for reading: " << status.ToString() << std::endl;
exit(1);
}
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), magicstr, 8) != 0) {
std::cerr << "File '" << argv[1] << "' is not a recognizable dump file."
<< std::endl;
exit(1);
}
#include <gflags/gflags.h>
#include "rocksdb/convenience.h"
#include "rocksdb/db_dump_tool.h"
status = dumpfile->Read(8, &slice, scratch8);
if (!status.ok() || slice.size() != 8 ||
memcmp(slice.data(), versionstr, 8) != 0) {
std::cerr << "File '" << argv[1] << "' version not recognized."
<< std::endl;
exit(1);
}
DEFINE_string(dump_location, "", "Path to the dump file that will be loaded");
DEFINE_string(db_path, "", "Path to the db that we will undump the file into");
DEFINE_bool(compact, false, "Compact the db after loading the dumped file");
DEFINE_string(db_options, "",
"Options string used to open the database that will be loaded");
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read info blob size." << std::endl;
exit(1);
}
uint32_t infosize = rocksdb::DecodeFixed32(slice.data());
status = dumpfile->Skip(infosize);
if (!status.ok()) {
std::cerr << "Unable to skip info blob: " << status.ToString() << std::endl;
exit(1);
}
int main(int argc, char **argv) {
GFLAGS::ParseCommandLineFlags(&argc, &argv, true);
options.create_if_missing = true;
status = rocksdb::DB::Open(options, argv[2], &dbptr);
if (!status.ok()) {
std::cerr << "Unable to open database '" << argv[2]
<< "' for writing: " << status.ToString() << std::endl;
exit(1);
if (FLAGS_db_path == "" || FLAGS_dump_location == "") {
fprintf(stderr, "Please set --db_path and --dump_location\n");
return 1;
}
const std::unique_ptr<rocksdb::DB> db(dbptr);
uint32_t last_keysize = 64;
size_t last_valsize = 1 << 20;
std::unique_ptr<char[]> keyscratch(new char[last_keysize]);
std::unique_ptr<char[]> valscratch(new char[last_valsize]);
while (1) {
uint32_t keysize, valsize;
rocksdb::Slice keyslice;
rocksdb::Slice valslice;
rocksdb::UndumpOptions undump_options;
undump_options.db_path = FLAGS_db_path;
undump_options.dump_location = FLAGS_dump_location;
undump_options.compact_db = FLAGS_compact;
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) break;
keysize = rocksdb::DecodeFixed32(slice.data());
if (keysize > last_keysize) {
while (keysize > last_keysize) last_keysize *= 2;
keyscratch = std::unique_ptr<char[]>(new char[last_keysize]);
rocksdb::Options db_options;
if (FLAGS_db_options != "") {
rocksdb::Options parsed_options;
rocksdb::Status s = rocksdb::GetOptionsFromString(
db_options, FLAGS_db_options, &parsed_options);
if (!s.ok()) {
fprintf(stderr, "Cannot parse provided db_options\n");
return 1;
}
status = dumpfile->Read(keysize, &keyslice, keyscratch.get());
if (!status.ok() || keyslice.size() != keysize) {
std::cerr << "Key read failure: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
exit(1);
db_options = parsed_options;
}
status = dumpfile->Read(4, &slice, scratch8);
if (!status.ok() || slice.size() != 4) {
std::cerr << "Unable to read value size: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
exit(1);
}
valsize = rocksdb::DecodeFixed32(slice.data());
if (valsize > last_valsize) {
while (valsize > last_valsize) last_valsize *= 2;
valscratch = std::unique_ptr<char[]>(new char[last_valsize]);
rocksdb::DbUndumpTool tool;
if (!tool.Run(undump_options, db_options)) {
return 1;
}
status = dumpfile->Read(valsize, &valslice, valscratch.get());
if (!status.ok() || valslice.size() != valsize) {
std::cerr << "Unable to read value: "
<< (status.ok() ? "insufficient data" : status.ToString())
<< std::endl;
exit(1);
}
status = db->Put(rocksdb::WriteOptions(), keyslice, valslice);
if (!status.ok()) {
fprintf(stderr, "Unable to write database entry\n");
exit(1);
}
}
return 0;
}
#endif // !(defined GFLAGS) || defined(ROCKSDB_LITE)

@ -2,6 +2,6 @@ TESTDIR=`mktemp -d /tmp/rocksdb-dump-test.XXXXX`
DUMPFILE="tools/sample-dump.dmp"
# Verify that the sample dump file is undumpable and then redumpable.
./rocksdb_undump $DUMPFILE $TESTDIR/db
./rocksdb_dump --anonymous $TESTDIR/db $TESTDIR/dump
./rocksdb_undump --dump_location=$DUMPFILE --db_path=$TESTDIR/db
./rocksdb_dump --anonymous --db_path=$TESTDIR/db --dump_location=$TESTDIR/dump
cmp $DUMPFILE $TESTDIR/dump

@ -540,6 +540,11 @@ class ShardedLRUCache : public Cache {
}
return usage;
}
virtual size_t GetUsage(Handle* handle) const override {
return reinterpret_cast<LRUHandle*>(handle)->charge;
}
virtual size_t GetPinnedUsage() const override {
// We will not lock the cache when getting the usage from shards.
int num_shards = 1 << num_shard_bits_;

@ -12,7 +12,6 @@
#include <thread>
#include "port/port.h"
#include "port/sys_time.h"
#include "port/port.h"
#include "rocksdb/options.h"
#include "util/arena.h"
@ -283,6 +282,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->set_fd_cloexec = options.is_fd_close_on_exec;
env_options->bytes_per_sync = options.bytes_per_sync;
env_options->rate_limiter = options.rate_limiter.get();
env_options->allow_fallocate = options.allow_fallocate;
}
}

@ -352,6 +352,7 @@ class PosixMmapFile : public WritableFile {
char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_; // If false, fallocate calls are bypassed
bool fallocate_with_keep_size_;
#endif
@ -393,7 +394,7 @@ class PosixMmapFile : public WritableFile {
TEST_KILL_RANDOM(rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
{
if (allow_fallocate_) {
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
@ -453,6 +454,7 @@ class PosixMmapFile : public WritableFile {
last_sync_(nullptr),
file_offset_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert((page_size & (page_size - 1)) == 0);
@ -575,8 +577,12 @@ class PosixMmapFile : public WritableFile {
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
int alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
@ -593,6 +599,7 @@ class PosixWritableFile : public WritableFile {
int fd_;
uint64_t filesize_;
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_;
bool fallocate_with_keep_size_;
#endif
@ -600,6 +607,7 @@ class PosixWritableFile : public WritableFile {
PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options)
: filename_(fname), fd_(fd), filesize_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert(!options.use_mmap_writes);
@ -660,8 +668,10 @@ class PosixWritableFile : public WritableFile {
// We ignore error since failure of this operation does not affect
// correctness.
IOSTATS_TIMER_GUARD(allocate_nanos);
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE,
filesize_, block_size * last_allocated_block - filesize_);
if (allow_fallocate_) {
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
block_size * last_allocated_block - filesize_);
}
#endif
}
@ -714,9 +724,12 @@ class PosixWritableFile : public WritableFile {
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM(rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status;
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
@ -1146,7 +1159,7 @@ class PosixEnv : public Env {
} else {
int fd = fileno(f);
#ifdef ROCKSDB_FALLOCATE_PRESENT
fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024 * 1024);
fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 4 * 1024);
#endif
SetFD_CLOEXEC(fd, nullptr);
result->reset(new PosixLogger(f, &PosixEnv::gettid, this));
@ -1609,7 +1622,8 @@ class PosixEnv : public Env {
};
PosixEnv::PosixEnv() : checkedDiskForMmap_(false),
PosixEnv::PosixEnv()
: checkedDiskForMmap_(false),
forceMmapOff(false),
page_size_(getpagesize()),
thread_pools_(Priority::TOTAL) {

@ -240,6 +240,7 @@ DBOptions::DBOptions()
allow_os_buffer(true),
allow_mmap_reads(false),
allow_mmap_writes(false),
allow_fallocate(true),
is_fd_close_on_exec(true),
skip_log_error_on_recovery(false),
stats_dump_period_sec(600),
@ -294,6 +295,7 @@ DBOptions::DBOptions(const Options& options)
allow_os_buffer(options.allow_os_buffer),
allow_mmap_reads(options.allow_mmap_reads),
allow_mmap_writes(options.allow_mmap_writes),
allow_fallocate(options.allow_fallocate),
is_fd_close_on_exec(options.is_fd_close_on_exec),
skip_log_error_on_recovery(options.skip_log_error_on_recovery),
stats_dump_period_sec(options.stats_dump_period_sec),
@ -341,6 +343,7 @@ void DBOptions::Dump(Logger* log) const {
keep_log_file_num);
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Header(log, " Options.allow_fallocate: %d", allow_fallocate);
Header(log, " Options.allow_mmap_writes: %d", allow_mmap_writes);
Header(log, " Options.create_missing_column_families: %d",
create_missing_column_families);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save