Compare commits

..

No commits in common. 'oxigraph-main' and 'main' have entirely different histories.

  1. 70
      .circleci/config.yml
  2. 4
      .github/workflows/sanity_check.yml
  3. 2
      .gitignore
  4. 99
      CMakeLists.txt
  5. 170
      HISTORY.md
  6. 22
      INSTALL.md
  7. 124
      Makefile
  8. 1
      PLUGINS.md
  9. 2
      README.md
  10. 21
      ROCKSDB_LITE.md
  11. 362
      TARGETS
  12. 34
      USERS.md
  13. 13
      buckifier/buckify_rocksdb.py
  14. 169
      build_tools/build_detect_platform
  15. 22
      build_tools/dependencies_platform009.sh
  16. 2
      build_tools/fbcode_config.sh
  17. 170
      build_tools/fbcode_config_platform009.sh
  18. 2
      build_tools/fbcode_config_platform010.sh
  19. 2
      build_tools/regression_build_test.sh
  20. 43
      build_tools/update_dependencies.sh
  21. 59
      cache/cache.cc
  22. 259
      cache/cache_bench_tool.cc
  23. 30
      cache/cache_entry_roles.cc
  24. 83
      cache/cache_entry_roles.h
  25. 27
      cache/cache_entry_stats.h
  26. 40
      cache/cache_helpers.cc
  27. 36
      cache/cache_helpers.h
  28. 2
      cache/cache_key.cc
  29. 17
      cache/cache_reservation_manager.cc
  30. 9
      cache/cache_reservation_manager.h
  31. 428
      cache/cache_test.cc
  32. 70
      cache/charged_cache.cc
  33. 80
      cache/charged_cache.h
  34. 971
      cache/clock_cache.cc
  35. 345
      cache/clock_cache.h
  36. 141
      cache/compressed_secondary_cache.cc
  37. 41
      cache/compressed_secondary_cache.h
  38. 563
      cache/compressed_secondary_cache_test.cc
  39. 510
      cache/lru_cache.cc
  40. 179
      cache/lru_cache.h
  41. 971
      cache/lru_cache_test.cc
  42. 20
      cache/secondary_cache.cc
  43. 433
      cache/secondary_cache_adapter.cc
  44. 76
      cache/secondary_cache_adapter.h
  45. 51
      cache/sharded_cache.cc
  46. 97
      cache/sharded_cache.h
  47. 375
      cache/typed_cache.h
  48. 2
      coverage/coverage_test.sh
  49. 14
      crash_test.mk
  50. 5
      db/arena_wrapped_db_iter.cc
  51. 48
      db/blob/blob_contents.cc
  52. 37
      db/blob/blob_contents.h
  53. 7
      db/blob/blob_counting_iterator.h
  54. 32
      db/blob/blob_file_builder.cc
  55. 23
      db/blob/blob_file_cache.cc
  56. 14
      db/blob/blob_file_cache.h
  57. 25
      db/blob/blob_file_cache_test.cc
  58. 17
      db/blob/blob_file_completion_callback.h
  59. 74
      db/blob/blob_file_reader.cc
  60. 6
      db/blob/blob_file_reader.h
  61. 83
      db/blob/blob_file_reader_test.cc
  62. 2
      db/blob/blob_log_sequential_reader.cc
  63. 2
      db/blob/blob_log_writer.cc
  64. 77
      db/blob/blob_source.cc
  65. 22
      db/blob/blob_source.h
  66. 83
      db/blob/blob_source_test.cc
  67. 464
      db/blob/db_blob_basic_test.cc
  68. 14
      db/blob/db_blob_compaction_test.cc
  69. 2
      db/blob/db_blob_corruption_test.cc
  70. 6
      db/blob/db_blob_index_test.cc
  71. 79
      db/builder.cc
  72. 4
      db/builder.h
  73. 389
      db/c.cc
  74. 265
      db/c_test.c
  75. 206
      db/column_family.cc
  76. 39
      db/column_family.h
  77. 286
      db/column_family_test.cc
  78. 19
      db/compact_files_test.cc
  79. 124
      db/compaction/compaction.cc
  80. 57
      db/compaction/compaction.h
  81. 215
      db/compaction/compaction_iterator.cc
  82. 24
      db/compaction/compaction_iterator.h
  83. 4
      db/compaction/compaction_iterator_test.cc
  84. 237
      db/compaction/compaction_job.cc
  85. 23
      db/compaction/compaction_job.h
  86. 16
      db/compaction/compaction_job_stats_test.cc
  87. 30
      db/compaction/compaction_job_test.cc
  88. 580
      db/compaction/compaction_outputs.cc
  89. 34
      db/compaction/compaction_outputs.h
  90. 26
      db/compaction/compaction_picker.cc
  91. 6
      db/compaction/compaction_picker.h
  92. 129
      db/compaction/compaction_picker_fifo.cc
  93. 11
      db/compaction/compaction_picker_fifo.h
  94. 63
      db/compaction/compaction_picker_level.cc
  95. 219
      db/compaction/compaction_picker_test.cc
  96. 59
      db/compaction/compaction_picker_universal.cc
  97. 2
      db/compaction/compaction_picker_universal.h
  98. 2
      db/compaction/compaction_service_job.cc
  99. 14
      db/compaction/compaction_service_test.cc
  100. 9
      db/compaction/sst_partitioner.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -152,15 +152,15 @@ commands:
steps:
- run:
name: "Test low-variance benchmarks"
command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 20000000
command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 10000000
environment:
LD_LIBRARY_PATH: /usr/local/lib
# How long to run parts of the test(s)
DURATION_RO: 300
DURATION_RW: 500
DURATION_RO: 400
DURATION_RW: 700
# Keep threads within physical capacity of server (much lower than default)
NUM_THREADS: 1
MAX_BACKGROUND_JOBS: 4
MAX_BACKGROUND_JOBS: 3
# Don't run a couple of "optional" initial tests
CI_TESTS_ONLY: "true"
# Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree
@ -170,11 +170,7 @@ commands:
# The benchmark host has 32GB memory
# The following values are tailored to work with that
# Note, tests may not exercise the targeted issues if the memory is increased on new test hosts.
COMPRESSION_TYPE: "none"
CACHE_INDEX_AND_FILTER_BLOCKS: 1
MIN_LEVEL_TO_COMPRESS: 3
CACHE_SIZE_MB: 10240
MB_WRITE_PER_SEC: 2
post-benchmarks:
steps:
@ -273,12 +269,12 @@ jobs:
./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression
- post-steps
build-linux-static_lib-alt_namespace-status_checked:
build-linux-shared_lib-alt_namespace-status_checked:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check
- post-steps
build-linux-release:
@ -286,21 +282,11 @@ jobs:
resource_class: 2xlarge
steps:
- checkout # check out the code in the project directory
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- post-steps
@ -316,6 +302,27 @@ jobs:
- run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
build-linux-lite:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run: LITE=1 make V=1 J=8 -j8 check
- post-steps
build-linux-lite-release:
executor: linux-docker
resource_class: large
steps:
- checkout # check out the code in the project directory
- run: LITE=1 make V=1 -j8 release
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: LITE=1 make V=1 -j8 release
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- post-steps
build-linux-clang-no_test_run:
executor: linux-docker
resource_class: xlarge
@ -420,10 +427,7 @@ jobs:
steps:
- checkout # check out the code in the project directory
- run: apt-get update -y && apt-get install -y libgflags-dev
- run:
name: "Unity build"
command: make V=1 -j8 unity_test
no_output_timeout: 20m
- run: make V=1 -j8 unity_test
- run: make V=1 -j8 -k check-headers # could be moved to a different build
- post-steps
@ -434,7 +438,7 @@ jobs:
- pre-steps
- setup-folly
- build-folly
- run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check
- post-steps
build-linux-gcc-7-with-folly-lite-no-test:
@ -480,7 +484,7 @@ jobs:
resource_class: 2xlarge
steps:
- pre-steps
- run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench # TODO: LIB_MODE only to work around unresolved linker failures
- run: CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
- post-steps
build-linux-clang-13-no_test_run:
@ -499,7 +503,7 @@ jobs:
- pre-steps
- setup-folly
- build-folly
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- post-steps
# This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
@ -516,7 +520,7 @@ jobs:
resource_class: large
steps:
- pre-steps
- run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush
- run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
- post-steps
build-linux-crashtest-tiered-storage-bb:
@ -526,7 +530,7 @@ jobs:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 blackbox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
@ -537,7 +541,7 @@ jobs:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 whitebox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
@ -817,16 +821,18 @@ workflows:
- build-linux-cmake-with-folly-coroutines
- build-linux-cmake-with-benchmark
- build-linux-encrypted_env-no_compression
- build-linux-lite
jobs-linux-run-tests-san:
jobs:
- build-linux-clang10-asan
- build-linux-clang10-ubsan
- build-linux-clang10-mini-tsan
- build-linux-static_lib-alt_namespace-status_checked
- build-linux-shared_lib-alt_namespace-status_checked
jobs-linux-no-test-run:
jobs:
- build-linux-release
- build-linux-release-rtti
- build-linux-lite-release
- build-examples
- build-fuzzers
- build-linux-clang-no_test_run

@ -33,7 +33,9 @@ jobs:
run: pip install argparse
- name: Download clang-format-diff.py
run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
uses: wei/wget@v1
with:
args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
- name: Check format
run: VERBOSE_CHECK=1 make check-format

2
.gitignore vendored

@ -95,5 +95,3 @@ fuzz/crash-*
cmake-build-*
third-party/folly/
.cache
*.sublime-*

@ -245,40 +245,41 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
endif(HAS_S390X_MARCH_NATIVE)
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
CHECK_C_COMPILER_FLAG("-march=loongarch64" HAS_LOONGARCH64)
if(HAS_LOONGARCH64)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
endif(HAS_LOONGARCH64)
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
set(PORTABLE 0 CACHE STRING "Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU")
if(PORTABLE STREQUAL 1)
# Usually nothing to do; compiler default is typically the most general
if(NOT MSVC)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64")
endif()
option(PORTABLE "build a portable binary" OFF)
option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF)
option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF)
if(PORTABLE)
add_definitions(-DROCKSDB_PORTABLE)
# MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
# is available, it is available by default.
if(FORCE_SSE42 AND NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
endif()
elseif(PORTABLE MATCHES [^0]+)
# Name of a CPU arch spec or feature set to require
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:${PORTABLE}")
if(FORCE_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
endif()
# MSVC automatically enables BMI / lzcnt with AVX2.
if(FORCE_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
endif()
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${PORTABLE}")
if(FORCE_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
endif()
if(FORCE_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
endif()
endif()
else()
if(MSVC)
# NOTE: No auto-detection of current CPU, but instead assume some useful
# level of optimization is supported
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
else()
# Require instruction set from current CPU (with some legacy or opt-out
# exceptions)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC)
@ -293,6 +294,25 @@ if(NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
endif()
CHECK_CXX_SOURCE_COMPILES("
#include <cstdint>
#include <nmmintrin.h>
#include <wmmintrin.h>
int main() {
volatile uint32_t x = _mm_crc32_u32(0, 0);
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
}
" HAVE_SSE42)
if(HAVE_SSE42)
add_definitions(-DHAVE_SSE42)
add_definitions(-DHAVE_PCLMUL)
elseif(FORCE_SSE42)
message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
endif()
# Check if -latomic is required or not
if (NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "--std=c++17")
@ -465,6 +485,12 @@ if(CMAKE_COMPILER_IS_GNUCXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp")
endif()
option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
if(ROCKSDB_LITE)
add_definitions(-DROCKSDB_LITE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
endif()
if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
add_definitions(-fno-builtin-memcmp -DCYGWIN)
elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
@ -547,7 +573,7 @@ if(HAVE_SCHED_GETCPU)
add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
endif()
check_cxx_symbol_exists(getauxval "sys/auxv.h" HAVE_AUXV_GETAUXVAL)
check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL)
if(HAVE_AUXV_GETAUXVAL)
add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
endif()
@ -623,14 +649,12 @@ set(SOURCES
cache/cache.cc
cache/cache_entry_roles.cc
cache/cache_key.cc
cache/cache_helpers.cc
cache/cache_reservation_manager.cc
cache/charged_cache.cc
cache/clock_cache.cc
cache/compressed_secondary_cache.cc
cache/lru_cache.cc
cache/secondary_cache.cc
cache/secondary_cache_adapter.cc
cache/sharded_cache.cc
db/arena_wrapped_db_iter.cc
db/blob/blob_contents.cc
@ -717,7 +741,6 @@ set(SOURCES
db/write_batch.cc
db/write_batch_base.cc
db/write_controller.cc
db/write_stall_stats.cc
db/write_thread.cc
env/composite_env.cc
env/env.cc
@ -783,7 +806,6 @@ set(SOURCES
table/block_based/block_based_table_iterator.cc
table/block_based/block_based_table_reader.cc
table/block_based/block_builder.cc
table/block_based/block_cache.cc
table/block_based/block_prefetcher.cc
table/block_based/block_prefix_index.cc
table/block_based/data_block_hash_index.cc
@ -851,7 +873,6 @@ set(SOURCES
util/compression_context_cache.cc
util/concurrent_task_limiter_impl.cc
util/crc32c.cc
util/data_structure.cc
util/dynamic_bloom.cc
util/hash.cc
util/murmurhash.cc
@ -865,8 +886,6 @@ set(SOURCES
util/string_util.cc
util/thread_local.cc
util/threadpool_imp.cc
util/udt_util.cc
util/write_batch_util.cc
util/xxhash.cc
utilities/agg_merge/agg_merge.cc
utilities/backup/backup_engine.cc
@ -981,6 +1000,12 @@ if ( ROCKSDB_PLUGINS )
endforeach()
endif()
if(HAVE_SSE42 AND NOT MSVC)
set_source_files_properties(
util/crc32c.cc
PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
list(APPEND SOURCES
util/crc32c_ppc.c
@ -1246,7 +1271,6 @@ if(WITH_TESTS OR WITH_BENCHMARK_TOOLS)
add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
add_library(testharness STATIC
test_util/mock_time_env.cc
test_util/secondary_cache_test_util.cc
test_util/testharness.cc)
target_link_libraries(testharness gtest)
endif()
@ -1292,7 +1316,6 @@ if(WITH_TESTS)
db/db_bloom_filter_test.cc
db/db_compaction_filter_test.cc
db/db_compaction_test.cc
db/db_clip_test.cc
db/db_dynamic_level_test.cc
db/db_encryption_test.cc
db/db_flush_test.cc
@ -1423,7 +1446,6 @@ if(WITH_TESTS)
util/timer_test.cc
util/thread_list_test.cc
util/thread_local_test.cc
util/udt_util_test.cc
util/work_queue_test.cc
utilities/agg_merge/agg_merge_test.cc
utilities/backup/backup_engine_test.cc
@ -1592,6 +1614,3 @@ option(WITH_BENCHMARK "build benchmark tests" OFF)
if(WITH_BENCHMARK)
add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/)
endif()
target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)

@ -1,164 +1,9 @@
# Rocksdb Change Log
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
## 8.5.0 (07/21/2023)
### Public API Changes
* Removed recently added APIs `GeneralCache` and `MakeSharedGeneralCache()` as our plan changed to stop exposing a general-purpose cache interface. The old forms of these APIs, `Cache` and `NewLRUCache()`, are still available, although general-purpose caching support will be dropped eventually.
### Behavior Changes
* Option `periodic_compaction_seconds` no longer supports FIFO compaction: setting it has no effect on FIFO compactions. FIFO compaction users should only set option `ttl` instead.
* Move prefetching responsibility to page cache for compaction read for non directIO use case
### Performance Improvements
* In case of direct_io, if buffer passed by callee is already aligned, RandomAccessFileRead::Read will avoid realloacting a new buffer, reducing memcpy and use already passed aligned buffer.
* Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
### Bug Fixes
* Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest. Right now it's an experimental feature.
## 8.4.0 (06/26/2023)
### New Features
* Add FSReadRequest::fs_scratch which is a data buffer allocated and provided by underlying FileSystem to RocksDB during reads, when FS wants to provide its own buffer with data instead of using RocksDB provided FSReadRequest::scratch. This can help in cpu optimization by avoiding copy from file system's buffer to RocksDB buffer. More details on how to use/enable it in file_system.h. Right now its supported only for MultiReads(async + sync) with non direct io.
* Start logging non-zero user-defined timestamp sizes in WAL to signal user key format in subsequent records and use it during recovery. This change will break recovery from WAL files written by early versions that contain user-defined timestamps. The workaround is to ensure there are no WAL files to recover (i.e. by flushing before close) before upgrade.
* Added new property "rocksdb.obsolete-sst-files-size-property" that reports the size of SST files that have become obsolete but have not yet been deleted or scheduled for deletion
* Start to record the value of the flag `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` in the Manifest and table properties for a SST file when it is created. And use the recorded flag when creating a table reader for the SST file. This flag is only explicitly record if it's false.
* Add a new option OptimisticTransactionDBOptions::shared_lock_buckets that enables sharing mutexes for validating transactions between DB instances, for better balancing memory efficiency and validation contention across DB instances. Different column families and DBs also now use different hash seeds in this validation, so that the same set of key names will not contend across DBs or column families.
* Add a new ticker `rocksdb.files.marked.trash.deleted` to track the number of trash files deleted by background thread from the trash queue.
* Add an API NewTieredVolatileCache() in include/rocksdb/cache.h to allocate an instance of a block cache with a primary block cache tier and a compressed secondary cache tier. A cache of this type distributes memory reservations against the block cache, such as WriteBufferManager, table reader memory etc., proportionally across both the primary and compressed secondary cache.
* Add `WaitForCompact()` to wait for all flush and compactions jobs to finish. Jobs to wait include the unscheduled (queued, but not scheduled yet).
* Add `WriteBatch::Release()` that releases the batch's serialized data to the caller.
### Public API Changes
* Add C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`.
* change the FileSystem::use_async_io() API to SupportedOps API in order to extend it to various operations supported by underlying FileSystem. Right now it contains FSSupportedOps::kAsyncIO and FSSupportedOps::kFSBuffer. More details about FSSupportedOps in filesystem.h
* Add new tickers: `rocksdb.error.handler.bg.error.count`, `rocksdb.error.handler.bg.io.error.count`, `rocksdb.error.handler.bg.retryable.io.error.count` to replace the misspelled ones: `rocksdb.error.handler.bg.errro.count`, `rocksdb.error.handler.bg.io.errro.count`, `rocksdb.error.handler.bg.retryable.io.errro.count` ('error' instead of 'errro'). Users should switch to use the new tickers before 9.0 release as the misspelled old tickers will be completely removed then.
* Overload the API CreateColumnFamilyWithImport() to support creating ColumnFamily by importing multiple ColumnFamilies It requires that CFs should not overlap in user key range.
### Behavior Changes
* Change the default value for option `level_compaction_dynamic_level_bytes` to true. This affects users who use leveled compaction and do not set this option explicitly. These users may see additional background compactions following DB open. These compactions help to shape the LSM according to `level_compaction_dynamic_level_bytes` such that the size of each level Ln is approximately size of Ln-1 * `max_bytes_for_level_multiplier`. Turning on this option has other benefits too: see more detail in wiki: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#option-level_compaction_dynamic_level_bytes-and-levels-target-size and in option comment in advanced_options.h (#11525).
* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468)
For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468)
* RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459)
* When a DB is openend with `allow_ingest_behind=true` (currently only Universal compaction is supported), files in the last level, i.e. the ingested files, will not be included in any compaction. (#11489)
* Statistics `rocksdb.sst.read.micros` scope is expanded to all SST reads except for file ingestion and column family import (some compaction reads were previously excluded).
### Bug Fixes
* Reduced cases of illegally using Env::Default() during static destruction by never destroying the internal PosixEnv itself (except for builds checking for memory leaks). (#11538)
* Fix extra prefetching during seek in async_io when BlockBasedTableOptions.num_file_reads_for_auto_readahead is 1 leading to extra reads than required.
* Fix a bug where compactions that are qualified to be run as 2 subcompactions were only run as one subcompaction.
* Fix a use-after-move bug in block.cc.
## 8.3.0 (05/19/2023)
### New Features
* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache.
* Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail.
* Introduced a new option `CompactionOptionsFIFO::file_temperature_age_thresholds` that allows FIFO compaction to compact files to different temperatures based on key age (#11428).
* Added a new ticker stat to count how many times RocksDB detected a corruption while verifying a block checksum: `BLOCK_CHECKSUM_MISMATCH_COUNT`.
* New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open.
* New statistics tickers for various iterator seek behaviors and relevant filtering, as \*`_LEVEL_SEEK_`\*. (#11460)
### Public API Changes
* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones.
* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists.
* Changed the meaning of various Bloom filter stats (prefix vs. whole key), with iterator-related filtering only being tracked in the new \*`_LEVEL_SEEK_`\*. stats. (#11460)
### Behavior changes
* For x86, CPU features are no longer detected at runtime nor in build scripts, but in source code using common preprocessor defines. This will likely unlock some small performance improvements on some newer hardware, but could hurt performance of the kCRC32c checksum, which is no longer the default, on some "portable" builds. See PR #11419 for details.
### Bug Fixes
* Delete an empty WAL file on DB open if the log number is less than the min log number to keep
* Delete temp OPTIONS file on DB open if there is a failure to write it out or rename it
### Performance Improvements
* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406).
## 8.2.0 (04/24/2023)
### Public API Changes
* `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
* Add `multi_get_for_update` to C API.
* Remove unnecessary constructor for CompressionOptions.
### Behavior changes
* Changed default block cache size from an 8MB to 32MB LRUCache, which increases the default number of cache shards from 16 to 64. This change is intended to minimize cache mutex contention under stress conditions. See https://github.com/facebook/rocksdb/wiki/Block-Cache for more information.
* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321).
* User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks.
* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp.
* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375)).
### Bug Fixes
* In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size.
* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet.
### New Features
* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called.
* Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()`
* Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388)
* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction.
## 8.1.0 (03/18/2023)
### Behavior changes
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
* If the async_io ReadOption is specified for MultiGet or NewIterator on a platform that doesn't support IO uring, the option is ignored and synchronous IO is used.
### Bug Fixes
* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB.
* Fixed a couple of cases where a Merge operand encountered during iteration wasn't reflected in the `internal_merge_count` PerfContext counter.
* Fixed a bug in CreateColumnFamilyWithImport()/ExportColumnFamily() which did not support range tombstones (#11252).
* Fixed a bug where an excluded column family from an atomic flush contains unflushed data that should've been included in this atomic flush (i.e, data of seqno less than the max seqno of this atomic flush), leading to potential data loss in this excluded column family when `WriteOptions::disableWAL == true` (#11148).
### New Features
* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits
* Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries.
* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit}
* Add support for SecondaryCache with HyperClockCache (`HyperClockCacheOptions` inherits `secondary_cache` option from `ShardedCacheOptions`)
* Add new db properties `rocksdb.cf-write-stall-stats`, `rocksdb.db-write-stall-stats`and APIs to examine them in a structured way. In particular, users of `GetMapProperty()` with property `kCFWriteStallStats`/`kDBWriteStallStats` can now use the functions in `WriteStallStatsMapKeys` to find stats in the map.
### Public API Changes
* Changed various functions and features in `Cache` that are mostly relevant to custom implementations or wrappers. Especially, asychronous lookup functionality is moved from `Lookup()` to a new `StartAsyncLookup()` function.
## 8.0.0 (02/19/2023)
### Behavior changes
* `ReadOptions::verify_checksums=false` disables checksum verification for more reads of non-`CacheEntryRole::kDataBlock` blocks.
* In case of scan with async_io enabled, if posix doesn't support IOUring, Status::NotSupported error will be returned to the users. Initially that error was swallowed and reads were switched to synchronous reads.
### Bug Fixes
* Fixed a data race on `ColumnFamilyData::flush_reason` caused by concurrent flushes.
* Fixed an issue in `Get` and `MultiGet` when user-defined timestamps is enabled in combination with BlobDB.
* Fixed some atypical behaviors for `LockWAL()` such as allowing concurrent/recursive use and not expecting `UnlockWAL()` after non-OK result. See API comments.
* Fixed a feature interaction bug where for blobs `GetEntity` would expose the blob reference instead of the blob value.
* Fixed `DisableManualCompaction()` and `CompactRangeOptions::canceled` to cancel compactions even when they are waiting on conflicting compactions to finish
* Fixed a bug in which a successful `GetMergeOperands()` could transiently return `Status::MergeInProgress()`
* Return the correct error (Status::NotSupported()) to MultiGet caller when ReadOptions::async_io flag is true and IO uring is not enabled. Previously, Status::Corruption() was being returned when the actual failure was lack of async IO support.
* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block.
### Feature Removal
* Remove RocksDB Lite.
* The feature block_cache_compressed is removed. Statistics related to it are removed too.
* Remove deprecated Env::LoadEnv(). Use Env::CreateFromString() instead.
* Remove deprecated FileSystem::Load(). Use FileSystem::CreateFromString() instead.
* Removed the deprecated version of these utility functions and the corresponding Java bindings: `LoadOptionsFromFile`, `LoadLatestOptions`, `CheckOptionsCompatibility`.
* Remove the FactoryFunc from the LoadObject method from the Customizable helper methods.
### Public API Changes
* Moved rarely-needed Cache class definition to new advanced_cache.h, and added a CacheWrapper class to advanced_cache.h. Minor changes to SimCache API definitions.
* Completely removed the following deprecated/obsolete statistics: the tickers `BLOCK_CACHE_INDEX_BYTES_EVICT`, `BLOCK_CACHE_FILTER_BYTES_EVICT`, `BLOOM_FILTER_MICROS`, `NO_FILE_CLOSES`, `STALL_L0_SLOWDOWN_MICROS`, `STALL_MEMTABLE_COMPACTION_MICROS`, `STALL_L0_NUM_FILES_MICROS`, `RATE_LIMIT_DELAY_MILLIS`, `NO_ITERATORS`, `NUMBER_FILTERED_DELETES`, `WRITE_TIMEDOUT`, `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, `BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT` as well as the histograms `STALL_L0_SLOWDOWN_COUNT`, `STALL_MEMTABLE_COMPACTION_COUNT`, `STALL_L0_NUM_FILES_COUNT`, `HARD_RATE_LIMIT_DELAY_COUNT`, `SOFT_RATE_LIMIT_DELAY_COUNT`, `BLOB_DB_GC_MICROS`, and `NUM_DATA_BLOCKS_READ_PER_LEVEL`. Note that as a result, the C++ enum values of the still supported statistics have changed. Developers are advised to not rely on the actual numeric values.
* Deprecated IngestExternalFileOptions::write_global_seqno and change default to false. This option only needs to be set to true to generate a DB compatible with RocksDB versions before 5.16.0.
* Remove deprecated APIs `GetColumnFamilyOptionsFrom{Map|String}(const ColumnFamilyOptions&, ..)`, `GetDBOptionsFrom{Map|String}(const DBOptions&, ..)`, `GetBlockBasedTableOptionsFrom{Map|String}(const BlockBasedTableOptions& table_options, ..)` and ` GetPlainTableOptionsFrom{Map|String}(const PlainTableOptions& table_options,..)`.
* Added a subcode of `Status::Corruption`, `Status::SubCode::kMergeOperatorFailed`, for users to identify corruption failures originating in the merge operator, as opposed to RocksDB's internally identified data corruptions
### Build Changes
* The `make` build now builds a shared library by default instead of a static library. Use `LIB_MODE=static` to override.
### New Features
* Compaction filters are now supported for wide-column entities by means of the `FilterV3` API. See the comment of the API for more details.
* Added `do_not_compress_roles` to `CompressedSecondaryCacheOptions` to disable compression on certain kinds of block. Filter blocks are now not compressed by CompressedSecondaryCache by default.
* Added a new `MultiGetEntity` API that enables batched wide-column point lookups. See the API comments for more details.
## 7.10.0 (01/23/2023)
## Unreleased
### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
* Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file.
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
### Bug Fixes
* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
@ -171,19 +16,12 @@ For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction
* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
* Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching
* Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d).
* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075).
### New Features
* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.
* Add BackupEngine feature to exclude files from backup that are known to be backed up elsewhere, using `CreateBackupOptions::exclude_files_callback`. To restore the DB, the excluded files must be provided in alternative backup directories using `RestoreOptions::alternate_dirs`.
### Public API Changes
* Substantial changes have been made to the Cache class to support internal development goals. Direct use of Cache class members is discouraged and further breaking modifications are expected in the future. SecondaryCache has some related changes and implementations will need to be updated. (Unlike Cache, SecondaryCache is still intended to support user implementations, and disruptive changes will be avoided.) (#10975)
* Add `MergeOperationOutput::op_failure_scope` for merge operator users to control the blast radius of merge operator failures. Existing merge operator users do not need to make any change to preserve the old behavior
### Performance Improvements
* Updated xxHash source code, which should improve kXXH3 checksum speed, at least on ARM (#11098).
* Improved CPU efficiency of DB reads, from block cache access improvements (#10975).
### New Features
* Add BackupEngine feature to exclude files from backup that are known to be backed up elsewhere, using `CreateBackupOptions::exclude_files_callback`. To restore the DB, the excluded files must be provided in alternative backup directories using `RestoreOptions::alternate_dirs`.
## 7.9.0 (11/21/2022)
### Performance Improvements

@ -20,15 +20,12 @@ There are few options when compiling RocksDB:
depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
use binaries compiled by `make all` in production.
* By default the binary we produce is optimized for the CPU you're compiling on
(`-march=native` or the equivalent). To build a binary compatible with the most
general architecture supported by your CPU and compiler, set `PORTABLE=1` for
the build, but performance will suffer as many operations benefit from newer
and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`,
it can be set to an architecture name recognized by your compiler. For example,
on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports
many or most of the available optimizations while still being compatible with
most processors made since roughly 2013.
* By default the binary we produce is optimized for the platform you're compiling on
(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your
CPU supports it. To print a warning if your CPU does not support SSE4.2, build with
`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want
to build a portable binary, add `PORTABLE=1` before your make commands, like this:
`PORTABLE=1 make static_lib`.
## Dependencies
@ -51,11 +48,6 @@ most processors made since roughly 2013.
* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
* If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed.
* You can do the following to install Google benchmark. These commands are copied from `./build_tools/ubuntu20_image/Dockerfile`:
`$ git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark`
`$ cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install`
## Supported platforms
@ -186,7 +178,7 @@ most processors made since roughly 2013.
gmake rocksdbjava
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`.
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
* **Windows** (Visual Studio 2017 to up):
* Read and follow the instructions at CMakeLists.txt

@ -44,6 +44,13 @@ quoted_perl_command = $(subst ','\'',$(perl_command))
# Set the default DEBUG_LEVEL to 1
DEBUG_LEVEL?=1
# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
# Mode "static" means to link against static libraries (.a)
# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
#
# Set the default LIB_MODE to static
LIB_MODE?=static
# OBJ_DIR is where the object files reside. Default to the current directory
OBJ_DIR?=.
@ -74,42 +81,29 @@ else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
endif
endif
# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
# Mode "static" means to link against static libraries (.a)
# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
#
ifeq ($(DEBUG_LEVEL), 0)
# For optimized, set the default LIB_MODE to static for code size/efficiency
LIB_MODE?=static
else
# For debug, set the default LIB_MODE to shared for efficient `make check` etc.
LIB_MODE?=shared
endif
$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
$(info $$DEBUG_LEVEL is $(DEBUG_LEVEL), $$LIB_MODE is $(LIB_MODE))
# Detect what platform we're building on.
# Export some common variables that might have been passed as Make variables
# instead of environment variables.
dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
export LDFLAGS="$(EXTRA_LDFLAGS)"; \
export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
export PORTABLE="$(PORTABLE)"; \
export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
export USE_CLANG="$(USE_CLANG)"; \
export LIB_MODE="$(LIB_MODE)"; \
export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
export USE_FOLLY="$(USE_FOLLY)"; \
"$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
# this file is generated by the previous line to set build flags and sources
include make_config.mk
# Lite build flag.
LITE ?= 0
ifeq ($(LITE), 0)
ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
# Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
# specified instead of LITE=1 on the command line.
LITE=1
endif
else ifeq ($(LITE), 1)
ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
OPT += -DROCKSDB_LITE
endif
endif
# Figure out optimize level.
ifneq ($(DEBUG_LEVEL), 2)
ifeq ($(LITE), 0)
OPTIMIZE_LEVEL ?= -O2
else
OPTIMIZE_LEVEL ?= -Os
endif
endif
# `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`.
# In that case, the compiler default (`-O0` for gcc and clang) will be used.
@ -242,6 +236,25 @@ am__v_AR_1 =
AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@
# Detect what platform we're building on.
# Export some common variables that might have been passed as Make variables
# instead of environment variables.
dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
export LDFLAGS="$(EXTRA_LDFLAGS)"; \
export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
export PORTABLE="$(PORTABLE)"; \
export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
export USE_CLANG="$(USE_CLANG)"; \
export LIB_MODE="$(LIB_MODE)"; \
export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
export USE_FOLLY="$(USE_FOLLY)"; \
"$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
# this file is generated by the previous line to set build flags and sources
include make_config.mk
ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
include $(ROCKSDB_PLUGIN_MKS)
ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\&
@ -324,6 +337,13 @@ endif
ifeq ($(PLATFORM), OS_SOLARIS)
PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99
endif
ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
# found
CFLAGS += -fno-exceptions
CXXFLAGS += -fno-exceptions
# LUA is not supported under ROCKSDB_LITE
LUA_PATH =
endif
ifeq ($(LIB_MODE),shared)
# So that binaries are executable from build location, in addition to install location
@ -337,8 +357,8 @@ ifneq ($(MACHINE), arm64)
# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
# set during platform detection
DISABLE_JEMALLOC=1
PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS))
PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS))
PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
endif
endif
endif
@ -539,7 +559,7 @@ endif
ifdef USE_CLANG
# Used by some teams in Facebook
WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator
WARNING_FLAGS += -Wshift-sign-overflow
endif
ifeq ($(PLATFORM), OS_OPENBSD)
@ -1062,11 +1082,13 @@ check: all
rm -rf $(TEST_TMPDIR)
ifneq ($(PLATFORM), OS_AIX)
$(PYTHON) tools/check_all_python.py
ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
ifndef ASSERT_STATUS_CHECKED # not yet working with these tests
$(PYTHON) tools/ldb_test.py
sh tools/rocksdb_dump_test.sh
endif
endif
endif
ifndef SKIP_FORMAT_BUCK_CHECKS
$(MAKE) check-format
$(MAKE) check-buck-targets
@ -1222,9 +1244,9 @@ clean: clean-ext-libraries-all clean-rocks clean-rocksjava
clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava
clean-rocks:
# Not practical to exactly match all versions/variants in naming (e.g. debug or not)
rm -f ${LIBNAME}*.so* ${LIBNAME}*.a
rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(MICROBENCHS)
echo shared=$(ALL_SHARED_LIBS)
echo static=$(ALL_STATIC_LIBS)
rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS)
rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
$(FIND) . -name "*.[oda]" -exec rm -f {} \;
$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
@ -1417,9 +1439,6 @@ thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY
work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
udt_util_test: $(OBJ_DIR)/util/udt_util_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
@ -1483,9 +1502,6 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR
db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
@ -2054,7 +2070,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
ifeq ($(PLATFORM), OS_SOLARIS)
ARCH := $(shell isainfo -b)
else ifeq ($(PLATFORM), OS_OPENBSD)
ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE)))
ARCH := 64
else
ARCH := 32
@ -2075,7 +2091,7 @@ ifneq ($(origin JNI_LIBC), undefined)
endif
ifeq (,$(ROCKSDBJNILIB))
ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE)))
ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
else
ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
@ -2441,8 +2457,6 @@ checkout_folly:
@# NOTE: this hack is required for gcc in some cases
perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h
CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
build_folly:
FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
if [ "$$FOLLY_INST_PATH" ]; then \
@ -2453,8 +2467,8 @@ build_folly:
fi
# Restore the original version of Invoke.h with boost dependency
cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h
cd third-party/folly && \
CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \
CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
# ---------------------------------------------------------------------------
# Build size testing
@ -2475,6 +2489,18 @@ build_size:
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib $$(stat --printf="%s" `readlink -f librocksdb.so`)
strip `readlink -f librocksdb.so`
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
# === lite build, static ===
$(MAKE) clean
$(MAKE) LITE=1 static_lib
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite $$(stat --printf="%s" librocksdb.a)
strip librocksdb.a
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite_stripped $$(stat --printf="%s" librocksdb.a)
# === lite build, shared ===
$(MAKE) clean
$(MAKE) LITE=1 shared_lib
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite $$(stat --printf="%s" `readlink -f librocksdb.so`)
strip `readlink -f librocksdb.so`
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
# ---------------------------------------------------------------------------
# Platform-specific compilation

@ -5,4 +5,3 @@ This is the list of all known third-party plugins for RocksDB. If something is m
* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
* [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.
* [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB.
* [IPPCP](https://github.com/intel/ippcp-plugin-rocksdb): a plugin to enable encryption on RocksDB based on Intel optimized open source IPP-Crypto library.

@ -1,6 +1,8 @@
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)

@ -0,0 +1,21 @@
# RocksDBLite
RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
Some examples of the features disabled by ROCKSDB_LITE:
* compiled-in support for LDB tool
* No backup engine
* No support for replication (which we provide in form of TransactionalIterator)
* No advanced monitoring tools
* No special-purpose memtables that are highly optimized for specific use cases
* No Transactions
When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
* Nobody from mobile really needs your feature,
* Your feature is adding a lot of weight to the binary.
Don't add ROCKSDB_LITE compile guard if:
* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
* Your feature is not adding a lot of weight.
If unsure, ask. :)

@ -11,7 +11,6 @@ load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrap
cpp_library_wrapper(name="rocksdb_lib", srcs=[
"cache/cache.cc",
"cache/cache_entry_roles.cc",
"cache/cache_helpers.cc",
"cache/cache_key.cc",
"cache/cache_reservation_manager.cc",
"cache/charged_cache.cc",
@ -19,7 +18,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"cache/compressed_secondary_cache.cc",
"cache/lru_cache.cc",
"cache/secondary_cache.cc",
"cache/secondary_cache_adapter.cc",
"cache/sharded_cache.cc",
"db/arena_wrapped_db_iter.cc",
"db/blob/blob_contents.cc",
@ -106,7 +104,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"db/write_batch.cc",
"db/write_batch_base.cc",
"db/write_controller.cc",
"db/write_stall_stats.cc",
"db/write_thread.cc",
"env/composite_env.cc",
"env/env.cc",
@ -183,7 +180,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"table/block_based/block_based_table_iterator.cc",
"table/block_based/block_based_table_reader.cc",
"table/block_based/block_builder.cc",
"table/block_based/block_cache.cc",
"table/block_based/block_prefetcher.cc",
"table/block_based/block_prefix_index.cc",
"table/block_based/data_block_footer.cc",
@ -250,7 +246,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"util/concurrent_task_limiter_impl.cc",
"util/crc32c.cc",
"util/crc32c_arm64.cc",
"util/data_structure.cc",
"util/dynamic_bloom.cc",
"util/file_checksum_helper.cc",
"util/hash.cc",
@ -264,8 +259,6 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"util/string_util.cc",
"util/thread_local.cc",
"util/threadpool_imp.cc",
"util/udt_util.cc",
"util/write_batch_util.cc",
"util/xxhash.cc",
"utilities/agg_merge/agg_merge.cc",
"utilities/backup/backup_engine.cc",
@ -356,14 +349,352 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"//folly/synchronization:distributed_mutex",
], headers=None, link_whole=False, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
"cache/cache.cc",
"cache/cache_entry_roles.cc",
"cache/cache_key.cc",
"cache/cache_reservation_manager.cc",
"cache/charged_cache.cc",
"cache/clock_cache.cc",
"cache/compressed_secondary_cache.cc",
"cache/lru_cache.cc",
"cache/secondary_cache.cc",
"cache/sharded_cache.cc",
"db/arena_wrapped_db_iter.cc",
"db/blob/blob_contents.cc",
"db/blob/blob_fetcher.cc",
"db/blob/blob_file_addition.cc",
"db/blob/blob_file_builder.cc",
"db/blob/blob_file_cache.cc",
"db/blob/blob_file_garbage.cc",
"db/blob/blob_file_meta.cc",
"db/blob/blob_file_reader.cc",
"db/blob/blob_garbage_meter.cc",
"db/blob/blob_log_format.cc",
"db/blob/blob_log_sequential_reader.cc",
"db/blob/blob_log_writer.cc",
"db/blob/blob_source.cc",
"db/blob/prefetch_buffer_collection.cc",
"db/builder.cc",
"db/c.cc",
"db/column_family.cc",
"db/compaction/compaction.cc",
"db/compaction/compaction_iterator.cc",
"db/compaction/compaction_job.cc",
"db/compaction/compaction_outputs.cc",
"db/compaction/compaction_picker.cc",
"db/compaction/compaction_picker_fifo.cc",
"db/compaction/compaction_picker_level.cc",
"db/compaction/compaction_picker_universal.cc",
"db/compaction/compaction_service_job.cc",
"db/compaction/compaction_state.cc",
"db/compaction/sst_partitioner.cc",
"db/compaction/subcompaction_state.cc",
"db/convenience.cc",
"db/db_filesnapshot.cc",
"db/db_impl/compacted_db_impl.cc",
"db/db_impl/db_impl.cc",
"db/db_impl/db_impl_compaction_flush.cc",
"db/db_impl/db_impl_debug.cc",
"db/db_impl/db_impl_experimental.cc",
"db/db_impl/db_impl_files.cc",
"db/db_impl/db_impl_open.cc",
"db/db_impl/db_impl_readonly.cc",
"db/db_impl/db_impl_secondary.cc",
"db/db_impl/db_impl_write.cc",
"db/db_info_dumper.cc",
"db/db_iter.cc",
"db/dbformat.cc",
"db/error_handler.cc",
"db/event_helpers.cc",
"db/experimental.cc",
"db/external_sst_file_ingestion_job.cc",
"db/file_indexer.cc",
"db/flush_job.cc",
"db/flush_scheduler.cc",
"db/forward_iterator.cc",
"db/import_column_family_job.cc",
"db/internal_stats.cc",
"db/log_reader.cc",
"db/log_writer.cc",
"db/logs_with_prep_tracker.cc",
"db/malloc_stats.cc",
"db/memtable.cc",
"db/memtable_list.cc",
"db/merge_helper.cc",
"db/merge_operator.cc",
"db/output_validator.cc",
"db/periodic_task_scheduler.cc",
"db/range_del_aggregator.cc",
"db/range_tombstone_fragmenter.cc",
"db/repair.cc",
"db/seqno_to_time_mapping.cc",
"db/snapshot_impl.cc",
"db/table_cache.cc",
"db/table_properties_collector.cc",
"db/transaction_log_impl.cc",
"db/trim_history_scheduler.cc",
"db/version_builder.cc",
"db/version_edit.cc",
"db/version_edit_handler.cc",
"db/version_set.cc",
"db/wal_edit.cc",
"db/wal_manager.cc",
"db/wide/wide_column_serialization.cc",
"db/wide/wide_columns.cc",
"db/write_batch.cc",
"db/write_batch_base.cc",
"db/write_controller.cc",
"db/write_thread.cc",
"env/composite_env.cc",
"env/env.cc",
"env/env_chroot.cc",
"env/env_encryption.cc",
"env/env_posix.cc",
"env/file_system.cc",
"env/file_system_tracer.cc",
"env/fs_posix.cc",
"env/fs_remap.cc",
"env/io_posix.cc",
"env/mock_env.cc",
"env/unique_id_gen.cc",
"file/delete_scheduler.cc",
"file/file_prefetch_buffer.cc",
"file/file_util.cc",
"file/filename.cc",
"file/line_file_reader.cc",
"file/random_access_file_reader.cc",
"file/read_write_util.cc",
"file/readahead_raf.cc",
"file/sequence_file_reader.cc",
"file/sst_file_manager_impl.cc",
"file/writable_file_writer.cc",
"logging/auto_roll_logger.cc",
"logging/event_logger.cc",
"logging/log_buffer.cc",
"memory/arena.cc",
"memory/concurrent_arena.cc",
"memory/jemalloc_nodump_allocator.cc",
"memory/memkind_kmem_allocator.cc",
"memory/memory_allocator.cc",
"memtable/alloc_tracker.cc",
"memtable/hash_linklist_rep.cc",
"memtable/hash_skiplist_rep.cc",
"memtable/skiplistrep.cc",
"memtable/vectorrep.cc",
"memtable/write_buffer_manager.cc",
"monitoring/histogram.cc",
"monitoring/histogram_windowing.cc",
"monitoring/in_memory_stats_history.cc",
"monitoring/instrumented_mutex.cc",
"monitoring/iostats_context.cc",
"monitoring/perf_context.cc",
"monitoring/perf_level.cc",
"monitoring/persistent_stats_history.cc",
"monitoring/statistics.cc",
"monitoring/thread_status_impl.cc",
"monitoring/thread_status_updater.cc",
"monitoring/thread_status_updater_debug.cc",
"monitoring/thread_status_util.cc",
"monitoring/thread_status_util_debug.cc",
"options/cf_options.cc",
"options/configurable.cc",
"options/customizable.cc",
"options/db_options.cc",
"options/options.cc",
"options/options_helper.cc",
"options/options_parser.cc",
"port/mmap.cc",
"port/port_posix.cc",
"port/stack_trace.cc",
"port/win/env_default.cc",
"port/win/env_win.cc",
"port/win/io_win.cc",
"port/win/port_win.cc",
"port/win/win_logger.cc",
"port/win/win_thread.cc",
"table/adaptive/adaptive_table_factory.cc",
"table/block_based/binary_search_index_reader.cc",
"table/block_based/block.cc",
"table/block_based/block_based_table_builder.cc",
"table/block_based/block_based_table_factory.cc",
"table/block_based/block_based_table_iterator.cc",
"table/block_based/block_based_table_reader.cc",
"table/block_based/block_builder.cc",
"table/block_based/block_prefetcher.cc",
"table/block_based/block_prefix_index.cc",
"table/block_based/data_block_footer.cc",
"table/block_based/data_block_hash_index.cc",
"table/block_based/filter_block_reader_common.cc",
"table/block_based/filter_policy.cc",
"table/block_based/flush_block_policy.cc",
"table/block_based/full_filter_block.cc",
"table/block_based/hash_index_reader.cc",
"table/block_based/index_builder.cc",
"table/block_based/index_reader_common.cc",
"table/block_based/parsed_full_filter_block.cc",
"table/block_based/partitioned_filter_block.cc",
"table/block_based/partitioned_index_iterator.cc",
"table/block_based/partitioned_index_reader.cc",
"table/block_based/reader_common.cc",
"table/block_based/uncompression_dict_reader.cc",
"table/block_fetcher.cc",
"table/compaction_merging_iterator.cc",
"table/cuckoo/cuckoo_table_builder.cc",
"table/cuckoo/cuckoo_table_factory.cc",
"table/cuckoo/cuckoo_table_reader.cc",
"table/format.cc",
"table/get_context.cc",
"table/iterator.cc",
"table/merging_iterator.cc",
"table/meta_blocks.cc",
"table/persistent_cache_helper.cc",
"table/plain/plain_table_bloom.cc",
"table/plain/plain_table_builder.cc",
"table/plain/plain_table_factory.cc",
"table/plain/plain_table_index.cc",
"table/plain/plain_table_key_coding.cc",
"table/plain/plain_table_reader.cc",
"table/sst_file_dumper.cc",
"table/sst_file_reader.cc",
"table/sst_file_writer.cc",
"table/table_factory.cc",
"table/table_properties.cc",
"table/two_level_iterator.cc",
"table/unique_id.cc",
"test_util/sync_point.cc",
"test_util/sync_point_impl.cc",
"test_util/transaction_test_util.cc",
"tools/dump/db_dump_tool.cc",
"tools/io_tracer_parser_tool.cc",
"tools/ldb_cmd.cc",
"tools/ldb_tool.cc",
"tools/sst_dump_tool.cc",
"trace_replay/block_cache_tracer.cc",
"trace_replay/io_tracer.cc",
"trace_replay/trace_record.cc",
"trace_replay/trace_record_handler.cc",
"trace_replay/trace_record_result.cc",
"trace_replay/trace_replay.cc",
"util/async_file_reader.cc",
"util/build_version.cc",
"util/cleanable.cc",
"util/coding.cc",
"util/compaction_job_stats_impl.cc",
"util/comparator.cc",
"util/compression.cc",
"util/compression_context_cache.cc",
"util/concurrent_task_limiter_impl.cc",
"util/crc32c.cc",
"util/crc32c_arm64.cc",
"util/dynamic_bloom.cc",
"util/file_checksum_helper.cc",
"util/hash.cc",
"util/murmurhash.cc",
"util/random.cc",
"util/rate_limiter.cc",
"util/ribbon_config.cc",
"util/slice.cc",
"util/status.cc",
"util/stderr_logger.cc",
"util/string_util.cc",
"util/thread_local.cc",
"util/threadpool_imp.cc",
"util/xxhash.cc",
"utilities/agg_merge/agg_merge.cc",
"utilities/backup/backup_engine.cc",
"utilities/blob_db/blob_compaction_filter.cc",
"utilities/blob_db/blob_db.cc",
"utilities/blob_db/blob_db_impl.cc",
"utilities/blob_db/blob_db_impl_filesnapshot.cc",
"utilities/blob_db/blob_dump_tool.cc",
"utilities/blob_db/blob_file.cc",
"utilities/cache_dump_load.cc",
"utilities/cache_dump_load_impl.cc",
"utilities/cassandra/cassandra_compaction_filter.cc",
"utilities/cassandra/format.cc",
"utilities/cassandra/merge_operator.cc",
"utilities/checkpoint/checkpoint_impl.cc",
"utilities/compaction_filters.cc",
"utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
"utilities/convenience/info_log_finder.cc",
"utilities/counted_fs.cc",
"utilities/debug.cc",
"utilities/env_mirror.cc",
"utilities/env_timed.cc",
"utilities/fault_injection_env.cc",
"utilities/fault_injection_fs.cc",
"utilities/fault_injection_secondary_cache.cc",
"utilities/leveldb_options/leveldb_options.cc",
"utilities/memory/memory_util.cc",
"utilities/merge_operators.cc",
"utilities/merge_operators/bytesxor.cc",
"utilities/merge_operators/max.cc",
"utilities/merge_operators/put.cc",
"utilities/merge_operators/sortlist.cc",
"utilities/merge_operators/string_append/stringappend.cc",
"utilities/merge_operators/string_append/stringappend2.cc",
"utilities/merge_operators/uint64add.cc",
"utilities/object_registry.cc",
"utilities/option_change_migration/option_change_migration.cc",
"utilities/options/options_util.cc",
"utilities/persistent_cache/block_cache_tier.cc",
"utilities/persistent_cache/block_cache_tier_file.cc",
"utilities/persistent_cache/block_cache_tier_metadata.cc",
"utilities/persistent_cache/persistent_cache_tier.cc",
"utilities/persistent_cache/volatile_tier_impl.cc",
"utilities/simulator_cache/cache_simulator.cc",
"utilities/simulator_cache/sim_cache.cc",
"utilities/table_properties_collectors/compact_on_deletion_collector.cc",
"utilities/trace/file_trace_reader_writer.cc",
"utilities/trace/replayer_impl.cc",
"utilities/transactions/lock/lock_manager.cc",
"utilities/transactions/lock/point/point_lock_manager.cc",
"utilities/transactions/lock/point/point_lock_tracker.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
"utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
"utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
"utilities/transactions/optimistic_transaction.cc",
"utilities/transactions/optimistic_transaction_db_impl.cc",
"utilities/transactions/pessimistic_transaction.cc",
"utilities/transactions/pessimistic_transaction_db.cc",
"utilities/transactions/snapshot_checker.cc",
"utilities/transactions/transaction_base.cc",
"utilities/transactions/transaction_db_mutex_impl.cc",
"utilities/transactions/transaction_util.cc",
"utilities/transactions/write_prepared_txn.cc",
"utilities/transactions/write_prepared_txn_db.cc",
"utilities/transactions/write_unprepared_txn.cc",
"utilities/transactions/write_unprepared_txn_db.cc",
"utilities/ttl/db_ttl_impl.cc",
"utilities/wal_filter.cc",
"utilities/write_batch_with_index/write_batch_with_index.cc",
"utilities/write_batch_with_index/write_batch_with_index_internal.cc",
], deps=[
"//folly/container:f14_hash",
"//folly/experimental/coro:blocking_wait",
"//folly/experimental/coro:collect",
"//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex",
], headers=None, link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
"db/db_test_util.cc",
"db/db_with_timestamp_test_util.cc",
"table/mock_table.cc",
"test_util/mock_time_env.cc",
"test_util/secondary_cache_test_util.cc",
"test_util/testharness.cc",
"test_util/testutil.cc",
"tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
@ -394,7 +725,6 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
"db_stress_tool/db_stress_test_base.cc",
"db_stress_tool/db_stress_tool.cc",
"db_stress_tool/expected_state.cc",
"db_stress_tool/expected_value.cc",
"db_stress_tool/multi_ops_txns_stress.cc",
"db_stress_tool/no_batched_ops_stress.cc",
"test_util/testutil.cc",
@ -4752,12 +5082,6 @@ cpp_unittest_wrapper(name="db_bloom_filter_test",
extra_compiler_flags=[])
cpp_unittest_wrapper(name="db_clip_test",
srcs=["db/db_clip_test.cc"],
deps=[":rocksdb_test_lib"],
extra_compiler_flags=[])
cpp_unittest_wrapper(name="db_compaction_filter_test",
srcs=["db/db_compaction_filter_test.cc"],
deps=[":rocksdb_test_lib"],
@ -5510,12 +5834,6 @@ cpp_unittest_wrapper(name="ttl_test",
extra_compiler_flags=[])
cpp_unittest_wrapper(name="udt_util_test",
srcs=["util/udt_util_test.cc"],
deps=[":rocksdb_test_lib"],
extra_compiler_flags=[])
cpp_unittest_wrapper(name="util_merge_operators_test",
srcs=["utilities/util_merge_operators_test.cc"],
deps=[":rocksdb_test_lib"],

@ -15,28 +15,6 @@ At Facebook, we use RocksDB as storage engines in multiple data management servi
[2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
## Bilibili
[Bilibili](bilibili.com) [uses](https://www.alluxio.io/blog/when-ai-meets-alluxio-at-bilibili-building-an-efficient-ai-platform-for-data-preprocessing-and-model-training/) Alluxio to speed up its ML training workloads, and Alluxio uses RocksDB to store its filesystem metadata, so Bilibili uses RocksDB.
Bilibili's [real-time platform](https://www.alibabacloud.com/blog/architecture-and-practices-of-bilibilis-real-time-platform_596676) uses Flink, and uses RocksDB as Flink's state store.
## TikTok
TikTok, or its parent company ByteDance, uses RocksDB as the storage engine for some storage systems, such as its distributed graph database [ByteGraph](https://vldb.org/pvldb/vol15/p3306-li.pdf).
Also, TikTok uses [Alluxio](alluxio.io) to [speed up Presto queries](https://www.alluxio.io/resources/videos/improving-presto-performance-with-alluxio-at-tiktok/), and Alluxio stores the files' metadata in RocksDB.
## FoundationDB
[FoundationDB](https://www.foundationdb.org/) [uses](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp) RocksDB to implement a [key-value store interface](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp#L1127) in its server backend.
## Apple
Apple [uses](https://opensource.apple.com/projects/foundationdb/) FoundationDB, so it also uses RocksDB.
## Snowflake
Snowflake [uses](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/) FoundationDB, so it also uses RocksDB.
## Microsoft
The Bing search engine from Microsoft uses RocksDB as the storage engine for its web data platform: https://blogs.bing.com/Engineering-Blog/october-2021/RocksDB-in-Microsoft-Bing
## LinkedIn
Two different use cases at Linkedin are using RocksDB as a storage engine:
@ -48,9 +26,6 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
## Yahoo
Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
## Tencent
[PaxosStore](https://github.com/Tencent/paxosstore) is a distributed database supporting WeChat. It uses RocksDB as its storage engine.
## Baidu
[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
@ -104,18 +79,9 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
## TiKV
[TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer.
## TiDB
[TiDB](https://github.com/pingcap/tidb) uses the TiKV distributed key-value database, so it uses RocksDB.
## PingCAP
[PingCAP](https://www.pingcap.com/) is the company behind TiDB, its cloud database service uses RocksDB.
## Apache Spark
[Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store.
## Databricks
[Databricks](https://www.databricks.com/) [replaces AWS RDS with TiDB](https://www.pingcap.com/case-study/how-databricks-tackles-the-scalability-limit-with-a-mysql-alternative/) for scalability, so it uses RocksDB.
## Apache Flink
[Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine.

@ -26,7 +26,7 @@ from util import ColorString
# $python3 buckifier/buckify_rocksdb.py \
# '{"fake": {
# "extra_deps": [":test_dep", "//fakes/module:mock1"],
# "extra_compiler_flags": ["-DFOO_BAR", "-Os"]
# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
# }
# }'
# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
@ -154,9 +154,16 @@ def generate_targets(repo_path, deps_map):
# rocksdb_whole_archive_lib
TARGETS.add_library(
"rocksdb_whole_archive_lib",
[],
src_mk["LIB_SOURCES"] +
# always add range_tree, it's only excluded on ppc64, which we don't use internally
src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
deps=[
":rocksdb_lib",
"//folly/container:f14_hash",
"//folly/experimental/coro:blocking_wait",
"//folly/experimental/coro:collect",
"//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex",
],
headers=None,
extra_external_deps="",

@ -63,7 +63,13 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
if [ "$LIB_MODE" == "shared" ]; then
PIC_BUILD=1
fi
source "$PWD/build_tools/fbcode_config_platform010.sh"
if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM010" ]; then
source "$PWD/build_tools/fbcode_config_platform010.sh"
elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
source "$PWD/build_tools/fbcode_config_platform009.sh"
else
source "$PWD/build_tools/fbcode_config_platform009.sh"
fi
fi
# Delete existing output, if it exists
@ -148,7 +154,7 @@ case "$TARGET_OS" in
;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE "
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
CROSS_COMPILE=true
@ -419,7 +425,7 @@ EOF
if ! test $ROCKSDB_DISABLE_JEMALLOC; then
# Test whether jemalloc is available
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ljemalloc \
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \
2>/dev/null; then
# This will enable some preprocessor identifiers in the Makefile
JEMALLOC=1
@ -428,19 +434,12 @@ EOF
WITH_JEMALLOC_FLAG=1
# check for JEMALLOC installed with HomeBrew
if [ "$PLATFORM" == "OS_MACOSX" ]; then
if [ "$TARGET_ARCHITECTURE" = "arm64" ]; then
# on M1 Macs, homebrew installs here instead of /usr/local
JEMALLOC_PREFIX="/opt/homebrew"
else
JEMALLOC_PREFIX="/usr/local"
fi
if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
JEMALLOC_INCLUDE="-I${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/include"
JEMALLOC_LIB="${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_LDFLAGS="$JAVA_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include"
JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB"
JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB"
fi
fi
fi
@ -628,7 +627,7 @@ EOF
fi
fi
if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then
if test "0$PORTABLE" -eq 0; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
# Tune for this POWER processor, treating '+' models as base models
POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
@ -651,36 +650,37 @@ if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif [ "$TARGET_OS" == "IOS" ]; then
COMMON_FLAGS="$COMMON_FLAGS"
elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
# TODO: Not sure why we don't use -march=native on these OSes
if test "$USE_SSE"; then
TRY_SSE_ETC="1"
fi
else
COMMON_FLAGS="$COMMON_FLAGS -march=native "
fi
else
# PORTABLE specified
if [ "$PORTABLE" == 1 ]; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif test "$USE_SSE"; then
# USE_SSE is DEPRECATED
# This is a rough approximation of the old USE_SSE behavior
COMMON_FLAGS="$COMMON_FLAGS -march=haswell"
fi
# Other than those cases, not setting -march= here.
else
# Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell
COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}"
# PORTABLE=1
if test "$USE_SSE"; then
TRY_SSE_ETC="1"
fi
if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
fi
if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
fi
if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
# For portability compile for macOS 10.14 or newer
COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.14"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.14"
# For portability compile for macOS 10.13 (2017) or newer
COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13"
# -mmacosx-version-min must come first here.
PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.14 $PLATFORM_SHARED_LDFLAGS"
PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.14"
PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS"
PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13"
JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13"
JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
@ -704,6 +704,101 @@ EOF
fi
fi
if test "$TRY_SSE_ETC"; then
# The USE_SSE flag now means "attempt to compile with widely-available
# Intel architecture extensions utilized by specific optimizations in the
# source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
# It doesn't even really check that your current CPU is compatible.
#
# SSE4.2 available since nehalem, ca. 2008-2010
# Includes POPCNT for BitsSetToOne, BitParity
TRY_SSE42="-msse4.2"
# PCLMUL available since westmere, ca. 2010-2011
TRY_PCLMUL="-mpclmul"
# AVX2 available since haswell, ca. 2013-2015
TRY_AVX2="-mavx2"
# BMI available since haswell, ca. 2013-2015
# Primarily for TZCNT for CountTrailingZeroBits
TRY_BMI="-mbmi"
# LZCNT available since haswell, ca. 2013-2015
# For FloorLog2
TRY_LZCNT="-mlzcnt"
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <nmmintrin.h>
int main() {
volatile uint32_t x = _mm_crc32_u32(0, 0);
(void)x;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <wmmintrin.h>
int main() {
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
(void)d;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main() {
const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
const auto b = _mm256_permutevar8x32_epi32(a, a);
(void)b;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main(int argc, char *argv[]) {
(void)argv;
return (int)_tzcnt_u64((uint64_t)argc);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main(int argc, char *argv[]) {
(void)argv;
return (int)_lzcnt_u64((uint64_t)argc);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
int main() {

@ -0,0 +1,22 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187
LZ4_BASE=/mnt/gvfs/third-party2/lz4/6ca38d3c390be2774d61a300f151464bbd632d62/1.9.1/platform009/7f3b187
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform009/c305944
NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652
TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187
LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187
LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187
GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d

@ -147,7 +147,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"

@ -0,0 +1,170 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ and clang compilers and also
# uses jemalloc
# Environment variables that change the behavior of this script:
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
BASEDIR=`dirname $BASH_SOURCE`
source "$BASEDIR/dependencies_platform009.sh"
CFLAGS=""
# libgcc
LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward"
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
# glibc
GLIBC_INCLUDE="$GLIBC_BASE/include"
GLIBC_LIBS=" -L $GLIBC_BASE/lib"
if test -z $PIC_BUILD; then
MAYBE_PIC=
else
MAYBE_PIC=_pic
fi
if ! test $ROCKSDB_DISABLE_SNAPPY; then
# snappy
SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
CFLAGS+=" -DSNAPPY"
fi
if ! test $ROCKSDB_DISABLE_ZLIB; then
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
CFLAGS+=" -DZLIB"
fi
if ! test $ROCKSDB_DISABLE_BZIP; then
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
CFLAGS+=" -DBZIP2"
fi
if ! test $ROCKSDB_DISABLE_LZ4; then
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
CFLAGS+=" -DLZ4"
fi
if ! test $ROCKSDB_DISABLE_ZSTD; then
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
CFLAGS+=" -DZSTD"
fi
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
CFLAGS+=" -DGFLAGS=gflags"
BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
GLOG_INCLUDE=" -I $GLOG_BASE/include/"
GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
# location of numa
NUMA_INCLUDE=" -I $NUMA_BASE/include/"
NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
CFLAGS+=" -DNUMA"
# location of libunwind
LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
# location of TBB
TBB_INCLUDE=" -isystem $TBB_BASE/include/"
TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
CFLAGS+=" -DTBB"
# location of LIBURING
LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
CFLAGS+=" -DLIBURING"
test "$USE_SSE" || USE_SSE=1
export USE_SSE
test "$PORTABLE" || PORTABLE=1
export PORTABLE
BINUTILS="$BINUTILS_BASE/bin"
AR="$BINUTILS/ar"
AS="$BINUTILS/as"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $GLOG_INCLUDE"
STDLIBS="-L $GCC_BASE/lib64"
CLANG_BIN="$CLANG_BASE/bin"
CLANG_LIB="$CLANG_BASE/lib"
CLANG_SRC="$CLANG_BASE/../../src"
CLANG_ANALYZER="$CLANG_BIN/clang++"
CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
AR="$GCC_BASE/bin/gcc-ar"
CFLAGS+=" -B$BINUTILS"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
JEMALLOC=1
else
# clang
CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
CC="$CLANG_BIN/clang"
CXX="$CLANG_BIN/clang++"
AR="$CLANG_BIN/llvm-ar"
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" -Wno-expansion-to-defined "
CXXFLAGS="-nostdinc++"
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
EXEC_LDFLAGS+=" $LIBUNWIND"
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
# required by libtbb
EXEC_LDFLAGS+=" -ldl"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
PLATFORM_LDFLAGS+=" -B$BINUTILS"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
VALGRIND_VER="$VALGRIND_BASE/bin/"
# lua not supported because it's on track for deprecation, I think
LUA_PATH=
LUA_LIB=
export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB

@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"

@ -360,7 +360,7 @@ function send_to_ods {
echo >&2 "ERROR: Key $key doesn't have a value."
return
fi
curl --silent "https://www.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
--connect-timeout 60
}

@ -104,3 +104,46 @@ get_lib_base valgrind LATEST platform010
get_lib_base lua 5.3.4 platform010
git diff $OUTPUT
###########################################################
# platform009 dependencies #
###########################################################
OUTPUT="$BASEDIR/dependencies_platform009.sh"
rm -f "$OUTPUT"
touch "$OUTPUT"
echo "Writing dependencies to $OUTPUT"
# Compilers locations
GCC_BASE=`readlink -f $TP2_LATEST/gcc/9.x/centos7-native/*/`
CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/9.0.0/platform009/*/`
log_header
log_variable GCC_BASE
log_variable CLANG_BASE
# Libraries locations
get_lib_base libgcc 9.x platform009
get_lib_base glibc 2.30 platform009
get_lib_base snappy LATEST platform009
get_lib_base zlib LATEST platform009
get_lib_base bzip2 LATEST platform009
get_lib_base lz4 LATEST platform009
get_lib_base zstd LATEST platform009
get_lib_base gflags LATEST platform009
get_lib_base jemalloc LATEST platform009
get_lib_base numa LATEST platform009
get_lib_base libunwind LATEST platform009
get_lib_base tbb 2018_U5 platform009
get_lib_base liburing LATEST platform009
get_lib_base benchmark LATEST platform009
get_lib_base kernel-headers fb platform009
get_lib_base binutils LATEST centos7-native
get_lib_base valgrind LATEST platform009
get_lib_base lua 5.3.4 platform009
git diff $OUTPUT

59
cache/cache.cc vendored

@ -16,8 +16,7 @@
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
const Cache::CacheItemHelper kNoopCacheItemHelper{};
#ifndef ROCKSDB_LITE
static std::unordered_map<std::string, OptionTypeInfo>
lru_cache_options_type_info = {
{"capacity",
@ -65,6 +64,7 @@ static std::unordered_map<std::string, OptionTypeInfo>
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
};
#endif // ROCKSDB_LITE
Status SecondaryCache::CreateFromString(
const ConfigOptions& config_options, const std::string& value,
@ -75,6 +75,7 @@ Status SecondaryCache::CreateFromString(
Status status;
std::shared_ptr<SecondaryCache> sec_cache;
#ifndef ROCKSDB_LITE
CompressedSecondaryCacheOptions sec_cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&comp_sec_cache_options_type_info, "",
@ -83,13 +84,19 @@ Status SecondaryCache::CreateFromString(
sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
}
#else
(void)config_options;
status = Status::NotSupported(
"Cannot load compressed secondary cache in LITE mode ", args);
#endif //! ROCKSDB_LITE
if (status.ok()) {
result->swap(sec_cache);
}
return status;
} else {
return LoadSharedObject<SecondaryCache>(config_options, value, result);
return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
result);
}
}
@ -101,6 +108,7 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
if (value.find('=') == std::string::npos) {
cache = NewLRUCache(ParseSizeT(value));
} else {
#ifndef ROCKSDB_LITE
LRUCacheOptions cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&lru_cache_options_type_info, "",
@ -108,51 +116,14 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
if (status.ok()) {
cache = NewLRUCache(cache_opts);
}
#else
(void)config_options;
status = Status::NotSupported("Cannot load cache in LITE mode ", value);
#endif //! ROCKSDB_LITE
}
if (status.ok()) {
result->swap(cache);
}
return status;
}
bool Cache::AsyncLookupHandle::IsReady() {
return pending_handle == nullptr || pending_handle->IsReady();
}
bool Cache::AsyncLookupHandle::IsPending() { return pending_handle != nullptr; }
Cache::Handle* Cache::AsyncLookupHandle::Result() {
assert(!IsPending());
return result_handle;
}
void Cache::StartAsyncLookup(AsyncLookupHandle& async_handle) {
async_handle.found_dummy_entry = false; // in case re-used
assert(!async_handle.IsPending());
async_handle.result_handle =
Lookup(async_handle.key, async_handle.helper, async_handle.create_context,
async_handle.priority, async_handle.stats);
}
Cache::Handle* Cache::Wait(AsyncLookupHandle& async_handle) {
WaitAll(&async_handle, 1);
return async_handle.Result();
}
void Cache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
for (size_t i = 0; i < count; ++i) {
if (async_handles[i].IsPending()) {
// If a pending handle gets here, it should be marked at "to be handled
// by a caller" by that caller erasing the pending_cache on it.
assert(async_handles[i].pending_cache == nullptr);
}
}
}
void Cache::SetEvictionCallback(EvictionCallback&& fn) {
// Overwriting non-empty with non-empty could indicate a bug
assert(!eviction_callback_ || !fn);
eviction_callback_ = std::move(fn);
}
} // namespace ROCKSDB_NAMESPACE

@ -16,7 +16,7 @@
#include "db/db_impl/db_impl.h"
#include "monitoring/histogram.h"
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
DEFINE_uint32(skew, 5, "Degree of skew in key selection");
DEFINE_bool(populate_cache, true, "Populate cache before operations");
DEFINE_uint32(lookup_insert_percent, 87,
@ -71,23 +71,17 @@ DEFINE_uint32(
DEFINE_uint32(gather_stats_entries_per_lock, 256,
"For Cache::ApplyToAllEntries");
DEFINE_bool(skewed, false, "If true, skew the key access distribution");
DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache "
"operations.");
DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\"");
DEFINE_bool(histograms, true,
"Whether to track and print histogram statistics.");
DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
#ifndef ROCKSDB_LITE
DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
#endif // ROCKSDB_LITE
DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
@ -153,6 +147,9 @@ class SharedState {
public:
explicit SharedState(CacheBench* cache_bench)
: cv_(&mu_),
num_initialized_(0),
start_(false),
num_done_(0),
cache_bench_(cache_bench) {}
~SharedState() {}
@ -175,27 +172,15 @@ class SharedState {
bool Started() const { return start_; }
void AddLookupStats(uint64_t hits, uint64_t misses) {
MutexLock l(&mu_);
lookup_count_ += hits + misses;
lookup_hits_ += hits;
}
double GetLookupHitRatio() const {
return 1.0 * lookup_hits_ / lookup_count_;
}
private:
port::Mutex mu_;
port::CondVar cv_;
CacheBench* cache_bench_;
uint64_t num_initialized_;
bool start_;
uint64_t num_done_;
uint64_t num_initialized_ = 0;
bool start_ = false;
uint64_t num_done_ = 0;
uint64_t lookup_count_ = 0;
uint64_t lookup_hits_ = 0;
CacheBench* cache_bench_;
};
// Per-thread state for concurrent executions of the same benchmark.
@ -207,19 +192,27 @@ struct ThreadState {
uint64_t duration_us = 0;
ThreadState(uint32_t index, SharedState* _shared)
: tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
: tid(index), rnd(1000 + index), shared(_shared) {}
};
struct KeyGen {
char key_data[27];
Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < skew; ++i) {
raw = std::min(raw, rnd.Next());
Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
uint64_t key = 0;
if (!FLAGS_skewed) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < FLAGS_skew; ++i) {
raw = std::min(raw, rnd.Next());
}
key = FastRange64(raw, max_key);
} else {
key = rnd.Skewed(max_log);
if (key > max_key) {
key -= max_key;
}
}
uint64_t key = FastRange64(raw, max_key);
// Variable size and alignment
size_t off = key % 8;
key_data[0] = char{42};
@ -233,7 +226,7 @@ struct KeyGen {
}
};
Cache::ObjectPtr createValue(Random64& rnd) {
char* createValue(Random64& rnd) {
char* rv = new char[FLAGS_value_bytes];
// Fill with some filler data, and take some CPU time
for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
@ -243,36 +236,28 @@ Cache::ObjectPtr createValue(Random64& rnd) {
}
// Callbacks for secondary cache
size_t SizeFn(Cache::ObjectPtr /*obj*/) { return FLAGS_value_bytes; }
size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
size_t length, char* out) {
memcpy(out, from_obj, length);
Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
memcpy(out, obj, size);
return Status::OK();
}
Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/,
MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj,
size_t* out_charge) {
*out_obj = new char[data.size()];
memcpy(*out_obj, data.data(), data.size());
*out_charge = data.size();
return Status::OK();
};
void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
// Different deleters to simulate using deleter to gather
// stats on the code origin and kind of cache entries.
void deleter1(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
void deleter2(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
void deleter3(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper1_wos);
Cache::CacheItemHelper helper2_wos(CacheEntryRole::kIndexBlock, DeleteFn);
Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper2_wos);
Cache::CacheItemHelper helper3_wos(CacheEntryRole::kFilterBlock, DeleteFn);
Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper3_wos);
Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
} // namespace
class CacheBench {
@ -290,25 +275,32 @@ class CacheBench {
lookup_threshold_(insert_threshold_ +
kHundredthUint64 * FLAGS_lookup_percent),
erase_threshold_(lookup_threshold_ +
kHundredthUint64 * FLAGS_erase_percent) {
kHundredthUint64 * FLAGS_erase_percent),
skewed_(FLAGS_skewed) {
if (erase_threshold_ != 100U * kHundredthUint64) {
fprintf(stderr, "Percentages must add to 100.\n");
exit(1);
}
max_log_ = 0;
if (skewed_) {
uint64_t max_key = max_key_;
while (max_key >>= 1) max_log_++;
if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
}
if (FLAGS_cache_type == "clock_cache") {
fprintf(stderr, "Old clock cache implementation has been removed.\n");
exit(1);
} else if (FLAGS_cache_type == "hyper_clock_cache") {
HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
cache_ = opts.MakeSharedCache();
cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits)
.MakeSharedCache();
} else if (FLAGS_cache_type == "lru_cache") {
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
false /* strict_capacity_limit */,
0.5 /* high_pri_pool_ratio */);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
#ifndef ROCKSDB_LITE
if (!FLAGS_secondary_cache_uri.empty()) {
Status s = SecondaryCache::CreateFromString(
ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@ -321,6 +313,7 @@ class CacheBench {
}
opts.secondary_cache = secondary_cache;
}
#endif // ROCKSDB_LITE
cache_ = NewLRUCache(opts);
} else {
@ -332,50 +325,13 @@ class CacheBench {
~CacheBench() {}
void PopulateCache() {
Random64 rnd(FLAGS_seed);
Random64 rnd(1);
KeyGen keygen;
size_t max_occ = 0;
size_t inserts_since_max_occ_increase = 0;
size_t keys_since_last_not_found = 0;
// Avoid redundant insertions by checking Lookup before Insert.
// Loop until insertions consistently fail to increase max occupancy or
// it becomes difficult to find keys not already inserted.
while (inserts_since_max_occ_increase < 100 &&
keys_since_last_not_found < 100) {
Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
Cache::Handle* handle = cache_->Lookup(key);
if (handle != nullptr) {
cache_->Release(handle);
++keys_since_last_not_found;
continue;
}
keys_since_last_not_found = 0;
Status s =
cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
createValue(rnd), &helper1, FLAGS_value_bytes);
assert(s.ok());
handle = cache_->Lookup(key);
if (!handle) {
fprintf(stderr, "Failed to lookup key just inserted.\n");
assert(false);
exit(42);
} else {
cache_->Release(handle);
}
size_t occ = cache_->GetOccupancyCount();
if (occ > max_occ) {
max_occ = occ;
inserts_since_max_occ_increase = 0;
} else {
++inserts_since_max_occ_increase;
}
}
printf("Population complete (%zu entries, %g average charge)\n", max_occ,
1.0 * FLAGS_cache_size / max_occ);
}
bool Run() {
@ -434,21 +390,18 @@ class CacheBench {
FLAGS_ops_per_thread / elapsed_secs);
printf("Thread ops/sec = %u\n", ops_per_sec);
printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
if (FLAGS_histograms) {
printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
printf("\n%s", stats_report.c_str());
return true;
@ -462,6 +415,8 @@ class CacheBench {
const uint64_t insert_threshold_;
const uint64_t lookup_threshold_;
const uint64_t erase_threshold_;
const bool skewed_;
int max_log_;
// A benchmark version of gathering stats on an active block cache by
// iterating over it. The primary purpose is to measure the impact of
@ -481,7 +436,7 @@ class CacheBench {
uint64_t total_entry_count = 0;
uint64_t table_occupancy = 0;
uint64_t table_size = 0;
std::set<const Cache::CacheItemHelper*> helpers;
std::set<Cache::DeleterFn> deleters;
StopWatchNano timer(clock);
for (;;) {
@ -506,7 +461,7 @@ class CacheBench {
<< BytesToHumanString(static_cast<uint64_t>(
1.0 * total_charge / total_entry_count))
<< "\n"
<< "Unique helpers: " << helpers.size() << "\n";
<< "Unique deleters: " << deleters.size() << "\n";
*stats_report = ostr.str();
return;
}
@ -522,26 +477,22 @@ class CacheBench {
total_key_size = 0;
total_charge = 0;
total_entry_count = 0;
helpers.clear();
auto fn = [&](const Slice& key, Cache::ObjectPtr /*value*/, size_t charge,
const Cache::CacheItemHelper* helper) {
deleters.clear();
auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
Cache::DeleterFn deleter) {
total_key_size += key.size();
total_charge += charge;
++total_entry_count;
// Something slightly more expensive as in stats by category
helpers.insert(helper);
// Something slightly more expensive as in (future) stats by category
deleters.insert(deleter);
};
if (FLAGS_histograms) {
timer.Start();
}
timer.Start();
Cache::ApplyToAllEntriesOptions opts;
opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
if (FLAGS_histograms) {
stats_hist->Add(timer.ElapsedNanos() / 1000);
}
stats_hist->Add(timer.ElapsedNanos() / 1000);
}
}
@ -572,8 +523,6 @@ class CacheBench {
void OperateCache(ThreadState* thread) {
// To use looked-up values
uint64_t result = 0;
uint64_t lookup_misses = 0;
uint64_t lookup_hits = 0;
// To hold handles for a non-trivial amount of time
Cache::Handle* handle = nullptr;
KeyGen gen;
@ -582,12 +531,18 @@ class CacheBench {
StopWatchNano timer(clock);
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
uint64_t random_op = thread->rnd.Next();
Cache::CreateCallback create_cb = [](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
*out_obj = reinterpret_cast<void*>(new char[size]);
memcpy(*out_obj, buf, size);
*charge = size;
return Status::OK();
};
if (FLAGS_histograms) {
timer.Start();
}
timer.Start();
if (random_op < lookup_insert_threshold_) {
if (handle) {
@ -595,17 +550,15 @@ class CacheBench {
handle = nullptr;
}
// do lookup
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
true);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
// do insert
Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
FLAGS_value_bytes, &handle);
@ -626,17 +579,14 @@ class CacheBench {
handle = nullptr;
}
// do lookup
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
true);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
}
} else if (random_op < erase_threshold_) {
// do erase
@ -645,14 +595,7 @@ class CacheBench {
// Should be extremely unlikely (noop)
assert(random_op >= kHundredthUint64 * 100U);
}
if (FLAGS_histograms) {
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
thread->shared->AddLookupStats(lookup_hits, lookup_misses);
}
if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex());
exit(0);
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
if (handle) {
cache_->Release(handle);
@ -674,7 +617,6 @@ class CacheBench {
#ifndef NDEBUG
printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
printf("----------------------------\n");
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
printf("DMutex impl name : %s\n", DMutex::kName());
printf("Number of threads : %u\n", FLAGS_threads);
@ -1014,14 +956,11 @@ int cache_bench_tool(int argc, char** argv) {
exit(1);
}
if (FLAGS_seed == 0) {
FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
}
ROCKSDB_NAMESPACE::CacheBench bench;
if (FLAGS_populate_cache) {
bench.PopulateCache();
printf("Population complete\n");
printf("----------------------------\n");
}
if (bench.Run()) {
return 0;

@ -101,4 +101,34 @@ std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
return GetPrefixedCacheEntryRoleName(kPrefix, role);
}
namespace {
struct Registry {
std::mutex mutex;
UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map;
void Register(Cache::DeleterFn fn, CacheEntryRole role) {
std::lock_guard<std::mutex> lock(mutex);
role_map[fn] = role;
}
UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() {
std::lock_guard<std::mutex> lock(mutex);
return role_map;
}
};
Registry& GetRegistry() {
STATIC_AVOID_DESTRUCTION(Registry, registry);
return registry;
}
} // namespace
void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
GetRegistry().Register(fn, role);
}
UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
return GetRegistry().Copy();
}
} // namespace ROCKSDB_NAMESPACE

@ -7,8 +7,11 @@
#include <array>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "rocksdb/cache.h"
#include "util/hash_containers.h"
namespace ROCKSDB_NAMESPACE {
@ -17,4 +20,84 @@ extern std::array<std::string, kNumCacheEntryRoles>
extern std::array<std::string, kNumCacheEntryRoles>
kCacheEntryRoleToHyphenString;
// To associate cache entries with their role, we use a hack on the
// existing Cache interface. Because the deleter of an entry can authenticate
// the code origin of an entry, we can elaborate the choice of deleter to
// also encode role information, without inferring false role information
// from entries not choosing to encode a role.
//
// The rest of this file is for handling mappings between deleters and
// roles.
// To infer a role from a deleter, the deleter must be registered. This
// can be done "manually" with this function. This function is thread-safe,
// and the registration mappings go into private but static storage. (Note
// that DeleterFn is a function pointer, not std::function. Registrations
// should not be too many.)
void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
// Gets a copy of the registered deleter -> role mappings. This is the only
// function for reading the mappings made with RegisterCacheDeleterRole.
// Why only this interface for reading?
// * This function has to be thread safe, which could incur substantial
// overhead. We should not pay this overhead for every deleter look-up.
// * This is suitable for preparing for batch operations, like with
// CacheEntryStatsCollector.
// * The number of mappings should be sufficiently small (dozens).
UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
// ************************************************************** //
// An automatic registration infrastructure. This enables code
// to simply ask for a deleter associated with a particular type
// and role, and registration is automatic. In a sense, this is
// a small dependency injection infrastructure, because linking
// in new deleter instantiations is essentially sufficient for
// making stats collection (using CopyCacheDeleterRoleMap) aware
// of them.
namespace cache_entry_roles_detail {
template <typename T, CacheEntryRole R>
struct RegisteredDeleter {
RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
// These have global linkage to help ensure compiler optimizations do not
// break uniqueness for each <T,R>
static void Delete(const Slice& /* key */, void* value) {
// Supports T == Something[], unlike delete operator
std::default_delete<T>()(
static_cast<typename std::remove_extent<T>::type*>(value));
}
};
template <CacheEntryRole R>
struct RegisteredNoopDeleter {
RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
static void Delete(const Slice& /* key */, void* /* value */) {
// Here was `assert(value == nullptr);` but we can also put pointers
// to static data in Cache, for testing at least.
}
};
} // namespace cache_entry_roles_detail
// Get an automatically registered deleter for value type T and role R.
// Based on C++ semantics, registration is invoked exactly once in a
// thread-safe way on first call to this function, for each <T, R>.
template <typename T, CacheEntryRole R>
Cache::DeleterFn GetCacheEntryDeleterForRole() {
static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
return reg.Delete;
}
// Get an automatically registered no-op deleter (value should be nullptr)
// and associated with role R. This is used for Cache "reservation" entries
// such as for WriteBufferManager.
template <CacheEntryRole R>
Cache::DeleterFn GetNoopDeleterForRole() {
static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
return reg.Delete;
}
} // namespace ROCKSDB_NAMESPACE

@ -10,8 +10,8 @@
#include <memory>
#include <mutex>
#include "cache/cache_helpers.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "port/lang.h"
#include "rocksdb/cache.h"
#include "rocksdb/status.h"
@ -111,14 +111,11 @@ class CacheEntryStatsCollector {
// Gets or creates a shared instance of CacheEntryStatsCollector in the
// cache itself, and saves into `ptr`. This shared_ptr will hold the
// entry in cache until all refs are destroyed.
static Status GetShared(Cache *raw_cache, SystemClock *clock,
static Status GetShared(Cache *cache, SystemClock *clock,
std::shared_ptr<CacheEntryStatsCollector> *ptr) {
assert(raw_cache);
BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
cache{raw_cache};
const Slice &cache_key = GetCacheKey();
auto h = cache.Lookup(cache_key);
Cache::Handle *h = cache->Lookup(cache_key);
if (h == nullptr) {
// Not yet in cache, but Cache doesn't provide a built-in way to
// avoid racing insert. So we double-check under a shared mutex,
@ -126,15 +123,15 @@ class CacheEntryStatsCollector {
STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
std::lock_guard<std::mutex> lock(static_mutex);
h = cache.Lookup(cache_key);
h = cache->Lookup(cache_key);
if (h == nullptr) {
auto new_ptr = new CacheEntryStatsCollector(cache.get(), clock);
auto new_ptr = new CacheEntryStatsCollector(cache, clock);
// TODO: non-zero charge causes some tests that count block cache
// usage to go flaky. Fix the problem somehow so we can use an
// accurate charge.
size_t charge = 0;
Status s =
cache.Insert(cache_key, new_ptr, charge, &h, Cache::Priority::HIGH);
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
Cache::Priority::HIGH);
if (!s.ok()) {
assert(h == nullptr);
delete new_ptr;
@ -143,11 +140,11 @@ class CacheEntryStatsCollector {
}
}
// If we reach here, shared entry is in cache with handle `h`.
assert(cache.get()->GetCacheItemHelper(h) == cache.GetBasicHelper());
assert(cache->GetDeleter(h) == Deleter);
// Build an aliasing shared_ptr that keeps `ptr` in cache while there
// are references.
*ptr = cache.SharedGuard(h);
*ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
return Status::OK();
}
@ -160,6 +157,10 @@ class CacheEntryStatsCollector {
cache_(cache),
clock_(clock) {}
static void Deleter(const Slice &, void *value) {
delete static_cast<CacheEntryStatsCollector *>(value);
}
static const Slice &GetCacheKey() {
// For each template instantiation
static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();

@ -1,40 +0,0 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/cache_helpers.h"
namespace ROCKSDB_NAMESPACE {
void ReleaseCacheHandleCleanup(void* arg1, void* arg2) {
Cache* const cache = static_cast<Cache*>(arg1);
assert(cache);
Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
assert(cache_handle);
cache->Release(cache_handle);
}
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority, size_t* out_charge) {
assert(helper);
assert(helper->create_cb);
Cache::ObjectPtr value;
size_t charge;
Status st = helper->create_cb(saved, create_context,
cache->memory_allocator(), &value, &charge);
if (st.ok()) {
st =
cache->Insert(key, value, helper, charge, /*handle*/ nullptr, priority);
if (out_charge) {
*out_charge = charge;
}
}
return st;
}
} // namespace ROCKSDB_NAMESPACE

@ -7,7 +7,7 @@
#include <cassert>
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
@ -17,17 +17,22 @@ template <typename T>
T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
assert(cache);
assert(handle);
return static_cast<T*>(cache->Value(handle));
}
// Simple generic deleter for Cache (to be used with Cache::Insert).
template <typename T>
void DeleteCacheEntry(const Slice& /* key */, void* value) {
delete static_cast<T*>(value);
}
// Turns a T* into a Slice so it can be used as a key with Cache.
template <typename T>
Slice GetSliceForKey(const T* t) {
Slice GetSlice(const T* t) {
return Slice(reinterpret_cast<const char*>(t), sizeof(T));
}
void ReleaseCacheHandleCleanup(void* arg1, void* arg2);
// Generic resource management object for cache handles that releases the handle
// when destroyed. Has unique ownership of the handle, so copying it is not
// allowed, while moving it transfers ownership.
@ -83,7 +88,7 @@ class CacheHandleGuard {
if (cleanable) {
if (handle_ != nullptr) {
assert(cache_);
cleanable->RegisterCleanup(&ReleaseCacheHandleCleanup, cache_, handle_);
cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_);
}
}
ResetFields();
@ -110,6 +115,16 @@ class CacheHandleGuard {
value_ = nullptr;
}
static void ReleaseCacheHandle(void* arg1, void* arg2) {
Cache* const cache = static_cast<Cache*>(arg1);
assert(cache);
Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
assert(cache_handle);
cache->Release(cache_handle);
}
private:
Cache* cache_ = nullptr;
Cache::Handle* handle_ = nullptr;
@ -124,16 +139,7 @@ template <typename T>
std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
Cache::Handle* handle) {
auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
return std::shared_ptr<T>(wrapper, GetFromCacheHandle<T>(cache, handle));
return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
}
// Given the persistable data (saved) for a block cache entry, parse that
// into a cache entry object and insert it into the given cache. The charge
// of the new entry can be returned to the caller through `out_charge`.
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority = Cache::Priority::LOW,
size_t* out_charge = nullptr);
} // namespace ROCKSDB_NAMESPACE

@ -8,7 +8,7 @@
#include <algorithm>
#include <atomic>
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
#include "table/unique_id_impl.h"
#include "util/hash.h"
#include "util/math.h"

@ -13,6 +13,7 @@
#include <cstring>
#include <memory>
#include "cache/cache_entry_roles.h"
#include "rocksdb/cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -40,17 +41,17 @@ CacheReservationManagerImpl<
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
std::shared_ptr<Cache> cache, bool delayed_decrease)
: cache_(cache),
delayed_decrease_(delayed_decrease),
: delayed_decrease_(delayed_decrease),
cache_allocated_size_(0),
memory_used_(0) {
assert(cache != nullptr);
cache_ = cache;
}
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
for (auto* handle : dummy_handles_) {
cache_.ReleaseAndEraseIfLastRef(handle);
cache_->Release(handle, true);
}
}
@ -114,7 +115,8 @@ Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
Status return_status = Status::OK();
while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
Cache::Handle* handle = nullptr;
return_status = cache_.Insert(GetNextCacheKey(), kSizeDummyEntry, &handle);
return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
GetNoopDeleterForRole<R>(), &handle);
if (return_status != Status::OK()) {
return return_status;
@ -139,7 +141,7 @@ Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
cache_allocated_size_.load(std::memory_order_relaxed)) {
assert(!dummy_handles_.empty());
auto* handle = dummy_handles_.back();
cache_.ReleaseAndEraseIfLastRef(handle);
cache_->Release(handle, true);
dummy_handles_.pop_back();
cache_allocated_size_ -= kSizeDummyEntry;
}
@ -167,9 +169,8 @@ Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
}
template <CacheEntryRole R>
const Cache::CacheItemHelper*
CacheReservationManagerImpl<R>::TEST_GetCacheItemHelperForRole() {
return CacheInterface::GetHelper();
Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() {
return GetNoopDeleterForRole<R>();
}
template class CacheReservationManagerImpl<

@ -18,7 +18,7 @@
#include "cache/cache_entry_roles.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "rocksdb/cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/coding.h"
@ -197,10 +197,10 @@ class CacheReservationManagerImpl
static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
// For testing only - it is to help ensure the CacheItemHelperForRole<R>
// For testing only - it is to help ensure the NoopDeleterForRole<R>
// accessed from CacheReservationManagerImpl and the one accessed from the
// test are from the same translation units
static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();
static Cache::DeleterFn TEST_GetNoopDeleterForRole();
private:
static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
@ -211,8 +211,7 @@ class CacheReservationManagerImpl
Status IncreaseCacheReservation(std::size_t new_mem_used);
Status DecreaseCacheReservation(std::size_t new_mem_used);
using CacheInterface = PlaceholderSharedCacheInterface<R>;
CacheInterface cache_;
std::shared_ptr<Cache> cache_;
bool delayed_decrease_;
std::atomic<std::size_t> cache_allocated_size_;
std::size_t memory_used_;

428
cache/cache_test.cc vendored

@ -16,12 +16,9 @@
#include <vector>
#include "cache/lru_cache.h"
#include "cache/typed_cache.h"
#include "port/stack_trace.h"
#include "test_util/secondary_cache_test_util.h"
#include "test_util/testharness.h"
#include "util/coding.h"
#include "util/hash_containers.h"
#include "util/string_util.h"
// HyperClockCache only supports 16-byte keys, so some of the tests
@ -58,43 +55,42 @@ int DecodeKey32Bits(const Slice& k) {
return DecodeFixed32(k.data());
}
Cache::ObjectPtr EncodeValue(uintptr_t v) {
return reinterpret_cast<Cache::ObjectPtr>(v);
}
void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
int DecodeValue(void* v) {
return static_cast<int>(reinterpret_cast<uintptr_t>(v));
}
const Cache::CacheItemHelper kDumbHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr /*value*/, MemoryAllocator* /*alloc*/) {}};
const Cache::CacheItemHelper kEraseOnDeleteHelper1{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
Cache* cache = static_cast<Cache*>(value);
cache->Erase("foo");
}};
const Cache::CacheItemHelper kEraseOnDeleteHelper2{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
Cache* cache = static_cast<Cache*>(value);
cache->Erase(EncodeKey16Bytes(1234));
}};
void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}
void EraseDeleter1(const Slice& /*key*/, void* value) {
Cache* cache = reinterpret_cast<Cache*>(value);
cache->Erase("foo");
}
void EraseDeleter2(const Slice& /*key*/, void* value) {
Cache* cache = reinterpret_cast<Cache*>(value);
cache->Erase(EncodeKey16Bytes(1234));
}
const std::string kLRU = "lru";
const std::string kHyperClock = "hyper_clock";
} // anonymous namespace
class CacheTest : public testing::Test,
public secondary_cache_test_util::WithCacheTypeParam {
class CacheTest : public testing::TestWithParam<std::string> {
public:
static CacheTest* current_;
static std::string type_;
static void Deleter(Cache::ObjectPtr v, MemoryAllocator*) {
static void Deleter(const Slice& key, void* v) {
if (type_ == kHyperClock) {
current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
} else {
current_->deleted_keys_.push_back(DecodeKey32Bits(key));
}
current_->deleted_values_.push_back(DecodeValue(v));
}
static const Cache::CacheItemHelper kHelper;
static const int kCacheSize = 1000;
static const int kNumShardBits = 4;
@ -102,10 +98,13 @@ class CacheTest : public testing::Test,
static const int kCacheSize2 = 100;
static const int kNumShardBits2 = 2;
std::vector<int> deleted_keys_;
std::vector<int> deleted_values_;
std::shared_ptr<Cache> cache_;
std::shared_ptr<Cache> cache2_;
size_t estimated_value_size_ = 1;
CacheTest()
: cache_(NewCache(kCacheSize, kNumShardBits, false)),
cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
@ -115,6 +114,41 @@ class CacheTest : public testing::Test,
~CacheTest() override {}
std::shared_ptr<Cache> NewCache(size_t capacity) {
auto type = GetParam();
if (type == kLRU) {
return NewLRUCache(capacity);
}
if (type == kHyperClock) {
return HyperClockCacheOptions(
capacity, estimated_value_size_ /*estimated_value_size*/)
.MakeSharedCache();
}
return nullptr;
}
std::shared_ptr<Cache> NewCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
auto type = GetParam();
if (type == kLRU) {
LRUCacheOptions co;
co.capacity = capacity;
co.num_shard_bits = num_shard_bits;
co.strict_capacity_limit = strict_capacity_limit;
co.high_pri_pool_ratio = 0;
co.metadata_charge_policy = charge_policy;
return NewLRUCache(co);
}
if (type == kHyperClock) {
return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/,
num_shard_bits, strict_capacity_limit,
nullptr /*allocator*/, charge_policy)
.MakeSharedCache();
}
return nullptr;
}
// These functions encode/decode keys in tests cases that use
// int keys.
// Currently, HyperClockCache requires keys to be 16B long, whereas
@ -148,8 +182,8 @@ class CacheTest : public testing::Test,
void Insert(std::shared_ptr<Cache> cache, int key, int value,
int charge = 1) {
EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper,
charge, /*handle*/ nullptr, Cache::Priority::HIGH));
EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
&CacheTest::Deleter));
}
void Erase(std::shared_ptr<Cache> cache, int key) {
@ -173,9 +207,6 @@ class CacheTest : public testing::Test,
void Erase2(int key) { Erase(cache2_, key); }
};
const Cache::CacheItemHelper CacheTest::kHelper{CacheEntryRole::kMisc,
&CacheTest::Deleter};
CacheTest* CacheTest::current_;
std::string CacheTest::type_;
@ -205,8 +236,10 @@ TEST_P(CacheTest, UsageTest) {
key = EncodeKey(i);
}
auto kv_size = key.size() + 5;
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size));
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
kv_size, DumbDeleter));
usage += kv_size;
ASSERT_EQ(usage, cache->GetUsage());
if (type == kHyperClock) {
@ -229,8 +262,10 @@ TEST_P(CacheTest, UsageTest) {
} else {
key = EncodeKey(static_cast<int>(1000 + i));
}
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
key.size() + 5, DumbDeleter));
}
// the usage should be close to the capacity
@ -285,9 +320,11 @@ TEST_P(CacheTest, PinnedUsageTest) {
auto kv_size = key.size() + 5;
Cache::Handle* handle;
Cache::Handle* handle_in_precise_cache;
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size, &handle));
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
DumbDeleter, &handle));
assert(handle);
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size,
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
kv_size, DumbDeleter,
&handle_in_precise_cache));
assert(handle_in_precise_cache);
pinned_usage += kv_size;
@ -328,8 +365,10 @@ TEST_P(CacheTest, PinnedUsageTest) {
} else {
key = EncodeKey(static_cast<int>(1000 + i));
}
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
key.size() + 5, DumbDeleter));
}
ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
@ -377,7 +416,8 @@ TEST_P(CacheTest, HitAndMiss) {
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(-1, Lookup(300));
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
if (GetParam() == kHyperClock) {
ASSERT_EQ(102, deleted_values_[0]);
} else {
@ -398,20 +438,21 @@ TEST_P(CacheTest, InsertSameKey) {
TEST_P(CacheTest, Erase) {
Erase(200);
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(0U, deleted_keys_.size());
Insert(100, 101);
Insert(200, 201);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
}
TEST_P(CacheTest, EntriesArePinned) {
@ -428,21 +469,23 @@ TEST_P(CacheTest, EntriesArePinned) {
Insert(100, 102);
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(2U, cache_->GetUsage());
cache_->Release(h1);
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(101, deleted_values_[0]);
ASSERT_EQ(1U, cache_->GetUsage());
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(1U, cache_->GetUsage());
cache_->Release(h2);
ASSERT_EQ(2U, deleted_values_.size());
ASSERT_EQ(2U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[1]);
ASSERT_EQ(102, deleted_values_[1]);
ASSERT_EQ(0U, cache_->GetUsage());
}
@ -545,9 +588,9 @@ TEST_P(CacheTest, EvictEmptyCache) {
// Insert item large than capacity to trigger eviction on empty cache.
auto cache = NewCache(1, 0, false);
if (type == kLRU) {
ASSERT_OK(cache->Insert("foo", nullptr, &kDumbHelper, 10));
ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
} else {
ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10));
ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
}
}
@ -558,19 +601,19 @@ TEST_P(CacheTest, EraseFromDeleter) {
// the cache at that point.
std::shared_ptr<Cache> cache = NewCache(10, 0, false);
std::string foo, bar;
const Cache::CacheItemHelper* erase_helper;
Cache::DeleterFn erase_deleter;
if (type == kLRU) {
foo = "foo";
bar = "bar";
erase_helper = &kEraseOnDeleteHelper1;
erase_deleter = EraseDeleter1;
} else {
foo = EncodeKey(1234);
bar = EncodeKey(5678);
erase_helper = &kEraseOnDeleteHelper2;
erase_deleter = EraseDeleter2;
}
ASSERT_OK(cache->Insert(foo, nullptr, &kDumbHelper, 1));
ASSERT_OK(cache->Insert(bar, cache.get(), erase_helper, 1));
ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
cache->Erase(bar);
ASSERT_EQ(nullptr, cache->Lookup(foo));
@ -632,51 +675,50 @@ TEST_P(CacheTest, NewId) {
ASSERT_NE(a, b);
}
class Value {
public:
explicit Value(int v) : v_(v) {}
int v_;
};
namespace {
void deleter(const Slice& /*key*/, void* value) {
delete static_cast<Value*>(value);
}
} // namespace
TEST_P(CacheTest, ReleaseAndErase) {
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
Cache::Handle* handle;
Status s =
cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
&CacheTest::Deleter, &handle);
ASSERT_TRUE(s.ok());
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(1U, cache->GetUsage());
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(0U, deleted_keys_.size());
auto erased = cache->Release(handle, true);
ASSERT_TRUE(erased);
// This tests that deleter has been called
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, deleted_keys_.size());
}
TEST_P(CacheTest, ReleaseWithoutErase) {
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
Cache::Handle* handle;
Status s =
cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
&CacheTest::Deleter, &handle);
ASSERT_TRUE(s.ok());
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(1U, cache->GetUsage());
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(0U, deleted_keys_.size());
auto erased = cache->Release(handle);
ASSERT_FALSE(erased);
// This tests that deleter is not called. When cache has free capacity it is
// not expected to immediately erase the released items.
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(0U, deleted_keys_.size());
}
namespace {
class Value {
public:
explicit Value(int v) : v_(v) {}
int v_;
static constexpr auto kCacheEntryRole = CacheEntryRole::kMisc;
};
using SharedCache = BasicTypedSharedCacheInterface<Value>;
using TypedHandle = SharedCache::TypedHandle;
} // namespace
TEST_P(CacheTest, SetCapacity) {
auto type = GetParam();
if (type == kHyperClock) {
@ -689,19 +731,19 @@ TEST_P(CacheTest, SetCapacity) {
// lets create a cache with capacity 5,
// then, insert 5 elements, then increase capacity
// to 10, returned capacity should be 10, usage=5
SharedCache cache{NewCache(5, 0, false)};
std::vector<TypedHandle*> handles(10);
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
std::vector<Cache::Handle*> handles(10);
// Insert 5 entries, but not releasing.
for (int i = 0; i < 5; i++) {
std::string key = EncodeKey(i + 1);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
ASSERT_TRUE(s.ok());
}
ASSERT_EQ(5U, cache.get()->GetCapacity());
ASSERT_EQ(5U, cache.get()->GetUsage());
cache.get()->SetCapacity(10);
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(5U, cache.get()->GetUsage());
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(5U, cache->GetUsage());
cache->SetCapacity(10);
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(5U, cache->GetUsage());
// test2: decrease capacity
// insert 5 more elements to cache, then release 5,
@ -709,77 +751,77 @@ TEST_P(CacheTest, SetCapacity) {
// and usage should be 7
for (int i = 5; i < 10; i++) {
std::string key = EncodeKey(i + 1);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
ASSERT_TRUE(s.ok());
}
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(10U, cache.get()->GetUsage());
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(10U, cache->GetUsage());
for (int i = 0; i < 5; i++) {
cache.Release(handles[i]);
cache->Release(handles[i]);
}
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(10U, cache.get()->GetUsage());
cache.get()->SetCapacity(7);
ASSERT_EQ(7, cache.get()->GetCapacity());
ASSERT_EQ(7, cache.get()->GetUsage());
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(10U, cache->GetUsage());
cache->SetCapacity(7);
ASSERT_EQ(7, cache->GetCapacity());
ASSERT_EQ(7, cache->GetUsage());
// release remaining 5 to keep valgrind happy
for (int i = 5; i < 10; i++) {
cache.Release(handles[i]);
cache->Release(handles[i]);
}
// Make sure this doesn't crash or upset ASAN/valgrind
cache.get()->DisownData();
cache->DisownData();
}
TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
// test1: set the flag to false. Insert more keys than capacity. See if they
// all go through.
SharedCache cache{NewCache(5, 0, false)};
std::vector<TypedHandle*> handles(10);
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
std::vector<Cache::Handle*> handles(10);
Status s;
for (int i = 0; i < 10; i++) {
std::string key = EncodeKey(i + 1);
s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
ASSERT_OK(s);
ASSERT_NE(nullptr, handles[i]);
}
ASSERT_EQ(10, cache.get()->GetUsage());
ASSERT_EQ(10, cache->GetUsage());
// test2: set the flag to true. Insert and check if it fails.
std::string extra_key = EncodeKey(100);
Value* extra_value = new Value(0);
cache.get()->SetStrictCapacityLimit(true);
TypedHandle* handle;
s = cache.Insert(extra_key, extra_value, 1, &handle);
cache->SetStrictCapacityLimit(true);
Cache::Handle* handle;
s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
ASSERT_TRUE(s.IsMemoryLimit());
ASSERT_EQ(nullptr, handle);
ASSERT_EQ(10, cache.get()->GetUsage());
ASSERT_EQ(10, cache->GetUsage());
for (int i = 0; i < 10; i++) {
cache.Release(handles[i]);
cache->Release(handles[i]);
}
// test3: init with flag being true.
SharedCache cache2{NewCache(5, 0, true)};
std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
for (int i = 0; i < 5; i++) {
std::string key = EncodeKey(i + 1);
s = cache2.Insert(key, new Value(i + 1), 1, &handles[i]);
s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
ASSERT_OK(s);
ASSERT_NE(nullptr, handles[i]);
}
s = cache2.Insert(extra_key, extra_value, 1, &handle);
s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
ASSERT_TRUE(s.IsMemoryLimit());
ASSERT_EQ(nullptr, handle);
// test insert without handle
s = cache2.Insert(extra_key, extra_value, 1);
s = cache2->Insert(extra_key, extra_value, 1, &deleter);
// AS if the key have been inserted into cache but get evicted immediately.
ASSERT_OK(s);
ASSERT_EQ(5, cache2.get()->GetUsage());
ASSERT_EQ(nullptr, cache2.Lookup(extra_key));
ASSERT_EQ(5, cache2->GetUsage());
ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
for (int i = 0; i < 5; i++) {
cache2.Release(handles[i]);
cache2->Release(handles[i]);
}
}
@ -787,54 +829,55 @@ TEST_P(CacheTest, OverCapacity) {
size_t n = 10;
// a LRUCache with n entries and one shard only
SharedCache cache{NewCache(n, 0, false)};
std::vector<TypedHandle*> handles(n + 1);
std::shared_ptr<Cache> cache = NewCache(n, 0, false);
std::vector<Cache::Handle*> handles(n + 1);
// Insert n+1 entries, but not releasing.
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
ASSERT_TRUE(s.ok());
}
// Guess what's in the cache now?
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
auto h = cache.Lookup(key);
auto h = cache->Lookup(key);
ASSERT_TRUE(h != nullptr);
if (h) cache.Release(h);
if (h) cache->Release(h);
}
// the cache is over capacity since nothing could be evicted
ASSERT_EQ(n + 1U, cache.get()->GetUsage());
ASSERT_EQ(n + 1U, cache->GetUsage());
for (int i = 0; i < static_cast<int>(n + 1); i++) {
cache.Release(handles[i]);
cache->Release(handles[i]);
}
if (GetParam() == kHyperClock) {
// Make sure eviction is triggered.
ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0]));
ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
// cache is under capacity now since elements were released
ASSERT_GE(n, cache.get()->GetUsage());
ASSERT_GE(n, cache->GetUsage());
// clean up
cache.Release(handles[0]);
cache->Release(handles[0]);
} else {
// LRUCache checks for over-capacity in Release.
// cache is exactly at capacity now with minimal eviction
ASSERT_EQ(n, cache.get()->GetUsage());
ASSERT_EQ(n, cache->GetUsage());
// element 0 is evicted and the rest is there
// This is consistent with the LRU policy since the element 0
// was released first
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
auto h = cache.Lookup(key);
auto h = cache->Lookup(key);
if (h) {
ASSERT_NE(static_cast<size_t>(i), 0U);
cache.Release(h);
cache->Release(h);
} else {
ASSERT_EQ(static_cast<size_t>(i), 0U);
}
@ -842,15 +885,40 @@ TEST_P(CacheTest, OverCapacity) {
}
}
namespace {
std::vector<std::pair<int, int>> legacy_callback_state;
void legacy_callback(void* value, size_t charge) {
legacy_callback_state.push_back(
{DecodeValue(value), static_cast<int>(charge)});
}
}; // namespace
TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
std::vector<std::pair<int, int>> inserted;
legacy_callback_state.clear();
for (int i = 0; i < 10; ++i) {
Insert(i, i * 2, i + 1);
inserted.push_back({i * 2, i + 1});
}
cache_->ApplyToAllCacheEntries(legacy_callback, true);
std::sort(inserted.begin(), inserted.end());
std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
ASSERT_EQ(inserted.size(), legacy_callback_state.size());
for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
EXPECT_EQ(inserted[i], legacy_callback_state[i]);
}
}
TEST_P(CacheTest, ApplyToAllEntriesTest) {
std::vector<std::string> callback_state;
const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper) {
const auto callback = [&](const Slice& key, void* value, size_t charge,
Cache::DeleterFn deleter) {
callback_state.push_back(std::to_string(DecodeKey(key)) + "," +
std::to_string(DecodeValue(value)) + "," +
std::to_string(charge));
assert(helper == &CacheTest::kHelper);
assert(deleter == &CacheTest::Deleter);
};
std::vector<std::string> inserted;
@ -889,8 +957,8 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
// For callback
int special_count = 0;
const auto callback = [&](const Slice&, Cache::ObjectPtr, size_t charge,
const Cache::CacheItemHelper*) {
const auto callback = [&](const Slice&, void*, size_t charge,
Cache::DeleterFn) {
if (charge == static_cast<size_t>(kSpecialCharge)) {
++special_count;
}
@ -952,105 +1020,13 @@ TEST_P(CacheTest, GetChargeAndDeleter) {
Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
ASSERT_EQ(1, cache_->GetCharge(h1));
ASSERT_EQ(&CacheTest::kHelper, cache_->GetCacheItemHelper(h1));
ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
cache_->Release(h1);
}
namespace {
bool AreTwoCacheKeysOrdered(Cache* cache) {
std::vector<std::string> keys;
const auto callback = [&](const Slice& key, Cache::ObjectPtr /*value*/,
size_t /*charge*/,
const Cache::CacheItemHelper* /*helper*/) {
keys.push_back(key.ToString());
};
cache->ApplyToAllEntries(callback, /*opts*/ {});
EXPECT_EQ(keys.size(), 2U);
EXPECT_NE(keys[0], keys[1]);
return keys[0] < keys[1];
}
} // namespace
TEST_P(CacheTest, CacheUniqueSeeds) {
// kQuasiRandomHashSeed should generate unique seeds (up to 2 billion before
// repeating)
UnorderedSet<uint32_t> seeds_seen;
// Roughly sqrt(number of possible values) for a decent chance at detecting
// a random collision if it's possible (shouldn't be)
uint16_t kSamples = 20000;
seeds_seen.reserve(kSamples);
// Hash seed should affect ordering of entries in the table, so we should
// have extremely high chance of seeing two entries ordered both ways.
bool seen_forward_order = false;
bool seen_reverse_order = false;
for (int i = 0; i < kSamples; ++i) {
auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
opts.hash_seed = LRUCacheOptions::kQuasiRandomHashSeed;
opts.num_shard_bits = 0;
opts.metadata_charge_policy = kDontChargeCacheMetadata;
});
auto val = cache->GetHashSeed();
ASSERT_TRUE(seeds_seen.insert(val).second);
ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
if (AreTwoCacheKeysOrdered(cache.get())) {
seen_forward_order = true;
} else {
seen_reverse_order = true;
}
}
ASSERT_TRUE(seen_forward_order);
ASSERT_TRUE(seen_reverse_order);
}
TEST_P(CacheTest, CacheHostSeed) {
// kHostHashSeed should generate a consistent seed within this process
// (and other processes on the same host, but not unit testing that).
// And we should be able to use that chosen seed as an explicit option
// (for debugging).
// And we should verify consistent ordering of entries.
uint32_t expected_seed = 0;
bool expected_order = false;
// 10 iterations -> chance of a random seed falsely appearing consistent
// should be low, just 1 in 2^9.
for (int i = 0; i < 10; ++i) {
auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
if (i != 5) {
opts.hash_seed = LRUCacheOptions::kHostHashSeed;
} else {
// Can be used as explicit seed
opts.hash_seed = static_cast<int32_t>(expected_seed);
ASSERT_GE(opts.hash_seed, 0);
}
opts.num_shard_bits = 0;
opts.metadata_charge_policy = kDontChargeCacheMetadata;
});
ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
uint32_t val = cache->GetHashSeed();
bool order = AreTwoCacheKeysOrdered(cache.get());
if (i != 0) {
ASSERT_EQ(val, expected_seed);
ASSERT_EQ(order, expected_order);
} else {
expected_seed = val;
expected_order = order;
}
}
// Printed for reference in case it's needed to reproduce other unit test
// failures on another host
fprintf(stderr, "kHostHashSeed -> %u\n", (unsigned)expected_seed);
}
INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
secondary_cache_test_util::GetTestingCacheTypes());
INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest,
testing::Values(secondary_cache_test_util::kLRU));
testing::Values(kLRU, kHyperClock));
INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
} // namespace ROCKSDB_NAMESPACE

@ -11,57 +11,65 @@ namespace ROCKSDB_NAMESPACE {
ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache)
: CacheWrapper(cache),
: cache_(cache),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<
CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
block_cache))) {}
Status ChargedCache::Insert(const Slice& key, ObjectPtr obj,
Status ChargedCache::Insert(const Slice& key, void* value, size_t charge,
DeleterFn deleter, Handle** handle,
Priority priority) {
Status s = cache_->Insert(key, value, charge, deleter, handle, priority);
if (s.ok()) {
// Insert may cause the cache entry eviction if the cache is full. So we
// directly call the reservation manager to update the total memory used
// in the cache.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}
return s;
}
Status ChargedCache::Insert(const Slice& key, void* value,
const CacheItemHelper* helper, size_t charge,
Handle** handle, Priority priority) {
Status s = target_->Insert(key, obj, helper, charge, handle, priority);
Status s = cache_->Insert(key, value, helper, charge, handle, priority);
if (s.ok()) {
// Insert may cause the cache entry eviction if the cache is full. So we
// directly call the reservation manager to update the total memory used
// in the cache.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}
return s;
}
Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) {
return cache_->Lookup(key, stats);
}
Cache::Handle* ChargedCache::Lookup(const Slice& key,
const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority, Statistics* stats) {
auto handle = target_->Lookup(key, helper, create_context, priority, stats);
const CreateCallback& create_cb,
Priority priority, bool wait,
Statistics* stats) {
auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats);
// Lookup may promote the KV pair from the secondary cache to the primary
// cache. So we directly call the reservation manager to update the total
// memory used in the cache.
if (helper && helper->create_cb) {
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
return handle;
}
void ChargedCache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
target_->WaitAll(async_handles, count);
// In case of any promotions. Although some could finish by return of
// StartAsyncLookup, Wait/WaitAll will generally be used, so simpler to
// update here.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
return handle;
}
bool ChargedCache::Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref) {
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, useful, erase_if_last_ref);
size_t memory_used_delta = cache_->GetUsage(handle);
bool erased = cache_->Release(handle, useful, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
@ -72,8 +80,8 @@ bool ChargedCache::Release(Cache::Handle* handle, bool useful,
}
bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, erase_if_last_ref);
size_t memory_used_delta = cache_->GetUsage(handle);
bool erased = cache_->Release(handle, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
@ -84,25 +92,25 @@ bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
}
void ChargedCache::Erase(const Slice& key) {
target_->Erase(key);
cache_->Erase(key);
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::EraseUnRefEntries() {
target_->EraseUnRefEntries();
cache_->EraseUnRefEntries();
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::SetCapacity(size_t capacity) {
target_->SetCapacity(capacity);
cache_->SetCapacity(capacity);
// SetCapacity can result in evictions when the cache capacity is decreased,
// so we would want to update the cache reservation here as well.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}

@ -8,7 +8,7 @@
#include <string>
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
namespace ROCKSDB_NAMESPACE {
@ -17,21 +17,22 @@ class ConcurrentCacheReservationManager;
// A cache interface which wraps around another cache and takes care of
// reserving space in block cache towards a single global memory limit, and
// forwards all the calls to the underlying cache.
class ChargedCache : public CacheWrapper {
class ChargedCache : public Cache {
public:
ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache);
~ChargedCache() override = default;
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
Handle** handle, Priority priority) override;
Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Cache::Handle* Lookup(const Slice& key, Statistics* stats) override;
Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
const CreateCallback& create_cb, Priority priority,
bool wait, Statistics* stats = nullptr) override;
bool Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref = false) override;
@ -43,9 +44,69 @@ class ChargedCache : public CacheWrapper {
static const char* kClassName() { return "ChargedCache"; }
const char* Name() const override { return kClassName(); }
uint64_t NewId() override { return cache_->NewId(); }
void SetCapacity(size_t capacity) override;
inline Cache* GetCache() const { return target_.get(); }
void SetStrictCapacityLimit(bool strict_capacity_limit) override {
cache_->SetStrictCapacityLimit(strict_capacity_limit);
}
bool HasStrictCapacityLimit() const override {
return cache_->HasStrictCapacityLimit();
}
void* Value(Cache::Handle* handle) override { return cache_->Value(handle); }
bool IsReady(Cache::Handle* handle) override {
return cache_->IsReady(handle);
}
void Wait(Cache::Handle* handle) override { cache_->Wait(handle); }
void WaitAll(std::vector<Handle*>& handles) override {
cache_->WaitAll(handles);
}
bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); }
size_t GetCapacity() const override { return cache_->GetCapacity(); }
size_t GetUsage() const override { return cache_->GetUsage(); }
size_t GetUsage(Cache::Handle* handle) const override {
return cache_->GetUsage(handle);
}
size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
size_t GetCharge(Cache::Handle* handle) const override {
return cache_->GetCharge(handle);
}
Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override {
return cache_->GetDeleter(handle);
}
void ApplyToAllEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
Cache::DeleterFn deleter)>& callback,
const Cache::ApplyToAllEntriesOptions& opts) override {
cache_->ApplyToAllEntries(callback, opts);
}
void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge),
bool thread_safe) override {
cache_->ApplyToAllCacheEntries(callback, thread_safe);
}
std::string GetPrintableOptions() const override {
return cache_->GetPrintableOptions();
}
void DisownData() override { return cache_->DisownData(); }
inline Cache* GetCache() const { return cache_.get(); }
inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
const {
@ -53,6 +114,7 @@ class ChargedCache : public CacheWrapper {
}
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};

971
cache/clock_cache.cc vendored

File diff suppressed because it is too large Load Diff

345
cache/clock_cache.h vendored

@ -24,7 +24,6 @@
#include "rocksdb/cache.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/math.h"
namespace ROCKSDB_NAMESPACE {
@ -146,7 +145,7 @@ class ClockCacheTest;
// (erased by user) but can be read by existing references, and ref count
// changed by Ref and Release.
//
// A special case is "standalone" entries, which are heap-allocated handles
// A special case is "detached" entries, which are heap-allocated handles
// not in the table. They are always Invisible and freed on zero refs.
//
// State transitions:
@ -201,8 +200,8 @@ class ClockCacheTest;
// table occupancy limit has been reached. If strict_capacity_limit=false,
// we must never fail Insert, and if a Handle* is provided, we have to return
// a usable Cache handle on success. The solution to this (typically rare)
// problem is "standalone" handles, which are usable by the caller but not
// actually available for Lookup in the Cache. Standalone handles are allocated
// problem is "detached" handles, which are usable by the caller but not
// actually available for Lookup in the Cache. Detached handles are allocated
// independently on the heap and specially marked so that they are freed on
// the heap when their last reference is released.
//
@ -306,17 +305,23 @@ constexpr double kLoadFactor = 0.7;
constexpr double kStrictLoadFactor = 0.84;
struct ClockHandleBasicData {
Cache::ObjectPtr value = nullptr;
const Cache::CacheItemHelper* helper = nullptr;
void* value = nullptr;
Cache::DeleterFn deleter = nullptr;
// A lossless, reversible hash of the fixed-size (16 byte) cache key. This
// eliminates the need to store a hash separately.
UniqueId64x2 hashed_key = kNullUniqueId64x2;
size_t total_charge = 0;
// For total_charge_and_flags
// "Detached" means the handle is allocated separately from hash table.
static constexpr uint64_t kFlagDetached = uint64_t{1} << 63;
// Extract just the total charge
static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1;
inline size_t GetTotalCharge() const { return total_charge; }
// Calls deleter (if non-null) on cache key and value
void FreeData(MemoryAllocator* allocator) const;
void FreeData() const;
// Required by concept HandleImpl
const UniqueId64x2& GetHash() const { return hashed_key; }
@ -372,134 +377,14 @@ struct ClockHandle : public ClockHandleBasicData {
static constexpr uint8_t kMaxCountdown = kHighCountdown;
// TODO: make these coundown values tuning parameters for eviction?
// See above. Mutable for read reference counting.
mutable std::atomic<uint64_t> meta{};
// Whether this is a "deteched" handle that is independently allocated
// with `new` (so must be deleted with `delete`).
// TODO: ideally this would be packed into some other data field, such
// as upper bits of total_charge, but that incurs a measurable performance
// regression.
bool standalone = false;
// See above
std::atomic<uint64_t> meta{};
inline bool IsStandalone() const { return standalone; }
inline void SetStandalone() { standalone = true; }
// Anticipating use for SecondaryCache support
void* reserved_for_future_use = nullptr;
}; // struct ClockHandle
class BaseClockTable {
public:
BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed)
: metadata_charge_policy_(metadata_charge_policy),
allocator_(allocator),
eviction_callback_(*eviction_callback),
hash_seed_(*hash_seed) {}
template <class Table>
typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
size_t capacity,
bool strict_capacity_limit,
bool allow_uncharged);
template <class Table>
Status Insert(const ClockHandleBasicData& proto,
typename Table::HandleImpl** handle, Cache::Priority priority,
size_t capacity, bool strict_capacity_limit);
void Ref(ClockHandle& handle);
size_t GetOccupancy() const {
return occupancy_.load(std::memory_order_relaxed);
}
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
size_t GetStandaloneUsage() const {
return standalone_usage_.load(std::memory_order_relaxed);
}
uint32_t GetHashSeed() const { return hash_seed_; }
struct EvictionData {
size_t freed_charge = 0;
size_t freed_count = 0;
};
void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
#ifndef NDEBUG
// Acquire N references
void TEST_RefN(ClockHandle& handle, size_t n);
// Helper for TEST_ReleaseN
void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
#endif
private: // fns
// Creates a "standalone" handle for returning from an Insert operation that
// cannot be completed by actually inserting into the table.
// Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
template <class HandleImpl>
HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=true rules. This
// means the operation might fail with Status::MemoryLimit. If
// `need_evict_for_occupancy`, then eviction of at least one entry is
// required, and the operation should fail if not possible.
// NOTE: Otherwise, occupancy_ is not managed in this function
template <class Table>
Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=false rules. This
// means that updating `usage_` always succeeds even if forced to exceed
// capacity. If `need_evict_for_occupancy`, then eviction of at least one
// entry is required, and the operation should return false if such eviction
// is not possible. `usage_` is not updated in that case. Otherwise, returns
// true, indicating success.
// NOTE: occupancy_ is not managed in this function
template <class Table>
bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
protected: // data
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
// Clock algorithm sweep pointer.
std::atomic<uint64_t> clock_pointer_{};
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<size_t> occupancy_{};
// Memory usage by entries tracked by the cache (including standalone)
std::atomic<size_t> usage_{};
// Part of usage by standalone entries (not in table)
std::atomic<size_t> standalone_usage_{};
ALIGN_AS(CACHE_LINE_SIZE)
const CacheMetadataChargePolicy metadata_charge_policy_;
// From Cache, for deleter
MemoryAllocator* const allocator_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
// A reference to ShardedCacheBase::hash_seed_
const uint32_t& hash_seed_;
};
class HyperClockTable : public BaseClockTable {
class HyperClockTable {
public:
// Target size to be exactly a common cache line size (see static_assert in
// clock_cache.cc)
@ -508,6 +393,16 @@ class HyperClockTable : public BaseClockTable {
// up in this slot or a higher one.
std::atomic<uint32_t> displacements{};
// Whether this is a "deteched" handle that is independently allocated
// with `new` (so must be deleted with `delete`).
// TODO: ideally this would be packed into some other data field, such
// as upper bits of total_charge, but that incurs a measurable performance
// regression.
bool detached = false;
inline bool IsDetached() const { return detached; }
inline void SetDetached() { detached = true; }
}; // struct HandleImpl
struct Opts {
@ -516,70 +411,75 @@ class HyperClockTable : public BaseClockTable {
HyperClockTable(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const Opts& opts);
const Opts& opts);
~HyperClockTable();
// For BaseClockTable::Insert
struct InsertState {};
void StartInsert(InsertState& state);
// Returns true iff there is room for the proposed number of entries.
bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
HandleImpl* DoInsert(const ClockHandleBasicData& proto,
uint64_t initial_countdown, bool take_ref,
InsertState& state);
// Runs the clock eviction algorithm trying to reclaim at least
// requested_charge. Returns how much is evicted, which could be less
// if it appears impossible to evict the requested amount without blocking.
void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
Cache::Priority priority, size_t capacity,
bool strict_capacity_limit);
HandleImpl* Lookup(const UniqueId64x2& hashed_key);
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
void Ref(HandleImpl& handle);
void Erase(const UniqueId64x2& hashed_key);
void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
size_t index_begin, size_t index_end,
bool apply_if_will_be_deleted) const;
void EraseUnRefEntries();
size_t GetTableSize() const { return size_t{1} << length_bits_; }
int GetLengthBits() const { return length_bits_; }
size_t GetOccupancy() const {
return occupancy_.load(std::memory_order_relaxed);
}
size_t GetOccupancyLimit() const { return occupancy_limit_; }
const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return const_cast<size_t&>(occupancy_limit_);
size_t GetDetachedUsage() const {
return detached_usage_.load(std::memory_order_relaxed);
}
// Release N references
// Acquire/release N references
void TEST_RefN(HandleImpl& handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // functions
// Returns x mod 2^{length_bits_}.
inline size_t ModTableSize(uint64_t x) {
return BitwiseAnd(x, length_bits_mask_);
return static_cast<size_t>(x) & length_bits_mask_;
}
// Returns the first slot in the probe sequence with a handle e such that
// match_fn(e) is true. At every step, the function first tests whether
// match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
// whether the search should be aborted, and if so, FindSlot immediately
// returns nullptr. For every handle e that is not a match and not aborted,
// FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
// slot will be the last probed because the next would cycle back to the first
// slot probed. This function uses templates instead of std::function to
// minimize the risk of heap-allocated closures being created.
template <typename MatchFn, typename AbortFn, typename UpdateFn>
// Runs the clock eviction algorithm trying to reclaim at least
// requested_charge. Returns how much is evicted, which could be less
// if it appears impossible to evict the requested amount without blocking.
inline void Evict(size_t requested_charge, size_t* freed_charge,
size_t* freed_count);
// Returns the first slot in the probe sequence, starting from the given
// probe number, with a handle e such that match(e) is true. At every
// step, the function first tests whether match(e) holds. If this is false,
// it evaluates abort(e) to decide whether the search should be aborted,
// and in the affirmative returns -1. For every handle e probed except
// the last one, the function runs update(e).
// The probe parameter is modified as follows. We say a probe to a handle
// e is aborting if match(e) is false and abort(e) is true. Then the final
// value of probe is one more than the last non-aborting probe during the
// call. This is so that that the variable can be used to keep track of
// progress across consecutive calls to FindSlot.
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
const MatchFn& match_fn, const AbortFn& abort_fn,
const UpdateFn& update_fn);
std::function<bool(HandleImpl*)> match,
std::function<bool(HandleImpl*)> stop,
std::function<void(HandleImpl*)> update,
size_t& probe);
// Re-decrement all displacements in probe path starting from beginning
// until (not including) the given handle
@ -592,7 +492,32 @@ class HyperClockTable : public BaseClockTable {
// before releasing it so that it can be provided to this function.
inline void ReclaimEntryUsage(size_t total_charge);
MemoryAllocator* GetAllocator() const { return allocator_; }
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=true rules. This
// means the operation might fail with Status::MemoryLimit. If
// `need_evict_for_occupancy`, then eviction of at least one entry is
// required, and the operation should fail if not possible.
// NOTE: Otherwise, occupancy_ is not managed in this function
inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
size_t capacity,
bool need_evict_for_occupancy);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=false rules. This
// means that updating `usage_` always succeeds even if forced to exceed
// capacity. If `need_evict_for_occupancy`, then eviction of at least one
// entry is required, and the operation should return false if such eviction
// is not possible. `usage_` is not updated in that case. Otherwise, returns
// true, indicating success.
// NOTE: occupancy_ is not managed in this function
inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
size_t capacity,
bool need_evict_for_occupancy);
// Creates a "detached" handle for returning from an Insert operation that
// cannot be completed by actually inserting into the table.
// Updates `detached_usage_` but not `usage_` nor `occupancy_`.
inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto);
// Returns the number of bits used to hash an element in the hash
// table.
@ -612,6 +537,24 @@ class HyperClockTable : public BaseClockTable {
// Array of slots comprising the hash table.
const std::unique_ptr<HandleImpl[]> array_;
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
ALIGN_AS(CACHE_LINE_SIZE)
// Clock algorithm sweep pointer.
std::atomic<uint64_t> clock_pointer_{};
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<size_t> occupancy_{};
// Memory usage by entries tracked by the cache (including detached)
std::atomic<size_t> usage_{};
// Part of usage by detached entries (not in table)
std::atomic<size_t> detached_usage_{};
}; // class HyperClockTable
// A single shard of sharded cache.
@ -620,9 +563,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
public:
ClockCacheShard(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const typename Table::Opts& opts);
const typename Table::Opts& opts);
// For CacheShard concept
using HandleImpl = typename Table::HandleImpl;
@ -632,23 +573,22 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Upper32of64(hash[0]);
}
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
static inline HashVal ComputeHash(const Slice& key) {
assert(key.size() == kCacheKeySize);
HashVal in;
HashVal out;
// NOTE: endian dependence
// TODO: use GetUnaligned?
std::memcpy(&in, key.data(), kCacheKeySize);
BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]);
BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
return out;
}
// For reconstructing key from hashed_key. Requires the caller to provide
// backing storage for the Slice in `unhashed`
static inline Slice ReverseHash(const UniqueId64x2& hashed,
UniqueId64x2* unhashed, uint32_t seed) {
UniqueId64x2* unhashed) {
BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
(*unhashed)[0] ^= seed;
// NOTE: endian dependence
return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
}
@ -660,14 +600,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
void SetStrictCapacityLimit(bool strict_capacity_limit);
Status Insert(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr value, const Cache::CacheItemHelper* helper,
size_t charge, HandleImpl** handle, Cache::Priority priority);
HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
size_t charge, Cache::DeleterFn deleter, HandleImpl** handle,
Cache::Priority priority);
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
@ -683,7 +618,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
size_t GetUsage() const;
size_t GetStandaloneUsage() const;
size_t GetDetachedUsage() const;
size_t GetPinnedUsage() const;
@ -694,30 +629,37 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr obj,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
std::string GetPrintableOptions() const { return std::string{}; }
// SecondaryCache not yet supported
Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
HandleImpl** handle, Cache::Priority priority) {
return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
priority);
}
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
const Cache::CacheItemHelper* /*helper*/,
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/, Statistics* /*stats*/) {
const Cache::CreateCallback& /*create_cb*/,
Cache::Priority /*priority*/, bool /*wait*/,
Statistics* /*stats*/) {
return Lookup(key, hashed_key);
}
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return table_.TEST_MutableOccupancyLimit();
}
bool IsReady(HandleImpl* /*handle*/) { return true; }
void Wait(HandleImpl* /*handle*/) {}
// Acquire/release N references
void TEST_RefN(HandleImpl* handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // data
Table table_;
@ -737,15 +679,18 @@ class HyperClockCache
public:
using Shard = ClockCacheShard<HyperClockTable>;
explicit HyperClockCache(const HyperClockCacheOptions& opts);
HyperClockCache(size_t capacity, size_t estimated_value_size,
int num_shard_bits, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
std::shared_ptr<MemoryAllocator> memory_allocator);
const char* Name() const override { return "HyperClockCache"; }
Cache::ObjectPtr Value(Handle* handle) override;
void* Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
void ReportProblems(
const std::shared_ptr<Logger>& /*info_log*/) const override;

@ -9,7 +9,7 @@
#include <cstdint>
#include <memory>
#include "memory/memory_allocator_impl.h"
#include "memory/memory_allocator.h"
#include "monitoring/perf_context_imp.h"
#include "util/compression.h"
#include "util/string_util.h"
@ -17,24 +17,30 @@
namespace ROCKSDB_NAMESPACE {
CompressedSecondaryCache::CompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts)
: cache_(opts.LRUCacheOptions::MakeSharedCache()),
cache_options_(opts),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache_))) {}
CompressedSecondaryCache::~CompressedSecondaryCache() {
assert(cache_res_mgr_->GetTotalReservedCacheSize() == 0);
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
CompressionType compression_type, uint32_t compress_format_version,
bool enable_custom_split_merge)
: cache_options_(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
use_adaptive_mutex, metadata_charge_policy,
compression_type, compress_format_version,
enable_custom_split_merge) {
cache_ =
NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, low_pri_pool_ratio);
}
CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) {
assert(helper);
const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
bool advise_erase, bool& is_in_sec_cache) {
std::unique_ptr<SecondaryCacheResultHandle> handle;
kept_in_sec_cache = false;
is_in_sec_cache = false;
Cache::Handle* lru_handle = cache_->Lookup(key);
if (lru_handle == nullptr) {
return nullptr;
@ -58,15 +64,12 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
handle_value_charge = cache_->GetCharge(lru_handle);
}
MemoryAllocator* allocator = cache_options_.memory_allocator.get();
Status s;
Cache::ObjectPtr value{nullptr};
void* value{nullptr};
size_t charge{0};
if (cache_options_.compression_type == kNoCompression ||
cache_options_.do_not_compress_roles.Contains(helper->role)) {
s = helper->create_cb(Slice(ptr->get(), handle_value_charge),
create_context, allocator, &value, &charge);
if (cache_options_.compression_type == kNoCompression) {
s = create_cb(ptr->get(), handle_value_charge, &value, &charge);
} else {
UncompressionContext uncompression_context(cache_options_.compression_type);
UncompressionInfo uncompression_info(uncompression_context,
@ -76,14 +79,14 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
size_t uncompressed_size{0};
CacheAllocationPtr uncompressed = UncompressData(
uncompression_info, (char*)ptr->get(), handle_value_charge,
&uncompressed_size, cache_options_.compress_format_version, allocator);
&uncompressed_size, cache_options_.compress_format_version,
cache_options_.memory_allocator.get());
if (!uncompressed) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
return nullptr;
}
s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
create_context, allocator, &value, &charge);
s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge);
}
if (!s.ok()) {
@ -95,32 +98,30 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
// Insert a dummy handle.
cache_
->Insert(key, /*obj=*/nullptr,
GetHelper(cache_options_.enable_custom_split_merge),
/*charge=*/0)
->Insert(key, /*value=*/nullptr, /*charge=*/0,
GetDeletionCallback(cache_options_.enable_custom_split_merge))
.PermitUncheckedError();
} else {
kept_in_sec_cache = true;
is_in_sec_cache = true;
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
return handle;
}
Status CompressedSecondaryCache::Insert(const Slice& key,
Cache::ObjectPtr value,
Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
const Cache::CacheItemHelper* helper) {
if (value == nullptr) {
return Status::InvalidArgument();
}
Cache::Handle* lru_handle = cache_->Lookup(key);
auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
Cache::DeleterFn del_cb =
GetDeletionCallback(cache_options_.enable_custom_split_merge);
if (lru_handle == nullptr) {
PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
// Insert a dummy handle if the handle is evicted for the first time.
return cache_->Insert(key, /*obj=*/nullptr, internal_helper,
/*charge=*/0);
return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb);
} else {
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
@ -136,8 +137,7 @@ Status CompressedSecondaryCache::Insert(const Slice& key,
Slice val(ptr.get(), size);
std::string compressed_val;
if (cache_options_.compression_type != kNoCompression &&
!cache_options_.do_not_compress_roles.Contains(helper->role)) {
if (cache_options_.compression_type != kNoCompression) {
PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
CompressionOptions compression_opts;
CompressionContext compression_context(cache_options_.compression_type);
@ -169,10 +169,10 @@ Status CompressedSecondaryCache::Insert(const Slice& key,
size_t charge{0};
CacheValueChunk* value_chunks_head =
SplitValueIntoChunks(val, cache_options_.compression_type, charge);
return cache_->Insert(key, value_chunks_head, internal_helper, charge);
return cache_->Insert(key, value_chunks_head, charge, del_cb);
} else {
CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
return cache_->Insert(key, buf, internal_helper, size);
return cache_->Insert(key, buf, size, del_cb);
}
}
@ -276,43 +276,50 @@ CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
return ptr;
}
const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
bool enable_custom_split_merge) const {
Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback(
bool enable_custom_split_merge) {
if (enable_custom_split_merge) {
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
CacheValueChunk* chunks_head = static_cast<CacheValueChunk*>(obj);
while (chunks_head != nullptr) {
CacheValueChunk* tmp_chunk = chunks_head;
chunks_head = chunks_head->next;
tmp_chunk->Free();
obj = nullptr;
};
}};
return &kHelper;
return [](const Slice& /*key*/, void* obj) {
CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj);
while (chunks_head != nullptr) {
CacheValueChunk* tmp_chunk = chunks_head;
chunks_head = chunks_head->next;
tmp_chunk->Free();
obj = nullptr;
};
};
} else {
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
delete static_cast<CacheAllocationPtr*>(obj);
obj = nullptr;
}};
return &kHelper;
return [](const Slice& /*key*/, void* obj) {
delete reinterpret_cast<CacheAllocationPtr*>(obj);
obj = nullptr;
};
}
}
std::shared_ptr<SecondaryCache>
CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
return std::make_shared<CompressedSecondaryCache>(*this);
}
Status CompressedSecondaryCache::Deflate(size_t decrease) {
return cache_res_mgr_->UpdateCacheReservation(decrease, /*increase=*/true);
std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
CompressionType compression_type, uint32_t compress_format_version,
bool enable_custom_split_merge) {
return std::make_shared<CompressedSecondaryCache>(
capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, compression_type, compress_format_version,
enable_custom_split_merge);
}
Status CompressedSecondaryCache::Inflate(size_t increase) {
return cache_res_mgr_->UpdateCacheReservation(increase, /*increase=*/false);
std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts) {
// The secondary_cache is disabled for this LRUCache instance.
assert(opts.secondary_cache == nullptr);
return NewCompressedSecondaryCache(
opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
opts.use_adaptive_mutex, opts.metadata_charge_policy,
opts.compression_type, opts.compress_format_version,
opts.enable_custom_split_merge);
}
} // namespace ROCKSDB_NAMESPACE

@ -9,9 +9,8 @@
#include <cstddef>
#include <memory>
#include "cache/cache_reservation_manager.h"
#include "cache/lru_cache.h"
#include "memory/memory_allocator_impl.h"
#include "memory/memory_allocator.h"
#include "rocksdb/secondary_cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -22,7 +21,7 @@ namespace ROCKSDB_NAMESPACE {
class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
public:
CompressedSecondaryCacheResultHandle(Cache::ObjectPtr value, size_t size)
CompressedSecondaryCacheResultHandle(void* value, size_t size)
: value_(value), size_(size) {}
~CompressedSecondaryCacheResultHandle() override = default;
@ -35,12 +34,12 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
void Wait() override {}
Cache::ObjectPtr Value() override { return value_; }
void* Value() override { return value_; }
size_t Size() override { return size_; }
private:
Cache::ObjectPtr value_;
void* value_;
size_t size_;
};
@ -70,19 +69,26 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
class CompressedSecondaryCache : public SecondaryCache {
public:
explicit CompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts);
CompressedSecondaryCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
CacheMetadataChargePolicy metadata_charge_policy =
kDefaultCacheMetadataChargePolicy,
CompressionType compression_type = CompressionType::kLZ4Compression,
uint32_t compress_format_version = 2,
bool enable_custom_split_merge = false);
~CompressedSecondaryCache() override;
const char* Name() const override { return "CompressedSecondaryCache"; }
Status Insert(const Slice& key, Cache::ObjectPtr value,
Status Insert(const Slice& key, void* value,
const Cache::CacheItemHelper* helper) override;
std::unique_ptr<SecondaryCacheResultHandle> Lookup(
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) override;
const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
bool advise_erase, bool& is_in_sec_cache) override;
bool SupportForceErase() const override { return true; }
@ -94,16 +100,10 @@ class CompressedSecondaryCache : public SecondaryCache {
Status GetCapacity(size_t& capacity) override;
Status Deflate(size_t decrease) override;
Status Inflate(size_t increase) override;
std::string GetPrintableOptions() const override;
size_t TEST_GetUsage() { return cache_->GetUsage(); }
private:
friend class CompressedSecondaryCacheTestBase;
friend class CompressedSecondaryCacheTest;
static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
128, 256, 512, 1024, 2048, 4096, 8192, 16384};
@ -129,12 +129,11 @@ class CompressedSecondaryCache : public SecondaryCache {
CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
size_t& charge);
// TODO: clean up to use cleaner interfaces in typed_cache.h
const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
// An implementation of Cache::DeleterFn.
static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge);
std::shared_ptr<Cache> cache_;
CompressedSecondaryCacheOptions cache_options_;
mutable port::Mutex capacity_mutex_;
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};
} // namespace ROCKSDB_NAMESPACE

@ -5,45 +5,86 @@
#include "cache/compressed_secondary_cache.h"
#include <array>
#include <iterator>
#include <memory>
#include <tuple>
#include "cache/secondary_cache_adapter.h"
#include "memory/jemalloc_nodump_allocator.h"
#include "rocksdb/convenience.h"
#include "test_util/secondary_cache_test_util.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
using secondary_cache_test_util::GetTestingCacheTypes;
using secondary_cache_test_util::WithCacheType;
// 16 bytes for HCC compatibility
const std::string key0 = "____ ____key0";
const std::string key1 = "____ ____key1";
const std::string key2 = "____ ____key2";
const std::string key3 = "____ ____key3";
class CompressedSecondaryCacheTestBase : public testing::Test,
public WithCacheType {
class CompressedSecondaryCacheTest : public testing::Test {
public:
CompressedSecondaryCacheTestBase() {}
~CompressedSecondaryCacheTestBase() override = default;
CompressedSecondaryCacheTest() : fail_create_(false) {}
~CompressedSecondaryCacheTest() override = default;
protected:
class TestItem {
public:
TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
memcpy(buf_.get(), buf, size);
}
~TestItem() = default;
char* Buf() { return buf_.get(); }
[[nodiscard]] size_t Size() const { return size_; }
private:
std::unique_ptr<char[]> buf_;
size_t size_;
};
static size_t SizeCallback(void* obj) {
return reinterpret_cast<TestItem*>(obj)->Size();
}
static Status SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out) {
auto item = reinterpret_cast<TestItem*>(from_obj);
const char* buf = item->Buf();
EXPECT_EQ(length, item->Size());
EXPECT_EQ(from_offset, 0);
memcpy(out, buf, length);
return Status::OK();
}
static void DeletionCallback(const Slice& /*key*/, void* obj) {
delete reinterpret_cast<TestItem*>(obj);
obj = nullptr;
}
static Cache::CacheItemHelper helper_;
static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
size_t /*size*/, void* /*out*/) {
return Status::NotSupported();
}
static Cache::CacheItemHelper helper_fail_;
Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
if (fail_create_) {
return Status::NotSupported();
}
*out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
*charge = size;
return Status::OK();
};
void SetFailCreate(bool fail) { fail_create_ = fail; }
void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
bool sec_cache_is_compressed) {
get_perf_context()->Reset();
bool kept_in_sec_cache{true};
bool is_in_sec_cache{true};
// Lookup an non-existent key.
std::unique_ptr<SecondaryCacheResultHandle> handle0 =
sec_cache->Lookup(key0, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup(
"k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
ASSERT_EQ(handle0, nullptr);
Random rnd(301);
@ -51,25 +92,25 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::string str1(rnd.RandomString(1000));
TestItem item1(str1.data(), str1.length());
// A dummy handle is inserted if the item is inserted for the first time.
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_EQ(handle1_1, nullptr);
// Insert and Lookup the item k1 for the second time and advise erasing it.
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
ASSERT_NE(handle1_2, nullptr);
ASSERT_FALSE(kept_in_sec_cache);
ASSERT_FALSE(is_in_sec_cache);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
1000);
@ -86,22 +127,22 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0);
// Lookup the item k1 again.
std::unique_ptr<SecondaryCacheResultHandle> handle1_3 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
ASSERT_EQ(handle1_3, nullptr);
// Insert and Lookup the item k2.
std::string str2(rnd.RandomString(1000));
TestItem item2(str2.data(), str2.length());
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_EQ(handle2_1, nullptr);
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -112,9 +153,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
}
std::unique_ptr<SecondaryCacheResultHandle> handle2_2 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_NE(handle2_2, nullptr);
std::unique_ptr<TestItem> val2 =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value()));
@ -183,26 +223,28 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::string str1(rnd.RandomString(1000));
TestItem item1(str1.data(), str1.length());
// Insert a dummy handle.
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
// Insert k1.
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
// Insert and Lookup the second item.
std::string str2(rnd.RandomString(200));
TestItem item2(str2.data(), str2.length());
// Insert a dummy handle, k1 is not evicted.
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
bool kept_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> handle1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
bool is_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_EQ(handle1, nullptr);
// Insert k2 and k1 is evicted.
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
std::unique_ptr<SecondaryCacheResultHandle> handle2 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_NE(handle2, nullptr);
std::unique_ptr<TestItem> val2 =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
@ -210,26 +252,27 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
// Insert k1 again and a dummy handle is inserted.
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_EQ(handle1_1, nullptr);
// Create Fails.
SetFailCreate(true);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
ASSERT_EQ(handle2_1, nullptr);
// Save Fails.
std::string str3 = rnd.RandomString(10);
TestItem item3(str3.data(), str3.length());
// The Status is OK because a dummy handle is inserted.
ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail()));
ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail()));
ASSERT_OK(sec_cache->Insert("k3", &item3,
&CompressedSecondaryCacheTest::helper_fail_));
ASSERT_NOK(sec_cache->Insert("k3", &item3,
&CompressedSecondaryCacheTest::helper_fail_));
sec_cache.reset();
}
@ -253,22 +296,28 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge;
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
std::shared_ptr<Cache> cache = NewCache(
LRUCacheOptions lru_cache_opts(
/*_capacity =*/1300, /*_num_shard_bits =*/0,
/*_strict_capacity_limit =*/true, secondary_cache);
/*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5,
/*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0);
lru_cache_opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts);
std::shared_ptr<Statistics> stats = CreateDBStatistics();
get_perf_context()->Reset();
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1_1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert(
"k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
std::string str2 = rnd.RandomString(1012);
auto item2_1 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert(key2, item2_1, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@ -277,19 +326,22 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
auto item3_1 = new TestItem(str3.data(), str3.length());
// After this Insert, primary cache contains k3 and secondary cache contains
// k1's dummy item and k2's dummy item.
ASSERT_OK(cache->Insert(key3, item3_1, GetHelper(), str3.length()));
ASSERT_OK(cache->Insert(
"k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item, k2's dummy item, and k3's dummy item.
auto item1_2 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert(
"k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's item, k2's dummy item, and k3's dummy item.
auto item2_2 = new TestItem(str2.data(), str2.length());
ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -304,7 +356,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
// After this Insert, primary cache contains k3 and secondary cache contains
// k1's item and k2's item.
auto item3_2 = new TestItem(str3.data(), str3.length());
ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length()));
ASSERT_OK(cache->Insert(
"k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -317,7 +370,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
}
Cache::Handle* handle;
handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_NE(handle, nullptr);
auto val3 = static_cast<TestItem*>(cache->Value(handle));
@ -326,13 +380,15 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
cache->Release(handle);
// Lookup an non-existent key.
handle = cache->Lookup(key0, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_EQ(handle, nullptr);
// This Lookup should just insert a dummy handle in the primary cache
// and the k1 is still in the secondary cache.
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -344,7 +400,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
// This Lookup should erase k1 from the secondary cache and insert
// it into primary cache; then k3 is demoted.
// k2 and k3 are in secondary cache.
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -352,7 +409,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
cache->Release(handle);
// k2 is still in secondary cache.
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2);
@ -360,7 +418,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
// Testing SetCapacity().
ASSERT_OK(secondary_cache->SetCapacity(0));
handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_EQ(handle, nullptr);
@ -370,30 +429,35 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
ASSERT_EQ(capacity, 7000);
auto item1_3 = new TestItem(str1.data(), str1.length());
// After this Insert, primary cache contains k1.
ASSERT_OK(cache->Insert(key1, item1_3, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4);
auto item2_3 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert(key2, item2_3, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert(
"k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4);
auto item1_4 = new TestItem(str1.data(), str1.length());
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item and k2's dummy item.
ASSERT_OK(cache->Insert(key1, item1_4, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5);
auto item2_4 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's real item and k2's dummy item.
ASSERT_OK(cache->Insert(key2, item2_4, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5);
// This Lookup should just insert a dummy handle in the primary cache
// and the k1 is still in the secondary cache.
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
stats.get());
ASSERT_NE(handle, nullptr);
@ -421,31 +485,31 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
std::shared_ptr<Cache> cache = NewCache(
LRUCacheOptions opts(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, secondary_cache);
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1.get(), GetHelper(), str1.length()));
ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length()));
ASSERT_OK(cache->Insert("k1", item1.get(),
&CompressedSecondaryCacheTest::helper_,
str1.length()));
item1.release(); // Appease clang-analyze "potential memory leak"
Cache::Handle* handle;
handle = cache->Lookup(key2, nullptr, this, Cache::Priority::LOW);
handle = cache->Lookup("k2", nullptr, test_item_creator,
Cache::Priority::LOW, true);
ASSERT_EQ(handle, nullptr);
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, false);
ASSERT_EQ(handle, nullptr);
Cache::AsyncLookupHandle ah;
ah.key = key2;
ah.helper = GetHelper();
ah.create_context = this;
ah.priority = Cache::Priority::LOW;
cache->StartAsyncLookup(ah);
cache->Wait(ah);
ASSERT_EQ(ah.Result(), nullptr);
cache.reset();
secondary_cache.reset();
}
@ -468,29 +532,40 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
std::shared_ptr<Cache> cache = NewCache(
LRUCacheOptions opts(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/true, secondary_cache);
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1, GetHelperFail(), str1.length()));
ASSERT_OK(cache->Insert("k1", item1,
&CompressedSecondaryCacheTest::helper_fail_,
str1.length()));
std::string str2 = rnd.RandomString(1002);
auto item2 = new TestItem(str2.data(), str2.length());
// k1 should be demoted to the secondary cache.
ASSERT_OK(cache->Insert(key2, item2, GetHelperFail(), str2.length()));
ASSERT_OK(cache->Insert("k2", item2,
&CompressedSecondaryCacheTest::helper_fail_,
str2.length()));
Cache::Handle* handle;
handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
// This lookup should fail, since k1 demotion would have failed.
handle = cache->Lookup(key1, GetHelperFail(), this, Cache::Priority::LOW);
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_EQ(handle, nullptr);
// Since k1 was not promoted, k2 should still be in cache.
handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
@ -516,30 +591,39 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
std::shared_ptr<Cache> cache = NewCache(
LRUCacheOptions opts(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/true, secondary_cache);
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_,
str1.length()));
std::string str2 = rnd.RandomString(1002);
auto item2 = new TestItem(str2.data(), str2.length());
// k1 should be demoted to the secondary cache.
ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
str2.length()));
Cache::Handle* handle;
SetFailCreate(true);
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
// This lookup should fail, since k1 creation would have failed
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_EQ(handle, nullptr);
// Since k1 didn't get promoted, k2 should still be in cache
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
@ -565,34 +649,43 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
std::shared_ptr<Cache> cache = NewCache(
LRUCacheOptions opts(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, secondary_cache);
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1_1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert(
"k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
std::string str2 = rnd.RandomString(1002);
std::string str2_clone{str2};
auto item2 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
str2.length()));
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item and k2's dummy item.
auto item1_2 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));
ASSERT_OK(cache->Insert(
"k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
auto item2_2 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's item and k2's dummy item.
ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
ASSERT_OK(cache->Insert(
"k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
Cache::Handle* handle2;
handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle2, nullptr);
cache->Release(handle2);
@ -600,12 +693,14 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
// strict_capacity_limit is true, but the lookup should still succeed.
// A k1's dummy item is inserted into primary cache.
Cache::Handle* handle1;
handle1 = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle1, nullptr);
cache->Release(handle1);
// Since k1 didn't get inserted, k2 should still be in cache
handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
ASSERT_NE(handle2, nullptr);
cache->Release(handle2);
@ -628,9 +723,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
allocator));
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
allocator);
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
@ -647,7 +741,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
current_chunk = current_chunk->next;
ASSERT_EQ(current_chunk->size, 98);
sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
}
void MergeChunksIntoValueTest() {
@ -681,8 +775,7 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::string str = str1 + str2 + str3;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0);
size_t charge{0};
CacheAllocationPtr value =
sec_cache->MergeChunksIntoValue(chunks_head, charge);
@ -712,9 +805,8 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
allocator));
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
allocator);
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
@ -730,29 +822,31 @@ class CompressedSecondaryCacheTestBase : public testing::Test,
std::string value_str{value.get(), charge};
ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
}
};
class CompressedSecondaryCacheTest
: public CompressedSecondaryCacheTestBase,
public testing::WithParamInterface<std::string> {
const std::string& Type() override { return GetParam(); }
private:
bool fail_create_;
};
INSTANTIATE_TEST_CASE_P(CompressedSecondaryCacheTest,
CompressedSecondaryCacheTest, GetTestingCacheTypes());
Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_(
CompressedSecondaryCacheTest::SizeCallback,
CompressedSecondaryCacheTest::SaveToCallback,
CompressedSecondaryCacheTest::DeletionCallback);
Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_(
CompressedSecondaryCacheTest::SizeCallback,
CompressedSecondaryCacheTest::SaveToCallbackFail,
CompressedSecondaryCacheTest::DeletionCallback);
class CompressedSecCacheTestWithCompressAndAllocatorParam
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<
std::tuple<bool, bool, std::string>> {
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
CompressedSecCacheTestWithCompressAndAllocatorParam() {
sec_cache_is_compressed_ = std::get<0>(GetParam());
use_jemalloc_ = std::get<1>(GetParam());
}
const std::string& Type() override { return std::get<2>(GetParam()); }
bool sec_cache_is_compressed_;
bool use_jemalloc_;
};
@ -763,20 +857,20 @@ TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) {
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecCacheTestWithCompressAndAllocatorParam,
::testing::Combine(testing::Bool(), testing::Bool(),
GetTestingCacheTypes()));
::testing::Combine(testing::Bool(), testing::Bool()));
class CompressedSecondaryCacheTestWithCompressionParam
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<std::tuple<bool, std::string>> {
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<bool> {
public:
CompressedSecondaryCacheTestWithCompressionParam() {
sec_cache_is_compressed_ = std::get<0>(GetParam());
sec_cache_is_compressed_ = GetParam();
}
const std::string& Type() override { return std::get<1>(GetParam()); }
bool sec_cache_is_compressed_;
};
#ifndef ROCKSDB_LITE
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
std::shared_ptr<SecondaryCache> sec_cache{nullptr};
std::string sec_cache_uri;
@ -840,6 +934,7 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
BasicTestHelper(sec_cache, sec_cache_is_compressed_);
}
#endif // ROCKSDB_LITE
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) {
FailsTest(sec_cache_is_compressed_);
@ -865,92 +960,18 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
IntegrationFullCapacityTest(sec_cache_is_compressed_);
}
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
CompressedSecondaryCacheOptions opts;
opts.capacity = 2048;
opts.num_shard_bits = 0;
if (sec_cache_is_compressed_) {
if (!LZ4_Supported()) {
ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
return;
}
} else {
opts.compression_type = CompressionType::kNoCompression;
}
// Select a random subset to include, for fast test
Random& r = *Random::GetTLSInstance();
CacheEntryRoleSet do_not_compress;
for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
// A few included on average, but decent chance of zero
if (r.OneIn(5)) {
do_not_compress.Add(static_cast<CacheEntryRole>(i));
}
}
opts.do_not_compress_roles = do_not_compress;
std::shared_ptr<SecondaryCache> sec_cache = NewCompressedSecondaryCache(opts);
// Fixed seed to ensure consistent compressibility (doesn't compress)
std::string junk(Random(301).RandomString(1000));
for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
CacheEntryRole role = static_cast<CacheEntryRole>(i);
// Uniquify `junk`
junk[0] = static_cast<char>(i);
TestItem item{junk.data(), junk.length()};
Slice ith_key = Slice(junk.data(), 16);
get_perf_context()->Reset();
ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1U);
ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1U);
bool kept_in_sec_cache{true};
std::unique_ptr<SecondaryCacheResultHandle> handle =
sec_cache->Lookup(ith_key, GetHelper(role), this, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_NE(handle, nullptr);
// Lookup returns the right data
std::unique_ptr<TestItem> val =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle->Value()));
ASSERT_NE(val, nullptr);
ASSERT_EQ(memcmp(val->Buf(), item.Buf(), item.Size()), 0);
bool compressed =
sec_cache_is_compressed_ && !do_not_compress.Contains(role);
if (compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
1000);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
1007);
} else {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
}
}
}
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecondaryCacheTestWithCompressionParam,
testing::Combine(testing::Bool(),
GetTestingCacheTypes()));
testing::Bool());
class CompressedSecCacheTestWithCompressAndSplitParam
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<
std::tuple<bool, bool, std::string>> {
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
public:
CompressedSecCacheTestWithCompressAndSplitParam() {
sec_cache_is_compressed_ = std::get<0>(GetParam());
enable_custom_split_merge_ = std::get<1>(GetParam());
}
const std::string& Type() override { return std::get<2>(GetParam()); }
bool sec_cache_is_compressed_;
bool enable_custom_split_merge_;
};
@ -961,112 +982,20 @@ TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) {
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecCacheTestWithCompressAndSplitParam,
::testing::Combine(testing::Bool(), testing::Bool(),
GetTestingCacheTypes()));
::testing::Combine(testing::Bool(), testing::Bool()));
TEST_P(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
SplitValueIntoChunksTest();
}
TEST_P(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
MergeChunksIntoValueTest();
}
TEST_P(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
SplictValueAndMergeChunksTest();
}
class CompressedSecCacheTestWithTiered : public ::testing::Test {
public:
CompressedSecCacheTestWithTiered() {
LRUCacheOptions lru_opts;
TieredVolatileCacheOptions opts;
lru_opts.capacity = 70 << 20;
opts.cache_opts = &lru_opts;
opts.cache_type = PrimaryCacheType::kCacheTypeLRU;
opts.comp_cache_opts.capacity = 30 << 20;
cache_ = NewTieredVolatileCache(opts);
cache_res_mgr_ =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache_);
}
protected:
CacheReservationManager* cache_res_mgr() { return cache_res_mgr_.get(); }
Cache* GetCache() {
return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
cache_.get())
->TEST_GetCache();
}
SecondaryCache* GetSecondaryCache() {
return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
cache_.get())
->TEST_GetSecondaryCache();
}
size_t GetPercent(size_t val, unsigned int percent) {
return static_cast<size_t>(val * percent / 100);
}
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<CacheReservationManager> cache_res_mgr_;
};
bool CacheUsageWithinBounds(size_t val1, size_t val2, size_t error) {
return ((val1 < (val2 + error)) && (val1 > (val2 - error)));
}
TEST_F(CompressedSecCacheTestWithTiered, CacheReservationManager) {
CompressedSecondaryCache* sec_cache =
reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
// Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to
// double explicit casts
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20));
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
GetPercent(37 << 20, 1));
EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
GetPercent(3 << 20, 1));
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0));
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
}
TEST_F(CompressedSecCacheTestWithTiered,
CacheReservationManagerMultipleUpdate) {
CompressedSecondaryCache* sec_cache =
reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
int i;
for (i = 0; i < 10; ++i) {
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation((1 + i) << 20));
}
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
GetPercent(37 << 20, 1));
EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
GetPercent(3 << 20, 1));
for (i = 10; i > 0; --i) {
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(((i - 1) << 20)));
}
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

510
cache/lru_cache.cc vendored

@ -14,29 +14,28 @@
#include <cstdio>
#include <cstdlib>
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "port/lang.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace lru_cache {
LRUHandleTable::LRUHandleTable(int max_upper_hash_bits,
MemoryAllocator* allocator)
// A distinct pointer value for marking "dummy" cache entries
void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker");
LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
: length_bits_(/* historical starting size*/ 4),
list_(new LRUHandle* [size_t{1} << length_bits_] {}),
elems_(0),
max_length_bits_(max_upper_hash_bits),
allocator_(allocator) {}
max_length_bits_(max_upper_hash_bits) {}
LRUHandleTable::~LRUHandleTable() {
auto alloc = allocator_;
ApplyToEntriesRange(
[alloc](LRUHandle* h) {
[](LRUHandle* h) {
if (!h->HasRefs()) {
h->Free(alloc);
h->Free();
}
},
0, size_t{1} << length_bits_);
@ -96,7 +95,7 @@ void LRUHandleTable::Resize() {
std::unique_ptr<LRUHandle* []> new_list {
new LRUHandle* [size_t{1} << new_length_bits] {}
};
[[maybe_unused]] uint32_t count = 0;
uint32_t count = 0;
for (uint32_t i = 0; i < old_length; i++) {
LRUHandle* h = list_[i];
while (h != nullptr) {
@ -119,8 +118,7 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double low_pri_pool_ratio, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback)
SecondaryCache* secondary_cache)
: CacheShardBase(metadata_charge_policy),
capacity_(0),
high_pri_pool_usage_(0),
@ -130,11 +128,11 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
high_pri_pool_capacity_(0),
low_pri_pool_ratio_(low_pri_pool_ratio),
low_pri_pool_capacity_(0),
table_(max_upper_hash_bits, allocator),
table_(max_upper_hash_bits),
usage_(0),
lru_usage_(0),
mutex_(use_adaptive_mutex),
eviction_callback_(*eviction_callback) {
secondary_cache_(secondary_cache) {
// Make empty circular linked list.
lru_.next = &lru_;
lru_.prev = &lru_;
@ -161,14 +159,13 @@ void LRUCacheShard::EraseUnRefEntries() {
}
for (auto entry : last_reference_list) {
entry->Free(table_.GetAllocator());
entry->Free();
}
}
void LRUCacheShard::ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
size_t average_entries_per_lock, size_t* state) {
// The state is essentially going to be the starting hash, which works
// nicely even if we resize between calls because we use upper-most
@ -195,8 +192,11 @@ void LRUCacheShard::ApplyToSomeEntries(
table_.ApplyToEntriesRange(
[callback,
metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
DeleterFn deleter = h->IsSecondaryCacheCompatible()
? h->info_.helper->del_cb
: h->info_.deleter;
callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
h->helper);
deleter);
},
index_begin, index_end);
}
@ -334,19 +334,16 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
}
}
void LRUCacheShard::NotifyEvicted(
const autovector<LRUHandle*>& evicted_handles) {
MemoryAllocator* alloc = table_.GetAllocator();
for (LRUHandle* entry : evicted_handles) {
if (eviction_callback_ &&
eviction_callback_(entry->key(),
reinterpret_cast<Cache::Handle*>(entry))) {
// Callback took ownership of obj; just free handle
free(entry);
} else {
// Free the entries here outside of mutex for performance reasons.
entry->Free(alloc);
void LRUCacheShard::TryInsertIntoSecondaryCache(
autovector<LRUHandle*> evicted_handles) {
for (auto entry : evicted_handles) {
if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
!entry->IsInSecondaryCache()) {
secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
.PermitUncheckedError();
}
// Free the entries here outside of mutex for performance reasons.
entry->Free();
}
}
@ -360,7 +357,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
EvictFromLRU(0, &last_reference_list);
}
NotifyEvicted(last_reference_list);
TryInsertIntoSecondaryCache(last_reference_list);
}
void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
@ -368,7 +365,8 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
strict_capacity_limit_ = strict_capacity_limit;
}
Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
bool free_handle_on_fail) {
Status s = Status::OK();
autovector<LRUHandle*> last_reference_list;
@ -387,9 +385,10 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
// into cache and get evicted immediately.
last_reference_list.push_back(e);
} else {
free(e);
e = nullptr;
*handle = nullptr;
if (free_handle_on_fail) {
free(e);
*handle = nullptr;
}
s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
}
} else {
@ -421,27 +420,192 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
}
}
NotifyEvicted(last_reference_list);
TryInsertIntoSecondaryCache(last_reference_list);
return s;
}
void LRUCacheShard::Promote(LRUHandle* e) {
SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
assert(secondary_handle->IsReady());
// e is not thread-shared here; OK to modify "immutable" fields as well as
// "mutable" (normally requiring mutex)
e->SetIsPending(false);
e->value = secondary_handle->Value();
assert(e->total_charge == 0);
size_t value_size = secondary_handle->Size();
delete secondary_handle;
if (e->value) {
e->CalcTotalCharge(value_size, metadata_charge_policy_);
Status s;
if (e->IsStandalone()) {
assert(secondary_cache_ && secondary_cache_->SupportForceErase());
// Insert a dummy handle and return a standalone handle to caller.
// Charge the standalone handle.
autovector<LRUHandle*> last_reference_list;
bool free_standalone_handle{false};
{
DMutexLock l(mutex_);
// Free the space following strict LRU policy until enough space
// is freed or the lru list is empty.
EvictFromLRU(e->total_charge, &last_reference_list);
if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) {
free_standalone_handle = true;
} else {
usage_ += e->total_charge;
}
}
TryInsertIntoSecondaryCache(last_reference_list);
if (free_standalone_handle) {
e->Unref();
e->Free();
e = nullptr;
} else {
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
}
// Insert a dummy handle into the primary cache. This dummy handle is
// not IsSecondaryCacheCompatible().
// FIXME? This should not overwrite an existing non-dummy entry in the
// rare case that one exists
Cache::Priority priority =
e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW;
s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0,
/*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr,
priority);
} else {
e->SetInCache(true);
LRUHandle* handle = e;
// This InsertItem() could fail if the cache is over capacity and
// strict_capacity_limit_ is true. In such a case, we don't want
// InsertItem() to free the handle, since the item is already in memory
// and the caller will most likely just read it from disk if we erase it
// here.
s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
if (s.ok()) {
PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
}
}
if (!s.ok()) {
// Item is in memory, but not accounted against the cache capacity.
// When the handle is released, the item should get deleted.
assert(!e->InCache());
}
} else {
// Secondary cache lookup failed. The caller will take care of detecting
// this and eventually releasing e.
assert(!e->value);
assert(!e->InCache());
}
}
LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* /*helper*/,
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/,
Statistics* /*stats*/) {
DMutexLock l(mutex_);
LRUHandle* e = table_.Lookup(key, hash);
if (e != nullptr) {
assert(e->InCache());
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(e);
const Cache::CacheItemHelper* helper,
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait,
Statistics* stats) {
LRUHandle* e = nullptr;
bool found_dummy_entry{false};
{
DMutexLock l(mutex_);
e = table_.Lookup(key, hash);
if (e != nullptr) {
assert(e->InCache());
if (e->value == kDummyValueMarker) {
// For a dummy handle, if it was retrieved from secondary cache,
// it may still exist in secondary cache.
// If the handle exists in secondary cache, the value should be
// erased from sec cache and be inserted into primary cache.
found_dummy_entry = true;
// Let the dummy entry be overwritten
e = nullptr;
} else {
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(e);
}
e->Ref();
e->SetHit();
}
}
}
// If handle table lookup failed or the handle is a dummy one, allocate
// a handle outside the mutex if we re going to lookup in the secondary cache.
//
// When a block is firstly Lookup from CompressedSecondaryCache, we just
// insert a dummy block into the primary cache (charging the actual size of
// the block) and don't erase the block from CompressedSecondaryCache. A
// standalone handle is returned to the caller. Only if the block is hit
// again, we erase it from CompressedSecondaryCache and add it into the
// primary cache.
if (!e && secondary_cache_ && helper && helper->saveto_cb) {
// For objects from the secondary cache, we expect the caller to provide
// a way to create/delete the primary cache object. The only case where
// a deleter would not be required is for dummy entries inserted for
// accounting purposes, which we won't demote to the secondary cache
// anyway.
assert(create_cb && helper->del_cb);
bool is_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry,
is_in_sec_cache);
if (secondary_handle != nullptr) {
e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
e->m_flags = 0;
e->im_flags = 0;
e->SetSecondaryCacheCompatible(true);
e->info_.helper = helper;
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
e->SetPriority(priority);
memcpy(e->key_data, key.data(), key.size());
e->value = nullptr;
e->sec_handle = secondary_handle.release();
e->total_charge = 0;
e->Ref();
e->SetIsInSecondaryCache(is_in_sec_cache);
e->SetIsStandalone(secondary_cache_->SupportForceErase() &&
!found_dummy_entry);
if (wait) {
Promote(e);
if (e) {
if (!e->value) {
// The secondary cache returned a handle, but the lookup failed.
e->Unref();
e->Free();
e = nullptr;
} else {
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
}
}
} else {
// If wait is false, we always return a handle and let the caller
// release the handle after checking for success or failure.
e->SetIsPending(true);
// This may be slightly inaccurate, if the lookup eventually fails.
// But the probability is very low.
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
}
} else {
// Caller will most likely overwrite the dummy entry with an Insert
// after this Lookup fails
assert(e == nullptr);
}
e->Ref();
e->SetHit();
}
return e;
}
@ -450,6 +614,8 @@ bool LRUCacheShard::Ref(LRUHandle* e) {
DMutexLock l(mutex_);
// To create another reference - entry must be already externally referenced.
assert(e->HasRefs());
// Pending handles are not for sharing
assert(!e->IsPending());
e->Ref();
return true;
}
@ -473,13 +639,14 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
if (e == nullptr) {
return false;
}
bool must_free;
bool was_in_cache;
bool last_reference = false;
// Must Wait or WaitAll first on pending handles. Otherwise, would leak
// a secondary cache handle.
assert(!e->IsPending());
{
DMutexLock l(mutex_);
must_free = e->Unref();
was_in_cache = e->InCache();
if (must_free && was_in_cache) {
last_reference = e->Unref();
if (last_reference && e->InCache()) {
// The item is still in cache, and nobody else holds a reference to it.
if (usage_ > capacity_ || erase_if_last_ref) {
// The LRU list must be empty since the cache is full.
@ -490,39 +657,28 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
} else {
// Put the item back on the LRU list, and don't free it.
LRU_Insert(e);
must_free = false;
last_reference = false;
}
}
// If about to be freed, then decrement the cache usage.
if (must_free) {
// If it was the last reference, then decrement the cache usage.
if (last_reference) {
assert(usage_ >= e->total_charge);
usage_ -= e->total_charge;
}
}
// Free the entry here outside of mutex for performance reasons.
if (must_free) {
// Only call eviction callback if we're sure no one requested erasure
// FIXME: disabled because of test churn
if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ &&
eviction_callback_(e->key(), reinterpret_cast<Cache::Handle*>(e))) {
// Callback took ownership of obj; just free handle
free(e);
} else {
e->Free(table_.GetAllocator());
}
if (last_reference) {
e->Free();
}
return must_free;
return last_reference;
}
LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge) {
assert(helper);
// value == nullptr is reserved for indicating failure in SecondaryCache
assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr));
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
size_t charge,
void (*deleter)(const Slice& key, void* value),
const Cache::CacheItemHelper* helper,
LRUHandle** handle, Cache::Priority priority) {
// Allocate the memory here outside of the mutex.
// If the cache is full, we'll have to release it.
// It shouldn't happen very often though.
@ -532,58 +688,27 @@ LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
e->value = value;
e->m_flags = 0;
e->im_flags = 0;
e->helper = helper;
if (helper) {
// Use only one of the two parameters
assert(deleter == nullptr);
// value == nullptr is reserved for indicating failure for when secondary
// cache compatible
assert(value != nullptr);
e->SetSecondaryCacheCompatible(true);
e->info_.helper = helper;
} else {
e->info_.deleter = deleter;
}
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
e->SetInCache(true);
e->SetPriority(priority);
memcpy(e->key_data, key.data(), key.size());
e->CalcTotalCharge(charge, metadata_charge_policy_);
return e;
}
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge, LRUHandle** handle,
Cache::Priority priority) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetPriority(priority);
e->SetInCache(true);
return InsertItem(e, handle);
}
LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge,
bool allow_uncharged) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetIsStandalone(true);
e->Ref();
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
EvictFromLRU(e->total_charge, &last_reference_list);
if (strict_capacity_limit_ && (usage_ + e->total_charge) > capacity_) {
if (allow_uncharged) {
e->total_charge = 0;
} else {
free(e);
e = nullptr;
}
} else {
usage_ += e->total_charge;
}
}
NotifyEvicted(last_reference_list);
return e;
return InsertItem(e, handle, /* free_handle_on_fail */ true);
}
void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@ -608,8 +733,18 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
// Free the entry here outside of mutex for performance reasons.
// last_reference will only be true if e != nullptr.
if (last_reference) {
e->Free(table_.GetAllocator());
e->Free();
}
}
bool LRUCacheShard::IsReady(LRUHandle* e) {
bool ready = true;
if (e->IsPending()) {
assert(secondary_cache_);
assert(e->sec_handle);
ready = e->sec_handle->IsReady();
}
return ready;
}
size_t LRUCacheShard::GetUsage() const {
@ -646,20 +781,30 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
str.append(buffer);
}
LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) {
LRUCache::LRUCache(size_t capacity, int num_shard_bits,
bool strict_capacity_limit, double high_pri_pool_ratio,
double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> allocator,
bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
std::shared_ptr<SecondaryCache> _secondary_cache)
: ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
std::move(allocator)),
secondary_cache_(std::move(_secondary_cache)) {
size_t per_shard = GetPerShardCapacity();
MemoryAllocator* alloc = memory_allocator();
InitShards([&](LRUCacheShard* cs) {
new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit,
opts.high_pri_pool_ratio, opts.low_pri_pool_ratio,
opts.use_adaptive_mutex, opts.metadata_charge_policy,
/* max_upper_hash_bits */ 32 - opts.num_shard_bits,
alloc, &eviction_callback_);
SecondaryCache* secondary_cache = secondary_cache_.get();
InitShards([=](LRUCacheShard* cs) {
new (cs) LRUCacheShard(
per_shard, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
/* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
});
}
Cache::ObjectPtr LRUCache::Value(Handle* handle) {
void* LRUCache::Value(Handle* handle) {
auto h = reinterpret_cast<const LRUHandle*>(handle);
assert(!h->IsPending() || h->value == nullptr);
assert(h->value != kDummyValueMarker);
return h->value;
}
@ -668,10 +813,13 @@ size_t LRUCache::GetCharge(Handle* handle) const {
GetShard(0).metadata_charge_policy_);
}
const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
Handle* handle) const {
Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
auto h = reinterpret_cast<const LRUHandle*>(handle);
return h->helper;
if (h->IsSecondaryCacheCompatible()) {
return h->info_.helper->del_cb;
} else {
return h->info_.deleter;
}
}
size_t LRUCache::TEST_GetLRUSize() {
@ -682,9 +830,51 @@ double LRUCache::GetHighPriPoolRatio() {
return GetShard(0).GetHighPriPoolRatio();
}
void LRUCache::WaitAll(std::vector<Handle*>& handles) {
if (secondary_cache_) {
std::vector<SecondaryCacheResultHandle*> sec_handles;
sec_handles.reserve(handles.size());
for (Handle* handle : handles) {
if (!handle) {
continue;
}
LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
if (!lru_handle->IsPending()) {
continue;
}
sec_handles.emplace_back(lru_handle->sec_handle);
}
secondary_cache_->WaitAll(sec_handles);
for (Handle* handle : handles) {
if (!handle) {
continue;
}
LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
if (!lru_handle->IsPending()) {
continue;
}
GetShard(lru_handle->hash).Promote(lru_handle);
}
}
}
void LRUCache::AppendPrintableOptions(std::string& str) const {
ShardedCache::AppendPrintableOptions(str); // options from shard
if (secondary_cache_) {
str.append(" secondary_cache:\n");
str.append(secondary_cache_->GetPrintableOptions());
}
}
} // namespace lru_cache
std::shared_ptr<Cache> LRUCacheOptions::MakeSharedCache() const {
std::shared_ptr<Cache> NewLRUCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
const std::shared_ptr<SecondaryCache>& secondary_cache,
double low_pri_pool_ratio) {
if (num_shard_bits >= 20) {
return nullptr; // The cache cannot be sharded into too many fine pieces.
}
@ -700,24 +890,32 @@ std::shared_ptr<Cache> LRUCacheOptions::MakeSharedCache() const {
// Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
return nullptr;
}
// For sanitized options
LRUCacheOptions opts = *this;
if (opts.num_shard_bits < 0) {
opts.num_shard_bits = GetDefaultCacheShardBits(capacity);
}
std::shared_ptr<Cache> cache = std::make_shared<LRUCache>(opts);
if (secondary_cache) {
cache = std::make_shared<CacheWithSecondaryAdapter>(cache, secondary_cache);
}
return cache;
}
std::shared_ptr<RowCache> LRUCacheOptions::MakeSharedRowCache() const {
if (secondary_cache) {
// Not allowed for a RowCache
return nullptr;
}
// Works while RowCache is an alias for Cache
return MakeSharedCache();
if (num_shard_bits < 0) {
num_shard_bits = GetDefaultCacheShardBits(capacity);
}
return std::make_shared<LRUCache>(
capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
metadata_charge_policy, secondary_cache);
}
std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
cache_opts.strict_capacity_limit,
cache_opts.high_pri_pool_ratio,
cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
cache_opts.metadata_charge_policy,
cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
}
std::shared_ptr<Cache> NewLRUCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
double low_pri_pool_ratio) {
return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, nullptr, low_pri_pool_ratio);
}
} // namespace ROCKSDB_NAMESPACE

179
cache/lru_cache.h vendored

@ -13,9 +13,9 @@
#include "cache/sharded_cache.h"
#include "port/lang.h"
#include "port/likely.h"
#include "port/malloc.h"
#include "port/port.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/distributed_mutex.h"
@ -48,9 +48,19 @@ namespace lru_cache {
// While refs > 0, public properties like value and deleter must not change.
struct LRUHandle {
Cache::ObjectPtr value;
const Cache::CacheItemHelper* helper;
LRUHandle* next_hash;
void* value;
union Info {
Info() {}
~Info() {}
Cache::DeleterFn deleter;
const Cache::CacheItemHelper* helper;
} info_;
// An entry is not added to the LRUHandleTable until the secondary cache
// lookup is complete, so its safe to have this union.
union {
LRUHandle* next_hash;
SecondaryCacheResultHandle* sec_handle;
};
LRUHandle* next;
LRUHandle* prev;
size_t total_charge; // TODO(opt): Only allow uint32_t?
@ -83,8 +93,14 @@ struct LRUHandle {
IM_IS_HIGH_PRI = (1 << 0),
// Whether this entry is low priority entry.
IM_IS_LOW_PRI = (1 << 1),
// Can this be inserted into the secondary cache.
IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
// Is the handle still being read from a lower tier.
IM_IS_PENDING = (1 << 3),
// Whether this handle is still in a lower tier
IM_IS_IN_SECONDARY_CACHE = (1 << 4),
// Marks result handles that should not be inserted into cache
IM_IS_STANDALONE = (1 << 2),
IM_IS_STANDALONE = (1 << 5),
};
// Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
@ -114,6 +130,13 @@ struct LRUHandle {
bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
bool HasHit() const { return m_flags & M_HAS_HIT; }
bool IsSecondaryCacheCompatible() const {
return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
}
bool IsPending() const { return im_flags & IM_IS_PENDING; }
bool IsInSecondaryCache() const {
return im_flags & IM_IS_IN_SECONDARY_CACHE;
}
bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
void SetInCache(bool in_cache) {
@ -155,6 +178,30 @@ struct LRUHandle {
void SetHit() { m_flags |= M_HAS_HIT; }
void SetSecondaryCacheCompatible(bool compat) {
if (compat) {
im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
} else {
im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
}
}
void SetIsPending(bool pending) {
if (pending) {
im_flags |= IM_IS_PENDING;
} else {
im_flags &= ~IM_IS_PENDING;
}
}
void SetIsInSecondaryCache(bool is_in_secondary_cache) {
if (is_in_secondary_cache) {
im_flags |= IM_IS_IN_SECONDARY_CACHE;
} else {
im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
}
}
void SetIsStandalone(bool is_standalone) {
if (is_standalone) {
im_flags |= IM_IS_STANDALONE;
@ -163,11 +210,22 @@ struct LRUHandle {
}
}
void Free(MemoryAllocator* allocator) {
void Free() {
assert(refs == 0);
assert(helper);
if (helper->del_cb) {
helper->del_cb(value, allocator);
if (!IsSecondaryCacheCompatible() && info_.deleter) {
(*info_.deleter)(key(), value);
} else if (IsSecondaryCacheCompatible()) {
if (IsPending()) {
assert(sec_handle != nullptr);
SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
tmp_sec_handle->Wait();
value = tmp_sec_handle->Value();
delete tmp_sec_handle;
}
if (value) {
(*info_.helper->del_cb)(key(), value);
}
}
free(this);
@ -209,7 +267,7 @@ struct LRUHandle {
// 4.4.3's builtin hashtable.
class LRUHandleTable {
public:
explicit LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator);
explicit LRUHandleTable(int max_upper_hash_bits);
~LRUHandleTable();
LRUHandle* Lookup(const Slice& key, uint32_t hash);
@ -233,8 +291,6 @@ class LRUHandleTable {
size_t GetOccupancyCount() const { return elems_; }
MemoryAllocator* GetAllocator() const { return allocator_; }
private:
// Return a pointer to slot that points to a cache entry that
// matches key/hash. If there is no such cache entry, return a
@ -256,22 +312,16 @@ class LRUHandleTable {
// Set from max_upper_hash_bits (see constructor).
const int max_length_bits_;
// From Cache, needed for delete
MemoryAllocator* const allocator_;
};
// A single shard of sharded cache.
class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
public:
// NOTE: the eviction_callback ptr is saved, as is it assumed to be kept
// alive in Cache.
LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits, MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback);
int max_upper_hash_bits, SecondaryCache* secondary_cache);
public: // Type definitions expected as parameter to ShardedCache
using HandleImpl = LRUHandle;
@ -279,8 +329,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
using HashCref = uint32_t;
public: // Function definitions expected as parameter to ShardedCache
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return Lower32of64(GetSliceNPHash64(key, seed));
static inline HashVal ComputeHash(const Slice& key) {
return Lower32of64(GetSliceNPHash64(key));
}
// Separate from constructor so caller can easily make an array of LRUCache
@ -298,21 +348,29 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
// Like Cache methods, but with an extra "hash" parameter.
Status Insert(const Slice& key, uint32_t hash, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge,
LRUHandle** handle, Cache::Priority priority);
LRUHandle* CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
inline Status Insert(const Slice& key, uint32_t hash, void* value,
size_t charge, Cache::DeleterFn deleter,
LRUHandle** handle, Cache::Priority priority) {
return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
}
inline Status Insert(const Slice& key, uint32_t hash, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
LRUHandle** handle, Cache::Priority priority) {
assert(helper);
return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
}
// If helper_cb is null, the values of the following arguments don't matter.
LRUHandle* Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context,
Cache::Priority priority, Statistics* stats);
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait, Statistics* stats);
inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
nullptr);
}
bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
bool IsReady(LRUHandle* /*handle*/);
void Wait(LRUHandle* /*handle*/) {}
bool Ref(LRUHandle* handle);
void Erase(const Slice& key, uint32_t hash);
@ -326,9 +384,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
@ -352,10 +409,23 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
private:
friend class LRUCache;
// Insert an item into the hash table and, if handle is null, insert into
// the LRU list. Older items are evicted as necessary. Frees `item` on
// non-OK status.
Status InsertItem(LRUHandle* item, LRUHandle** handle);
// the LRU list. Older items are evicted as necessary. If the cache is full
// and free_handle_on_fail is true, the item is deleted and handle is set to
// nullptr.
Status InsertItem(LRUHandle* item, LRUHandle** handle,
bool free_handle_on_fail);
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
DeleterFn deleter, const Cache::CacheItemHelper* helper,
LRUHandle** handle, Cache::Priority priority);
// Promote an item looked up from the secondary cache to the LRU cache.
// The item may be still in the secondary cache.
// It is only inserted into the hash table and not the LRU list, and only
// if the cache is not at full capacity, as is the case during Insert. The
// caller should hold a reference on the LRUHandle. When the caller releases
// the last reference, the item is added to the LRU list.
// The item is promoted to the high pri or low pri pool as specified by the
// caller in Lookup.
void Promote(LRUHandle* e);
void LRU_Remove(LRUHandle* e);
void LRU_Insert(LRUHandle* e);
@ -369,11 +439,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
// holding the mutex_.
void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
void NotifyEvicted(const autovector<LRUHandle*>& evicted_handles);
LRUHandle* CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge);
// Try to insert the evicted handles into the secondary cache.
void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
// Initialized before use.
size_t capacity_;
@ -436,8 +503,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
// Owned by LRUCache
SecondaryCache* secondary_cache_;
};
class LRUCache
@ -446,16 +513,28 @@ class LRUCache
#endif
: public ShardedCache<LRUCacheShard> {
public:
explicit LRUCache(const LRUCacheOptions& opts);
LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
CacheMetadataChargePolicy metadata_charge_policy =
kDontChargeCacheMetadata,
std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
const char* Name() const override { return "LRUCache"; }
ObjectPtr Value(Handle* handle) override;
void* Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
void WaitAll(std::vector<Handle*>& handles) override;
// Retrieves number of elements in LRU, for unit test purpose only.
size_t TEST_GetLRUSize();
// Retrieves high pri pool ratio.
double GetHighPriPoolRatio();
void AppendPrintableOptions(std::string& str) const override;
private:
std::shared_ptr<SecondaryCache> secondary_cache_;
};
} // namespace lru_cache

File diff suppressed because it is too large Load Diff

@ -11,32 +11,20 @@ namespace ROCKSDB_NAMESPACE {
namespace {
void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {}
size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); }
size_t SliceSize(Cache::ObjectPtr obj) {
return static_cast<Slice*>(obj)->size();
}
Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length,
char* out) {
Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length,
void* out) {
const Slice& slice = *static_cast<Slice*>(from_obj);
std::memcpy(out, slice.data() + from_offset, length);
return Status::OK();
}
Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*,
Cache::ObjectPtr*, size_t*) {
return Status::NotSupported("Only for dumping data into SecondaryCache");
}
} // namespace
Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
static Cache::CacheItemHelper helper_no_secondary{CacheEntryRole::kMisc,
&NoopDelete};
static Cache::CacheItemHelper helper{
CacheEntryRole::kMisc, &NoopDelete, &SliceSize,
&SliceSaveTo, &FailCreate, &helper_no_secondary};
&SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()};
// NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
return Insert(key, const_cast<Slice*>(&saved), &helper);
}

@ -1,433 +0,0 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// A distinct pointer value for marking "dummy" cache entries
struct Dummy {
char val[7] = "kDummy";
};
const Dummy kDummy{};
Cache::ObjectPtr const kDummyObj = const_cast<Dummy*>(&kDummy);
} // namespace
// When CacheWithSecondaryAdapter is constructed with the distribute_cache_res
// parameter set to true, it manages the entire memory budget across the
// primary and secondary cache. The secondary cache is assumed to be in
// memory, such as the CompressedSecondaryCache. When a placeholder entry
// is inserted by a CacheReservationManager instance to reserve memory,
// the CacheWithSecondaryAdapter ensures that the reservation is distributed
// proportionally across the primary/secondary caches.
//
// The primary block cache is initially sized to the sum of the primary cache
// budget + teh secondary cache budget, as follows -
// |--------- Primary Cache Configured Capacity -----------|
// |---Secondary Cache Budget----|----Primary Cache Budget-----|
//
// A ConcurrentCacheReservationManager member in the CacheWithSecondaryAdapter,
// pri_cache_res_,
// is used to help with tracking the distribution of memory reservations.
// Initially, it accounts for the entire secondary cache budget as a
// reservation against the primary cache. This shrinks the usable capacity of
// the primary cache to the budget that the user originally desired.
//
// |--Reservation for Sec Cache--|-Pri Cache Usable Capacity---|
//
// When a reservation placeholder is inserted into the adapter, it is inserted
// directly into the primary cache. This means the entire charge of the
// placeholder is counted against the primary cache. To compensate and count
// a portion of it against the secondary cache, the secondary cache Deflate()
// method is called to shrink it. Since the Deflate() causes the secondary
// actual usage to shrink, it is refelcted here by releasing an equal amount
// from the pri_cache_res_ reservation. The Deflate() in the secondary cache
// can be, but is not required to be, implemented using its own cache
// reservation manager.
//
// For example, if the pri/sec ratio is 70/30, and the combined capacity is
// 100MB, the intermediate and final state after inserting a reservation
// placeholder for 10MB would be as follows -
//
// |-Reservation for Sec Cache-|-Pri Cache Usable Capacity-|---R---|
// 1. After inserting the placeholder in primary
// |------- 30MB -------------|------- 60MB -------------|-10MB--|
// 2. After deflating the secondary and adjusting the reservation for
// secondary against the primary
// |------- 27MB -------------|------- 63MB -------------|-10MB--|
//
// Likewise, when the user inserted placeholder is released, the secondary
// cache Inflate() method is called to grow it, and the pri_cache_res_
// reservation is increased by an equal amount.
//
// Another way of implementing this would have been to simply split the user
// reservation into primary and seconary components. However, this would
// require allocating a structure to track the associated secondary cache
// reservation, which adds some complexity and overhead.
//
CacheWithSecondaryAdapter::CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache, bool distribute_cache_res)
: CacheWrapper(std::move(target)),
secondary_cache_(std::move(secondary_cache)),
distribute_cache_res_(distribute_cache_res) {
target_->SetEvictionCallback([this](const Slice& key, Handle* handle) {
return EvictionHandler(key, handle);
});
if (distribute_cache_res_) {
size_t sec_capacity = 0;
pri_cache_res_ = std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
target_));
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
// Initially, the primary cache is sized to uncompressed cache budget plsu
// compressed secondary cache budget. The secondary cache budget is then
// taken away from the primary cache through cache reservations. Later,
// when a placeholder entry is inserted by the caller, its inserted
// into the primary cache and the portion that should be assigned to the
// secondary cache is freed from the reservation.
s = pri_cache_res_->UpdateCacheReservation(sec_capacity);
assert(s.ok());
sec_cache_res_ratio_ = (double)sec_capacity / target_->GetCapacity();
}
}
CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
// `*this` will be destroyed before `*target_`, so we have to prevent
// use after free
target_->SetEvictionCallback({});
#ifndef NDEBUG
if (distribute_cache_res_) {
size_t sec_capacity = 0;
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
assert(pri_cache_res_->GetTotalReservedCacheSize() == sec_capacity);
}
#endif // NDEBUG
}
bool CacheWithSecondaryAdapter::EvictionHandler(const Slice& key,
Handle* handle) {
auto helper = GetCacheItemHelper(handle);
if (helper->IsSecondaryCacheCompatible()) {
auto obj = target_->Value(handle);
// Ignore dummy entry
if (obj != kDummyObj) {
// Spill into secondary cache.
secondary_cache_->Insert(key, obj, helper).PermitUncheckedError();
}
}
// Never takes ownership of obj
return false;
}
bool CacheWithSecondaryAdapter::ProcessDummyResult(Cache::Handle** handle,
bool erase) {
if (*handle && target_->Value(*handle) == kDummyObj) {
target_->Release(*handle, erase);
*handle = nullptr;
return true;
} else {
return false;
}
}
void CacheWithSecondaryAdapter::CleanupCacheObject(
ObjectPtr obj, const CacheItemHelper* helper) {
if (helper->del_cb) {
helper->del_cb(obj, memory_allocator());
}
}
Cache::Handle* CacheWithSecondaryAdapter::Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache) {
assert(secondary_handle->IsReady());
ObjectPtr obj = secondary_handle->Value();
if (!obj) {
// Nothing found.
return nullptr;
}
// Found something.
switch (helper->role) {
case CacheEntryRole::kFilterBlock:
RecordTick(stats, SECONDARY_CACHE_FILTER_HITS);
break;
case CacheEntryRole::kIndexBlock:
RecordTick(stats, SECONDARY_CACHE_INDEX_HITS);
break;
case CacheEntryRole::kDataBlock:
RecordTick(stats, SECONDARY_CACHE_DATA_HITS);
break;
default:
break;
}
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
// Note: SecondaryCache::Size() is really charge (from the CreateCallback)
size_t charge = secondary_handle->Size();
Handle* result = nullptr;
// Insert into primary cache, possibly as a standalone+dummy entries.
if (secondary_cache_->SupportForceErase() && !found_dummy_entry) {
// Create standalone and insert dummy
// Allow standalone to be created even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
// Insert dummy to record recent use
// TODO: try to avoid case where inserting this dummy could overwrite a
// regular entry
Status s = Insert(key, kDummyObj, &kNoopCacheItemHelper, /*charge=*/0,
/*handle=*/nullptr, priority);
s.PermitUncheckedError();
// Nothing to do or clean up on dummy insertion failure
} else {
// Insert regular entry into primary cache.
// Don't allow it to spill into secondary cache again if it was kept there.
Status s = Insert(
key, obj, kept_in_sec_cache ? helper->without_secondary_compat : helper,
charge, &result, priority);
if (s.ok()) {
assert(result);
PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
} else {
// Create standalone result instead, even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
}
}
return result;
}
Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper,
size_t charge, Handle** handle,
Priority priority) {
Status s = target_->Insert(key, value, helper, charge, handle, priority);
if (s.ok() && value == nullptr && distribute_cache_res_) {
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
s = secondary_cache_->Deflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/false);
assert(s.ok());
}
return s;
}
Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key,
const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority,
Statistics* stats) {
// NOTE: we could just StartAsyncLookup() and Wait(), but this should be a bit
// more efficient
Handle* result =
target_->Lookup(key, helper, create_context, priority, stats);
bool secondary_compatible = helper && helper->IsSecondaryCacheCompatible();
bool found_dummy_entry =
ProcessDummyResult(&result, /*erase=*/secondary_compatible);
if (!result && secondary_compatible) {
// Try our secondary cache
bool kept_in_sec_cache = false;
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true,
found_dummy_entry, /*out*/ kept_in_sec_cache);
if (secondary_handle) {
result = Promote(std::move(secondary_handle), key, helper, priority,
stats, found_dummy_entry, kept_in_sec_cache);
}
}
return result;
}
bool CacheWithSecondaryAdapter::Release(Handle* handle,
bool erase_if_last_ref) {
if (erase_if_last_ref) {
ObjectPtr v = target_->Value(handle);
if (v == nullptr && distribute_cache_res_) {
size_t charge = target_->GetCharge(handle);
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
Status s = secondary_cache_->Inflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/true);
assert(s.ok());
}
}
return target_->Release(handle, erase_if_last_ref);
}
Cache::ObjectPtr CacheWithSecondaryAdapter::Value(Handle* handle) {
ObjectPtr v = target_->Value(handle);
// TODO with stacked secondaries: might fail in EvictionHandler
assert(v != kDummyObj);
return v;
}
void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary(
AsyncLookupHandle& async_handle) {
assert(!async_handle.IsPending());
assert(async_handle.result_handle == nullptr);
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(async_handle.key, async_handle.helper,
async_handle.create_context, /*wait*/ false,
async_handle.found_dummy_entry,
/*out*/ async_handle.kept_in_sec_cache);
if (secondary_handle) {
// TODO with stacked secondaries: Check & process if already ready?
async_handle.pending_handle = secondary_handle.release();
async_handle.pending_cache = secondary_cache_.get();
}
}
void CacheWithSecondaryAdapter::StartAsyncLookup(
AsyncLookupHandle& async_handle) {
target_->StartAsyncLookup(async_handle);
if (!async_handle.IsPending()) {
bool secondary_compatible =
async_handle.helper &&
async_handle.helper->IsSecondaryCacheCompatible();
async_handle.found_dummy_entry |= ProcessDummyResult(
&async_handle.result_handle, /*erase=*/secondary_compatible);
if (async_handle.Result() == nullptr && secondary_compatible) {
// Not found and not pending on another secondary cache
StartAsyncLookupOnMySecondary(async_handle);
}
}
}
void CacheWithSecondaryAdapter::WaitAll(AsyncLookupHandle* async_handles,
size_t count) {
if (count == 0) {
// Nothing to do
return;
}
// Requests that are pending on *my* secondary cache, at the start of this
// function
std::vector<AsyncLookupHandle*> my_pending;
// Requests that are pending on an "inner" secondary cache (managed somewhere
// under target_), as of the start of this function
std::vector<AsyncLookupHandle*> inner_pending;
// Initial accounting of pending handles, excluding those already handled
// by "outer" secondary caches. (See cur->pending_cache = nullptr.)
for (size_t i = 0; i < count; ++i) {
AsyncLookupHandle* cur = async_handles + i;
if (cur->pending_cache) {
assert(cur->IsPending());
assert(cur->helper);
assert(cur->helper->IsSecondaryCacheCompatible());
if (cur->pending_cache == secondary_cache_.get()) {
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
} else {
// Remember as potentially needing a lookup in my secondary
inner_pending.push_back(cur);
}
}
}
// Wait on inner-most cache lookups first
// TODO with stacked secondaries: because we are not using proper
// async/await constructs here yet, there is a false synchronization point
// here where all the results at one level are needed before initiating
// any lookups at the next level. Probably not a big deal, but worth noting.
if (!inner_pending.empty()) {
target_->WaitAll(async_handles, count);
}
// For those that failed to find something, convert to lookup in my
// secondary cache.
for (AsyncLookupHandle* cur : inner_pending) {
if (cur->Result() == nullptr) {
// Not found, try my secondary
StartAsyncLookupOnMySecondary(*cur);
if (cur->IsPending()) {
assert(cur->pending_cache == secondary_cache_.get());
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
}
}
}
// Wait on all lookups on my secondary cache
{
std::vector<SecondaryCacheResultHandle*> my_secondary_handles;
for (AsyncLookupHandle* cur : my_pending) {
my_secondary_handles.push_back(cur->pending_handle);
}
secondary_cache_->WaitAll(std::move(my_secondary_handles));
}
// Process results
for (AsyncLookupHandle* cur : my_pending) {
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle(
cur->pending_handle);
cur->pending_handle = nullptr;
cur->result_handle = Promote(
std::move(secondary_handle), cur->key, cur->helper, cur->priority,
cur->stats, cur->found_dummy_entry, cur->kept_in_sec_cache);
assert(cur->pending_cache == nullptr);
}
}
std::string CacheWithSecondaryAdapter::GetPrintableOptions() const {
std::string str = target_->GetPrintableOptions();
str.append(" secondary_cache:\n");
str.append(secondary_cache_->GetPrintableOptions());
return str;
}
const char* CacheWithSecondaryAdapter::Name() const {
// To the user, at least for now, configure the underlying cache with
// a secondary cache. So we pretend to be that cache
return target_->Name();
}
std::shared_ptr<Cache> NewTieredVolatileCache(
TieredVolatileCacheOptions& opts) {
if (!opts.cache_opts) {
return nullptr;
}
std::shared_ptr<Cache> cache;
if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) {
LRUCacheOptions cache_opts =
*(static_cast_with_check<LRUCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache_opts.capacity += opts.comp_cache_opts.capacity;
cache = cache_opts.MakeSharedCache();
} else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) {
HyperClockCacheOptions cache_opts =
*(static_cast_with_check<HyperClockCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache = cache_opts.MakeSharedCache();
} else {
return nullptr;
}
std::shared_ptr<SecondaryCache> sec_cache;
sec_cache = NewCompressedSecondaryCache(opts.comp_cache_opts);
return std::make_shared<CacheWithSecondaryAdapter>(cache, sec_cache, true);
}
} // namespace ROCKSDB_NAMESPACE

@ -1,76 +0,0 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "cache/cache_reservation_manager.h"
#include "rocksdb/secondary_cache.h"
namespace ROCKSDB_NAMESPACE {
class CacheWithSecondaryAdapter : public CacheWrapper {
public:
explicit CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache,
bool distribute_cache_res = false);
~CacheWithSecondaryAdapter() override;
Status Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper, size_t charge,
Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
using Cache::Release;
bool Release(Handle* handle, bool erase_if_last_ref = false) override;
ObjectPtr Value(Handle* handle) override;
void StartAsyncLookup(AsyncLookupHandle& async_handle) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
std::string GetPrintableOptions() const override;
const char* Name() const override;
Cache* TEST_GetCache() { return target_.get(); }
SecondaryCache* TEST_GetSecondaryCache() { return secondary_cache_.get(); }
private:
bool EvictionHandler(const Slice& key, Handle* handle);
void StartAsyncLookupOnMySecondary(AsyncLookupHandle& async_handle);
Handle* Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache);
bool ProcessDummyResult(Cache::Handle** handle, bool erase);
void CleanupCacheObject(ObjectPtr obj, const CacheItemHelper* helper);
std::shared_ptr<SecondaryCache> secondary_cache_;
// Whether to proportionally distribute cache memory reservations, i.e
// placeholder entries with null value and a non-zero charge, across
// the primary and secondary caches.
bool distribute_cache_res_;
// A cache reservation manager to keep track of secondary cache memory
// usage by reserving equivalent capacity against the primary cache
std::shared_ptr<ConcurrentCacheReservationManager> pri_cache_res_;
// Fraction of a cache memory reservation to be assigned to the secondary
// cache
double sec_cache_res_ratio_;
};
} // namespace ROCKSDB_NAMESPACE

@ -13,57 +13,20 @@
#include <cstdint>
#include <memory>
#include "env/unique_id_gen.h"
#include "rocksdb/env.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// The generated seeds must fit in 31 bits so that
// ShardedCacheOptions::hash_seed can be set to it explicitly, for
// diagnostic/debugging purposes.
constexpr uint32_t kSeedMask = 0x7fffffff;
uint32_t DetermineSeed(int32_t hash_seed_option) {
if (hash_seed_option >= 0) {
// User-specified exact seed
return static_cast<uint32_t>(hash_seed_option);
}
static SemiStructuredUniqueIdGen gen;
if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) {
std::string hostname;
Status s = Env::Default()->GetHostNameString(&hostname);
if (s.ok()) {
return GetSliceHash(hostname) & kSeedMask;
} else {
// Fall back on something stable within the process.
return BitwiseAnd(gen.GetBaseUpper(), kSeedMask);
}
} else {
// for kQuasiRandomHashSeed and fallback
uint32_t val = gen.GenerateNext<uint32_t>() & kSeedMask;
// Perform some 31-bit bijective transformations so that we get
// quasirandom, not just incrementing. (An incrementing seed from a
// random starting point would be fine, but hard to describe in a name.)
// See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like
// transformation here for our bijection in the lower 31 bits.
// See https://en.wikipedia.org/wiki/MurmurHash
val *= /*31-bit prime*/ 1150630961;
val ^= (val & kSeedMask) >> 17;
val *= /*31-bit prime*/ 1320603883;
return val & kSeedMask;
}
}
} // namespace
ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts)
: Cache(opts.memory_allocator),
ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> allocator)
: Cache(std::move(allocator)),
last_id_(1),
shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1),
hash_seed_(DetermineSeed(opts.hash_seed)),
strict_capacity_limit_(opts.strict_capacity_limit),
capacity_(opts.capacity) {}
shard_mask_((uint32_t{1} << num_shard_bits) - 1),
strict_capacity_limit_(strict_capacity_limit),
capacity_(capacity) {}
size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
uint32_t num_shards = GetNumShards();

@ -15,7 +15,7 @@
#include "port/lang.h"
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
#include "util/hash.h"
#include "util/mutexlock.h"
@ -34,8 +34,8 @@ class CacheShardBase {
std::string GetPrintableOptions() const { return ""; }
using HashVal = uint64_t;
using HashCref = uint64_t;
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return GetSliceNPHash64(key, seed);
static inline HashVal ComputeHash(const Slice& key) {
return GetSliceNPHash64(key);
}
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Lower32of64(hash);
@ -49,19 +49,21 @@ class CacheShardBase {
HashCref GetHash() const;
...
};
Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value,
Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
DeleterFn deleter, HandleImpl** handle,
Cache::Priority priority) = 0;
Status Insert(const Slice& key, HashCref hash, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
HandleImpl** handle, Cache::Priority priority,
bool standalone) = 0;
Handle* CreateStandalone(const Slice& key, HashCref hash, ObjectPtr obj,
const CacheItemHelper* helper,
size_t charge, bool allow_uncharged) = 0;
HandleImpl** handle, Cache::Priority priority) = 0;
HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
HandleImpl* Lookup(const Slice& key, HashCref hash,
const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context,
Cache::Priority priority,
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait,
Statistics* stats) = 0;
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
bool IsReady(HandleImpl* handle) = 0;
void Wait(HandleImpl* handle) = 0;
bool Ref(HandleImpl* handle) = 0;
void Erase(const Slice& key, HashCref hash) = 0;
void SetCapacity(size_t capacity) = 0;
@ -75,9 +77,8 @@ class CacheShardBase {
// *state == 0 and implementation sets *state = SIZE_MAX to indicate
// completion.
void ApplyToSomeEntries(
const std::function<void(const Slice& key, ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
size_t average_entries_per_lock, size_t* state) = 0;
void EraseUnRefEntries() = 0;
*/
@ -89,7 +90,9 @@ class CacheShardBase {
// Portions of ShardedCache that do not depend on the template parameter
class ShardedCacheBase : public Cache {
public:
explicit ShardedCacheBase(const ShardedCacheOptions& opts);
ShardedCacheBase(size_t capacity, int num_shard_bits,
bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> memory_allocator);
virtual ~ShardedCacheBase() = default;
int GetNumShardBits() const;
@ -104,8 +107,6 @@ class ShardedCacheBase : public Cache {
size_t GetUsage(Handle* handle) const override;
std::string GetPrintableOptions() const override;
uint32_t GetHashSeed() const override { return hash_seed_; }
protected: // fns
virtual void AppendPrintableOptions(std::string& str) const = 0;
size_t GetPerShardCapacity() const;
@ -114,7 +115,6 @@ class ShardedCacheBase : public Cache {
protected: // data
std::atomic<uint64_t> last_id_; // For NewId
const uint32_t shard_mask_;
const uint32_t hash_seed_;
// Dynamic configuration parameters, guarded by config_mutex_
bool strict_capacity_limit_;
@ -135,8 +135,10 @@ class ShardedCache : public ShardedCacheBase {
using HashCref = typename CacheShard::HashCref;
using HandleImpl = typename CacheShard::HandleImpl;
explicit ShardedCache(const ShardedCacheOptions& opts)
: ShardedCacheBase(opts),
ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> allocator)
: ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
allocator),
shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
sizeof(CacheShard) * GetNumShards()))),
destroy_shards_in_dtor_(false) {}
@ -170,38 +172,41 @@ class ShardedCache : public ShardedCacheBase {
[s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
}
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
Handle** handle, Priority priority) override {
HashVal hash = CacheShard::ComputeHash(key);
auto h_out = reinterpret_cast<HandleImpl**>(handle);
return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
priority);
}
Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override {
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
if (!helper) {
return Status::InvalidArgument();
}
HashVal hash = CacheShard::ComputeHash(key);
auto h_out = reinterpret_cast<HandleImpl**>(handle);
return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out,
return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
priority);
}
Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
const CacheItemHelper* helper, size_t charge,
bool allow_uncharged) override {
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).CreateStandalone(
key, hash, obj, helper, charge, allow_uncharged);
Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
HashVal hash = CacheShard::ComputeHash(key);
HandleImpl* result = GetShard(hash).Lookup(key, hash);
return reinterpret_cast<Handle*>(result);
}
Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr,
CreateContext* create_context = nullptr,
Priority priority = Priority::LOW,
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
const CreateCallback& create_cb, Priority priority, bool wait,
Statistics* stats = nullptr) override {
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).Lookup(key, hash, helper,
create_context, priority, stats);
HashVal hash = CacheShard::ComputeHash(key);
HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
priority, wait, stats);
return reinterpret_cast<Handle*>(result);
}
void Erase(const Slice& key) override {
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HashVal hash = CacheShard::ComputeHash(key);
GetShard(hash).Erase(key, hash);
}
@ -210,6 +215,14 @@ class ShardedCache : public ShardedCacheBase {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
}
bool IsReady(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).IsReady(h);
}
void Wait(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
GetShard(h->GetHash()).Wait(h);
}
bool Ref(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Ref(h);
@ -225,14 +238,14 @@ class ShardedCache : public ShardedCacheBase {
return SumOverShards2(&CacheShard::GetPinnedUsage);
}
size_t GetOccupancyCount() const override {
return SumOverShards2(&CacheShard::GetOccupancyCount);
return SumOverShards2(&CacheShard::GetPinnedUsage);
}
size_t GetTableAddressCount() const override {
return SumOverShards2(&CacheShard::GetTableAddressCount);
}
void ApplyToAllEntries(
const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
const CacheItemHelper* helper)>& callback,
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const ApplyToAllEntriesOptions& opts) override {
uint32_t num_shards = GetNumShards();
// Iterate over part of each shard, rotating between shards, to

375
cache/typed_cache.h vendored

@ -1,375 +0,0 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// APIs for accessing Cache in a type-safe and convenient way. Cache is kept
// at a low, thin level of abstraction so that different implementations can
// be plugged in, but these wrappers provide clean, convenient access to the
// most common operations.
//
// A number of template classes are needed for sharing common structure. The
// key classes are these:
//
// * PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value.
// * BasicTypedCacheInterface<TValue> - Used for primary cache storage of
// objects of type TValue.
// * FullTypedCacheHelper<TValue, TCreateContext> - Used for secondary cache
// compatible storage of objects of type TValue.
// * For each of these, there's a "Shared" version
// (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache,
// rather than assuming external ownership by holding only a raw `Cache*`.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "cache/cache_helpers.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/advanced_options.h"
namespace ROCKSDB_NAMESPACE {
// For future consideration:
// * Pass in value to Insert with std::unique_ptr& to simplify ownership
// transfer logic in callers
// * Make key type a template parameter (e.g. useful for table cache)
// * Closer integration with CacheHandleGuard (opt-in, so not always
// paying the extra overhead)
#define CACHE_TYPE_DEFS() \
using Priority = Cache::Priority; \
using Handle = Cache::Handle; \
using ObjectPtr = Cache::ObjectPtr; \
using CreateContext = Cache::CreateContext; \
using CacheItemHelper = Cache::CacheItemHelper /* caller ; */
template <typename CachePtr>
class BaseCacheInterface {
public:
CACHE_TYPE_DEFS();
/*implicit*/ BaseCacheInterface(CachePtr cache) : cache_(std::move(cache)) {}
inline void Release(Handle* handle) { cache_->Release(handle); }
inline void ReleaseAndEraseIfLastRef(Handle* handle) {
cache_->Release(handle, /*erase_if_last_ref*/ true);
}
inline void RegisterReleaseAsCleanup(Handle* handle, Cleanable& cleanable) {
cleanable.RegisterCleanup(&ReleaseCacheHandleCleanup, get(), handle);
}
inline Cache* get() const { return &*cache_; }
explicit inline operator bool() const noexcept { return cache_ != nullptr; }
protected:
CachePtr cache_;
};
// PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value. CacheEntryRole is required as
// a template parameter.
template <CacheEntryRole kRole, typename CachePtr = Cache*>
class PlaceholderCacheInterface : public BaseCacheInterface<CachePtr> {
public:
CACHE_TYPE_DEFS();
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
inline Status Insert(const Slice& key, size_t charge, Handle** handle) {
return this->cache_->Insert(key, /*value=*/nullptr, GetHelper(), charge,
handle);
}
static const Cache::CacheItemHelper* GetHelper() {
static const Cache::CacheItemHelper kHelper{kRole};
return &kHelper;
}
};
template <CacheEntryRole kRole>
using PlaceholderSharedCacheInterface =
PlaceholderCacheInterface<kRole, std::shared_ptr<Cache>>;
template <class TValue>
class BasicTypedCacheHelperFns {
public:
CACHE_TYPE_DEFS();
// E.g. char* for char[]
using TValuePtr = std::remove_extent_t<TValue>*;
protected:
inline static ObjectPtr UpCastValue(TValuePtr value) { return value; }
inline static TValuePtr DownCastValue(ObjectPtr value) {
return static_cast<TValuePtr>(value);
}
static void Delete(ObjectPtr value, MemoryAllocator* allocator) {
// FIXME: Currently, no callers actually allocate the ObjectPtr objects
// using the custom allocator, just subobjects that keep a reference to
// the allocator themselves (with CacheAllocationPtr).
if (/*DISABLED*/ false && allocator) {
if constexpr (std::is_destructible_v<TValue>) {
DownCastValue(value)->~TValue();
}
allocator->Deallocate(value);
} else {
// Like delete but properly handles TValue=char[] etc.
std::default_delete<TValue>{}(DownCastValue(value));
}
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, CacheEntryRole kRole>
class BasicTypedCacheHelper : public BasicTypedCacheHelperFns<TValue> {
public:
static const Cache::CacheItemHelper* GetBasicHelper() {
static const Cache::CacheItemHelper kHelper{kRole,
&BasicTypedCacheHelper::Delete};
return &kHelper;
}
};
// BasicTypedCacheInterface - Used for primary cache storage of objects of
// type TValue, which can be cleaned up with std::default_delete<TValue>. The
// role is provided by TValue::kCacheEntryRole or given in an optional
// template parameter.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class BasicTypedCacheInterface : public BaseCacheInterface<CachePtr>,
public BasicTypedCacheHelper<TValue, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
struct TypedHandle : public Handle {};
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
// ctor
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle {
TypedHandle* Result() {
return reinterpret_cast<TypedHandle*>(Cache::AsyncLookupHandle::Result());
}
};
inline Status Insert(const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr,
Priority priority = Priority::LOW) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
return this->cache_->Insert(
key, BasicTypedCacheHelperFns<TValue>::UpCastValue(value),
GetBasicHelper(), charge, untyped_handle, priority);
}
inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) {
return reinterpret_cast<TypedHandle*>(
this->cache_->BasicLookup(key, stats));
}
inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) {
assert(async_handle.helper == nullptr);
this->cache_->StartAsyncLookup(async_handle);
}
inline CacheHandleGuard<TValue> Guard(TypedHandle* handle) {
if (handle) {
return CacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline std::shared_ptr<TValue> SharedGuard(TypedHandle* handle) {
if (handle) {
return MakeSharedCacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline TValuePtr Value(TypedHandle* handle) {
return BasicTypedCacheHelperFns<TValue>::DownCastValue(
this->cache_->Value(handle));
}
};
// BasicTypedSharedCacheInterface - Like BasicTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole>
using BasicTypedSharedCacheInterface =
BasicTypedCacheInterface<TValue, kRole, std::shared_ptr<Cache>>;
// TValue must implement ContentSlice() and ~TValue
// TCreateContext must implement Create(std::unique_ptr<TValue>*, ...)
template <class TValue, class TCreateContext>
class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns<TValue> {
public:
CACHE_TYPE_DEFS();
protected:
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
static size_t Size(ObjectPtr v) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
return slice.size();
}
static Status SaveTo(ObjectPtr v, size_t from_offset, size_t length,
char* out) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
assert(from_offset < slice.size());
assert(from_offset + length <= slice.size());
std::copy_n(slice.data() + from_offset, length, out);
return Status::OK();
}
static Status Create(const Slice& data, CreateContext* context,
MemoryAllocator* allocator, ObjectPtr* out_obj,
size_t* out_charge) {
std::unique_ptr<TValue> value = nullptr;
if constexpr (sizeof(TCreateContext) > 0) {
TCreateContext* tcontext = static_cast<TCreateContext*>(context);
tcontext->Create(&value, out_charge, data, allocator);
} else {
TCreateContext::Create(&value, out_charge, data, allocator);
}
*out_obj = UpCastValue(value.release());
return Status::OK();
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, class TCreateContext, CacheEntryRole kRole>
class FullTypedCacheHelper
: public FullTypedCacheHelperFns<TValue, TCreateContext> {
public:
static const Cache::CacheItemHelper* GetFullHelper() {
static const Cache::CacheItemHelper kHelper{
kRole,
&FullTypedCacheHelper::Delete,
&FullTypedCacheHelper::Size,
&FullTypedCacheHelper::SaveTo,
&FullTypedCacheHelper::Create,
BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper()};
return &kHelper;
}
};
// FullTypedCacheHelper - Used for secondary cache compatible storage of
// objects of type TValue. In addition to BasicTypedCacheInterface constraints,
// we require TValue::ContentSlice() to return persistable data. This
// simplifies usage for the normal case of simple secondary cache compatibility
// (can give you a Slice to the data already in memory). In addition to
// TCreateContext performing the role of Cache::CreateContext, it is also
// expected to provide a function Create(std::unique_ptr<TValue>* value,
// size_t* out_charge, const Slice& data, MemoryAllocator* allocator) for
// creating new TValue.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class FullTypedCacheInterface
: public BasicTypedCacheInterface<TValue, kRole, CachePtr>,
public FullTypedCacheHelper<TValue, TCreateContext, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheInterface<TValue, kRole, CachePtr>::TypedHandle;
using typename BasicTypedCacheInterface<TValue, kRole,
CachePtr>::TypedAsyncLookupHandle;
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
using FullTypedCacheHelper<TValue, TCreateContext, kRole>::GetFullHelper;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
// ctor
using BasicTypedCacheInterface<TValue, kRole,
CachePtr>::BasicTypedCacheInterface;
// Insert with SecondaryCache compatibility (subject to CacheTier).
// (Basic Insert() also inherited.)
inline Status InsertFull(
const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr, Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier
? GetFullHelper()
: GetBasicHelper();
return this->cache_->Insert(key, UpCastValue(value), helper, charge,
untyped_handle, priority);
}
// Like SecondaryCache::InsertSaved, with SecondaryCache compatibility
// (subject to CacheTier).
inline Status InsertSaved(
const Slice& key, const Slice& data, TCreateContext* create_context,
Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier,
size_t* out_charge = nullptr) {
ObjectPtr value;
size_t charge;
Status st = GetFullHelper()->create_cb(data, create_context,
this->cache_->memory_allocator(),
&value, &charge);
if (out_charge) {
*out_charge = charge;
}
if (st.ok()) {
st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/,
priority, lowest_used_cache_tier);
} else {
GetFullHelper()->del_cb(value, this->cache_->memory_allocator());
}
return st;
}
// Lookup with SecondaryCache support (subject to CacheTier).
// (Basic Lookup() also inherited.)
inline TypedHandle* LookupFull(
const Slice& key, TCreateContext* create_context = nullptr,
Priority priority = Priority::LOW, Statistics* stats = nullptr,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
return reinterpret_cast<TypedHandle*>(this->cache_->Lookup(
key, GetFullHelper(), create_context, priority, stats));
} else {
return BasicTypedCacheInterface<TValue, kRole, CachePtr>::Lookup(key,
stats);
}
}
inline void StartAsyncLookupFull(
TypedAsyncLookupHandle& async_handle,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
async_handle.helper = GetFullHelper();
this->cache_->StartAsyncLookup(async_handle);
} else {
BasicTypedCacheInterface<TValue, kRole, CachePtr>::StartAsyncLookup(
async_handle);
}
}
};
// FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole>
using FullTypedSharedCacheInterface =
FullTypedCacheInterface<TValue, TCreateContext, kRole,
std::shared_ptr<Cache>>;
#undef CACHE_TYPE_DEFS
} // namespace ROCKSDB_NAMESPACE

@ -12,7 +12,7 @@ fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode_config_platform010.sh
source $ROOT/build_tools/fbcode_config_platform009.sh
GCOV=$GCC_BASE/bin/gcov
else
GCOV=$(which gcov)

@ -21,8 +21,6 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --
blackbox_crash_test_with_multiops_wp_txn \
crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
whitebox_crash_test_with_tiered_storage \
whitebox_crash_test_with_optimistic_txn \
blackbox_crash_test_with_optimistic_txn \
crash_test: $(DB_STRESS_CMD)
# Do not parallelize
@ -39,11 +37,6 @@ crash_test_with_txn: $(DB_STRESS_CMD)
$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_optimistic_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_optimistic_txn
crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
crash_test_with_ts: $(DB_STRESS_CMD)
@ -87,9 +80,6 @@ blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn blackbox $(CRASH_TEST_EXT_ARGS)
ifeq ($(CRASH_TEST_KILL_ODD),)
CRASH_TEST_KILL_ODD=888887
endif
@ -115,7 +105,3 @@ whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)

@ -47,11 +47,6 @@ void ArenaWrappedDBIter::Init(
read_options_ = read_options;
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false;
}
}
Status ArenaWrappedDBIter::Refresh() {

@ -13,6 +13,12 @@
namespace ROCKSDB_NAMESPACE {
std::unique_ptr<BlobContents> BlobContents::Create(
CacheAllocationPtr&& allocation, size_t size) {
return std::unique_ptr<BlobContents>(
new BlobContents(std::move(allocation), size));
}
size_t BlobContents::ApproximateMemoryUsage() const {
size_t usage = 0;
@ -39,4 +45,46 @@ size_t BlobContents::ApproximateMemoryUsage() const {
return usage;
}
size_t BlobContents::SizeCallback(void* obj) {
assert(obj);
return static_cast<const BlobContents*>(obj)->size();
}
Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out) {
assert(from_obj);
const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
assert(buf->size() >= from_offset + length);
memcpy(out, buf->data().data() + from_offset, length);
return Status::OK();
}
Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
static Cache::CacheItemHelper cache_helper(
&SizeCallback, &SaveToCallback,
GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
return &cache_helper;
}
Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
const void* buf, size_t size,
void** out_obj, size_t* charge) {
assert(allocation);
memcpy(allocation.get(), buf, size);
std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
BlobContents* const contents = obj.release();
*out_obj = contents;
*charge = contents->ApproximateMemoryUsage();
return Status::OK();
}
} // namespace ROCKSDB_NAMESPACE

@ -7,8 +7,8 @@
#include <memory>
#include "memory/memory_allocator_impl.h"
#include "rocksdb/advanced_cache.h"
#include "memory/memory_allocator.h"
#include "rocksdb/cache.h"
#include "rocksdb/rocksdb_namespace.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -18,8 +18,8 @@ namespace ROCKSDB_NAMESPACE {
// A class representing a single uncompressed value read from a blob file.
class BlobContents {
public:
BlobContents(CacheAllocationPtr&& allocation, size_t size)
: allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
size_t size);
BlobContents(const BlobContents&) = delete;
BlobContents& operator=(const BlobContents&) = delete;
@ -34,26 +34,23 @@ class BlobContents {
size_t ApproximateMemoryUsage() const;
// For TypedCacheInterface
const Slice& ContentSlice() const { return data_; }
static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kBlobValue;
// Callbacks for secondary cache
static size_t SizeCallback(void* obj);
static Status SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out);
static Cache::CacheItemHelper* GetCacheItemHelper();
static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
size_t size, void** out_obj, size_t* charge);
private:
BlobContents(CacheAllocationPtr&& allocation, size_t size)
: allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
CacheAllocationPtr allocation_;
Slice data_;
};
class BlobContentsCreator : public Cache::CreateContext {
public:
static void Create(std::unique_ptr<BlobContents>* out, size_t* out_charge,
const Slice& contents, MemoryAllocator* alloc) {
auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc),
contents.size());
out->reset(raw);
if (out_charge) {
*out_charge = raw->ApproximateMemoryUsage();
}
}
};
} // namespace ROCKSDB_NAMESPACE

@ -134,6 +134,13 @@ class BlobCountingIterator : public InternalIterator {
if (!iter_->Valid()) {
status_ = iter_->status();
return;
} else if (iter_->IsDeleteRangeSentinelKey()) {
// CompactionMergingIterator emits range tombstones, and range tombstone
// keys can be truncated at file boundaries. This means the range
// tombstone keys can have op_type kTypeBlobIndex.
// This could crash the ProcessInFlow() call below since
// value is empty for these keys.
return;
}
TEST_SYNC_POINT(

@ -13,7 +13,6 @@
#include "db/blob/blob_index.h"
#include "db/blob/blob_log_format.h"
#include "db/blob/blob_log_writer.h"
#include "db/blob/blob_source.h"
#include "db/event_helpers.h"
#include "db/version_set.h"
#include "file/filename.h"
@ -259,7 +258,6 @@ Status BlobFileBuilder::CompressBlobIfNeeded(
return Status::OK();
}
// TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
CompressionOptions opts;
CompressionContext context(blob_compression_type_);
constexpr uint64_t sample_for_compression = 0;
@ -395,7 +393,7 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
uint64_t blob_offset) const {
Status s = Status::OK();
BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache};
auto blob_cache = immutable_options_->blob_cache;
auto statistics = immutable_options_->statistics.get();
bool warm_cache =
prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
@ -409,12 +407,34 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
const Cache::Priority priority = Cache::Priority::BOTTOM;
s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority,
immutable_options_->lowest_used_cache_tier);
// Objects to be put into the cache have to be heap-allocated and
// self-contained, i.e. own their contents. The Cache has to be able to
// take unique ownership of them.
CacheAllocationPtr allocation =
AllocateBlock(blob.size(), blob_cache->memory_allocator());
memcpy(allocation.get(), blob.data(), blob.size());
std::unique_ptr<BlobContents> buf =
BlobContents::Create(std::move(allocation), blob.size());
Cache::CacheItemHelper* const cache_item_helper =
BlobContents::GetCacheItemHelper();
assert(cache_item_helper);
if (immutable_options_->lowest_used_cache_tier ==
CacheTier::kNonVolatileBlockTier) {
s = blob_cache->Insert(key, buf.get(), cache_item_helper,
buf->ApproximateMemoryUsage(),
nullptr /* cache_handle */, priority);
} else {
s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
cache_item_helper->del_cb,
nullptr /* cache_handle */, priority);
}
if (s.ok()) {
RecordTick(statistics, BLOB_DB_CACHE_ADD);
RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size());
RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
buf.release();
} else {
RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
}

@ -25,7 +25,7 @@ BlobFileCache::BlobFileCache(Cache* cache,
HistogramImpl* blob_file_read_hist,
const std::shared_ptr<IOTracer>& io_tracer)
: cache_(cache),
mutex_(kNumberOfMutexStripes),
mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
immutable_options_(immutable_options),
file_options_(file_options),
column_family_id_(column_family_id),
@ -37,29 +37,29 @@ BlobFileCache::BlobFileCache(Cache* cache,
}
Status BlobFileCache::GetBlobFileReader(
const ReadOptions& read_options, uint64_t blob_file_number,
uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader) {
assert(blob_file_reader);
assert(blob_file_reader->IsEmpty());
const Slice key = GetSliceForKey(&blob_file_number);
const Slice key = GetSlice(&blob_file_number);
assert(cache_);
TypedHandle* handle = cache_.Lookup(key);
Cache::Handle* handle = cache_->Lookup(key);
if (handle) {
*blob_file_reader = cache_.Guard(handle);
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
return Status::OK();
}
TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
// Check again while holding mutex
MutexLock lock(&mutex_.Get(key));
MutexLock lock(mutex_.get(key));
handle = cache_.Lookup(key);
handle = cache_->Lookup(key);
if (handle) {
*blob_file_reader = cache_.Guard(handle);
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
return Status::OK();
}
@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader(
{
assert(file_options_);
const Status s = BlobFileReader::Create(
*immutable_options_, read_options, *file_options_, column_family_id_,
*immutable_options_, *file_options_, column_family_id_,
blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
if (!s.ok()) {
RecordTick(statistics, NO_FILE_ERRORS);
@ -84,7 +84,8 @@ Status BlobFileCache::GetBlobFileReader(
{
constexpr size_t charge = 1;
const Status s = cache_.Insert(key, reader.get(), charge, &handle);
const Status s = cache_->Insert(key, reader.get(), charge,
&DeleteCacheEntry<BlobFileReader>, &handle);
if (!s.ok()) {
RecordTick(statistics, NO_FILE_ERRORS);
return s;
@ -93,7 +94,7 @@ Status BlobFileCache::GetBlobFileReader(
reader.release();
*blob_file_reader = cache_.Guard(handle);
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
return Status::OK();
}

@ -7,8 +7,7 @@
#include <cinttypes>
#include "cache/typed_cache.h"
#include "db/blob/blob_file_reader.h"
#include "cache/cache_helpers.h"
#include "rocksdb/rocksdb_namespace.h"
#include "util/mutexlock.h"
@ -19,6 +18,7 @@ struct ImmutableOptions;
struct FileOptions;
class HistogramImpl;
class Status;
class BlobFileReader;
class Slice;
class IOTracer;
@ -32,18 +32,14 @@ class BlobFileCache {
BlobFileCache(const BlobFileCache&) = delete;
BlobFileCache& operator=(const BlobFileCache&) = delete;
Status GetBlobFileReader(const ReadOptions& read_options,
uint64_t blob_file_number,
Status GetBlobFileReader(uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader);
private:
using CacheInterface =
BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
using TypedHandle = CacheInterface::TypedHandle;
CacheInterface cache_;
Cache* cache_;
// Note: mutex_ below is used to guard against multiple threads racing to open
// the same file.
Striped<CacheAlignedWrapper<port::Mutex>> mutex_;
Striped<port::Mutex, Slice> mutex_;
const ImmutableOptions* immutable_options_;
const FileOptions* file_options_;
uint32_t column_family_id_;

@ -118,9 +118,7 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
// First try: reader should be opened and put in cache
CacheHandleGuard<BlobFileReader> first;
const ReadOptions read_options;
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&first));
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
ASSERT_NE(first.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -128,8 +126,7 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
// Second try: reader should be served from cache
CacheHandleGuard<BlobFileReader> second;
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&second));
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
ASSERT_NE(second.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -166,21 +163,19 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
CacheHandleGuard<BlobFileReader> first;
CacheHandleGuard<BlobFileReader> second;
const ReadOptions read_options;
SyncPoint::GetInstance()->SetCallBack(
"BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
// Disabling sync points to prevent infinite recursion
SyncPoint::GetInstance()->DisableProcessing();
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options,
blob_file_number, &second));
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
ASSERT_NE(second.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&first));
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
ASSERT_NE(first.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -218,10 +213,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
CacheHandleGuard<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
.IsIOError());
blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
ASSERT_EQ(reader.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
@ -260,10 +253,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
// strict_capacity_limit is set
CacheHandleGuard<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
.IsMemoryLimit());
ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
.IsMemoryLimit());
ASSERT_EQ(reader.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);

@ -23,19 +23,32 @@ class BlobFileCompletionCallback {
const std::vector<std::shared_ptr<EventListener>>& listeners,
const std::string& dbname)
: event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
#ifndef ROCKSDB_LITE
sst_file_manager_ = sst_file_manager;
mutex_ = mutex;
error_handler_ = error_handler;
#else
(void)sst_file_manager;
(void)mutex;
(void)error_handler;
#endif // ROCKSDB_LITE
}
void OnBlobFileCreationStarted(const std::string& file_name,
const std::string& column_family_name,
int job_id,
BlobFileCreationReason creation_reason) {
#ifndef ROCKSDB_LITE
// Notify the listeners.
EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
column_family_name, file_name,
job_id, creation_reason);
#else
(void)file_name;
(void)column_family_name;
(void)job_id;
(void)creation_reason;
#endif
}
Status OnBlobFileCompleted(const std::string& file_name,
@ -48,6 +61,7 @@ class BlobFileCompletionCallback {
uint64_t blob_count, uint64_t blob_bytes) {
Status s;
#ifndef ROCKSDB_LITE
auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
if (sfm) {
// Report new blob files to SstFileManagerImpl
@ -60,6 +74,7 @@ class BlobFileCompletionCallback {
error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
}
}
#endif // !ROCKSDB_LITE
// Notify the listeners.
EventHelpers::LogAndNotifyBlobFileCreationFinished(
@ -74,9 +89,11 @@ class BlobFileCompletionCallback {
}
private:
#ifndef ROCKSDB_LITE
SstFileManager* sst_file_manager_;
InstrumentedMutex* mutex_;
ErrorHandler* error_handler_;
#endif // ROCKSDB_LITE
EventLogger* event_logger_;
std::vector<std::shared_ptr<EventListener>> listeners_;
std::string dbname_;

@ -12,7 +12,7 @@
#include "db/blob/blob_log_format.h"
#include "file/file_prefetch_buffer.h"
#include "file/filename.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "options/cf_options.h"
#include "rocksdb/file_system.h"
#include "rocksdb/slice.h"
@ -26,10 +26,9 @@
namespace ROCKSDB_NAMESPACE {
Status BlobFileReader::Create(
const ImmutableOptions& immutable_options, const ReadOptions& read_options,
const FileOptions& file_options, uint32_t column_family_id,
HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
const std::shared_ptr<IOTracer>& io_tracer,
const ImmutableOptions& immutable_options, const FileOptions& file_options,
uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
std::unique_ptr<BlobFileReader>* blob_file_reader) {
assert(blob_file_reader);
assert(!*blob_file_reader);
@ -53,17 +52,15 @@ Status BlobFileReader::Create(
CompressionType compression_type = kNoCompression;
{
const Status s =
ReadHeader(file_reader.get(), read_options, column_family_id,
statistics, &compression_type);
const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
&compression_type);
if (!s.ok()) {
return s;
}
}
{
const Status s =
ReadFooter(file_reader.get(), read_options, file_size, statistics);
const Status s = ReadFooter(file_reader.get(), file_size, statistics);
if (!s.ok()) {
return s;
}
@ -137,7 +134,6 @@ Status BlobFileReader::OpenFile(
}
Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint32_t column_family_id,
Statistics* statistics,
CompressionType* compression_type) {
@ -155,10 +151,9 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
constexpr size_t read_size = BlobLogHeader::kSize;
// TODO: rate limit reading headers from blob files.
const Status s =
ReadFromFile(file_reader, read_options, read_offset, read_size,
statistics, &header_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
const Status s = ReadFromFile(file_reader, read_offset, read_size,
statistics, &header_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
if (!s.ok()) {
return s;
}
@ -192,7 +187,6 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
}
Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t file_size, Statistics* statistics) {
assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
assert(file_reader);
@ -208,10 +202,9 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
constexpr size_t read_size = BlobLogFooter::kSize;
// TODO: rate limit reading footers from blob files.
const Status s =
ReadFromFile(file_reader, read_options, read_offset, read_size,
statistics, &footer_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
const Status s = ReadFromFile(file_reader, read_offset, read_size,
statistics, &footer_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
if (!s.ok()) {
return s;
}
@ -239,7 +232,6 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
}
Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t read_offset, size_t read_size,
Statistics* statistics, Slice* slice,
Buffer* buf, AlignedBuf* aligned_buf,
@ -254,23 +246,17 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
Status s;
IOOptions io_options;
s = file_reader->PrepareIOOptions(read_options, io_options);
if (!s.ok()) {
return s;
}
if (file_reader->use_direct_io()) {
constexpr char* scratch = nullptr;
s = file_reader->Read(io_options, read_offset, read_size, slice, scratch,
s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
aligned_buf, rate_limiter_priority);
} else {
buf->reset(new char[read_size]);
constexpr AlignedBuf* aligned_scratch = nullptr;
s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(),
aligned_scratch, rate_limiter_priority);
s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
buf->get(), aligned_scratch, rate_limiter_priority);
}
if (!s.ok()) {
@ -338,13 +324,8 @@ Status BlobFileReader::GetBlob(
Status s;
constexpr bool for_compaction = true;
IOOptions io_options;
s = file_reader_->PrepareIOOptions(read_options, io_options);
if (!s.ok()) {
return s;
}
prefetched = prefetch_buffer->TryReadFromCache(
io_options, file_reader_.get(), record_offset,
IOOptions(), file_reader_.get(), record_offset,
static_cast<size_t>(record_size), &record_slice, &s,
read_options.rate_limiter_priority, for_compaction);
if (!s.ok()) {
@ -357,10 +338,10 @@ Status BlobFileReader::GetBlob(
PERF_COUNTER_ADD(blob_read_count, 1);
PERF_COUNTER_ADD(blob_read_byte, record_size);
PERF_TIMER_GUARD(blob_read_time);
const Status s = ReadFromFile(
file_reader_.get(), read_options, record_offset,
static_cast<size_t>(record_size), statistics_, &record_slice, &buf,
&aligned_buf, read_options.rate_limiter_priority);
const Status s = ReadFromFile(file_reader_.get(), record_offset,
static_cast<size_t>(record_size), statistics_,
&record_slice, &buf, &aligned_buf,
read_options.rate_limiter_priority);
if (!s.ok()) {
return s;
}
@ -439,11 +420,11 @@ void BlobFileReader::MultiGetBlob(
assert(req->offset >= adjustment);
adjustments.push_back(adjustment);
FSReadRequest read_req;
FSReadRequest read_req = {};
read_req.offset = req->offset - adjustment;
read_req.len = req->len + adjustment;
read_reqs.emplace_back(read_req);
total_len += read_req.len;
read_reqs.emplace_back(std::move(read_req));
}
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
@ -588,7 +569,12 @@ Status BlobFileReader::UncompressBlobIfNeeded(
assert(result);
if (compression_type == kNoCompression) {
BlobContentsCreator::Create(result, nullptr, value_slice, allocator);
CacheAllocationPtr allocation =
AllocateBlock(value_slice.size(), allocator);
memcpy(allocation.get(), value_slice.data(), value_slice.size());
*result = BlobContents::Create(std::move(allocation), value_slice.size());
return Status::OK();
}
@ -616,7 +602,7 @@ Status BlobFileReader::UncompressBlobIfNeeded(
return Status::Corruption("Unable to uncompress blob");
}
result->reset(new BlobContents(std::move(output), uncompressed_size));
*result = BlobContents::Create(std::move(output), uncompressed_size);
return Status::OK();
}

@ -29,7 +29,6 @@ class Statistics;
class BlobFileReader {
public:
static Status Create(const ImmutableOptions& immutable_options,
const ReadOptions& read_options,
const FileOptions& file_options,
uint32_t column_family_id,
HistogramImpl* blob_file_read_hist,
@ -75,18 +74,15 @@ class BlobFileReader {
std::unique_ptr<RandomAccessFileReader>* file_reader);
static Status ReadHeader(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint32_t column_family_id, Statistics* statistics,
CompressionType* compression_type);
static Status ReadFooter(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options, uint64_t file_size,
Statistics* statistics);
uint64_t file_size, Statistics* statistics);
using Buffer = std::unique_ptr<char[]>;
static Status ReadFromFile(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t read_offset, size_t read_size,
Statistics* statistics, Slice* slice, Buffer* buf,
AlignedBuf* aligned_buf,

@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
std::unique_ptr<BlobFileReader> reader;
ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
// Make sure the blob can be retrieved with and without checksum verification
ReadOptions read_options;
read_options.verify_checksums = false;
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
.IsCorruption());
}
@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
.IsCorruption());
}
@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
.IsCorruption());
}
@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
.IsCorruption());
}
@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
std::unique_ptr<BlobFileReader> reader;
constexpr uint32_t incorrect_column_family_id = 2;
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), incorrect_column_family_id,
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
incorrect_column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
SyncPoint::GetInstance()->SetCallBack(
"BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
@ -728,12 +728,13 @@ TEST_F(BlobFileReaderTest, Compression) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
// Make sure the blob can be retrieved with and without checksum verification
ReadOptions read_options;
read_options.verify_checksums = false;
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -802,10 +803,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
SyncPoint::GetInstance()->SetCallBack(
"BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
@ -894,10 +895,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
const Status s = BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader);
const bool fail_during_create =
(sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
@ -982,10 +983,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
const Status s = BlobFileReader::Create(
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader);
const bool fail_during_create =
sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";

@ -7,7 +7,7 @@
#include "db/blob/blob_log_sequential_reader.h"
#include "file/random_access_file_reader.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {

@ -10,7 +10,7 @@
#include "db/blob/blob_log_format.h"
#include "file/writable_file_writer.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "rocksdb/system_clock.h"
#include "test_util/sync_point.h"
#include "util/coding.h"

@ -13,7 +13,7 @@
#include "db/blob/blob_contents.h"
#include "db/blob/blob_file_reader.h"
#include "db/blob/blob_log_format.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "options/cf_options.h"
#include "table/get_context.h"
#include "table/multiget_context.h"
@ -30,14 +30,16 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options,
blob_file_cache_(blob_file_cache),
blob_cache_(immutable_options->blob_cache),
lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
#ifndef ROCKSDB_LITE
auto bbto =
immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
if (bbto &&
bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
.charged == CacheEntryRoleOptions::Decision::kEnabled) {
blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
immutable_options->blob_cache, bbto->block_cache)};
blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
bbto->block_cache);
}
#endif // ROCKSDB_LITE
}
BlobSource::~BlobSource() = default;
@ -80,8 +82,9 @@ Status BlobSource::PutBlobIntoCache(
assert(cached_blob);
assert(cached_blob->IsEmpty());
TypedHandle* cache_handle = nullptr;
Cache::Handle* cache_handle = nullptr;
const Status s = InsertEntryIntoCache(cache_key, blob->get(),
(*blob)->ApproximateMemoryUsage(),
&cache_handle, Cache::Priority::BOTTOM);
if (s.ok()) {
blob->release();
@ -103,10 +106,26 @@ Status BlobSource::PutBlobIntoCache(
return s;
}
BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const {
return blob_cache_.LookupFull(key, nullptr /* context */,
Cache::Priority::BOTTOM, statistics_,
lowest_used_cache_tier_);
Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
Cache::Handle* cache_handle = nullptr;
if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
Cache::CreateCallback create_cb =
[allocator = blob_cache_->memory_allocator()](
const void* buf, size_t size, void** out_obj,
size_t* charge) -> Status {
return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
size, out_obj, charge);
};
cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
create_cb, Cache::Priority::BOTTOM,
true /* wait_for_cache */, statistics_);
} else {
cache_handle = blob_cache_->Lookup(key, statistics_);
}
return cache_handle;
}
void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
@ -147,11 +166,24 @@ void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
}
Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
TypedHandle** cache_handle,
size_t charge,
Cache::Handle** cache_handle,
Cache::Priority priority) const {
return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(),
cache_handle, priority,
lowest_used_cache_tier_);
Status s;
Cache::CacheItemHelper* const cache_item_helper =
BlobContents::GetCacheItemHelper();
assert(cache_item_helper);
if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
priority);
} else {
s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
cache_handle, priority);
}
return s;
}
Status BlobSource::GetBlob(const ReadOptions& read_options,
@ -209,8 +241,7 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,
{
CacheHandleGuard<BlobFileReader> blob_file_reader;
s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
&blob_file_reader);
s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
if (!s.ok()) {
return s;
}
@ -221,10 +252,9 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,
return Status::Corruption("Compression type mismatch when reading blob");
}
MemoryAllocator* const allocator =
(blob_cache_ && read_options.fill_cache)
? blob_cache_.get()->memory_allocator()
: nullptr;
MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
? blob_cache_->memory_allocator()
: nullptr;
uint64_t read_size = 0;
s = blob_file_reader.GetValue()->GetBlob(
@ -373,8 +403,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
}
CacheHandleGuard<BlobFileReader> blob_file_reader;
Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
&blob_file_reader);
Status s =
blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
if (!s.ok()) {
for (size_t i = 0; i < _blob_reqs.size(); ++i) {
BlobReadRequest* const req = _blob_reqs[i].first;
@ -388,10 +418,9 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
assert(blob_file_reader.GetValue());
MemoryAllocator* const allocator =
(blob_cache_ && read_options.fill_cache)
? blob_cache_.get()->memory_allocator()
: nullptr;
MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
? blob_cache_->memory_allocator()
: nullptr;
blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
_blob_reqs, &_bytes_read);

@ -8,9 +8,8 @@
#include <cinttypes>
#include <memory>
#include "cache/cache_helpers.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "db/blob/blob_contents.h"
#include "db/blob/blob_file_cache.h"
#include "db/blob/blob_read_request.h"
#include "rocksdb/cache.h"
@ -24,6 +23,7 @@ struct ImmutableOptions;
class Status;
class FilePrefetchBuffer;
class Slice;
class BlobContents;
// BlobSource is a class that provides universal access to blobs, regardless of
// whether they are in the blob cache, secondary cache, or (remote) storage.
@ -95,9 +95,9 @@ class BlobSource {
uint64_t* bytes_read);
inline Status GetBlobFileReader(
const ReadOptions& read_options, uint64_t blob_file_number,
uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader) {
return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number,
return blob_file_cache_->GetBlobFileReader(blob_file_number,
blob_file_reader);
}
@ -106,14 +106,6 @@ class BlobSource {
bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
uint64_t offset, size_t* charge = nullptr) const;
// For TypedSharedCacheInterface
void Create(BlobContents** out, const char* buf, size_t size,
MemoryAllocator* alloc);
using SharedCacheInterface =
FullTypedSharedCacheInterface<BlobContents, BlobContentsCreator>;
using TypedHandle = SharedCacheInterface::TypedHandle;
private:
Status GetBlobFromCache(const Slice& cache_key,
CacheHandleGuard<BlobContents>* cached_blob) const;
@ -128,10 +120,10 @@ class BlobSource {
static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
PinnableSlice* value);
TypedHandle* GetEntryFromCache(const Slice& key) const;
Cache::Handle* GetEntryFromCache(const Slice& key) const;
Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
TypedHandle** cache_handle,
size_t charge, Cache::Handle** cache_handle,
Cache::Priority priority) const;
inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
@ -149,7 +141,7 @@ class BlobSource {
BlobFileCache* blob_file_cache_;
// A cache to store uncompressed blobs.
mutable SharedCacheInterface blob_cache_;
std::shared_ptr<Cache> blob_cache_;
// The control option of how the cache tiers will be used. Currently rocksdb
// support block/blob cache (volatile tier) and secondary cache (this tier

@ -517,8 +517,7 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
compression, blob_offsets, blob_sizes);
CacheHandleGuard<BlobFileReader> blob_file_reader;
ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number,
&blob_file_reader));
ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
ASSERT_NE(blob_file_reader.GetValue(), nullptr);
const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
@ -1140,18 +1139,26 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
blob_file_cache.get());
CacheHandleGuard<BlobFileReader> file_reader;
ReadOptions read_options;
ASSERT_OK(
blob_source.GetBlobFileReader(read_options, file_number, &file_reader));
ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
ASSERT_NE(file_reader.GetValue(), nullptr);
const uint64_t file_size = file_reader.GetValue()->GetFileSize();
ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
ReadOptions read_options;
read_options.verify_checksums = true;
auto blob_cache = options_.blob_cache;
auto secondary_cache = lru_cache_opts_.secondary_cache;
Cache::CreateCallback create_cb = [](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
CacheAllocationPtr allocation(new char[size]);
return BlobContents::CreateCallback(std::move(allocation), buf, size,
out_obj, charge);
};
{
// GetBlob
std::vector<PinnableSlice> values(keys.size());
@ -1212,16 +1219,15 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
{
CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
const Slice key0 = cache_key.AsSlice();
auto handle0 = blob_cache->BasicLookup(key0, statistics);
auto handle0 = blob_cache->Lookup(key0, statistics);
ASSERT_EQ(handle0, nullptr);
// key0's item should be in the secondary cache.
bool kept_in_sec_cache = false;
auto sec_handle0 = secondary_cache->Lookup(
key0, BlobSource::SharedCacheInterface::GetFullHelper(),
/*context*/ nullptr, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_FALSE(kept_in_sec_cache);
bool is_in_sec_cache = false;
auto sec_handle0 =
secondary_cache->Lookup(key0, create_cb, true,
/*advise_erase=*/true, is_in_sec_cache);
ASSERT_FALSE(is_in_sec_cache);
ASSERT_NE(sec_handle0, nullptr);
ASSERT_TRUE(sec_handle0->IsReady());
auto value = static_cast<BlobContents*>(sec_handle0->Value());
@ -1240,16 +1246,15 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
{
CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
const Slice key1 = cache_key.AsSlice();
auto handle1 = blob_cache->BasicLookup(key1, statistics);
auto handle1 = blob_cache->Lookup(key1, statistics);
ASSERT_NE(handle1, nullptr);
blob_cache->Release(handle1);
bool kept_in_sec_cache = false;
auto sec_handle1 = secondary_cache->Lookup(
key1, BlobSource::SharedCacheInterface::GetFullHelper(),
/*context*/ nullptr, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_FALSE(kept_in_sec_cache);
bool is_in_sec_cache = false;
auto sec_handle1 =
secondary_cache->Lookup(key1, create_cb, true,
/*advise_erase=*/true, is_in_sec_cache);
ASSERT_FALSE(is_in_sec_cache);
ASSERT_EQ(sec_handle1, nullptr);
ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
@ -1271,7 +1276,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// key0 should be in the primary cache.
CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
const Slice key0 = cache_key0.AsSlice();
auto handle0 = blob_cache->BasicLookup(key0, statistics);
auto handle0 = blob_cache->Lookup(key0, statistics);
ASSERT_NE(handle0, nullptr);
auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
ASSERT_NE(value, nullptr);
@ -1281,12 +1286,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// key1 is not in the primary cache and is in the secondary cache.
CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
const Slice key1 = cache_key1.AsSlice();
auto handle1 = blob_cache->BasicLookup(key1, statistics);
auto handle1 = blob_cache->Lookup(key1, statistics);
ASSERT_EQ(handle1, nullptr);
// erase key0 from the primary cache.
blob_cache->Erase(key0);
handle0 = blob_cache->BasicLookup(key0, statistics);
handle0 = blob_cache->Lookup(key0, statistics);
ASSERT_EQ(handle0, nullptr);
// key1 promotion should succeed due to the primary cache being empty. we
@ -1302,7 +1307,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// in the secondary cache. So, the primary cache's Lookup() without
// secondary cache support cannot see it. (NOTE: The dummy handle used
// to be a leaky abstraction but not anymore.)
handle1 = blob_cache->BasicLookup(key1, statistics);
handle1 = blob_cache->Lookup(key1, statistics);
ASSERT_EQ(handle1, nullptr);
// But after another access, it is promoted to primary cache
@ -1310,7 +1315,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
blob_offsets[1]));
// And Lookup() can find it (without secondary cache support)
handle1 = blob_cache->BasicLookup(key1, statistics);
handle1 = blob_cache->Lookup(key1, statistics);
ASSERT_NE(handle1, nullptr);
ASSERT_NE(blob_cache->Value(handle1), nullptr);
blob_cache->Release(handle1);
@ -1374,7 +1379,7 @@ class BlobSourceCacheReservationTest : public DBTestBase {
static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
CacheEntryRole::kBlobCache>::GetDummyEntrySize();
static constexpr std::size_t kCacheCapacity = 2 * kSizeDummyEntry;
static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
static constexpr int kNumShardBits = 0; // 2^0 shard
static constexpr uint32_t kColumnFamilyId = 1;
@ -1393,6 +1398,7 @@ class BlobSourceCacheReservationTest : public DBTestBase {
std::string db_session_id_;
};
#ifndef ROCKSDB_LITE
TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
options_.cf_paths.emplace_back(
test::PerThreadDBPath(
@ -1507,10 +1513,11 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
}
}
TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
options_.cf_paths.emplace_back(
test::PerThreadDBPath(
env_, "BlobSourceCacheReservationTest_IncreaseCacheReservation"),
env_,
"BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
0);
GenerateKeysAndBlobs();
@ -1518,7 +1525,7 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_);
constexpr size_t blob_size = 24 << 10; // 24KB
constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
for (size_t i = 0; i < kNumBlobs; ++i) {
blob_file_size_ -= blobs_[i].size(); // old blob size
blob_strs_[i].resize(blob_size, '@');
@ -1576,6 +1583,11 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
std::vector<PinnableSlice> values(keys_.size());
// Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
// can't fit all the blobs in the cache at the same time, which means we
// should observe cache evictions once we reach the cache's capacity.
// Due to the overhead of the cache and the BlobContents objects, as well as
// jemalloc bin sizes, this happens after inserting seven blobs.
uint64_t blob_bytes = 0;
for (size_t i = 0; i < kNumBlobs; ++i) {
ASSERT_OK(blob_source.GetBlob(
@ -1586,21 +1598,22 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
// Release cache handle
values[i].Reset();
size_t charge = 0;
ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
blob_offsets[i], &charge));
if (i < kNumBlobs / 2 - 1) {
size_t charge = 0;
ASSERT_TRUE(blob_source.TEST_BlobInCache(
kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
blob_bytes += charge;
blob_bytes += charge;
}
ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(),
(blob_bytes <= kSizeDummyEntry) ? kSizeDummyEntry
: (2 * kSizeDummyEntry));
ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
options_.blob_cache->GetUsage());
}
}
}
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE

@ -11,7 +11,6 @@
#include "db/blob/blob_index.h"
#include "db/blob/blob_log_format.h"
#include "db/db_test_util.h"
#include "db/db_with_timestamp_test_util.h"
#include "port/stack_trace.h"
#include "test_util/sync_point.h"
#include "utilities/fault_injection_env.h"
@ -585,6 +584,7 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
}
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
Options options = GetDefaultOptions();
@ -773,6 +773,7 @@ TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
ASSERT_EQ(values[2], second_blob);
}
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
Options options = GetDefaultOptions();
@ -1061,6 +1062,7 @@ TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
.IsCorruption());
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, GenerateIOTracing) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -1115,6 +1117,7 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
ASSERT_GT(blob_files_op_count, 2);
}
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
Options options = GetDefaultOptions();
@ -1216,6 +1219,7 @@ TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
ASSERT_EQ(values[2], "v2_0");
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, Properties) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -1378,6 +1382,7 @@ TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
blob_size + BlobLogFooter::kSize));
}
#endif // !ROCKSDB_LITE
class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
public testing::WithParamInterface<std::string> {
@ -1627,6 +1632,7 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
Options options = GetDefaultOptions();
@ -1694,6 +1700,7 @@ TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
/*end=*/nullptr));
EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
CompressedSecondaryCacheOptions secondary_cache_opts;
@ -1772,461 +1779,6 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
1);
}
TEST_F(DBBlobBasicTest, GetEntityBlob) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.min_blob_size = 0;
Reopen(options);
constexpr char key[] = "key";
constexpr char blob_value[] = "blob_value";
constexpr char other_key[] = "other_key";
constexpr char other_blob_value[] = "other_blob_value";
ASSERT_OK(Put(key, blob_value));
ASSERT_OK(Put(other_key, other_blob_value));
ASSERT_OK(Flush());
WideColumns expected_columns{{kDefaultWideColumnName, blob_value}};
WideColumns other_expected_columns{
{kDefaultWideColumnName, other_blob_value}};
{
PinnableWideColumns result;
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key,
&result));
ASSERT_EQ(result.columns(), expected_columns);
}
{
PinnableWideColumns result;
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
other_key, &result));
ASSERT_EQ(result.columns(), other_expected_columns);
}
{
constexpr size_t num_keys = 2;
std::array<Slice, num_keys> keys{{key, other_key}};
std::array<PinnableWideColumns, num_keys> results;
std::array<Status, num_keys> statuses;
db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
&keys[0], &results[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(results[0].columns(), expected_columns);
ASSERT_OK(statuses[1]);
ASSERT_EQ(results[1].columns(), other_expected_columns);
}
}
class DBBlobWithTimestampTest : public DBBasicTestWithTimestampBase {
protected:
DBBlobWithTimestampTest()
: DBBasicTestWithTimestampBase("db_blob_with_timestamp_test") {}
};
TEST_F(DBBlobWithTimestampTest, GetBlob) {
Options options = GetDefaultOptions();
options.create_if_missing = true;
options.enable_blob_files = true;
options.min_blob_size = 0;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
constexpr char key[] = "key";
constexpr char blob_value[] = "blob_value";
ASSERT_OK(db_->Put(write_opts, key, ts, blob_value));
ASSERT_OK(Flush());
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::string value;
ASSERT_OK(db_->Get(read_opts, key, &value));
ASSERT_EQ(value, blob_value);
}
TEST_F(DBBlobWithTimestampTest, MultiGetBlobs) {
constexpr size_t min_blob_size = 6;
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.min_blob_size = min_blob_size;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
// Put then retrieve three key-values. The first value is below the size limit
// and is thus stored inline; the other two are stored separately as blobs.
constexpr size_t num_keys = 3;
constexpr char first_key[] = "first_key";
constexpr char first_value[] = "short";
static_assert(sizeof(first_value) - 1 < min_blob_size,
"first_value too long to be inlined");
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, first_key, ts, first_value));
constexpr char second_key[] = "second_key";
constexpr char second_value[] = "long_value";
static_assert(sizeof(second_value) - 1 >= min_blob_size,
"second_value too short to be stored as blob");
ASSERT_OK(db_->Put(write_opts, second_key, ts, second_value));
constexpr char third_key[] = "third_key";
constexpr char third_value[] = "other_long_value";
static_assert(sizeof(third_value) - 1 >= min_blob_size,
"third_value too short to be stored as blob");
ASSERT_OK(db_->Put(write_opts, third_key, ts, third_value));
ASSERT_OK(Flush());
ReadOptions read_options;
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
read_options.timestamp = &read_ts_slice;
std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
{
std::array<PinnableSlice, num_keys> values;
std::array<Status, num_keys> statuses;
db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
&values[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(values[0], first_value);
ASSERT_OK(statuses[1]);
ASSERT_EQ(values[1], second_value);
ASSERT_OK(statuses[2]);
ASSERT_EQ(values[2], third_value);
}
}
TEST_F(DBBlobWithTimestampTest, GetMergeBlobWithPut) {
Options options = GetDefaultOptions();
options.merge_operator = MergeOperators::CreateStringAppendOperator();
options.enable_blob_files = true;
options.min_blob_size = 0;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v2"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v3"));
ASSERT_OK(Flush());
std::string value;
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
ASSERT_OK(db_->Get(read_opts, "Key1", &value));
ASSERT_EQ(value, "v1,v2,v3");
}
TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) {
constexpr size_t num_keys = 3;
Options options = GetDefaultOptions();
options.merge_operator = MergeOperators::CreateStringAppendOperator();
options.enable_blob_files = true;
options.min_blob_size = 0;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, "Key0", ts, "v0_0"));
ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1_0"));
ASSERT_OK(db_->Put(write_opts, "Key2", ts, "v2_0"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_1"));
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v1_1"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_2"));
ASSERT_OK(Flush());
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
std::array<PinnableSlice, num_keys> values;
std::array<Status, num_keys> statuses;
db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, &keys[0],
&values[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
ASSERT_OK(statuses[1]);
ASSERT_EQ(values[1], "v1_0,v1_1");
ASSERT_OK(statuses[2]);
ASSERT_EQ(values[2], "v2_0");
}
TEST_F(DBBlobWithTimestampTest, IterateBlobs) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
int num_blobs = 5;
std::vector<std::string> keys;
std::vector<std::string> blobs;
WriteOptions write_opts;
std::vector<std::string> write_timestamps = {Timestamp(1, 0),
Timestamp(2, 0)};
// For each key in ["key0", ... "keyi", ...], write two versions:
// Timestamp(1, 0), "blobi0"
// Timestamp(2, 0), "blobi1"
for (int i = 0; i < num_blobs; i++) {
keys.push_back("key" + std::to_string(i));
blobs.push_back("blob" + std::to_string(i));
for (size_t j = 0; j < write_timestamps.size(); j++) {
ASSERT_OK(db_->Put(write_opts, keys[i], write_timestamps[j],
blobs[i] + std::to_string(j)));
}
}
ASSERT_OK(Flush());
ReadOptions read_options;
std::vector<std::string> read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)};
Slice ts_upper_bound(read_timestamps[1]);
read_options.timestamp = &ts_upper_bound;
auto check_iter_entry =
[](const Iterator* iter, const std::string& expected_key,
const std::string& expected_ts, const std::string& expected_value,
bool key_is_internal = true) {
ASSERT_OK(iter->status());
if (key_is_internal) {
std::string expected_ukey_and_ts;
expected_ukey_and_ts.assign(expected_key.data(), expected_key.size());
expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey;
ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey,
true /* log_err_key */));
ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts);
} else {
ASSERT_EQ(iter->key(), expected_key);
}
ASSERT_EQ(iter->timestamp(), expected_ts);
ASSERT_EQ(iter->value(), expected_value);
};
// Forward iterating one version of each key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key1", Timestamp(2, 0), "blob11")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1), /*key_is_internal*/ false);
iter->Next();
}
}
// Forward iteration, then reverse to backward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Next();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i],
write_timestamps[1],
blobs[num_blobs * 2 - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Prev();
}
}
}
// Backward iterating one versions of each key, get in this order:
// [("key4", Timestamp(2, 0), "blob41"),
// ("key3", Timestamp(2, 0), "blob31")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
iter->Prev();
}
}
// Backward iteration, then reverse to forward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i],
write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Prev();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1],
blobs[i - num_blobs] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Next();
}
}
}
Slice ts_lower_bound(read_timestamps[0]);
read_options.iter_start_ts = &ts_lower_bound;
// Forward iterating multiple versions of the same key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key0", Timestamp(1, 0), "blob00"),
// ("key1", Timestamp(2, 0), "blob11")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs; i++) {
for (size_t j = write_timestamps.size(); j > 0; --j) {
check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
blobs[i] + std::to_string(j - 1));
iter->Next();
}
}
}
// Backward iterating multiple versions of the same key, get in this order:
// [("key4", Timestamp(1, 0), "blob00"),
// ("key4", Timestamp(2, 0), "blob01"),
// ("key3", Timestamp(1, 0), "blob10")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = num_blobs; i > 0; i--) {
for (size_t j = 0; j < write_timestamps.size(); j++) {
check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
blobs[i - 1] + std::to_string(j));
iter->Prev();
}
}
}
int upper_bound_idx = num_blobs - 2;
int lower_bound_idx = 1;
Slice upper_bound_slice(keys[upper_bound_idx]);
Slice lower_bound_slice(keys[lower_bound_idx]);
read_options.iterate_upper_bound = &upper_bound_slice;
read_options.iterate_lower_bound = &lower_bound_slice;
// Forward iteration with upper and lower bound.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = lower_bound_idx; i < upper_bound_idx; i++) {
for (size_t j = write_timestamps.size(); j > 0; --j) {
check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
blobs[i] + std::to_string(j - 1));
iter->Next();
}
}
}
// Backward iteration with upper and lower bound.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = upper_bound_idx; i > lower_bound_idx; i--) {
for (size_t j = 0; j < write_timestamps.size(); j++) {
check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
blobs[i - 1] + std::to_string(j));
iter->Prev();
}
}
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -16,6 +16,7 @@ class DBBlobCompactionTest : public DBTestBase {
explicit DBBlobCompactionTest()
: DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
#ifndef ROCKSDB_LITE
const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
VersionSet* const versions = dbfull()->GetVersionSet();
assert(versions);
@ -29,6 +30,7 @@ class DBBlobCompactionTest : public DBTestBase {
return internal_stats->TEST_GetCompactionStats();
}
#endif // ROCKSDB_LITE
};
namespace {
@ -248,6 +250,7 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
ASSERT_EQ("value", value);
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -255,6 +258,7 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
// this involves neither reading nor writing blobs
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -295,6 +299,7 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
ASSERT_EQ(long_value, value);
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -302,10 +307,12 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
// this involves reading but not writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
Options options = GetDefaultOptions();
@ -381,6 +388,7 @@ TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
Close();
}
#endif
TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
Options options = GetDefaultOptions();
@ -405,6 +413,7 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
ASSERT_EQ(new_blob_value, Get(key));
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -412,6 +421,7 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
// this involves writing but not reading blobs
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -530,6 +540,7 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -537,6 +548,7 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
// this involves reading and writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -594,6 +606,7 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
/*end=*/nullptr));
ASSERT_EQ(blob_files, GetBlobFileNumbers());
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -601,6 +614,7 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
// this involves reading but not writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}

@ -34,6 +34,7 @@ class DBBlobCorruptionTest : public DBTestBase {
}
};
#ifndef ROCKSDB_LITE
TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -70,6 +71,7 @@ TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -131,7 +131,9 @@ class DBBlobIndexTest : public DBTestBase {
ASSERT_OK(Flush());
ASSERT_OK(
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
#ifndef ROCKSDB_LITE
ASSERT_EQ("0,1", FilesPerLevel());
#endif // !ROCKSDB_LITE
break;
}
}
@ -457,6 +459,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
#ifndef ROCKSDB_LITE
// Iterator with blob support and using seek.
ASSERT_OK(dbfull()->SetOptions(
cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
@ -481,6 +484,7 @@ TEST_F(DBBlobIndexTest, Iterate) {
create_blob_iterator, check_is_blob(false));
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
#endif // !ROCKSDB_LITE
for (auto* snapshot : snapshots) {
dbfull()->ReleaseSnapshot(snapshot);
@ -580,10 +584,12 @@ TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
Status expected_status;
verify(1, expected_status, expected_value);
#ifndef ROCKSDB_LITE
// Test DBIter::FindValueForCurrentKeyUsingSeek flow.
ASSERT_OK(dbfull()->SetOptions(cfh(),
{{"max_sequential_skip_in_iterations", "0"}}));
verify(1, expected_status, expected_value);
#endif // !ROCKSDB_LITE
}
} // namespace ROCKSDB_NAMESPACE

@ -15,7 +15,6 @@
#include "db/blob/blob_file_builder.h"
#include "db/compaction/compaction_iterator.h"
#include "db/dbformat.h"
#include "db/event_helpers.h"
#include "db/internal_stats.h"
#include "db/merge_helper.h"
@ -57,8 +56,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, const ReadOptions& read_options,
TableCache* table_cache, InternalIterator* iter,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,
FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
@ -108,9 +107,11 @@ Status BuildTable(
std::vector<std::string> blob_file_paths;
std::string file_checksum = kUnknownFileChecksum;
std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
#ifndef ROCKSDB_LITE
EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
tboptions.column_family_name,
fname, job_id, tboptions.reason);
#endif // !ROCKSDB_LITE
Env* env = db_options.env;
assert(env);
FileSystem* fs = db_options.fs.get();
@ -175,10 +176,10 @@ Status BuildTable(
builder = NewTableBuilder(tboptions, file_writer.get());
}
auto ucmp = tboptions.internal_comparator.user_comparator();
MergeHelper merge(
env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(),
ioptions.logger, true /* internal key corruption is not ok */,
env, tboptions.internal_comparator.user_comparator(),
ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
std::unique_ptr<BlobFileBuilder> blob_file_builder(
@ -196,49 +197,32 @@ Status BuildTable(
const std::atomic<bool> kManualCompactionCanceledFalse{false};
CompactionIterator c_iter(
iter, ucmp, &merge, kMaxSequenceNumber, &snapshots,
earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
iter, tboptions.internal_comparator.user_comparator(), &merge,
kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
job_snapshot, snapshot_checker, env,
ShouldReportDetailedTime(env, ioptions.stats),
true /* internal key corruption is not ok */, range_del_agg.get(),
blob_file_builder.get(), ioptions.allow_data_in_errors,
ioptions.enforce_single_del_contracts,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
true /* must_count_input_entries */,
/*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
const size_t ts_sz = ucmp->timestamp_size();
const bool strip_timestamp =
ts_sz > 0 && !ioptions.persist_user_defined_timestamps;
std::string key_after_flush_buf;
c_iter.SeekToFirst();
for (; c_iter.Valid(); c_iter.Next()) {
const Slice& key = c_iter.key();
const Slice& value = c_iter.value();
const ParsedInternalKey& ikey = c_iter.ikey();
Slice key_after_flush = key;
// If user defined timestamps will be stripped from user key after flush,
// the in memory version of the key act logically the same as one with a
// minimum timestamp. We update the timestamp here so file boundary and
// output validator, block builder all see the effect of the stripping.
if (strip_timestamp) {
key_after_flush_buf.clear();
ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz);
key_after_flush = key_after_flush_buf;
}
// Generate a rolling 64-bit hash of the key and values
// Note :
// Here "key" integrates 'sequence_number'+'kType'+'user key'.
s = output_validator.Add(key_after_flush, value);
// Generate a rolling 64-bit hash of the key and values
// Note :
// Here "key" integrates 'sequence_number'+'kType'+'user key'.
s = output_validator.Add(key, value);
if (!s.ok()) {
break;
}
builder->Add(key_after_flush, value);
builder->Add(key, value);
s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence,
ikey.type);
s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
if (!s.ok()) {
break;
}
@ -258,28 +242,21 @@ Status BuildTable(
if (s.ok()) {
auto range_del_it = range_del_agg->NewIterator();
Slice last_tombstone_start_user_key{};
for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) {
auto tombstone = range_del_it->Tombstone();
auto kv = tombstone.Serialize();
// TODO(yuzhangyu): handle range deletion for UDT in memtables only.
builder->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_end = tombstone.SerializeEndKey();
meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
tboptions.internal_comparator);
if (version) {
if (last_tombstone_start_user_key.empty() ||
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
range_del_it->start_key()) < 0) {
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
meta->compensated_range_deletion_size += versions->ApproximateSize(
approx_opts, read_options, version, kv.first.Encode(),
tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */,
TableReaderCaller::kFlush);
}
last_tombstone_start_user_key = range_del_it->start_key();
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
meta->compensated_range_deletion_size += versions->ApproximateSize(
approx_opts, version, kv.first.Encode(), tombstone_end.Encode(),
0 /* start_level */, -1 /* end_level */,
TableReaderCaller::kFlush);
}
}
}
@ -287,9 +264,8 @@ Status BuildTable(
TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
const bool empty = builder->IsEmpty();
if (num_input_entries != nullptr) {
assert(c_iter.HasNumInputEntryScanned());
*num_input_entries =
c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
}
if (!s.ok() || empty) {
builder->Abandon();
@ -312,10 +288,7 @@ Status BuildTable(
if (s.ok() && !empty) {
uint64_t file_size = builder->FileSize();
meta->fd.file_size = file_size;
meta->tail_size = builder->GetTailSize();
meta->marked_for_compaction = builder->NeedCompact();
meta->user_defined_timestamps_persisted =
ioptions.persist_user_defined_timestamps;
assert(meta->fd.GetFileSize() > 0);
tp = builder
->GetTableProperties(); // refresh now that builder is finished
@ -375,8 +348,6 @@ Status BuildTable(
s = *io_status;
}
// TODO(yuzhangyu): handle the key copy in the blob when ts should be
// stripped.
if (blob_file_builder) {
if (s.ok()) {
s = blob_file_builder->Finish();
@ -395,6 +366,7 @@ Status BuildTable(
// here because this is a special case after we finish the table building.
// No matter whether use_direct_io_for_flush_and_compaction is true,
// the goal is to cache it here for further user reads.
ReadOptions read_options;
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
read_options, file_options, tboptions.internal_comparator, *meta,
nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
@ -406,8 +378,7 @@ Status BuildTable(
MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr,
/*allow_unprepared_value*/ false,
mutable_cf_options.block_protection_bytes_per_key));
/*allow_unprepared_value*/ false));
s = it->status();
if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator,

@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
extern Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, const ReadOptions& read_options,
TableCache* table_cache, InternalIterator* iter,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,
FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,

@ -7,6 +7,8 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "rocksdb/c.h"
#include <cstdlib>
@ -15,7 +17,7 @@
#include <vector>
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/comparator.h"
#include "rocksdb/convenience.h"
@ -46,7 +48,6 @@
#include "rocksdb/utilities/write_batch_with_index.h"
#include "rocksdb/write_batch.h"
#include "utilities/merge_operators.h"
#include "rocksdb/env_encryption.h"
using ROCKSDB_NAMESPACE::BackupEngine;
using ROCKSDB_NAMESPACE::BackupEngineOptions;
@ -68,7 +69,6 @@ using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
using ROCKSDB_NAMESPACE::CompactRangeOptions;
using ROCKSDB_NAMESPACE::Comparator;
using ROCKSDB_NAMESPACE::CompressionType;
using ROCKSDB_NAMESPACE::ConfigOptions;
using ROCKSDB_NAMESPACE::CuckooTableOptions;
using ROCKSDB_NAMESPACE::DB;
using ROCKSDB_NAMESPACE::DBOptions;
@ -78,8 +78,6 @@ using ROCKSDB_NAMESPACE::EnvOptions;
using ROCKSDB_NAMESPACE::FileLock;
using ROCKSDB_NAMESPACE::FilterPolicy;
using ROCKSDB_NAMESPACE::FlushOptions;
using ROCKSDB_NAMESPACE::HistogramData;
using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
using ROCKSDB_NAMESPACE::InfoLogLevel;
using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
using ROCKSDB_NAMESPACE::Iterator;
@ -210,9 +208,6 @@ struct rocksdb_logger_t {
struct rocksdb_lru_cache_options_t {
LRUCacheOptions rep;
};
struct rocksdb_hyper_clock_cache_options_t {
HyperClockCacheOptions rep;
};
struct rocksdb_memory_allocator_t {
std::shared_ptr<MemoryAllocator> rep;
};
@ -281,11 +276,6 @@ struct rocksdb_compactionfiltercontext_t {
CompactionFilter::Context rep;
};
struct rocksdb_statistics_histogram_data_t {
rocksdb_statistics_histogram_data_t() : rep() {}
HistogramData rep;
};
struct rocksdb_compactionfilter_t : public CompactionFilter {
void* state_;
void (*destructor_)(void*);
@ -1064,36 +1054,6 @@ rocksdb_column_family_handle_t* rocksdb_create_column_family(
return handle;
}
rocksdb_column_family_handle_t** rocksdb_create_column_families(
rocksdb_t* db, const rocksdb_options_t* column_family_options,
int num_column_families, const char* const* column_family_names,
size_t* lencfs, char** errptr) {
std::vector<ColumnFamilyHandle*> handles;
std::vector<std::string> names;
for (int i = 0; i != num_column_families; ++i) {
names.push_back(std::string(column_family_names[i]));
}
SaveError(errptr, db->rep->CreateColumnFamilies(
ColumnFamilyOptions(column_family_options->rep), names,
&handles));
*lencfs = handles.size();
rocksdb_column_family_handle_t** c_handles =
static_cast<rocksdb_column_family_handle_t**>(
malloc(sizeof(rocksdb_column_family_handle_t*) * handles.size()));
for (size_t i = 0; i != handles.size(); ++i) {
c_handles[i] = new rocksdb_column_family_handle_t;
c_handles[i]->rep = handles[i];
}
return c_handles;
}
void rocksdb_create_column_families_destroy(
rocksdb_column_family_handle_t** list) {
free(list);
}
rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
rocksdb_t* db, const rocksdb_options_t* column_family_options,
const char* column_family_name, int ttl, char** errptr) {
@ -1845,17 +1805,6 @@ void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
}
void rocksdb_flush_cfs(rocksdb_t* db, const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t** column_families,
int num_column_families, char** errptr) {
std::vector<ColumnFamilyHandle*> column_family_handles;
for (int i = 0; i < num_column_families; i++) {
column_family_handles.push_back(column_families[i]->rep);
}
SaveError(errptr, db->rep->Flush(options->rep, column_family_handles));
}
void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
SaveError(errptr, db->rep->FlushWAL(sync));
}
@ -2549,12 +2498,8 @@ void rocksdb_load_latest_options(
rocksdb_options_t*** list_column_family_options, char** errptr) {
DBOptions db_opt;
std::vector<ColumnFamilyDescriptor> cf_descs;
ConfigOptions config_opts;
config_opts.ignore_unknown_options = ignore_unknown_options;
config_opts.input_strings_escaped = true;
config_opts.env = env->rep;
Status s = LoadLatestOptions(config_opts, std::string(db_path), &db_opt,
&cf_descs, &cache->rep);
Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
&cf_descs, ignore_unknown_options, &cache->rep);
if (s.ok()) {
char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
@ -2675,6 +2620,14 @@ void rocksdb_block_based_options_set_block_cache(
}
}
void rocksdb_block_based_options_set_block_cache_compressed(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache_compressed) {
if (block_cache_compressed) {
options->rep.block_cache_compressed = block_cache_compressed->rep;
}
}
void rocksdb_block_based_options_set_whole_key_filtering(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.whole_key_filtering = v;
@ -3030,29 +2983,6 @@ void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
}
void rocksdb_options_set_statistics_level(rocksdb_options_t* opt, int level) {
if (!opt->rep.statistics) {
return;
}
if (level < rocksdb_statistics_level_disable_all) {
level = rocksdb_statistics_level_disable_all;
}
if (level > rocksdb_statistics_level_all) {
level = rocksdb_statistics_level_all;
}
opt->rep.statistics->set_stats_level(
static_cast<ROCKSDB_NAMESPACE::StatsLevel>(level));
}
int rocksdb_options_get_statistics_level(rocksdb_options_t* opt) {
if (!opt->rep.statistics) {
return ROCKSDB_NAMESPACE::StatsLevel::kDisableAll;
}
return static_cast<int>(opt->rep.statistics->get_stats_level());
}
void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.skip_stats_update_on_db_open = val;
@ -3800,21 +3730,16 @@ void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
}
void rocksdb_options_set_plain_table_factory(
rocksdb_options_t* opt, uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
char encoding_type, unsigned char full_scan_mode,
unsigned char store_index_in_file) {
void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
uint32_t user_key_len,
int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness) {
ROCKSDB_NAMESPACE::PlainTableOptions options;
options.user_key_len = user_key_len;
options.bloom_bits_per_key = bloom_bits_per_key;
options.hash_table_ratio = hash_table_ratio;
options.index_sparseness = index_sparseness;
options.huge_page_tlb_size = huge_page_tlb_size;
options.encoding_type =
static_cast<ROCKSDB_NAMESPACE::EncodingType>(encoding_type);
options.full_scan_mode = full_scan_mode;
options.store_index_in_file = store_index_in_file;
ROCKSDB_NAMESPACE::TableFactory* factory =
ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
@ -3892,26 +3817,6 @@ char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
return nullptr;
}
uint64_t rocksdb_options_statistics_get_ticker_count(rocksdb_options_t* opt,
uint32_t ticker_type) {
ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
if (statistics) {
return statistics->getTickerCount(ticker_type);
}
return 0;
}
void rocksdb_options_statistics_get_histogram_data(
rocksdb_options_t* opt, uint32_t type,
rocksdb_statistics_histogram_data_t* const data) {
ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
if (statistics) {
statistics->histogramData(type, &data->rep);
} else {
*data = rocksdb_statistics_histogram_data_t{};
}
}
void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
rocksdb_ratelimiter_t* limiter) {
if (limiter) {
@ -3973,15 +3878,6 @@ void rocksdb_options_add_compact_on_deletion_collector_factory(
opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
}
void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger,
double deletion_ratio) {
std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
compact_on_del = NewCompactOnDeletionCollectorFactory(
window_size, num_dels_trigger, deletion_ratio);
opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
}
void rocksdb_set_perf_level(int v) {
PerfLevel level = static_cast<PerfLevel>(v);
SetPerfLevel(level);
@ -4158,8 +4054,6 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
return rep->blob_decompress_time;
case rocksdb_internal_range_del_reseek_count:
return rep->internal_range_del_reseek_count;
case rocksdb_block_read_cpu_time:
return rep->block_read_cpu_time;
default:
break;
}
@ -4555,15 +4449,6 @@ rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
return opt->rep.io_timeout.count();
}
void rocksdb_readoptions_set_async_io(rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.async_io = v;
}
unsigned char rocksdb_readoptions_get_async_io(rocksdb_readoptions_t* opt) {
return opt->rep.async_io;
}
void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
const char* ts, size_t tslen) {
if (ts == nullptr) {
@ -4775,59 +4660,12 @@ rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
}
rocksdb_cache_t* rocksdb_cache_create_lru_opts(
const rocksdb_lru_cache_options_t* opt) {
rocksdb_lru_cache_options_t* opt) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(opt->rep);
return c;
}
rocksdb_hyper_clock_cache_options_t* rocksdb_hyper_clock_cache_options_create(
size_t capacity, size_t estimated_entry_charge) {
return new rocksdb_hyper_clock_cache_options_t{
HyperClockCacheOptions(capacity, estimated_entry_charge)};
}
void rocksdb_hyper_clock_cache_options_destroy(
rocksdb_hyper_clock_cache_options_t* opt) {
delete opt;
}
void rocksdb_hyper_clock_cache_options_set_capacity(
rocksdb_hyper_clock_cache_options_t* opts, size_t capacity) {
opts->rep.capacity = capacity;
}
void rocksdb_hyper_clock_cache_options_set_estimated_entry_charge(
rocksdb_hyper_clock_cache_options_t* opts, size_t estimated_entry_charge) {
opts->rep.estimated_entry_charge = estimated_entry_charge;
}
void rocksdb_hyper_clock_cache_options_set_num_shard_bits(
rocksdb_hyper_clock_cache_options_t* opts, int num_shard_bits) {
opts->rep.num_shard_bits = num_shard_bits;
}
void rocksdb_hyper_clock_cache_options_set_memory_allocator(
rocksdb_hyper_clock_cache_options_t* opts,
rocksdb_memory_allocator_t* memory_allocator) {
opts->rep.memory_allocator = memory_allocator->rep;
}
rocksdb_cache_t* rocksdb_cache_create_hyper_clock(
size_t capacity, size_t estimated_entry_charge) {
HyperClockCacheOptions opts(capacity, estimated_entry_charge);
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = opts.MakeSharedCache();
return c;
}
rocksdb_cache_t* rocksdb_cache_create_hyper_clock_opts(
const rocksdb_hyper_clock_cache_options_t* opts) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = opts->rep.MakeSharedCache();
return c;
}
void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }
void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
@ -4838,26 +4676,18 @@ void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
cache->rep->SetCapacity(capacity);
}
size_t rocksdb_cache_get_capacity(const rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
return cache->rep->GetCapacity();
}
size_t rocksdb_cache_get_usage(const rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
return cache->rep->GetUsage();
}
size_t rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
return cache->rep->GetPinnedUsage();
}
size_t rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache) {
return cache->rep->GetTableAddressCount();
}
size_t rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache) {
return cache->rep->GetOccupancyCount();
}
rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
uint64_t target_size) {
rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
@ -4875,20 +4705,6 @@ rocksdb_env_t* rocksdb_create_default_env() {
return result;
}
rocksdb_env_t* rocksdb_create_encrypted_env(const char* key) {
rocksdb_env_t* result = new rocksdb_env_t;
std::shared_ptr<rocksdb::EncryptionProvider> provider;
Status status = rocksdb::EncryptionProvider::CreateFromString(
ConfigOptions(), "ippcp", &provider);
assert(status.ok());
status =
provider->AddCipher("", key, 32, false);
assert(status.ok());
result->rep = NewEncryptedEnv(Env::Default(), provider);
result->is_default = true;
return result;
}
rocksdb_env_t* rocksdb_create_mem_env() {
rocksdb_env_t* result = new rocksdb_env_t;
result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
@ -5090,12 +4906,6 @@ void rocksdb_ingestexternalfileoptions_set_ingest_behind(
opt->rep.ingest_behind = ingest_behind;
}
void rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level(
rocksdb_ingestexternalfileoptions_t* opt,
unsigned char fail_if_not_bottommost_level) {
opt->rep.fail_if_not_bottommost_level = fail_if_not_bottommost_level;
}
void rocksdb_ingestexternalfileoptions_destroy(
rocksdb_ingestexternalfileoptions_t* opt) {
delete opt;
@ -5257,17 +5067,6 @@ rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
return result;
}
void rocksdb_fifo_compaction_options_set_allow_compaction(
rocksdb_fifo_compaction_options_t* fifo_opts,
unsigned char allow_compaction) {
fifo_opts->rep.allow_compaction = allow_compaction;
}
unsigned char rocksdb_fifo_compaction_options_get_allow_compaction(
rocksdb_fifo_compaction_options_t* fifo_opts) {
return fifo_opts->rep.allow_compaction;
}
void rocksdb_fifo_compaction_options_set_max_table_files_size(
rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
fifo_opts->rep.max_table_files_size = size;
@ -5688,20 +5487,6 @@ int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
}
}
rocksdb_t* rocksdb_transactiondb_get_base_db(rocksdb_transactiondb_t* txn_db) {
DB* base_db = txn_db->rep->GetBaseDB();
if (base_db != nullptr) {
rocksdb_t* result = new rocksdb_t;
result->rep = base_db;
return result;
}
return nullptr;
}
void rocksdb_transactiondb_close_base_db(rocksdb_t* base_db) { delete base_db; }
rocksdb_transaction_t* rocksdb_transaction_begin(
rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* write_options,
@ -5986,35 +5771,6 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
}
}
void rocksdb_transaction_multi_get_for_update(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGetForUpdate(options->rep, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_transaction_multi_get_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
@ -6047,38 +5803,6 @@ void rocksdb_transaction_multi_get_cf(
}
}
void rocksdb_transaction_multi_get_for_update_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
// Read a key outside a transaction
char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
@ -6380,18 +6104,6 @@ void rocksdb_transactiondb_flush_cf(
SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
}
void rocksdb_transactiondb_flush_cfs(
rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t** column_families, int num_column_families,
char** errptr) {
std::vector<ColumnFamilyHandle*> column_family_handles;
for (int i = 0; i < num_column_families; i++) {
column_family_handles.push_back(column_families[i]->rep);
}
SaveError(errptr, txn_db->rep->Flush(options->rep, column_family_handles));
}
rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
rocksdb_transactiondb_t* txn_db, char** errptr) {
Checkpoint* checkpoint;
@ -6679,59 +6391,6 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) {
db->rep->EnableManualCompaction();
}
rocksdb_statistics_histogram_data_t*
rocksdb_statistics_histogram_data_create() {
return new rocksdb_statistics_histogram_data_t{};
}
void rocksdb_statistics_histogram_data_destroy(
rocksdb_statistics_histogram_data_t* data) {
delete data;
}
double rocksdb_statistics_histogram_data_get_median(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.median;
}
double rocksdb_statistics_histogram_data_get_p95(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.percentile95;
}
double rocksdb_statistics_histogram_data_get_p99(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.percentile99;
}
double rocksdb_statistics_histogram_data_get_average(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.average;
}
double rocksdb_statistics_histogram_data_get_std_dev(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.standard_deviation;
}
double rocksdb_statistics_histogram_data_get_max(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.max;
}
uint64_t rocksdb_statistics_histogram_data_get_count(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.count;
}
uint64_t rocksdb_statistics_histogram_data_get_sum(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.sum;
}
double rocksdb_statistics_histogram_data_get_min(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.min;
}
} // end extern "C"
#endif // !ROCKSDB_LITE

@ -3,14 +3,17 @@
found in the LICENSE file. See the AUTHORS file for names of contributors. */
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include "rocksdb/c.h"
#include <stdio.h>
#ifndef ROCKSDB_LITE // Lite does not support C API
#include <assert.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include "rocksdb/c.h"
#ifndef OS_WIN
#include <unistd.h>
#endif
@ -487,19 +490,6 @@ static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
rocksdb_pinnableslice_destroy(p);
}
static void CheckTxnGetForUpdate(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
const char* key, const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = rocksdb_transaction_get_for_update(txn, options, key, strlen(key),
&val_len, true, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options, const char* key,
const char* expected) {
@ -527,20 +517,6 @@ static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
Free(&val);
}
static void CheckTxnGetForUpdateCF(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = rocksdb_transaction_get_for_update_cf(
txn, options, column_family, key, strlen(key), &val_len, true, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
const char* key, const char* expected) {
@ -720,8 +696,6 @@ int main(int argc, char** argv) {
rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
10001);
rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
options, 10000, 10001, 0.0);
StartPhase("destroy");
rocksdb_destroy_db(options, dbname, &err);
@ -1673,8 +1647,7 @@ int main(int argc, char** argv) {
rocksdb_options_set_prefix_extractor(
options, rocksdb_slicetransform_create_fixed_prefix(3));
rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16, 0, 0, 0,
0);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
db = rocksdb_open(options, dbname, &err);
@ -2060,15 +2033,6 @@ int main(int argc, char** argv) {
CheckCondition(29.0 ==
rocksdb_options_get_experimental_mempurge_threshold(o));
CheckCondition(rocksdb_statistics_level_disable_all ==
rocksdb_options_get_statistics_level(o));
rocksdb_options_enable_statistics(o);
CheckCondition(rocksdb_statistics_level_disable_all !=
rocksdb_options_get_statistics_level(o));
rocksdb_options_set_statistics_level(o, rocksdb_statistics_level_all);
CheckCondition(rocksdb_statistics_level_all ==
rocksdb_options_get_statistics_level(o));
/* Blob Options */
rocksdb_options_set_enable_blob_files(o, 1);
CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
@ -2608,9 +2572,6 @@ int main(int argc, char** argv) {
rocksdb_readoptions_set_io_timeout(ro, 400);
CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
rocksdb_readoptions_set_async_io(ro, 1);
CheckCondition(1 == rocksdb_readoptions_get_async_io(ro));
rocksdb_readoptions_destroy(ro);
}
@ -3130,17 +3091,6 @@ int main(int argc, char** argv) {
CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
// memory usage
rocksdb_t* base_db = rocksdb_transactiondb_get_base_db(txn_db);
rocksdb_memory_consumers_t* consumers = rocksdb_memory_consumers_create();
rocksdb_memory_consumers_add_db(consumers, base_db);
rocksdb_memory_usage_t* usage =
rocksdb_approximate_memory_usage_create(consumers, &err);
CheckNoError(err);
rocksdb_approximate_memory_usage_destroy(usage);
rocksdb_memory_consumers_destroy(consumers);
rocksdb_transactiondb_close_base_db(base_db);
// flush
rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
rocksdb_flushoptions_set_wait(flush_options, 1);
@ -3253,120 +3203,6 @@ int main(int argc, char** argv) {
rocksdb_transactiondb_options_destroy(txn_db_options);
}
StartPhase("transactions_multi_get_for_update");
{
// open a TransactionDB
txn_db_options = rocksdb_transactiondb_options_create();
rocksdb_transactiondb_options_set_transaction_lock_timeout(txn_db_options,
0);
txn_options = rocksdb_transaction_options_create();
rocksdb_options_set_create_if_missing(options, 1);
txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
CheckNoError(err);
rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
CheckNoError(err);
rocksdb_transactiondb_put(txn_db, woptions, "bar", 3, "hello", 5, &err);
CheckNoError(err);
// begin transactions
txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
rocksdb_transaction_t* txn2 =
rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
// multi get
{
const char* keys[2] = {"foo", "bar"};
const size_t keys_sizes[2] = {3, 3};
char* vals[2];
size_t vals_sizes[2];
char* errs[2];
const char* expected[2] = {"hey", "hello"};
rocksdb_transaction_multi_get_for_update(
txn, roptions, 2, keys, keys_sizes, vals, vals_sizes, errs);
CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
}
char* conflict_err = NULL;
size_t val_len;
rocksdb_transaction_get_for_update(txn2, roptions, "foo", 3, &val_len, true,
&conflict_err);
// get-for-update conflict
CheckCondition(conflict_err != NULL);
Free(&conflict_err);
// commit
rocksdb_transaction_commit(txn, &err);
CheckNoError(err);
// should work after first tx is commited
CheckTxnGetForUpdate(txn2, roptions, "foo", "hey");
// commit the second one
rocksdb_transaction_commit(txn2, &err);
CheckNoError(err);
// destroy txns
rocksdb_transaction_destroy(txn);
rocksdb_transaction_destroy(txn2);
// same for column families
rocksdb_column_family_handle_t* cfh;
cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
"txn_db_cf", &err);
CheckNoError(err);
rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
8, &err);
CheckNoError(err);
rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_bar", 6, "cf_hey",
6, &err);
CheckNoError(err);
txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
txn2 = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
{
const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
const char* keys[2] = {"cf_foo", "cf_bar"};
const size_t keys_sizes[2] = {6, 6};
char* vals[2];
size_t vals_sizes[2];
char* errs[2];
const char* expected[2] = {"cf_hello", "cf_hey"};
rocksdb_transaction_multi_get_for_update_cf(txn, roptions, get_handles, 2,
keys, keys_sizes, vals,
vals_sizes, errs);
CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
}
char* conflict_err_cf = NULL;
size_t val_len_cf;
rocksdb_transaction_get_for_update_cf(txn2, roptions, cfh, "cf_foo", 6,
&val_len_cf, true, &conflict_err_cf);
CheckCondition(conflict_err_cf != NULL);
Free(&conflict_err_cf);
rocksdb_transaction_commit(txn, &err);
CheckNoError(err);
CheckTxnGetForUpdateCF(txn2, roptions, cfh, "cf_foo", "cf_hello");
rocksdb_transaction_commit(txn2, &err);
CheckNoError(err);
// close and destroy
rocksdb_column_family_handle_destroy(cfh);
rocksdb_transaction_destroy(txn);
rocksdb_transaction_destroy(txn2);
rocksdb_transactiondb_close(txn_db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_transaction_options_destroy(txn_options);
rocksdb_transactiondb_options_destroy(txn_db_options);
}
StartPhase("optimistic_transactions");
{
rocksdb_options_t* db_options = rocksdb_options_create();
@ -3396,19 +3232,8 @@ int main(int argc, char** argv) {
rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
CheckNoError(err);
rocksdb_column_family_handle_t *cfh1, *cfh2;
char** list_const_cf_names = (char**)malloc(2 * sizeof(char*));
list_const_cf_names[0] = "txn_db_cf1";
list_const_cf_names[1] = "txn_db_cf2";
size_t cflen;
rocksdb_column_family_handle_t** list_cfh = rocksdb_create_column_families(
db, db_options, 2, (const char* const*)list_const_cf_names, &cflen,
&err);
free(list_const_cf_names);
CheckNoError(err);
assert(cflen == 2);
cfh1 = list_cfh[0];
cfh2 = list_cfh[1];
rocksdb_create_column_families_destroy(list_cfh);
cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
NULL);
rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
@ -3622,71 +3447,6 @@ int main(int argc, char** argv) {
rocksdb_readoptions_destroy(ropts);
}
StartPhase("statistics");
{
const uint32_t BYTES_WRITTEN_TICKER = 40;
const uint32_t DB_WRITE_HIST = 1;
rocksdb_statistics_histogram_data_t* hist =
rocksdb_statistics_histogram_data_create();
{
// zero by default
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 ==
rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 ==
rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_min(hist));
}
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_options_enable_statistics(options);
rocksdb_options_set_statistics_level(options, rocksdb_statistics_level_all);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckCondition(0 == rocksdb_options_statistics_get_ticker_count(
options, BYTES_WRITTEN_TICKER));
rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
int i;
for (i = 0; i < 10; ++i) {
char key = '0' + (char)i;
rocksdb_put(db, woptions, &key, 1, "", 1, &err);
CheckNoError(err);
}
CheckCondition(0 != rocksdb_options_statistics_get_ticker_count(
options, BYTES_WRITTEN_TICKER));
rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 != rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 != rocksdb_statistics_histogram_data_get_sum(hist));
rocksdb_statistics_histogram_data_destroy(hist);
}
StartPhase("cancel_all_background_work");
rocksdb_cancel_all_background_work(db, 1);
@ -3705,3 +3465,12 @@ int main(int argc, char** argv) {
fprintf(stderr, "PASS\n");
return 0;
}
#else
int main(void) {
fprintf(stderr, "SKIPPED\n");
return 0;
}
#endif // !ROCKSDB_LITE

@ -53,9 +53,11 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
if (cfd_ != nullptr) {
#ifndef ROCKSDB_LITE
for (auto& listener : cfd_->ioptions()->listeners) {
listener->OnColumnFamilyHandleDeletionStarted(this);
}
#endif // ROCKSDB_LITE
// Job id == 0 means that this is not our background process, but rather
// user thread
// Need to hold some shared pointers owned by the initial_cf_options
@ -86,10 +88,15 @@ const std::string& ColumnFamilyHandleImpl::GetName() const {
}
Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
#ifndef ROCKSDB_LITE
// accessing mutable cf-options requires db mutex.
InstrumentedMutexLock l(mutex_);
*desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
return Status::OK();
#else
(void)desc;
return Status::NotSupported();
#endif // !ROCKSDB_LITE
}
const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
@ -340,6 +347,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.hard_pending_compaction_bytes_limit;
}
#ifndef ROCKSDB_LITE
// When the DB is stopped, it's possible that there are some .trash files that
// were not deleted yet, when we open the DB we will find these .trash files
// and schedule them to be deleted (or delete immediately if SstFileManager
@ -351,6 +359,7 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.cf_paths[i].path)
.PermitUncheckedError();
}
#endif
if (result.cf_paths.empty()) {
result.cf_paths = db_options.db_paths;
@ -382,9 +391,8 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
if (result.ttl == kDefaultTtl) {
if (is_block_based_table) {
// FIFO also requires max_open_files=-1, which is checked in
// ValidateOptions().
if (is_block_based_table &&
result.compaction_style != kCompactionStyleFIFO) {
result.ttl = kAdjustedTtl;
} else {
result.ttl = 0;
@ -392,35 +400,40 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
}
const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
if (result.compaction_style == kCompactionStyleLevel) {
// Turn on periodic compactions and set them to occur once every 30 days if
// compaction filters are used and periodic_compaction_seconds is set to the
// default value.
if (result.compaction_style != kCompactionStyleFIFO) {
if ((result.compaction_filter != nullptr ||
result.compaction_filter_factory != nullptr) &&
result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
is_block_based_table) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
} else if (result.compaction_style == kCompactionStyleUniversal) {
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
is_block_based_table) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
} else if (result.compaction_style == kCompactionStyleFIFO) {
if (result.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
ROCKS_LOG_WARN(
db_options.info_log.get(),
"periodic_compaction_seconds does not support FIFO compaction. You"
"may want to set option TTL instead.");
} else {
// result.compaction_style == kCompactionStyleFIFO
if (result.ttl == 0) {
if (is_block_based_table) {
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
result.ttl = result.periodic_compaction_seconds;
}
} else if (result.periodic_compaction_seconds != 0) {
result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
}
}
// For universal compaction, `ttl` and `periodic_compaction_seconds` mean the
// same thing, take the stricter value.
if (result.compaction_style == kCompactionStyleUniversal) {
if (result.periodic_compaction_seconds == 0) {
result.periodic_compaction_seconds = result.ttl;
} else if (result.ttl != 0) {
// TTL compactions would work similar to Periodic Compactions in Universal in
// most of the cases. So, if ttl is set, execute the periodic compaction
// codepath.
if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
if (result.periodic_compaction_seconds != 0) {
result.periodic_compaction_seconds =
std::min(result.ttl, result.periodic_compaction_seconds);
} else {
result.periodic_compaction_seconds = result.ttl;
}
}
@ -544,6 +557,7 @@ ColumnFamilyData::ColumnFamilyData(
next_(nullptr),
prev_(nullptr),
log_number_(0),
flush_reason_(FlushReason::kOthers),
column_family_set_(column_family_set),
queued_for_flush_(false),
queued_for_compaction_(false),
@ -589,6 +603,7 @@ ColumnFamilyData::ColumnFamilyData(
if (ioptions_.compaction_style == kCompactionStyleLevel) {
compaction_picker_.reset(
new LevelCompactionPicker(ioptions_, &internal_comparator_));
#ifndef ROCKSDB_LITE
} else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset(
new UniversalCompactionPicker(ioptions_, &internal_comparator_));
@ -602,6 +617,7 @@ ColumnFamilyData::ColumnFamilyData(
"Column family %s does not use any background compaction. "
"Compactions can only be done via CompactFiles\n",
GetName().c_str());
#endif // !ROCKSDB_LITE
} else {
ROCKS_LOG_ERROR(ioptions_.logger,
"Unable to recognize the specified compaction style %d. "
@ -865,7 +881,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
}
} // anonymous namespace
std::pair<WriteStallCondition, WriteStallCause>
std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
ColumnFamilyData::GetWriteStallConditionAndCause(
int num_unflushed_memtables, int num_l0_files,
uint64_t num_compaction_needed_bytes,
@ -938,8 +954,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
if (compaction_picker_->IsLevel0CompactionInProgress()) {
internal_stats_->AddCFStats(
InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
1);
InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
}
ROCKS_LOG_WARN(ioptions_.logger,
"[%s] Stopping writes because we have %d level-0 files",
@ -960,7 +975,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
SetupDelay(write_controller, compaction_needed_bytes,
prev_compaction_needed_bytes_, was_stopped,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1);
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
ROCKS_LOG_WARN(
ioptions_.logger,
"[%s] Stalling writes because we have %d immutable memtables "
@ -978,11 +993,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
SetupDelay(write_controller, compaction_needed_bytes,
prev_compaction_needed_bytes_, was_stopped || near_stop,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1);
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
1);
if (compaction_picker_->IsLevel0CompactionInProgress()) {
internal_stats_->AddCFStats(
InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
1);
InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
}
ROCKS_LOG_WARN(ioptions_.logger,
"[%s] Stalling writes because we have %d level-0 files "
@ -1008,7 +1023,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
prev_compaction_needed_bytes_, was_stopped || near_stop,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1);
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
ROCKS_LOG_WARN(
ioptions_.logger,
"[%s] Stalling writes because of estimated pending compaction "
@ -1137,7 +1152,6 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
*overlap = false;
// Create an InternalIterator over all unflushed memtables
Arena arena;
// TODO: plumb Env::IOActivity
ReadOptions read_opts;
read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
@ -1241,11 +1255,30 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
// (if no Scrape happens).
assert(ptr != SuperVersion::kSVInUse);
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
if (sv == SuperVersion::kSVObsolete) {
if (sv == SuperVersion::kSVObsolete ||
sv->version_number != super_version_number_.load()) {
RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
db->mutex()->Lock();
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
db->mutex()->Lock();
// NOTE: underlying resources held by superversion (sst files) might
// not be released until the next background job.
sv->Cleanup();
if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
db->AddSuperVersionsToFreeQueue(sv);
db->SchedulePurge();
} else {
sv_to_delete = sv;
}
} else {
db->mutex()->Lock();
}
sv = super_version_->Ref();
db->mutex()->Unlock();
delete sv_to_delete;
}
assert(sv != nullptr);
return sv;
@ -1376,33 +1409,6 @@ Status ColumnFamilyData::ValidateOptions(
}
}
const auto* ucmp = cf_options.comparator;
assert(ucmp);
if (ucmp->timestamp_size() > 0 &&
!cf_options.persist_user_defined_timestamps) {
if (db_options.atomic_flush) {
return Status::NotSupported(
"Not persisting user-defined timestamps feature is not supported"
"in combination with atomic flush.");
}
if (db_options.allow_concurrent_memtable_write) {
return Status::NotSupported(
"Not persisting user-defined timestamps feature is not supported"
" in combination with concurrent memtable write.");
}
const char* comparator_name = cf_options.comparator->Name();
size_t name_size = strlen(comparator_name);
const char* suffix = ".u64ts";
size_t suffix_size = strlen(suffix);
if (name_size <= suffix_size ||
strcmp(comparator_name + name_size - suffix_size, suffix) != 0) {
return Status::NotSupported(
"Not persisting user-defined timestamps"
"feature only support user-defined timestamps formatted as "
"uint64_t.");
}
}
if (cf_options.enable_blob_garbage_collection) {
if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
cf_options.blob_garbage_collection_age_cutoff > 1.0) {
@ -1432,40 +1438,10 @@ Status ColumnFamilyData::ValidateOptions(
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
if (std::find(supported.begin(), supported.end(),
cf_options.block_protection_bytes_per_key) == supported.end()) {
return Status::NotSupported(
"Block per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds
.empty()) {
if (cf_options.compaction_style != kCompactionStyleFIFO) {
return Status::NotSupported(
"Option file_temperature_age_thresholds only supports FIFO "
"compaction.");
} else if (cf_options.num_levels > 1) {
return Status::NotSupported(
"Option file_temperature_age_thresholds is only supported when "
"num_levels = 1.");
} else {
const auto& ages =
cf_options.compaction_options_fifo.file_temperature_age_thresholds;
assert(ages.size() >= 1);
// check that age is sorted
for (size_t i = 0; i < ages.size() - 1; ++i) {
if (ages[i].age >= ages[i + 1].age) {
return Status::NotSupported(
"Option file_temperature_age_thresholds requires elements to be "
"sorted in increasing order with respect to `age` field.");
}
}
}
}
return s;
}
#ifndef ROCKSDB_LITE
Status ColumnFamilyData::SetOptions(
const DBOptions& db_opts,
const std::unordered_map<std::string, std::string>& options_map) {
@ -1484,6 +1460,7 @@ Status ColumnFamilyData::SetOptions(
}
return s;
}
#endif // ROCKSDB_LITE
// REQUIRES: DB mutex held
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
@ -1542,43 +1519,6 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
return data_dirs_[path_id].get();
}
bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
uint64_t max_memtable_id) {
const Comparator* ucmp = user_comparator();
const size_t ts_sz = ucmp->timestamp_size();
if (ts_sz == 0 || ioptions_.persist_user_defined_timestamps) {
return false;
}
// If users set the `persist_user_defined_timestamps` flag to false, they
// should also set the `full_history_ts_low` flag to indicate the range of
// user-defined timestamps to retain in memory. Otherwise, we do not
// explicitly postpone flush to retain UDTs.
const std::string& full_history_ts_low = GetFullHistoryTsLow();
if (full_history_ts_low.empty()) {
return false;
}
#ifndef NDEBUG
Slice last_table_newest_udt;
#endif /* !NDEBUG */
for (const Slice& table_newest_udt :
imm()->GetTablesNewestUDT(max_memtable_id)) {
assert(table_newest_udt.size() == full_history_ts_low.size());
assert(last_table_newest_udt.empty() ||
ucmp->CompareTimestamp(table_newest_udt, last_table_newest_udt) >=
0);
// Checking the newest UDT contained in MemTable with ascending ID up to
// `max_memtable_id`. MemTable with bigger ID will have newer UDT, return
// immediately on finding the first MemTable that needs postponing.
if (ucmp->CompareTimestamp(table_newest_udt, full_history_ts_low) >= 0) {
return true;
}
#ifndef NDEBUG
last_table_newest_udt = table_newest_udt;
#endif /* !NDEBUG */
}
return false;
}
void ColumnFamilyData::RecoverEpochNumbers() {
assert(current_);
auto* vstorage = current_->storage_info();
@ -1681,13 +1621,6 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
db_id_, db_session_id_);
column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd});
auto ucmp = new_cfd->user_comparator();
assert(ucmp);
size_t ts_sz = ucmp->timestamp_size();
running_ts_sz_.insert({id, ts_sz});
if (ts_sz > 0) {
ts_sz_for_record_.insert({id, ts_sz});
}
max_column_family_ = std::max(max_column_family_, id);
// add to linked list
new_cfd->next_ = dummy_cfd_;
@ -1703,13 +1636,10 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
// under a DB mutex AND from a write thread
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
uint32_t cf_id = cfd->GetID();
auto cfd_iter = column_family_data_.find(cf_id);
auto cfd_iter = column_family_data_.find(cfd->GetID());
assert(cfd_iter != column_family_data_.end());
column_family_data_.erase(cfd_iter);
column_families_.erase(cfd->GetName());
running_ts_sz_.erase(cf_id);
ts_sz_for_record_.erase(cf_id);
}
// under a DB mutex OR from a write thread

@ -310,6 +310,10 @@ class ColumnFamilyData {
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
uint64_t GetLogNumber() const { return log_number_; }
void SetFlushReason(FlushReason flush_reason) {
flush_reason_ = flush_reason;
}
FlushReason GetFlushReason() const { return flush_reason_; }
// thread-safe
const FileOptions* soptions() const;
const ImmutableOptions* ioptions() const { return &ioptions_; }
@ -335,10 +339,12 @@ class ColumnFamilyData {
// Validate CF options against DB options
static Status ValidateOptions(const DBOptions& db_options,
const ColumnFamilyOptions& cf_options);
#ifndef ROCKSDB_LITE
// REQUIRES: DB mutex held
Status SetOptions(
const DBOptions& db_options,
const std::unordered_map<std::string, std::string>& options_map);
#endif // ROCKSDB_LITE
InternalStats* internal_stats() { return internal_stats_.get(); }
@ -462,6 +468,12 @@ class ColumnFamilyData {
bool queued_for_flush() { return queued_for_flush_; }
bool queued_for_compaction() { return queued_for_compaction_; }
enum class WriteStallCause {
kNone,
kMemtableLimit,
kL0FileCountLimit,
kPendingCompactionBytes,
};
static std::pair<WriteStallCondition, WriteStallCause>
GetWriteStallConditionAndCause(
int num_unflushed_memtables, int num_l0_files,
@ -506,12 +518,6 @@ class ColumnFamilyData {
return full_history_ts_low_;
}
// REQUIRES: DB mutex held.
// Return true if flushing up to MemTables with ID `max_memtable_id`
// should be postponed to retain user-defined timestamps according to the
// user's setting. Called by background flush job.
bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id);
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
std::shared_ptr<CacheReservationManager>
@ -610,6 +616,8 @@ class ColumnFamilyData {
// recovered from
uint64_t log_number_;
std::atomic<FlushReason> flush_reason_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
@ -711,16 +719,6 @@ class ColumnFamilySet {
Version* dummy_version,
const ColumnFamilyOptions& options);
const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
const {
return running_ts_sz_;
}
const UnorderedMap<uint32_t, size_t>&
GetColumnFamiliesTimestampSizeForRecord() const {
return ts_sz_for_record_;
}
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
@ -746,15 +744,6 @@ class ColumnFamilySet {
UnorderedMap<std::string, uint32_t> column_families_;
UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
// Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow
// the same requirements as `column_families_` and `column_family_data_`.
// Mapping from column family id to user-defined timestamp size for all
// running column families.
UnorderedMap<uint32_t, size_t> running_ts_sz_;
// Mapping from column family id to user-defined timestamp size for
// column families with non-zero user-defined timestamp size.
UnorderedMap<uint32_t, size_t> ts_sz_for_record_;
uint32_t max_column_family_;
const FileOptions file_options_;

@ -17,7 +17,6 @@
#include "options/options_parser.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/comparator.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
@ -64,9 +63,6 @@ class ColumnFamilyTestBase : public testing::Test {
db_options_.create_if_missing = true;
db_options_.fail_if_options_file_error = true;
db_options_.env = env_;
}
void SetUp() override {
EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
}
@ -75,7 +71,11 @@ class ColumnFamilyTestBase : public testing::Test {
for (auto h : handles_) {
ColumnFamilyDescriptor cfdescriptor;
Status s = h->GetDescriptor(&cfdescriptor);
#ifdef ROCKSDB_LITE
EXPECT_TRUE(s.IsNotSupported());
#else
EXPECT_OK(s);
#endif // ROCKSDB_LITE
column_families.push_back(cfdescriptor);
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@ -197,10 +197,12 @@ class ColumnFamilyTestBase : public testing::Test {
&db_);
}
#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
void AssertOpenReadOnly(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
ASSERT_OK(OpenReadOnly(cf, options));
}
#endif // !ROCKSDB_LITE
void Open(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
@ -222,16 +224,27 @@ class ColumnFamilyTestBase : public testing::Test {
}
bool IsDbWriteStopped() {
#ifndef ROCKSDB_LITE
uint64_t v;
EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
return (v == 1);
#else
return dbfull()->TEST_write_controler().IsStopped();
#endif // !ROCKSDB_LITE
}
uint64_t GetDbDelayedWriteRate() {
#ifndef ROCKSDB_LITE
uint64_t v;
EXPECT_TRUE(
dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
return v;
#else
if (!dbfull()->TEST_write_controler().NeedsDelay()) {
return 0;
}
return dbfull()->TEST_write_controler().delayed_write_rate();
#endif // !ROCKSDB_LITE
}
void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
@ -254,6 +267,7 @@ class ColumnFamilyTestBase : public testing::Test {
db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
names_[cfi] = cfs[i];
#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor
// Verify the CF options of the returned CF handle.
ColumnFamilyDescriptor desc;
ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
@ -262,6 +276,7 @@ class ColumnFamilyTestBase : public testing::Test {
ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
ConfigOptions(), desc.options,
SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
#endif // !ROCKSDB_LITE
cfi++;
}
}
@ -310,6 +325,7 @@ class ColumnFamilyTestBase : public testing::Test {
ASSERT_OK(db_->FlushWAL(/*sync=*/false));
}
#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite
void WaitForFlush(int cf) {
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
}
@ -323,6 +339,7 @@ class ColumnFamilyTestBase : public testing::Test {
void AssertMaxTotalInMemoryState(uint64_t value) {
ASSERT_EQ(value, MaxTotalInMemoryState());
}
#endif // !ROCKSDB_LITE
Status Put(int cf, const std::string& key, const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
@ -360,6 +377,7 @@ class ColumnFamilyTestBase : public testing::Test {
"rocksdb.num-files-at-level" + std::to_string(level));
}
#ifndef ROCKSDB_LITE
// Return spread of files per level
std::string FilesPerLevel(int cf) {
std::string result;
@ -376,19 +394,31 @@ class ColumnFamilyTestBase : public testing::Test {
result.resize(last_non_zero_offset);
return result;
}
#endif
void AssertFilesPerLevel(const std::string& value, int cf) {
#ifndef ROCKSDB_LITE
ASSERT_EQ(value, FilesPerLevel(cf));
#else
(void)value;
(void)cf;
#endif
}
#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported
int CountLiveFiles() {
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
return static_cast<int>(metadata.size());
}
#endif // !ROCKSDB_LITE
void AssertCountLiveFiles(int expected_value) {
#ifndef ROCKSDB_LITE
ASSERT_EQ(expected_value, CountLiveFiles());
#else
(void)expected_value;
#endif
}
// Do n memtable flushes, each of which produces an sstable
@ -402,6 +432,7 @@ class ColumnFamilyTestBase : public testing::Test {
}
}
#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
int CountLiveLogFiles() {
int micros_wait_for_log_deletion = 20000;
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
@ -430,18 +461,25 @@ class ColumnFamilyTestBase : public testing::Test {
return ret;
return 0;
}
#endif // !ROCKSDB_LITE
void AssertCountLiveLogFiles(int value) {
#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
ASSERT_EQ(value, CountLiveLogFiles());
#else
(void)value;
#endif // !ROCKSDB_LITE
}
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
assert(num_per_cf.size() == handles_.size());
#ifndef ROCKSDB_LITE // GetProperty is not supported in lite
for (size_t i = 0; i < num_per_cf.size(); ++i) {
ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
"rocksdb.num-immutable-mem-table"));
}
#endif // !ROCKSDB_LITE
}
void CopyFile(const std::string& source, const std::string& destination,
@ -537,6 +575,7 @@ TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
}
}
#ifndef ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
Open();
@ -559,6 +598,7 @@ TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
#endif // !ROCKSDB_LITE
class FlushEmptyCFTestWithParam
: public ColumnFamilyTestBase,
@ -902,6 +942,7 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
}
}
#ifndef ROCKSDB_LITE // TEST functions used are not supported
TEST_P(ColumnFamilyTest, FlushTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
@ -1016,6 +1057,7 @@ TEST_P(ColumnFamilyTest, LogDeletionTest) {
AssertCountLiveLogFiles(4);
Close();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CrashAfterFlush) {
std::unique_ptr<FaultInjectionTestEnv> fault_env(
@ -1055,6 +1097,7 @@ TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
}
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
// Makes sure that obsolete log files get deleted
TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
// disable flushing stale column families
@ -1162,12 +1205,14 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
AssertCountLiveLogFiles(7);
Close();
}
#endif // !ROCKSDB_LITE
// The test is commented out because we want to test that snapshot is
// not created for memtables not supported it, but There isn't a memtable
// that doesn't support snapshot right now. If we have one later, we can
// re-enable the test.
//
// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite
// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
// db_options_.allow_concurrent_memtable_write = false;
// Open();
@ -1187,6 +1232,7 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
// Close();
// }
// #endif // !ROCKSDB_LITE
class TestComparator : public Comparator {
int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
@ -1253,13 +1299,13 @@ TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
Close();
}
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
Open();
CreateColumnFamilies({"one", "two"});
ColumnFamilyOptions default_cf, one, two;
db_options_.max_open_files = 20; // only 10 files in file cache
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
@ -1277,7 +1323,6 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1322,7 +1367,9 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE
// Sync points not supported in RocksDB Lite
TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
@ -1332,7 +1379,6 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
@ -1349,7 +1395,6 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1432,14 +1477,13 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
;
table_options.no_block_cache = true;
default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
@ -1450,7 +1494,6 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1529,14 +1572,13 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
;
table_options.no_block_cache = true;
default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
@ -1547,7 +1589,6 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1992,7 +2033,9 @@ TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // Tailing iterator not supported
namespace {
std::string IterStatus(Iterator* iter) {
std::string result;
@ -2050,7 +2093,9 @@ TEST_P(ColumnFamilyTest, NewIteratorsTest) {
Destroy();
}
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2099,7 +2144,9 @@ TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
s = OpenReadOnly({"one", "four"});
ASSERT_TRUE(!s.ok());
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite
TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2121,7 +2168,9 @@ TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite
TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
Open();
CreateColumnFamilies({"one", "two"});
@ -2168,6 +2217,7 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
ASSERT_EQ(0, dbfull()->TEST_total_log_size());
Close();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
Status s = TryOpen({"one", "two"});
@ -2407,6 +2457,8 @@ TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
Destroy();
}
#ifndef ROCKSDB_LITE
// skipped as persisting options is not supported in ROCKSDB_LITE
namespace {
std::atomic<int> test_stage(0);
std::atomic<bool> ordered_by_writethread(false);
@ -2488,6 +2540,7 @@ TEST_P(ColumnFamilyTest, CreateAndDropRace) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
const uint64_t kBaseRate = 800000u;
@ -2897,6 +2950,7 @@ TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
}
#ifndef ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
ColumnFamilyHandle* cfh;
Open();
@ -2951,7 +3005,9 @@ TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
SpecialEnv env(Env::Default());
db_options_.env = &env;
@ -3058,7 +3114,9 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // TEST functions are not supported in lite
TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
SpecialEnv env(Env::Default());
// Allow both of flush and purge job to schedule.
@ -3134,6 +3192,7 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
// to return true which is not so in unbuffered mode.
@ -3384,205 +3443,6 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
}
// Tests the flushing behavior of a column family to retain user-defined
// timestamp when `persist_user_defined_timestamp` is false.
class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase {
public:
ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {}
void SetUp() override {
db_options_.allow_concurrent_memtable_write = false;
column_family_options_.comparator =
test::BytewiseComparatorWithU64TsWrapper();
column_family_options_.persist_user_defined_timestamps = false;
ColumnFamilyTestBase::SetUp();
}
Status Put(int cf, const std::string& key, const std::string& ts,
const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(ts),
Slice(value));
}
};
class TestTsComparator : public Comparator {
public:
TestTsComparator() : Comparator(8 /*ts_sz*/) {}
int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
return 0;
}
const char* Name() const override { return "TestTs"; }
void FindShortestSeparator(
std::string* /*start*/,
const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
void FindShortSuccessor(std::string* /*key*/) const override {}
};
TEST_F(ColumnFamilyRetainUDTTest, SanityCheck) {
Open();
ColumnFamilyOptions cf_options;
cf_options.persist_user_defined_timestamps = false;
TestTsComparator test_comparator;
cf_options.comparator = &test_comparator;
ColumnFamilyHandle* handle;
// Not persisting user-defined timestamps feature only supports user-defined
// timestamps formatted as uint64_t.
ASSERT_TRUE(
db_->CreateColumnFamily(cf_options, "pikachu", &handle).IsNotSupported());
Destroy();
// Not persisting user-defined timestamps feature doesn't work in combination
// with atomic flush.
db_options_.atomic_flush = true;
ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
// Not persisting user-defined timestamps feature doesn't work in combination
// with concurrent memtable write.
db_options_.atomic_flush = false;
db_options_.allow_concurrent_memtable_write = true;
ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
Close();
}
TEST_F(ColumnFamilyRetainUDTTest, FullHistoryTsLowNotSet) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
// No `full_history_ts_low` explicitly set by user, flush is continued
// without checking if its UDTs expired.
ASSERT_OK(Flush(0));
// After flush, `full_history_ts_low` should be automatically advanced to
// the effective cutoff timestamp: write_ts + 1
std::string cutoff_ts;
PutFixed64(&cutoff_ts, 2);
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, AllKeysExpired) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
std::string cutoff_ts;
PutFixed64(&cutoff_ts, 3);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
// All keys expired w.r.t the configured `full_history_ts_low`, flush continue
// without the need for a re-schedule.
ASSERT_OK(Flush(0));
// `full_history_ts_low` stays unchanged after flush.
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushToAvoidWriteStall) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string cutoff_ts;
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
PutFixed64(&cutoff_ts, 1);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
ASSERT_OK(db_->SetOptions(handles_[0], {{"max_write_buffer_number", "1"}}));
// Not all keys expired, but flush is continued without a re-schedule because
// of risk of write stall.
ASSERT_OK(Flush(0));
// After flush, `full_history_ts_low` should be automatically advanced to
// the effective cutoff timestamp: write_ts + 1
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
cutoff_ts.clear();
PutFixed64(&cutoff_ts, 2);
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushRescheduled) {
std::string cutoff_ts;
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::AfterRetainUDTReschedule:cb", [&](void* /*arg*/) {
// Increasing full_history_ts_low so all keys expired after the initial
// FlushRequest is rescheduled
cutoff_ts.clear();
PutFixed64(&cutoff_ts, 3);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
});
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(2, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
PutFixed64(&cutoff_ts, 1);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
// Not all keys expired, and there is no risk of write stall. Flush is
// rescheduled. The actual flush happens after `full_history_ts_low` is
// increased to mark all keys expired.
ASSERT_OK(Flush(0));
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
// `full_history_ts_low` stays unchanged.
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -3,6 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include <mutex>
#include <string>
@ -66,7 +67,6 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
const int kWriteBufferSize = 10000;
const int kLevel0Trigger = 2;
options.create_if_missing = true;
options.level_compaction_dynamic_level_bytes = false;
options.compaction_style = kCompactionStyleLevel;
// Small slowdown and stop trigger for experimental purpose.
options.level0_slowdown_writes_trigger = 20;
@ -121,9 +121,7 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
TEST_F(CompactFilesTest, MultipleLevel) {
Options options;
options.create_if_missing = true;
// Otherwise background compaction can happen to
// drain unnecessary level
options.level_compaction_dynamic_level_bytes = false;
options.level_compaction_dynamic_level_bytes = true;
options.num_levels = 6;
// Add listener
FlushedFileCollector* collector = new FlushedFileCollector();
@ -184,6 +182,7 @@ TEST_F(CompactFilesTest, MultipleLevel) {
for (int invalid_output_level = 0; invalid_output_level < 5;
invalid_output_level++) {
s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
std::cout << s.ToString() << std::endl;
ASSERT_TRUE(s.IsInvalidArgument());
}
@ -360,7 +359,6 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.create_if_missing = true;
options.compaction_filter = cf.get();
@ -403,7 +401,6 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
CompactionStyle::kCompactionStyleNone}) {
ASSERT_OK(DestroyDB(db_name_, Options()));
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.compaction_style = compaction_style;
// L0: Snappy, L1: ZSTD, L2: Snappy
options.compression_per_level = {CompressionType::kSnappyCompression,
@ -493,3 +490,13 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // !ROCKSDB_LITE

@ -13,7 +13,6 @@
#include <vector>
#include "db/column_family.h"
#include "logging/logging.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/sst_partitioner.h"
#include "test_util/sync_point.h"
@ -21,16 +20,14 @@
namespace ROCKSDB_NAMESPACE {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b));
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a);
auto b_footer = ExtractInternalKeyFooter(b);
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
@ -204,34 +201,6 @@ bool Compaction::IsFullCompaction(
return num_files_in_compaction == total_num_files;
}
const TablePropertiesCollection& Compaction::GetTableProperties() {
if (!input_table_properties_initialized_) {
const ReadOptions read_options(Env::IOActivity::kCompaction);
for (size_t i = 0; i < num_input_levels(); ++i) {
for (const FileMetaData* fmd : *(this->inputs(i))) {
std::shared_ptr<const TableProperties> tp;
std::string file_name =
TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(),
fmd->fd.GetPathId());
Status s = input_version_->GetTableProperties(read_options, &tp, fmd,
&file_name);
if (s.ok()) {
table_properties_[file_name] = tp;
} else {
ROCKS_LOG_ERROR(immutable_options_.info_log,
"Unable to load table properties for file %" PRIu64
" --- %s\n",
fmd->fd.GetNumber(), s.ToString().c_str());
}
}
}
input_table_properties_initialized_ = true;
};
return table_properties_;
}
Compaction::Compaction(
VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
const MutableCFOptions& _mutable_cf_options,
@ -493,11 +462,6 @@ bool Compaction::IsTrivialMove() const {
return false;
}
if (compaction_reason_ == CompactionReason::kChangeTemperature) {
// Changing temperature usually requires rewriting the file.
return false;
}
// Used in universal compaction, where trivial move can be done if the
// input files are non overlapping
if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
@ -514,24 +478,25 @@ bool Compaction::IsTrivialMove() const {
// assert inputs_.size() == 1
if (output_level_ + 1 < number_levels_) {
std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
for (const auto& file : inputs_.front().files) {
std::vector<FileMetaData*> file_grand_parents;
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
&file->largest,
&file_grand_parents);
const auto compaction_size =
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
if (compaction_size > max_compaction_bytes_) {
return false;
}
std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
if (partitioner.get() != nullptr) {
if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
file->largest.user_key())) {
return false;
}
for (const auto& file : inputs_.front().files) {
std::vector<FileMetaData*> file_grand_parents;
if (output_level_ + 1 >= number_levels_) {
continue;
}
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
&file->largest, &file_grand_parents);
const auto compaction_size =
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
if (compaction_size > max_compaction_bytes_) {
return false;
}
if (partitioner.get() != nullptr) {
if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
file->largest.user_key())) {
return false;
}
}
}
@ -590,49 +555,6 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
return false;
}
bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const {
assert(input_version_ != nullptr);
assert(level_ptrs != nullptr);
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) <
0);
if (bottommost_level_) {
return true /* does not overlap */;
} else if (output_level_ != 0 &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files =
input_vstorage_->LevelFiles(lvl);
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
auto* f = files[level_ptrs->at(lvl)];
// Advance until the first file with begin_key <= f->largest.user_key()
if (user_cmp->CompareWithoutTimestamp(begin_key,
f->largest.user_key()) > 0) {
continue;
}
// We know that the previous file prev_f, if exists, has
// prev_f->largest.user_key() < begin_key.
if (user_cmp->CompareWithoutTimestamp(end_key,
f->smallest.user_key()) <= 0) {
// not overlapping with this level
break;
} else {
// We have:
// - begin_key < end_key,
// - begin_key <= f->largest.user_key(), and
// - end_key > f->smallest.user_key()
return false /* overlap */;
}
}
}
return true /* does not overlap */;
}
return false /* overlaps */;
};
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
for (size_t i = 0; i < num_input_levels(); i++) {

@ -18,6 +18,8 @@ namespace ROCKSDB_NAMESPACE {
// The file contains class Compaction, as well as some helper functions
// and data structures used by the class.
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
@ -31,19 +33,8 @@ namespace ROCKSDB_NAMESPACE {
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&);
inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a,
const InternalKey& b) {
return sstableKeyCompare(user_cmp, a, b.Encode());
}
inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const Slice& b) {
return sstableKeyCompare(user_cmp, a.Encode(), b);
}
inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
return sstableKeyCompare(user_cmp, a.Encode(), b.Encode());
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
@ -214,18 +205,10 @@ class Compaction {
void AddInputDeletions(VersionEdit* edit);
// Returns true if the available information we have guarantees that
// the input "user_key" does not exist in any level beyond `output_level()`.
// the input "user_key" does not exist in any level beyond "output_level()".
bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
std::vector<size_t>* level_ptrs) const;
// Returns true if the user key range [begin_key, end_key) does not exist
// in any level beyond `output_level()`.
// Used for checking range tombstones, so we assume begin_key < end_key.
// begin_key and end_key should include timestamp if enabled.
bool KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const;
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
//
@ -326,16 +309,12 @@ class Compaction {
int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs);
// If called before a compaction finishes, will return
// table properties of all compaction input files.
// If called after a compaction finished, will return
// table properties of all compaction input and output files.
const TablePropertiesCollection& GetTableProperties();
TablePropertiesCollection GetOutputTableProperties() const {
return output_table_properties_;
}
void SetOutputTableProperties(
const std::string& file_name,
const std::shared_ptr<const TableProperties>& tp) {
table_properties_[file_name] = tp;
void SetOutputTableProperties(TablePropertiesCollection tp) {
output_table_properties_ = std::move(tp);
}
Slice GetSmallestUserKey() const { return smallest_user_key_; }
@ -522,9 +501,8 @@ class Compaction {
// Does input compression match the output compression?
bool InputCompressionMatchesOutput() const;
bool input_table_properties_initialized_ = false;
// table properties of output files
TablePropertiesCollection table_properties_;
TablePropertiesCollection output_table_properties_;
// smallest user keys in compaction
// includes timestamp if user-defined timestamp is enabled.
@ -570,16 +548,13 @@ struct PerKeyPlacementContext {
const Slice value;
const SequenceNumber seq_num;
bool& output_to_penultimate_level;
bool output_to_penultimate_level;
PerKeyPlacementContext(int _level, Slice _key, Slice _value,
SequenceNumber _seq_num,
bool& _output_to_penultimate_level)
: level(_level),
key(_key),
value(_value),
seq_num(_seq_num),
output_to_penultimate_level(_output_to_penultimate_level) {}
SequenceNumber _seq_num)
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
output_to_penultimate_level = false;
}
};
#endif /* !NDEBUG */

@ -13,7 +13,6 @@
#include "db/blob/blob_index.h"
#include "db/blob/prefetch_buffer_collection.h"
#include "db/snapshot_checker.h"
#include "db/wide/wide_column_serialization.h"
#include "logging/logging.h"
#include "port/likely.h"
#include "rocksdb/listener.h"
@ -31,8 +30,7 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
bool must_count_input_entries, const Compaction* compaction,
const CompactionFilter* compaction_filter,
const Compaction* compaction, const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low,
@ -46,9 +44,8 @@ CompactionIterator::CompactionIterator(
manual_compaction_canceled,
std::unique_ptr<CompactionProxy>(
compaction ? new RealCompaction(compaction) : nullptr),
must_count_input_entries, compaction_filter, shutting_down, info_log,
full_history_ts_low, preserve_time_min_seqno,
preclude_last_level_min_seqno) {}
compaction_filter, shutting_down, info_log, full_history_ts_low,
preserve_time_min_seqno, preclude_last_level_min_seqno) {}
CompactionIterator::CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@ -60,14 +57,15 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction, bool must_count_input_entries,
std::unique_ptr<CompactionProxy> compaction,
const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low,
const SequenceNumber preserve_time_min_seqno,
const SequenceNumber preclude_last_level_min_seqno)
: input_(input, cmp, must_count_input_entries),
: input_(input, cmp,
!compaction || compaction->DoesInputReferenceBlobFiles()),
cmp_(cmp),
merge_helper_(merge_helper),
snapshots_(snapshots),
@ -126,9 +124,6 @@ CompactionIterator::CompactionIterator(
timestamp_size_ == full_history_ts_low_->size());
#endif
input_.SetPinnedItersMgr(&pinned_iters_mgr_);
// The default `merge_until_status_` does not need to be checked since it is
// overwritten as soon as `MergeUntil()` is called
merge_until_status_.PermitUncheckedError();
TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
}
@ -183,20 +178,6 @@ void CompactionIterator::Next() {
ikey_.user_key = current_key_.GetUserKey();
validity_info_.SetValid(ValidContext::kMerge1);
} else {
if (merge_until_status_.IsMergeInProgress()) {
// `Status::MergeInProgress()` tells us that the previous `MergeUntil()`
// produced only merge operands. Those merge operands were accessed and
// written out using `merge_out_iter_`. Since `merge_out_iter_` is
// exhausted at this point, all merge operands have been written out.
//
// Still, there may be a base value (PUT, DELETE, SINGLEDEL, etc.) that
// needs to be written out. Normally, `CompactionIterator` would skip it
// on the basis that it has already output something in the same
// snapshot stripe. To prevent this, we reset `has_current_user_key_` to
// trick the future iteration from finding out the snapshot stripe is
// unchanged.
has_current_user_key_ = false;
}
// We consumed all pinned merge operands, release pinned iterators
pinned_iters_mgr_.ReleasePinnedData();
// MergeHelper moves the iterator to the first record after the merged
@ -223,47 +204,39 @@ void CompactionIterator::Next() {
bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
Slice* skip_until) {
if (!compaction_filter_) {
return true;
}
if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
ikey_.type != kTypeWideColumnEntity) {
// TODO: support compaction filter for wide-column entities
if (!compaction_filter_ ||
(ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
return true;
}
CompactionFilter::Decision decision =
CompactionFilter::Decision::kUndetermined;
bool error = false;
// If the user has specified a compaction filter and the sequence
// number is greater than any external snapshot, then invoke the
// filter. If the return value of the compaction filter is true,
// replace the entry with a deletion marker.
CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
compaction_filter_value_.clear();
compaction_filter_skip_until_.Clear();
CompactionFilter::ValueType value_type =
ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
: ikey_.type == kTypeBlobIndex
? CompactionFilter::ValueType::kBlobIndex
: CompactionFilter::ValueType::kWideColumnEntity;
: CompactionFilter::ValueType::kBlobIndex;
// Hack: pass internal key to BlobIndexCompactionFilter since it needs
// to get sequence number.
assert(compaction_filter_);
const Slice& filter_key =
(ikey_.type != kTypeBlobIndex ||
Slice& filter_key =
(ikey_.type == kTypeValue ||
!compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
? ikey_.user_key
: key_;
compaction_filter_value_.clear();
compaction_filter_skip_until_.Clear();
std::vector<std::pair<std::string, std::string>> new_columns;
{
StopWatchNano timer(clock_, report_detailed_time_);
if (ikey_.type == kTypeBlobIndex) {
decision = compaction_filter_->FilterBlobByKey(
if (kTypeBlobIndex == ikey_.type) {
filter = compaction_filter_->FilterBlobByKey(
level_, filter_key, &compaction_filter_value_,
compaction_filter_skip_until_.rep());
if (decision == CompactionFilter::Decision::kUndetermined &&
if (CompactionFilter::Decision::kUndetermined == filter &&
!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
if (!compaction_) {
if (compaction_ == nullptr) {
status_ =
Status::Corruption("Unexpected blob index outside of compaction");
validity_info_.Invalidate();
@ -309,61 +282,33 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
value_type = CompactionFilter::ValueType::kValue;
}
}
if (decision == CompactionFilter::Decision::kUndetermined) {
const Slice* existing_val = nullptr;
const WideColumns* existing_col = nullptr;
WideColumns existing_columns;
if (ikey_.type != kTypeWideColumnEntity) {
if (!blob_value_.empty()) {
existing_val = &blob_value_;
} else {
existing_val = &value_;
}
} else {
Slice value_copy = value_;
const Status s =
WideColumnSerialization::Deserialize(value_copy, existing_columns);
if (!s.ok()) {
status_ = s;
validity_info_.Invalidate();
return false;
}
existing_col = &existing_columns;
}
decision = compaction_filter_->FilterV3(
level_, filter_key, value_type, existing_val, existing_col,
&compaction_filter_value_, &new_columns,
if (CompactionFilter::Decision::kUndetermined == filter) {
filter = compaction_filter_->FilterV2(
level_, filter_key, value_type,
blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
compaction_filter_skip_until_.rep());
}
iter_stats_.total_filter_time +=
env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
}
if (decision == CompactionFilter::Decision::kUndetermined) {
// Should not reach here, since FilterV2/FilterV3 should never return
// kUndetermined.
status_ = Status::NotSupported(
"FilterV2/FilterV3 should never return kUndetermined");
if (CompactionFilter::Decision::kUndetermined == filter) {
// Should not reach here, since FilterV2 should never return kUndetermined.
status_ =
Status::NotSupported("FilterV2() should never return kUndetermined");
validity_info_.Invalidate();
return false;
}
if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil &&
if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
0) {
// Can't skip to a key smaller than the current one.
// Keep the key as per FilterV2/FilterV3 documentation.
decision = CompactionFilter::Decision::kKeep;
// Keep the key as per FilterV2 documentation.
filter = CompactionFilter::Decision::kKeep;
}
if (decision == CompactionFilter::Decision::kRemove) {
if (filter == CompactionFilter::Decision::kRemove) {
// convert the current key to a delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeDeletion;
@ -371,7 +316,7 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (decision == CompactionFilter::Decision::kPurge) {
} else if (filter == CompactionFilter::Decision::kPurge) {
// convert the current key to a single delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeSingleDeletion;
@ -379,19 +324,19 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with single delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (decision == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type != kTypeValue) {
} else if (filter == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type == kTypeBlobIndex) {
// value transfer from blob file to inlined data
ikey_.type = kTypeValue;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue);
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
}
value_ = compaction_filter_value_;
} else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) {
} else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
*need_skip = true;
compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
kValueTypeForSeek);
*skip_until = compaction_filter_skip_until_.Encode();
} else if (decision == CompactionFilter::Decision::kChangeBlobIndex) {
} else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
// Only the StackableDB-based BlobDB impl's compaction filter should return
// kChangeBlobIndex. Decision about rewriting blob and changing blob index
// in the integrated BlobDB impl is made in subsequent call to
@ -403,56 +348,23 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
validity_info_.Invalidate();
return false;
}
if (ikey_.type != kTypeBlobIndex) {
if (ikey_.type == kTypeValue) {
// value transfer from inlined data to blob file
ikey_.type = kTypeBlobIndex;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeBlobIndex);
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
}
value_ = compaction_filter_value_;
} else if (decision == CompactionFilter::Decision::kIOError) {
} else if (filter == CompactionFilter::Decision::kIOError) {
if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
status_ = Status::NotSupported(
"CompactionFilter for integrated BlobDB should not return kIOError");
validity_info_.Invalidate();
return false;
}
status_ = Status::IOError("Failed to access blob during compaction filter");
validity_info_.Invalidate();
return false;
} else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) {
WideColumns sorted_columns;
sorted_columns.reserve(new_columns.size());
for (const auto& column : new_columns) {
sorted_columns.emplace_back(column.first, column.second);
}
std::sort(sorted_columns.begin(), sorted_columns.end(),
[](const WideColumn& lhs, const WideColumn& rhs) {
return lhs.name().compare(rhs.name()) < 0;
});
{
const Status s = WideColumnSerialization::Serialize(
sorted_columns, compaction_filter_value_);
if (!s.ok()) {
status_ = s;
validity_info_.Invalidate();
return false;
}
}
if (ikey_.type != kTypeWideColumnEntity) {
ikey_.type = kTypeWideColumnEntity;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeWideColumnEntity);
}
value_ = compaction_filter_value_;
error = true;
}
return true;
return !error;
}
void CompactionIterator::NextFromInput() {
@ -983,15 +895,14 @@ void CompactionIterator::NextFromInput() {
// have hit (A)
// We encapsulate the merge related state machine in a different
// object to minimize change to the existing flow.
merge_until_status_ = merge_helper_->MergeUntil(
Status s = merge_helper_->MergeUntil(
&input_, range_del_agg_, prev_snapshot, bottommost_level_,
allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
prefetch_buffers_.get(), &iter_stats_);
merge_out_iter_.SeekToFirst();
if (!merge_until_status_.ok() &&
!merge_until_status_.IsMergeInProgress()) {
status_ = merge_until_status_;
if (!s.ok() && !s.IsMergeInProgress()) {
status_ = s;
return;
} else if (merge_out_iter_.Valid()) {
// NOTE: key, value, and ikey_ refer to old entries.
@ -1202,7 +1113,17 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
void CompactionIterator::DecideOutputLevel() {
assert(compaction_->SupportsPerKeyPlacement());
#ifndef NDEBUG
// Could be overridden by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
ikey_.sequence);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
&context);
output_to_penultimate_level_ = context.output_to_penultimate_level;
#else
output_to_penultimate_level_ = false;
#endif // NDEBUG
// if the key is newer than the cutoff sequence or within the earliest
// snapshot, it should output to the penultimate level.
if (ikey_.sequence > preclude_last_level_min_seqno_ ||
@ -1210,17 +1131,6 @@ void CompactionIterator::DecideOutputLevel() {
output_to_penultimate_level_ = true;
}
#ifndef NDEBUG
// Could be overridden by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence,
output_to_penultimate_level_);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
&context);
if (ikey_.sequence > earliest_snapshot_) {
output_to_penultimate_level_ = true;
}
#endif // NDEBUG
if (output_to_penultimate_level_) {
// If it's decided to output to the penultimate level, but unsafe to do so,
// still output to the last level. For example, moving the data from a lower
@ -1413,7 +1323,6 @@ std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
}
ReadOptions read_options;
read_options.io_activity = Env::IOActivity::kCompaction;
read_options.fill_cache = false;
return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));

@ -38,18 +38,15 @@ class SequenceIterWrapper : public InternalIterator {
bool Valid() const override { return inner_iter_->Valid(); }
Status status() const override { return inner_iter_->status(); }
void Next() override {
if (!inner_iter_->IsDeleteRangeSentinelKey()) {
num_itered_++;
}
num_itered_++;
inner_iter_->Next();
}
void Seek(const Slice& target) override {
if (!need_count_entries_) {
has_num_itered_ = false;
inner_iter_->Seek(target);
} else {
// Need to count total number of entries,
// so we do Next() rather than Seek().
// For flush cases, we need to count total number of entries, so we
// do Next() rather than Seek().
while (inner_iter_->Valid() &&
icmp_.Compare(inner_iter_->key(), target) < 0) {
Next();
@ -65,8 +62,7 @@ class SequenceIterWrapper : public InternalIterator {
void SeekForPrev(const Slice& /* target */) override { assert(false); }
void SeekToLast() override { assert(false); }
uint64_t NumItered() const { return num_itered_; }
bool HasNumItered() const { return has_num_itered_; }
uint64_t num_itered() const { return num_itered_; }
bool IsDeleteRangeSentinelKey() const override {
assert(Valid());
return inner_iter_->IsDeleteRangeSentinelKey();
@ -77,7 +73,6 @@ class SequenceIterWrapper : public InternalIterator {
InternalIterator* inner_iter_; // not owned
uint64_t num_itered_ = 0;
bool need_count_entries_;
bool has_num_itered_ = true;
};
class CompactionIterator {
@ -194,10 +189,6 @@ class CompactionIterator {
const Compaction* compaction_;
};
// @param must_count_input_entries if true, `NumInputEntryScanned()` will
// return the number of input keys scanned. If false, `NumInputEntryScanned()`
// will return this number if no Seek was called on `input`. User should call
// `HasNumInputEntryScanned()` first in this case.
CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@ -208,7 +199,7 @@ class CompactionIterator {
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
bool must_count_input_entries, const Compaction* compaction = nullptr,
const Compaction* compaction = nullptr,
const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr,
@ -228,7 +219,6 @@ class CompactionIterator {
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction,
bool must_count_input_entries,
const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr,
@ -263,8 +253,7 @@ class CompactionIterator {
return current_user_key_;
}
const CompactionIterationStats& iter_stats() const { return iter_stats_; }
bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
// If the current key should be placed on penultimate level, only valid if
// per_key_placement is supported
bool output_to_penultimate_level() const {
@ -455,7 +444,6 @@ class CompactionIterator {
bool clear_and_output_next_key_ = false;
MergeOutputIterator merge_out_iter_;
Status merge_until_status_;
// PinnedIteratorsManager used to pin input_ Iterator blocks while reading
// merge operands and then releasing them after consuming them.
PinnedIteratorsManager pinned_iters_mgr_;

@ -293,8 +293,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
true /*enforce_single_del_contracts*/,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
std::move(compaction), /*must_count_input_entries=*/false, filter,
&shutting_down_, /*info_log=*/nullptr, full_history_ts_low));
std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
full_history_ts_low));
}
void AddSnapshot(SequenceNumber snapshot,

@ -192,8 +192,8 @@ CompactionJob::CompactionJob(
assert(log_buffer_ != nullptr);
const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
ThreadStatusUtil::SetColumnFamily(cfd);
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
db_options_.enable_thread_tracking);
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
ReportStartedCompaction(compaction);
}
@ -204,6 +204,10 @@ CompactionJob::~CompactionJob() {
}
void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
db_options_.enable_thread_tracking);
ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
job_id_);
@ -260,7 +264,7 @@ void CompactionJob::Prepare() {
StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
GenSubcompactionBoundaries();
}
if (boundaries_.size() >= 1) {
if (boundaries_.size() > 1) {
for (size_t i = 0; i <= boundaries_.size(); i++) {
compact_->sub_compact_states.emplace_back(
c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
@ -287,14 +291,12 @@ void CompactionJob::Prepare() {
c->immutable_options()->preclude_last_level_data_seconds);
if (preserve_time_duration > 0) {
const ReadOptions read_options(Env::IOActivity::kCompaction);
// setup seqno_time_mapping_
seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
for (const auto& each_level : *c->inputs()) {
for (const auto& fmd : each_level.files) {
std::shared_ptr<const TableProperties> tp;
Status s =
cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr);
Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
if (s.ok()) {
seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
.PermitUncheckedError();
@ -470,7 +472,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
// overlap with N-1 other ranges. Since we requested a relatively large number
// (128) of ranges from each input files, even N range overlapping would
// cause relatively small inaccuracy.
const ReadOptions read_options(Env::IOActivity::kCompaction);
auto* c = compact_->compaction;
if (c->max_subcompactions() <= 1 &&
!(c->immutable_options()->compaction_pri == kRoundRobin &&
@ -504,9 +506,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
FileMetaData* f = flevel->files[i].file_metadata;
std::vector<TableReader::Anchor> my_anchors;
Status s = cfd->table_cache()->ApproximateKeyAnchors(
read_options, icomp, *f,
c->mutable_cf_options()->block_protection_bytes_per_key,
my_anchors);
ReadOptions(), icomp, *f, my_anchors);
if (!s.ok() || my_anchors.empty()) {
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
}
@ -722,12 +722,11 @@ Status CompactionJob::Run() {
// use_direct_io_for_flush_and_compaction is true, we will regard this
// verification as user reads since the goal is to cache it here for
// further user reads
const ReadOptions verify_table_read_options(
Env::IOActivity::kCompaction);
ReadOptions read_options;
InternalIterator* iter = cfd->table_cache()->NewIterator(
verify_table_read_options, file_options_,
cfd->internal_comparator(), files_output[file_idx]->meta,
/*range_del_agg=*/nullptr, prefix_extractor,
read_options, file_options_, cfd->internal_comparator(),
files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
prefix_extractor,
/*table_reader_ptr=*/nullptr,
cfd->internal_stats()->GetFileReadHist(
compact_->compaction->output_level()),
@ -737,9 +736,7 @@ Status CompactionJob::Run() {
*compact_->compaction->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false,
compact_->compaction->mutable_cf_options()
->block_protection_bytes_per_key);
/*allow_unprepared_value=*/false);
auto s = iter->status();
if (s.ok() && paranoid_file_checks_) {
@ -796,51 +793,20 @@ Status CompactionJob::Run() {
auto fn =
TableFileName(state.compaction->immutable_options()->cf_paths,
output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
compact_->compaction->SetOutputTableProperties(fn,
output.table_properties);
tp[fn] = output.table_properties;
}
}
compact_->compaction->SetOutputTableProperties(std::move(tp));
// Finish up all bookkeeping to unify the subcompaction results.
// Finish up all book-keeping to unify the subcompaction results
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
uint64_t num_input_range_del = 0;
bool ok = UpdateCompactionStats(&num_input_range_del);
// (Sub)compactions returned ok, do sanity check on the number of input keys.
if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
size_t ts_sz = compact_->compaction->column_family_data()
->user_comparator()
->timestamp_size();
// When trim_ts_ is non-empty, CompactionIterator takes
// HistoryTrimmingIterator as input iterator and sees a trimmed view of
// input keys. So the number of keys it processed is not suitable for
// verification here.
// TODO: support verification when trim_ts_ is non-empty.
if (!(ts_sz > 0 && !trim_ts_.empty()) &&
db_options_.compaction_verify_record_count) {
assert(compaction_stats_.stats.num_input_records > 0);
// TODO: verify the number of range deletion entries.
uint64_t expected =
compaction_stats_.stats.num_input_records - num_input_range_del;
uint64_t actual = compaction_job_stats_->num_input_records;
if (expected != actual) {
std::string msg =
"Total number of input records: " + std::to_string(expected) +
", but processed " + std::to_string(actual) + " records.";
ROCKS_LOG_WARN(
db_options_.info_log, "[%s] [JOB %d] Compaction %s",
compact_->compaction->column_family_data()->GetName().c_str(),
job_context_->job_id, msg.c_str());
status = Status::Corruption(
"Compaction number of input keys does not match number of keys "
"processed.");
}
}
}
UpdateCompactionStats();
RecordCompactionIOStats();
LogFlush(db_options_.info_log);
TEST_SYNC_POINT("CompactionJob::Run():End");
compact_->status = status;
TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
return status;
}
@ -1012,6 +978,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
void CompactionJob::NotifyOnSubcompactionBegin(
SubcompactionState* sub_compact) {
#ifndef ROCKSDB_LITE
Compaction* c = compact_->compaction;
if (db_options_.listeners.empty()) {
@ -1037,10 +1004,14 @@ void CompactionJob::NotifyOnSubcompactionBegin(
}
info.status.PermitUncheckedError();
#else
(void)sub_compact;
#endif // ROCKSDB_LITE
}
void CompactionJob::NotifyOnSubcompactionCompleted(
SubcompactionState* sub_compact) {
#ifndef ROCKSDB_LITE
if (db_options_.listeners.empty()) {
return;
@ -1061,11 +1032,16 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
for (const auto& listener : db_options_.listeners) {
listener->OnSubcompactionCompleted(info);
}
#else
(void)sub_compact;
#endif // ROCKSDB_LITE
}
void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
assert(sub_compact);
assert(sub_compact->compaction);
#ifndef ROCKSDB_LITE
if (db_options_.compaction_service) {
CompactionServiceJobStatus comp_status =
ProcessKeyValueCompactionWithCompactionService(sub_compact);
@ -1076,6 +1052,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
// fallback to local compaction
assert(comp_status == CompactionServiceJobStatus::kUseLocal);
}
#endif // !ROCKSDB_LITE
uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
@ -1116,7 +1093,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
read_options.verify_checksums = true;
read_options.fill_cache = false;
read_options.rate_limiter_priority = GetRateLimiterPriority();
read_options.io_activity = Env::IOActivity::kCompaction;
// Compaction iterators shouldn't be confined to a single prefix.
// Compactions use Seek() for
// (a) concurrent compactions,
@ -1127,17 +1103,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
// GenSubcompactionBoundaries doesn't strip away the timestamp.
size_t ts_sz = cfd->user_comparator()->timestamp_size();
if (start.has_value()) {
read_options.iterate_lower_bound = &(*start);
read_options.iterate_lower_bound = &start.value();
if (ts_sz > 0) {
start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
read_options.iterate_lower_bound = &(*start_without_ts);
start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
read_options.iterate_lower_bound = &start_without_ts.value();
}
}
if (end.has_value()) {
read_options.iterate_upper_bound = &(*end);
read_options.iterate_upper_bound = &end.value();
if (ts_sz > 0) {
end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
read_options.iterate_upper_bound = &(*end_without_ts);
end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
read_options.iterate_upper_bound = &end_without_ts.value();
}
}
@ -1152,8 +1128,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
IterKey end_ikey;
Slice start_slice;
Slice end_slice;
Slice start_user_key{};
Slice end_user_key{};
static constexpr char kMaxTs[] =
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
@ -1169,22 +1143,21 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
}
if (start.has_value()) {
start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
kValueTypeForSeek);
if (ts_sz > 0) {
start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
&ts_slice);
}
start_slice = start_ikey.GetInternalKey();
start_user_key = start_ikey.GetUserKey();
}
if (end.has_value()) {
end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
if (ts_sz > 0) {
end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
&ts_slice);
}
end_slice = end_ikey.GetInternalKey();
end_user_key = end_ikey.GetUserKey();
}
std::unique_ptr<InternalIterator> clip;
@ -1283,8 +1256,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
/*expect_valid_internal_key=*/true, range_del_agg.get(),
blob_file_builder.get(), db_options_.allow_data_in_errors,
db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
sub_compact->compaction
->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
sub_compact->compaction, compaction_filter, shutting_down_,
db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
preclude_last_level_min_seqno_);
@ -1302,15 +1273,11 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
[this, sub_compact](CompactionOutputs& outputs) {
return this->OpenCompactionOutputFile(sub_compact, outputs);
};
const CompactionFileCloseFunc close_file_func =
[this, sub_compact, start_user_key, end_user_key](
CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(
status, sub_compact, outputs, next_table_min_key,
sub_compact->start.has_value() ? &start_user_key : nullptr,
sub_compact->end.has_value() ? &end_user_key : nullptr);
[this, sub_compact](CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(status, sub_compact, outputs,
next_table_min_key);
};
Status status;
@ -1321,8 +1288,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
// returns true.
assert(!end.has_value() ||
cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
assert(!end.has_value() || cfd->user_comparator()->Compare(
c_iter->user_key(), end.value()) < 0);
if (c_iter_stats.num_input_records % kRecordStatsEvery ==
kRecordStatsEvery - 1) {
@ -1349,25 +1316,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
if (c_iter->status().IsManualCompactionPaused()) {
break;
}
#ifndef NDEBUG
bool stop = false;
TEST_SYNC_POINT_CALLBACK("CompactionJob::ProcessKeyValueCompaction()::stop",
static_cast<void*>(&stop));
if (stop) {
break;
}
#endif // NDEBUG
}
// This number may not be accurate when CompactionIterator was created
// with `must_count_input_entries=false`.
assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
c_iter->HasNumInputEntryScanned());
sub_compact->compaction_job_stats.has_num_input_records =
c_iter->HasNumInputEntryScanned();
sub_compact->compaction_job_stats.num_input_records =
c_iter->NumInputEntryScanned();
sub_compact->compaction_job_stats.num_blobs_read =
c_iter_stats.num_blobs_read;
sub_compact->compaction_job_stats.total_blob_bytes_read =
@ -1517,8 +1467,7 @@ void CompactionJob::RecordDroppedKeys(
Status CompactionJob::FinishCompactionOutputFile(
const Status& input_status, SubcompactionState* sub_compact,
CompactionOutputs& outputs, const Slice& next_table_min_key,
const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
CompactionOutputs& outputs, const Slice& next_table_min_key) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
assert(sub_compact != nullptr);
@ -1548,10 +1497,12 @@ Status CompactionJob::FinishCompactionOutputFile(
// output_to_penultimate_level compaction here, as it's only used to decide
// if range dels could be dropped.
if (outputs.HasRangeDel()) {
s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key,
range_del_out_stats, bottommost_level_,
cfd->internal_comparator(), earliest_snapshot,
next_table_min_key, full_history_ts_low_);
s = outputs.AddRangeDels(
sub_compact->start.has_value() ? &(sub_compact->start.value())
: nullptr,
sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
earliest_snapshot, next_table_min_key, full_history_ts_low_);
}
RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
@ -1663,6 +1614,7 @@ Status CompactionJob::FinishCompactionOutputFile(
TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
file_checksum_func_name);
#ifndef ROCKSDB_LITE
// Report new file to SstFileManagerImpl
auto sfm =
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
@ -1681,6 +1633,7 @@ Status CompactionJob::FinishCompactionOutputFile(
db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
}
}
#endif
outputs.ResetBuilder();
return s;
@ -1692,7 +1645,6 @@ Status CompactionJob::InstallCompactionResults(
db_mutex_->AssertHeld();
const ReadOptions read_options(Env::IOActivity::kCompaction);
auto* compaction = compact_->compaction;
assert(compaction);
@ -1770,8 +1722,8 @@ Status CompactionJob::InstallCompactionResults(
}
return versions_->LogAndApply(compaction->column_family_data(),
mutable_cf_options, read_options, edit,
db_mutex_, db_directory_);
mutable_cf_options, edit, db_mutex_,
db_directory_);
}
void CompactionJob::RecordCompactionIOStats() {
@ -1806,9 +1758,11 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
std::string fname = GetTableFileName(file_number);
// Fire events.
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
#ifndef ROCKSDB_LITE
EventHelpers::NotifyTableFileCreationStarted(
cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
TableFileCreationReason::kCompaction);
#endif // !ROCKSDB_LITE
// Make the output file
std::unique_ptr<FSWritableFile> writable_file;
#ifndef NDEBUG
@ -1867,10 +1821,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
uint64_t current_time = static_cast<uint64_t>(temp_current_time);
InternalKey tmp_start, tmp_end;
if (sub_compact->start.has_value()) {
tmp_start.SetMinPossibleForUserKey(*(sub_compact->start));
tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
}
if (sub_compact->end.has_value()) {
tmp_end.SetMinPossibleForUserKey(*(sub_compact->end));
tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
}
uint64_t oldest_ancester_time =
sub_compact->compaction->MinInputFileOldestAncesterTime(
@ -1945,6 +1899,7 @@ void CompactionJob::CleanupCompaction() {
compact_ = nullptr;
}
#ifndef ROCKSDB_LITE
namespace {
void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
assert(prefix_length > 0);
@ -1953,53 +1908,25 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
}
} // namespace
bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
#endif // !ROCKSDB_LITE
void CompactionJob::UpdateCompactionStats() {
assert(compact_);
Compaction* compaction = compact_->compaction;
compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
compaction_stats_.stats.num_input_files_in_output_level = 0;
bool has_error = false;
const ReadOptions read_options(Env::IOActivity::kCompaction);
const auto& input_table_properties = compaction->GetTableProperties();
for (int input_level = 0;
input_level < static_cast<int>(compaction->num_input_levels());
++input_level) {
size_t num_input_files = compaction->num_input_files(input_level);
uint64_t* bytes_read;
if (compaction->level(input_level) != compaction->output_level()) {
compaction_stats_.stats.num_input_files_in_non_output_levels +=
static_cast<int>(num_input_files);
bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
UpdateCompactionInputStatsHelper(
&compaction_stats_.stats.num_input_files_in_non_output_levels,
&compaction_stats_.stats.bytes_read_non_output_levels, input_level);
} else {
compaction_stats_.stats.num_input_files_in_output_level +=
static_cast<int>(num_input_files);
bytes_read = &compaction_stats_.stats.bytes_read_output_level;
}
for (size_t i = 0; i < num_input_files; ++i) {
const FileMetaData* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
uint64_t file_input_entries = file_meta->num_entries;
uint64_t file_num_range_del = file_meta->num_range_deletions;
if (file_input_entries == 0) {
uint64_t file_number = file_meta->fd.GetNumber();
// Try getting info from table property
std::string fn =
TableFileName(compaction->immutable_options()->cf_paths,
file_number, file_meta->fd.GetPathId());
const auto& tp = input_table_properties.find(fn);
if (tp != input_table_properties.end()) {
file_input_entries = tp->second->num_entries;
file_num_range_del = tp->second->num_range_deletions;
} else {
has_error = true;
}
}
compaction_stats_.stats.num_input_records += file_input_entries;
if (num_input_range_del) {
*num_input_range_del += file_num_range_del;
}
UpdateCompactionInputStatsHelper(
&compaction_stats_.stats.num_input_files_in_output_level,
&compaction_stats_.stats.bytes_read_output_level, input_level);
}
}
@ -2009,11 +1936,26 @@ bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
compaction_stats_.stats.num_dropped_records =
compaction_stats_.DroppedRecords();
return !has_error;
}
void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
uint64_t* bytes_read,
int input_level) {
const Compaction* compaction = compact_->compaction;
auto num_input_files = compaction->num_input_files(input_level);
*num_files += static_cast<int>(num_input_files);
for (size_t i = 0; i < num_input_files; ++i) {
const auto* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
compaction_stats_.stats.num_input_records +=
static_cast<uint64_t>(file_meta->num_entries);
}
}
void CompactionJob::UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const {
#ifndef ROCKSDB_LITE
compaction_job_stats_->elapsed_micros = stats.micros;
// input information
@ -2040,6 +1982,9 @@ void CompactionJob::UpdateCompactionJobStats(
CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
&compaction_job_stats_->largest_output_key_prefix);
}
#else
(void)stats;
#endif // !ROCKSDB_LITE
}
void CompactionJob::LogCompaction() {

@ -192,21 +192,7 @@ class CompactionJob {
IOStatus io_status() const { return io_status_; }
protected:
// Update the following stats in compaction_stats_.stats
// - num_input_files_in_non_output_levels
// - num_input_files_in_output_level
// - bytes_read_non_output_levels
// - bytes_read_output_level
// - num_input_records
// - bytes_read_blob
// - num_dropped_records
//
// @param num_input_range_del if non-null, will be set to the number of range
// deletion entries in this compaction input.
//
// Returns true iff compaction_stats_.stats.num_input_records and
// num_input_range_del are calculated successfully.
bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
void UpdateCompactionStats();
void LogCompaction();
virtual void RecordCompactionIOStats();
void CleanupCompaction();
@ -270,9 +256,7 @@ class CompactionJob {
Status FinishCompactionOutputFile(const Status& input_status,
SubcompactionState* sub_compact,
CompactionOutputs& outputs,
const Slice& next_table_min_key,
const Slice* comp_start_user_key,
const Slice* comp_end_user_key);
const Slice& next_table_min_key);
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
CompactionOutputs& outputs);
@ -281,6 +265,9 @@ class CompactionJob {
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
CompactionJobStats* compaction_job_stats = nullptr);
void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
int input_level);
void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);

@ -24,7 +24,7 @@
#include "db/write_batch_internal.h"
#include "env/mock_env.h"
#include "file/filename.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "monitoring/thread_status_util.h"
#include "port/stack_trace.h"
#include "rocksdb/cache.h"
@ -54,11 +54,12 @@
#include "util/compression.h"
#include "util/hash.h"
#include "util/mutexlock.h"
#include "util/rate_limiter_impl.h"
#include "util/rate_limiter.h"
#include "util/string_util.h"
#include "utilities/merge_operators.h"
#if !defined(IOS_CROSS_COMPILE)
#ifndef ROCKSDB_LITE
namespace ROCKSDB_NAMESPACE {
static std::string RandomString(Random* rnd, int len, double ratio) {
@ -616,7 +617,6 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
// via AddExpectedStats().
auto* stats_checker = new CompactionJobStatsChecker();
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.listeners.emplace_back(stats_checker);
options.create_if_missing = true;
// just enough setting to hold off auto-compaction.
@ -816,7 +816,6 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
// what we expect.
auto* stats_checker = new CompactionJobDeletionStatsChecker();
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.listeners.emplace_back(stats_checker);
options.create_if_missing = true;
options.level0_file_num_compaction_trigger = kTestScale + 1;
@ -960,6 +959,15 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // !ROCKSDB_LITE
#else

@ -3,6 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_job.h"
@ -373,7 +374,7 @@ class CompactionJobTestBase : public testing::Test {
} else if (table_type_ == TableTypeForTest::kMockTable) {
file_size = 10;
EXPECT_OK(mock_table_factory_->CreateMockTable(
env_, GenerateFileName(file_number), contents));
env_, GenerateFileName(file_number), std::move(contents)));
} else {
assert(false);
}
@ -386,13 +387,12 @@ class CompactionJobTestBase : public testing::Test {
kUnknownFileCreationTime,
versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
/*compensated_range_deletion_size=*/0, /*tail_size=*/0,
/*user_defined_timestamps_persisted=*/true);
0);
mutex_.Lock();
EXPECT_OK(versions_->LogAndApply(
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
read_options_, &edit, &mutex_, nullptr));
EXPECT_OK(
versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
mutable_cf_options_, &edit, &mutex_, nullptr));
mutex_.Unlock();
}
@ -455,8 +455,7 @@ class CompactionJobTestBase : public testing::Test {
Status s = cf_options_.table_factory->NewTableReader(
read_opts,
TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
cfd_->internal_comparator(),
0 /* block_protection_bytes_per_key */),
cfd_->internal_comparator()),
std::move(freader), file_size, &table_reader, false);
ASSERT_OK(s);
assert(table_reader);
@ -655,12 +654,11 @@ class CompactionJobTestBase : public testing::Test {
ASSERT_TRUE(full_history_ts_low_.empty() ||
ucmp_->timestamp_size() == full_history_ts_low_.size());
const std::atomic<bool> kManualCompactionCanceledFalse{false};
JobContext job_context(1, false /* create_superversion */);
CompactionJob compaction_job(
0, &compaction, db_options_, mutable_db_options_, env_options_,
versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
nullptr, nullptr, &mutex_, &error_handler_, snapshots,
earliest_write_conflict_snapshot, snapshot_checker, &job_context,
earliest_write_conflict_snapshot, snapshot_checker, nullptr,
table_cache_, &event_logger, false, false, dbname_,
&compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
@ -730,7 +728,6 @@ class CompactionJobTestBase : public testing::Test {
ColumnFamilyOptions cf_options_;
MutableCFOptions mutable_cf_options_;
MutableDBOptions mutable_db_options_;
const ReadOptions read_options_;
std::shared_ptr<Cache> table_cache_;
WriteController write_controller_;
WriteBufferManager write_buffer_manager_;
@ -2444,3 +2441,14 @@ int main(int argc, char** argv) {
RegisterCustomObjects(argc, argv);
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // ROCKSDB_LITE

@ -43,10 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status,
const uint64_t current_bytes = builder_->FileSize();
if (s.ok()) {
meta->fd.file_size = current_bytes;
meta->tail_size = builder_->GetTailSize();
meta->marked_for_compaction = builder_->NeedCompact();
meta->user_defined_timestamps_persisted = static_cast<bool>(
builder_->GetTableProperties().user_defined_timestamps_persisted);
}
current_output().finished = true;
stats_.bytes_written += current_bytes;
@ -79,46 +76,6 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
return io_s;
}
bool CompactionOutputs::UpdateFilesToCutForTTLStates(
const Slice& internal_key) {
if (!files_to_cut_for_ttl_.empty()) {
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false;
}
size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
const Slice& internal_key) {
size_t curr_key_boundary_switched_num = 0;
@ -127,6 +84,11 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
if (grandparents.empty()) {
return curr_key_boundary_switched_num;
}
assert(!internal_key.empty());
InternalKey ikey;
ikey.DecodeFrom(internal_key);
assert(ikey.Valid());
const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
// Move the grandparent_index_ to the file containing the current user_key.
@ -134,7 +96,7 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
// index points to the last file containing the key.
while (grandparent_index_ < grandparents.size()) {
if (being_grandparent_gap_) {
if (sstableKeyCompare(ucmp, internal_key,
if (sstableKeyCompare(ucmp, ikey,
grandparents[grandparent_index_]->smallest) < 0) {
break;
}
@ -147,13 +109,13 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
being_grandparent_gap_ = false;
} else {
int cmp_result = sstableKeyCompare(
ucmp, internal_key, grandparents[grandparent_index_]->largest);
ucmp, ikey, grandparents[grandparent_index_]->largest);
// If it's same key, make sure grandparent_index_ is pointing to the last
// one.
if (cmp_result < 0 ||
(cmp_result == 0 &&
(grandparent_index_ == grandparents.size() - 1 ||
sstableKeyCompare(ucmp, internal_key,
sstableKeyCompare(ucmp, ikey,
grandparents[grandparent_index_ + 1]->smallest) <
0))) {
break;
@ -223,39 +185,18 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
assert(c_iter.Valid());
// always update grandparent information like overlapped file number, size
// etc.
const Slice& internal_key = c_iter.key();
#ifndef NDEBUG
bool should_stop = false;
std::pair<bool*, const Slice> p{&should_stop, internal_key};
TEST_SYNC_POINT_CALLBACK(
"CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
if (should_stop) {
return true;
}
#endif // NDEBUG
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
size_t num_grandparent_boundaries_crossed = 0;
bool should_stop_for_ttl = false;
// Always update grandparent information like overlapped file number, size
// etc., and TTL states.
// If compaction_->output_level() == 0, there is no need to update grandparent
// info, and that `grandparent` should be empty.
if (compaction_->output_level() > 0) {
num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key);
}
size_t num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
if (!HasBuilder()) {
return false;
}
if (should_stop_for_ttl) {
return true;
}
// If there's user defined partitioner, check that first
if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
last_key_for_partitioner_, c_iter.user_key(),
@ -273,6 +214,9 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
return true;
}
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
// Check if it needs to split for RoundRobin
// Invalid local_output_split_key indicates that we do not need to split
if (local_output_split_key_ != nullptr && !is_split_) {
@ -346,6 +290,41 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
}
}
// check ttl file boundaries if there's any
if (!files_to_cut_for_ttl_.empty()) {
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false;
}
@ -425,182 +404,114 @@ Status CompactionOutputs::AddToOutput(
return s;
}
namespace {
void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
const size_t ts_sz) {
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
internal_key = InternalKey(user_key, kMaxSequenceNumber,
kTypeRangeDeletion, Slice(kTsMax, ts_sz));
} else {
internal_key =
InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
}
}
} // namespace
Status CompactionOutputs::AddRangeDels(
const Slice* comp_start_user_key, const Slice* comp_end_user_key,
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
const Slice& next_table_min_key, const std::string& full_history_ts_low) {
// The following example does not happen since
// CompactionOutput::ShouldStopBefore() always return false for the first
// point key. But we should consider removing this dependency. Suppose for the
// first compaction output file,
// - next_table_min_key.user_key == comp_start_user_key
// - no point key is in the output file
// - there is a range tombstone @seqno to be added that covers
// comp_start_user_key
// Then meta.smallest will be set to comp_start_user_key@seqno
// and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
// which violates the assumption that meta.smallest should be <= meta.largest.
assert(HasRangeDel());
FileMetaData& meta = current_output().meta;
const Comparator* ucmp = icmp.user_comparator();
InternalKey lower_bound_buf, upper_bound_buf;
Slice lower_bound_guard, upper_bound_guard;
std::string smallest_user_key;
const Slice *lower_bound, *upper_bound;
// We first determine the internal key lower_bound and upper_bound for
// this output file. All and only range tombstones that overlap with
// [lower_bound, upper_bound] should be added to this file. File
// boundaries (meta.smallest/largest) should be updated accordingly when
// extended by range tombstones.
bool lower_bound_from_sub_compact = false;
bool lower_bound_from_range_tombstone = false;
size_t output_size = outputs_.size();
if (output_size == 1) {
// This is the first file in the subcompaction.
//
// When outputting a range tombstone that spans a subcompaction boundary,
// the files on either side of that boundary need to include that
// boundary's user key. Otherwise, the spanning range tombstone would lose
// coverage.
//
// To achieve this while preventing files from overlapping in internal key
// (an LSM invariant violation), we allow the earlier file to include the
// boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
// later file can begin at the boundary user key at the newest key version
// it contains. At this point that version number is unknown since we have
// not processed the range tombstones yet, so permit any version. Same story
// applies to timestamp, and a non-nullptr `comp_start_user_key` should have
// `kMaxTs` here, which similarly permits any timestamp.
if (comp_start_user_key) {
lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
lower_bound_guard = lower_bound_buf.Encode();
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
} else {
// For the first output table, include range tombstones before the min
// key but after the subcompaction boundary.
lower_bound = comp_start_user_key;
lower_bound_from_sub_compact = true;
} else if (range_tombstone_lower_bound_.size() > 0) {
assert(meta.smallest.size() == 0 ||
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) <= 0);
lower_bound_guard = range_tombstone_lower_bound_.user_key();
lower_bound = &lower_bound_guard;
lower_bound_from_range_tombstone = true;
} else if (meta.smallest.size() > 0) {
// For subsequent output tables, only include range tombstones from min
// key onwards since the previous file was extended to contain range
// tombstones falling before min key.
if (range_tombstone_lower_bound_.size() > 0) {
assert(meta.smallest.size() == 0 ||
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
lower_bound_guard = range_tombstone_lower_bound_.Encode();
} else {
assert(meta.smallest.size() > 0);
lower_bound_guard = meta.smallest.Encode();
}
smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
lower_bound_guard = Slice(smallest_user_key);
lower_bound = &lower_bound_guard;
}
const size_t ts_sz = ucmp->timestamp_size();
if (next_table_min_key.empty()) {
// Last file of the subcompaction.
if (comp_end_user_key) {
upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
upper_bound_guard = upper_bound_buf.Encode();
upper_bound = &upper_bound_guard;
} else {
upper_bound = nullptr;
}
} else {
// There is another file coming whose coverage will begin at
// `next_table_min_key`. The current file needs to extend range tombstone
// coverage through its own keys (through `meta.largest`) and through user
// keys preceding `next_table_min_key`'s user key.
ParsedInternalKey next_table_min_key_parsed;
ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
false /* log_err_key */)
.PermitUncheckedError();
assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
assert(meta.largest.size() == 0 ||
icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
if (meta.largest.size() > 0 &&
ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
next_table_min_key_parsed.user_key)) {
// Caution: this assumes meta.largest.Encode() lives longer than
// upper_bound, which is only true if meta.largest is never updated.
// This just happens to be the case here since meta.largest serves
// as the upper_bound.
upper_bound_guard = meta.largest.Encode();
lower_bound = nullptr;
}
if (!next_table_min_key.empty()) {
// This may be the last file in the subcompaction in some cases, so we
// need to compare the end key of subcompaction with the next file start
// key. When the end key is chosen by the subcompaction, we know that
// it must be the biggest key in output file. Therefore, it is safe to
// use the smaller key as the upper bound of the output file, to ensure
// that there is no overlapping between different output files.
upper_bound_guard = ExtractUserKey(next_table_min_key);
if (comp_end_user_key != nullptr &&
ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
0) {
upper_bound = comp_end_user_key;
} else {
SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
ts_sz);
upper_bound_guard = upper_bound_buf.Encode();
upper_bound = &upper_bound_guard;
}
upper_bound = &upper_bound_guard;
}
if (lower_bound && upper_bound &&
icmp.Compare(*lower_bound, *upper_bound) > 0) {
assert(meta.smallest.size() == 0 &&
ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
ExtractUserKey(*upper_bound)));
// This can only happen when lower_bound have the same user key as
// next_table_min_key and that there is no point key in the current
// compaction output file.
return Status::OK();
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = comp_end_user_key;
}
bool has_overlapping_endpoints;
if (upper_bound != nullptr && meta.largest.size() > 0) {
has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
meta.largest.user_key(), *upper_bound) == 0;
} else {
has_overlapping_endpoints = false;
}
// The end key of the subcompaction must be bigger or equal to the upper
// bound. If the end of subcompaction is null or the upper bound is null,
// it means that this file is the last file in the compaction. So there
// will be no overlapping between this file and others.
assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
*comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
Slice last_tombstone_start_user_key{};
bool reached_lower_bound = false;
const ReadOptions read_options(Env::IOActivity::kCompaction);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
has_overlapping_endpoints);
// Position the range tombstone output iterator. There may be tombstone
// fragments that are entirely out of range, so make sure that we do not
// include those.
if (lower_bound != nullptr) {
it->Seek(*lower_bound);
} else {
it->SeekToFirst();
}
for (; it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
auto kv = tombstone.Serialize();
InternalKey tombstone_end = tombstone.SerializeEndKey();
// TODO: the underlying iterator should support clamping the bounds.
// tombstone_end.Encode is of form user_key@kMaxSeqno
// if it is equal to lower_bound, there is no need to include
// such range tombstone.
if (!reached_lower_bound && lower_bound &&
icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
continue;
if (upper_bound != nullptr) {
int cmp =
ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
if ((has_overlapping_endpoints && cmp < 0) ||
(!has_overlapping_endpoints && cmp <= 0)) {
// Tombstones starting after upper_bound only need to be included in
// the next table. If the current SST ends before upper_bound, i.e.,
// `has_overlapping_endpoints == false`, we can also skip over range
// tombstones that start exactly at upper_bound. Such range
// tombstones will be included in the next file and are not relevant
// to the point keys or endpoints of the current file.
break;
}
}
assert(!lower_bound ||
icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
reached_lower_bound = true;
const size_t ts_sz = ucmp->timestamp_size();
// Garbage collection for range tombstones.
// If user-defined timestamp is enabled, range tombstones are dropped if
// they are at bottommost_level, below full_history_ts_low and not visible
// in any snapshot. trim_ts_ is passed to the constructor for
// range_del_agg_, and range_del_agg_ internally drops tombstones above
// trim_ts_.
bool consider_drop =
tombstone.seq_ <= earliest_snapshot &&
if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
(ts_sz == 0 ||
(!full_history_ts_low.empty() &&
ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0));
if (consider_drop && bottommost_level) {
ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
// TODO(andrewkr): tombstones that span multiple output files are
// counted for each compaction output file, so lots of double
// counting.
@ -609,126 +520,149 @@ Status CompactionOutputs::AddRangeDels(
continue;
}
auto kv = tombstone.Serialize();
assert(lower_bound == nullptr ||
ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
kv.second) < 0);
InternalKey tombstone_start = kv.first;
if (lower_bound &&
ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
ExtractUserKey(*lower_bound)) < 0) {
// This just updates the non-timestamp portion of `tombstone_start`'s user
// key. Ideally there would be a simpler API usage
ParsedInternalKey tombstone_start_parsed;
ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
false /* log_err_key */)
.PermitUncheckedError();
// timestamp should be from where sequence number is from, which is from
// tombstone in this case
std::string ts =
tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
.ToString();
tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
tombstone_start.SetFrom(tombstone_start_parsed, ts);
ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
// Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_start = std::move(kv.first);
InternalKey smallest_candidate{tombstone_start};
if (lower_bound != nullptr &&
ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
if (lower_bound_from_sub_compact) {
// When lower_bound is chosen by a subcompaction
// (lower_bound_from_sub_compact), we know that subcompactions over
// smaller keys cannot contain any keys at lower_bound. We also know
// that smaller subcompactions exist, because otherwise the
// subcompaction woud be unbounded on the left. As a result, we know
// that no other files on the output level will contain actual keys at
// lower_bound (an output file may have a largest key of
// lower_bound@kMaxSequenceNumber, but this only indicates a large range
// tombstone was truncated). Therefore, it is safe to use the
// tombstone's sequence number, to ensure that keys at lower_bound at
// lower levels are covered by truncated tombstones.
if (ts_sz) {
assert(tombstone.ts_.size() == ts_sz);
smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
kTypeRangeDeletion, tombstone.ts_);
} else {
smallest_candidate =
InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
}
} else if (lower_bound_from_range_tombstone) {
// When lower_bound is chosen from a range tombtone start key:
// Range tombstone keys can be truncated at file boundaries of the files
// that contain them.
//
// If this lower bound is from a range tombstone key that is not
// truncated, i.e., it was not truncated when reading from the input
// files, then its sequence number and `op_type` will be
// kMaxSequenceNumber and kTypeRangeDeletion (see
// TruncatedRangeDelIterator::start_key()). In this case, when this key
// was used as the upper bound to cut the previous compaction output
// file, the previous file's largest key could have the same value as
// this key (see the upperbound logic below). To guarantee
// non-overlapping ranges between output files, we use the range
// tombstone's actual sequence number (tombstone.seq_) for the lower
// bound of this file. If this range tombstone key is truncated, then
// the previous file's largest key will be smaller than this range
// tombstone key, so we can use it as the lower bound directly.
if (ExtractInternalKeyFooter(range_tombstone_lower_bound_.Encode()) ==
kRangeTombstoneSentinel) {
if (ts_sz) {
smallest_candidate =
InternalKey(range_tombstone_lower_bound_.user_key(),
tombstone.seq_, kTypeRangeDeletion, tombstone.ts_);
} else {
smallest_candidate =
InternalKey(range_tombstone_lower_bound_.user_key(),
tombstone.seq_, kTypeRangeDeletion);
}
} else {
assert(GetInternalKeySeqno(range_tombstone_lower_bound_.Encode()) <
kMaxSequenceNumber);
smallest_candidate = range_tombstone_lower_bound_;
}
} else {
// If lower_bound was chosen by the smallest data key in the file,
// choose lowest seqnum so this file's smallest internal key comes
// after the previous file's largest. The fake seqnum is OK because
// the read path's file-picking code only considers user key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
}
InternalKey tombstone_end = tombstone.SerializeEndKey();
InternalKey largest_candidate{tombstone_end};
if (upper_bound != nullptr &&
icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
break;
}
if (lower_bound &&
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
tombstone_start.DecodeFrom(*lower_bound);
}
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
tombstone_end.DecodeFrom(*upper_bound);
ucmp->CompareWithoutTimestamp(*upper_bound,
largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the
// user key portion.
//
// Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
Slice(kTsMax, ts_sz));
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
}
}
if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel(
tombstone_start.user_key(),
tombstone_end.user_key(), &level_ptrs_)) {
range_del_out_stats.num_range_del_drop_obsolete++;
range_del_out_stats.num_record_drop_obsolete++;
continue;
#ifndef NDEBUG
SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
if (meta.smallest.size() > 0) {
smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
}
// Here we show that *only* range tombstones that overlap with
// [lower_bound, upper_bound] are added to the current file, and
// sanity checking invariants that should hold:
// - [tombstone_start, tombstone_end] overlaps with [lower_bound,
// upper_bound]
// - meta.smallest <= meta.largest
// Corresponding assertions are made, the proof is broken is any of them
// fails.
// TODO: show that *all* range tombstones that overlap with
// [lower_bound, upper_bound] are added.
// TODO: some invariant about boundaries are correctly updated.
//
// Note that `tombstone_start` is updated in the if condition above, we use
// tombstone_start to refer to its initial value, i.e.,
// it->Tombstone().first, and use tombstone_start* to refer to its value
// after the update.
//
// To show [lower_bound, upper_bound] overlaps with [tombstone_start,
// tombstone_end]:
// lower_bound <= upper_bound from the if condition right after all
// bounds are initialized. We assume each tombstone fragment has
// start_key.user_key < end_key.user_key, so
// tombstone_start < tombstone_end by
// FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
// non-emtpy. The flag `reached_lower_bound` and the if logic before it
// ensures lower_bound <= tombstone_end. tombstone_start is only updated
// if it has a smaller user_key than lower_bound user_key, so
// tombstone_start <= tombstone_start*. The above if condition implies
// tombstone_start* <= upper_bound. So we have
// tombstone_start <= upper_bound and lower_bound <= tombstone_end
// and the two ranges overlap.
//
// To show meta.smallest <= meta.largest:
// From the implementation of UpdateBoundariesForRange(), it suffices to
// prove that when it is first called in this function, its parameters
// satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
// and end = min(tombstone_end, upper_bound). From the above proof we have
// lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
// to show that tombstone_start* <= min(tombstone_end, upper_bound).
// Note that tombstone_start*.user_key = max(tombstone_start.user_key,
// lower_bound.user_key). Assuming tombstone_end always has
// kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
// Since lower_bound <= tombstone_end and lower_bound.seqno <
// tombstone_end.seqno (in absolute number order, not internal key order),
// lower_bound.user_key < tombstone_end.user_key.
// Since lower_bound.user_key < tombstone_end.user_key and
// tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
// tombstone_end. Since tombstone_start* <= upper_bound from the above proof
// and tombstone_start* < tombstone_end, tombstone_start* <=
// min(tombstone_end, upper_bound), so the two ranges overlap.
// Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second);
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
#endif
meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
tombstone.seq_, icmp);
if (!bottommost_level) {
bool start_user_key_changed =
last_tombstone_start_user_key.empty() ||
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
it->start_key()) < 0;
last_tombstone_start_user_key = it->start_key();
if (start_user_key_changed) {
// If tombstone_start >= tombstone_end, then either no key range is
// covered, or that they have the same user key. If they have the same
// user key, then the internal key range should only be within this
// level, and no keys from older levels is covered.
if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
tombstone_end.user_key()) < 0) {
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
auto approximate_covered_size =
compaction_->input_version()->version_set()->ApproximateSize(
approx_opts, read_options, compaction_->input_version(),
tombstone_start.Encode(), tombstone_end.Encode(),
compaction_->output_level() + 1 /* start_level */,
-1 /* end_level */, kCompaction);
meta.compensated_range_deletion_size += approximate_covered_size;
}
// Range tombstones are truncated at file boundaries
if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
tombstone_start = meta.smallest;
}
if (icmp.Compare(tombstone_end, meta.largest) > 0) {
tombstone_end = meta.largest;
}
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
auto approximate_covered_size =
compaction_->input_version()->version_set()->ApproximateSize(
approx_opts, compaction_->input_version(),
tombstone_start.Encode(), tombstone_end.Encode(),
compaction_->output_level() + 1 /* start_level */,
-1 /* end_level */, kCompaction);
meta.compensated_range_deletion_size += approximate_covered_size;
}
// The smallest key in a file is used for range tombstone truncation, so
// it cannot have a seqnum of 0 (unless the smallest data key in a file
// has a seqnum of 0). Otherwise, the truncated tombstone may expose
// deleted keys at lower levels.
assert(smallest_ikey_seqnum == 0 || lower_bound_from_range_tombstone ||
ExtractInternalKeyFooter(meta.smallest.Encode()) !=
PackSequenceAndType(0, kTypeRangeDeletion));
}
return Status::OK();
}
@ -785,8 +719,6 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction,
if (compaction->output_level() != 0) {
FillFilesToCutForTtl();
}
level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
}
} // namespace ROCKSDB_NAMESPACE

@ -167,15 +167,9 @@ class CompactionOutputs {
current_output_file_size_ = 0;
}
// Add range deletions from the range_del_agg_ to the current output file.
// Input parameters, `range_tombstone_lower_bound_` and current output's
// metadata determine the bounds on range deletions to add. Updates output
// file metadata boundary if extended by range tombstones.
//
// Add range-dels from the aggregator to the current output file
// @param comp_start_user_key and comp_end_user_key include timestamp if
// user-defined timestamp is enabled. Their timestamp should be max timestamp.
// @param next_table_min_key internal key lower bound for the next compaction
// output.
// user-defined timestamp is enabled.
// @param full_history_ts_low used for range tombstone garbage collection.
Status AddRangeDels(const Slice* comp_start_user_key,
const Slice* comp_end_user_key,
@ -206,10 +200,10 @@ class CompactionOutputs {
// We may only split the output when the cursor is in the range. Split
if ((!end.has_value() ||
icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()), *end) < 0) &&
(!start.has_value() ||
icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()), *start) > 0)) {
ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
(!start.has_value() || icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()),
start.value()) > 0)) {
local_output_split_key_ = output_split_key;
}
}
@ -227,13 +221,6 @@ class CompactionOutputs {
}
}
// Updates states related to file cutting for TTL.
// Returns a boolean value indicating whether the current
// compaction output file should be cut before `internal_key`.
//
// @param internal_key the current key to be added to output.
bool UpdateFilesToCutForTTLStates(const Slice& internal_key);
// update tracked grandparents information like grandparent index, if it's
// in the gap between 2 grandparent files, accumulated grandparent files size
// etc.
@ -356,15 +343,6 @@ class CompactionOutputs {
// The smallest key of the current output file, this is set when current
// output file's smallest key is a range tombstone start key.
InternalKey range_tombstone_lower_bound_;
// Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in
// CompactionOutputs::AddRangeDels().
// level_ptrs_[i] holds index of the file that was checked during the last
// call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows
// future calls to the function to pick up where it left off, since each
// range tombstone added to output file within each subcompaction is in
// increasing key range.
std::vector<size_t> level_ptrs_;
};
// helper struct to concatenate the last level and penultimate level outputs

@ -20,7 +20,7 @@
#include "file/filename.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/string_util.h"
@ -611,21 +611,23 @@ Compaction* CompactionPicker::CompactRange(
// Universal compaction with more than one level always compacts all the
// files together to the last level.
assert(vstorage->num_levels() > 1);
int max_output_level =
vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind);
// DBImpl::CompactRange() set output level to be the last level
assert(output_level == max_output_level);
if (ioptions_.allow_ingest_behind) {
assert(output_level == vstorage->num_levels() - 2);
} else {
assert(output_level == vstorage->num_levels() - 1);
}
// DBImpl::RunManualCompaction will make full range for universal compaction
assert(begin == nullptr);
assert(end == nullptr);
*compaction_end = nullptr;
int start_level = 0;
for (; start_level <= max_output_level &&
for (; start_level < vstorage->num_levels() &&
vstorage->NumLevelFiles(start_level) == 0;
start_level++) {
}
if (start_level > max_output_level) {
if (start_level == vstorage->num_levels()) {
return nullptr;
}
@ -635,9 +637,9 @@ Compaction* CompactionPicker::CompactRange(
return nullptr;
}
std::vector<CompactionInputFiles> inputs(max_output_level + 1 -
std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
start_level);
for (int level = start_level; level <= max_output_level; level++) {
for (int level = start_level; level < vstorage->num_levels(); level++) {
inputs[level - start_level].level = level;
auto& files = inputs[level - start_level].files;
for (FileMetaData* f : vstorage->LevelFiles(level)) {
@ -751,10 +753,8 @@ Compaction* CompactionPicker::CompactRange(
// for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
// files that are created during the current compaction.
if ((compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized ||
compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kIfHaveCompactionFilter) &&
if (compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized &&
max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
assert(input_level == output_level);
// inputs_shrunk holds a continuous subset of input files which were all
@ -877,6 +877,7 @@ Compaction* CompactionPicker::CompactRange(
return compaction;
}
#ifndef ROCKSDB_LITE
namespace {
// Test whether two files have overlapping key-ranges.
bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
@ -1115,6 +1116,7 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
return Status::OK();
}
#endif // !ROCKSDB_LITE
void CompactionPicker::RegisterCompaction(Compaction* c) {
if (c == nullptr) {

@ -93,9 +93,11 @@ class CompactionPicker {
// into a valid one by adding more files, the function will return a
// non-ok status with specific reason.
//
#ifndef ROCKSDB_LITE
Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta,
const int output_level) const;
#endif // ROCKSDB_LITE
// Free up the files that participated in a compaction
//
@ -227,9 +229,11 @@ class CompactionPicker {
// A helper function to SanitizeCompactionInputFiles() that
// sanitizes "input_files" by adding necessary files.
#ifndef ROCKSDB_LITE
virtual Status SanitizeCompactionInputFilesForAllLevels(
std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta, const int output_level) const;
#endif // ROCKSDB_LITE
// Keeps track of all compactions that are running on Level0.
// Protected by DB mutex
@ -242,6 +246,7 @@ class CompactionPicker {
const InternalKeyComparator* const icmp_;
};
#ifndef ROCKSDB_LITE
// A dummy compaction that never triggers any automatic
// compaction.
class NullCompactionPicker : public CompactionPicker {
@ -282,6 +287,7 @@ class NullCompactionPicker : public CompactionPicker {
return false;
}
};
#endif // !ROCKSDB_LITE
// Attempts to find an intra L0 compaction conforming to the given parameters.
//

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_fifo.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
#include <string>
@ -16,7 +17,6 @@
#include "db/column_family.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "options/options_helper.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
@ -285,36 +285,31 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
return c;
}
Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
Compaction* FIFOCompactionPicker::PickCompactionToWarm(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) {
const std::vector<FileTemperatureAge>& ages =
mutable_cf_options.compaction_options_fifo
.file_temperature_age_thresholds;
if (ages.empty()) {
if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
return nullptr;
}
// Does not apply to multi-level FIFO.
if (vstorage->num_levels() > 1) {
return nullptr;
// PickCompactionToWarm is only triggered if there is no non-L0 files.
for (int level = 1; level < vstorage->num_levels(); ++level) {
if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
return nullptr;
}
}
const int kLevel0 = 0;
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
if (level_files.empty()) {
return nullptr;
}
int64_t _current_time;
auto status = ioptions_.clock->GetCurrentTime(&_current_time);
if (!status.ok()) {
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: Couldn't get current time: %s. "
"Not doing compactions based on file temperature-age threshold. ",
cf_name.c_str(), status.ToString().c_str());
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: Couldn't get current time: %s. "
"Not doing compactions based on warm threshold. ",
cf_name.c_str(), status.ToString().c_str());
return nullptr;
}
const uint64_t current_time = static_cast<uint64_t>(_current_time);
@ -333,77 +328,56 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
inputs[0].level = 0;
// avoid underflow
uint64_t min_age = ages[0].age;
// kLastTemperature means target temperature is to be determined.
Temperature compaction_target_temp = Temperature::kLastTemperature;
if (current_time > min_age) {
uint64_t create_time_threshold = current_time - min_age;
if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
uint64_t create_time_threshold =
current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
uint64_t compaction_size = 0;
// We will ideally identify a file qualifying for temperature change by
// knowing the timestamp for the youngest entry in the file. However, right
// now we don't have the information. We infer it by looking at timestamp of
// the previous file's (which is just younger) oldest entry's timestamp.
Temperature cur_target_temp;
// avoid index underflow
assert(level_files.size() >= 1);
for (size_t index = level_files.size() - 1; index >= 1; --index) {
// Try to add cur_file to compaction inputs.
FileMetaData* cur_file = level_files[index];
// prev_file is just younger than cur_file
FileMetaData* prev_file = level_files[index - 1];
if (cur_file->being_compacted) {
// Should not happen since we check for
// `level0_compactions_in_progress_` above. Here we simply just don't
// schedule anything.
// We will ideally identify a file qualifying for warm tier by knowing
// the timestamp for the youngest entry in the file. However, right now
// we don't have the information. We infer it by looking at timestamp
// of the next file's (which is just younger) oldest entry's timestamp.
FileMetaData* prev_file = nullptr;
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
FileMetaData* f = *ritr;
assert(f);
if (f->being_compacted) {
// Right now this probably won't happen as we never try to schedule
// two compactions in parallel, so here we just simply don't schedule
// anything.
return nullptr;
}
uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime();
if (oldest_ancestor_time == kUnknownOldestAncesterTime) {
uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
if (oldest_ancester_time == kUnknownOldestAncesterTime) {
// Older files might not have enough information. It is possible to
// handle these files by looking at newer files, but maintaining the
// logic isn't worth it.
break;
}
if (oldest_ancestor_time > create_time_threshold) {
// cur_file is too fresh
if (oldest_ancester_time > create_time_threshold) {
// The previous file (which has slightly older data) doesn't qualify
// for warm tier.
break;
}
cur_target_temp = ages[0].temperature;
for (size_t i = 1; i < ages.size(); ++i) {
if (current_time >= ages[i].age &&
oldest_ancestor_time <= current_time - ages[i].age) {
cur_target_temp = ages[i].temperature;
}
}
if (cur_file->temperature == cur_target_temp) {
if (inputs[0].empty()) {
continue;
} else {
if (prev_file != nullptr) {
compaction_size += prev_file->fd.GetFileSize();
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
break;
}
inputs[0].files.push_back(prev_file);
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with next file's oldest time %" PRIu64 " for warm",
cf_name.c_str(), prev_file->fd.GetNumber(),
oldest_ancester_time);
}
// cur_file needs to change temperature
if (compaction_target_temp == Temperature::kLastTemperature) {
assert(inputs[0].empty());
compaction_target_temp = cur_target_temp;
} else if (cur_target_temp != compaction_target_temp) {
assert(!inputs[0].empty());
break;
}
if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <=
mutable_cf_options.max_compaction_bytes) {
inputs[0].files.push_back(cur_file);
compaction_size += cur_file->fd.GetFileSize();
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with next file's oldest time %" PRIu64 " for temperature %s.",
cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
temperature_to_string[cur_target_temp].c_str());
}
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
if (f->temperature == Temperature::kUnknown ||
f->temperature == Temperature::kHot) {
prev_file = f;
} else if (!inputs[0].files.empty()) {
// A warm file newer than files picked.
break;
} else {
assert(prev_file == nullptr);
}
}
}
@ -417,7 +391,7 @@ Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
std::move(inputs), 0, 0 /* output file size limit */,
0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
mutable_cf_options.compression, mutable_cf_options.compression_opts,
compaction_target_temp,
Temperature::kWarm,
/* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
vstorage->CompactionScore(0),
/* is deletion compaction */ false, /* l0_files_might_overlap */ true,
@ -439,8 +413,8 @@ Compaction* FIFOCompactionPicker::PickCompaction(
vstorage, log_buffer);
}
if (c == nullptr) {
c = PickTemperatureChangeCompaction(
cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
vstorage, log_buffer);
}
RegisterCompaction(c);
return c;
@ -469,3 +443,4 @@ Compaction* FIFOCompactionPicker::CompactRange(
}
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_picker.h"
@ -52,9 +53,11 @@ class FIFOCompactionPicker : public CompactionPicker {
VersionStorageInfo* version,
LogBuffer* log_buffer);
Compaction* PickTemperatureChangeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer);
Compaction* PickCompactionToWarm(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options,
VersionStorageInfo* version,
LogBuffer* log_buffer);
};
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -83,7 +83,7 @@ class LevelCompactionBuilder {
Compaction* GetCompaction();
// From `start_level_`, pick files to compact to `output_level_`.
// For the specfied level, pick a file that we want to compact.
// Returns false if there is no file to compact.
// If it returns true, inputs->files.size() will be exactly one for
// all compaction priorities except round-robin. For round-robin,
@ -107,9 +107,8 @@ class LevelCompactionBuilder {
bool PickIntraL0Compaction();
// Return true if TrivialMove is extended. `start_index` is the index of
// the initial file picked, which should already be in `start_level_inputs_`.
bool TryExtendNonL0TrivialMove(int start_index,
bool only_expand_right = false);
// the intiial file picked, which should already be in `start_level_inputs_`.
bool TryExtendNonL0TrivialMove(int start_index);
// Picks a file from level_files to compact.
// level_files is a vector of (level, file metadata) in ascending order of
@ -356,8 +355,7 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
if (TryExtendNonL0TrivialMove((int)start_index,
true /* only_expand_right */)) {
if (TryExtendNonL0TrivialMove((int)start_index)) {
return;
}
}
@ -503,16 +501,6 @@ Compaction* LevelCompactionBuilder::PickCompaction() {
}
Compaction* LevelCompactionBuilder::GetCompaction() {
// TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
// empty output level. So L0 files is picked in PickFileToCompact() by
// compaction score. We may still be able to do trivial move when this file
// does not overlap with other L0s. This happens when
// compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
// pull in more L0s.
assert(!compaction_inputs_.empty());
bool l0_files_might_overlap =
start_level_ == 0 && !is_l0_trivial_move_ &&
(compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
auto c = new Compaction(
vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
std::move(compaction_inputs_), output_level_,
@ -527,7 +515,8 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
Temperature::kUnknown,
/* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
/* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
l0_files_might_overlap, compaction_reason_);
/* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
compaction_reason_);
// If it's level 0 compaction, make sure we don't execute any other level 0
// compactions in parallel
@ -664,8 +653,7 @@ bool LevelCompactionBuilder::TryPickL0TrivialMove() {
return false;
}
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
bool only_expand_right) {
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
if (start_level_inputs_.size() == 1 &&
(ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
(mutable_cf_options_.compression_per_level.empty())) {
@ -682,7 +670,6 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
size_t total_size = initial_file->fd.GetFileSize();
CompactionInputFiles output_level_inputs;
output_level_inputs.level = output_level_;
// Expand towards right
for (int i = start_index + 1;
i < static_cast<int>(level_files.size()) &&
start_level_inputs_.size() < kMaxMultiTrivialMove;
@ -715,37 +702,6 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
}
start_level_inputs_.files.push_back(next_file);
}
// Expand towards left
if (!only_expand_right) {
for (int i = start_index - 1;
i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
FileMetaData* next_file = level_files[i];
if (next_file->being_compacted) {
break;
}
vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
&(initial_file->largest),
&output_level_inputs.files);
if (!output_level_inputs.empty()) {
break;
}
if (i > 0 && compaction_picker_->icmp()
->user_comparator()
->CompareWithoutTimestamp(
next_file->smallest.user_key(),
level_files[i - 1]->largest.user_key()) == 0) {
// Not a clean up after adding the next file. Skip.
break;
}
total_size += next_file->fd.GetFileSize();
if (total_size > mutable_cf_options_.max_compaction_bytes) {
break;
}
// keep `files` sorted in increasing order by key range
start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
next_file);
}
}
return start_level_inputs_.size() > 1;
}
return false;
@ -829,10 +785,7 @@ bool LevelCompactionBuilder::PickFileToCompact() {
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
if (start_level_ > 0 &&
TryExtendNonL0TrivialMove(index,
ioptions_.compaction_pri ==
kRoundRobin /* only_expand_right */)) {
if (TryExtendNonL0TrivialMove(index)) {
break;
}
} else {

@ -70,11 +70,6 @@ class CompactionPickerTestBase : public testing::Test {
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
ioptions_.cf_paths.emplace_back("dummy",
std::numeric_limits<uint64_t>::max());
// When the default value of this option is true, universal compaction
// tests can encounter assertion failure since SanitizeOption() is
// not run to set this option to false. So we do the sanitization
// here. Tests that test this option set this option to true explicitly.
ioptions_.level_compaction_dynamic_level_bytes = false;
}
~CompactionPickerTestBase() override {}
@ -153,8 +148,7 @@ class CompactionPickerTestBase : public testing::Test {
smallest_seq, largest_seq, marked_for_compact, temperature,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
true /* user_defined_timestamps_persisted */);
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
f->compensated_file_size =
(compensated_file_size != 0) ? compensated_file_size : file_size;
f->oldest_ancester_time = oldest_ancestor_time;
@ -488,6 +482,8 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
ASSERT_EQ(num_levels - 1, compaction->output_level());
}
// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE
TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
NewVersionStorage(1, kCompactionStyleUniversal);
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -511,7 +507,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
const uint64_t kFileSize = 100000;
NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
NewVersionStorage(1, kCompactionStyleUniversal);
ioptions_.allow_ingest_behind = true;
ioptions_.num_levels = 3;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -538,14 +534,6 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
// output level should be the one above the bottom-most
ASSERT_EQ(1, compaction->output_level());
// input should not include the reserved level
const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
for (const auto& compaction_input : *inputs) {
if (!compaction_input.empty()) {
ASSERT_LT(compaction_input.level, 2);
}
}
}
// Tests if the files can be trivially moved in multi level
// universal compaction when allow_trivial_move option is set
@ -1019,28 +1007,29 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
}
}
TEST_F(CompactionPickerTest, FIFOToCold1) {
TEST_F(CompactionPickerTest, FIFOToWarm1) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kColdThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0 /* level */, 4U /* file_number */, "260", "300", 1 * kFileSize, 0, 2500,
2600, 0, true, Temperature::kUnknown,
threshold_time - 2000 /* oldest_ancestor_time */);
// Qualifies for compaction to kCold.
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
UpdateVersionStorageInfo();
@ -1050,36 +1039,33 @@ TEST_F(CompactionPickerTest, FIFOToCold1) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToCold2) {
TEST_F(CompactionPickerTest, FIFOToWarm2) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kColdThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kColdThreshold;
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time);
// The following two files qualify for compaction to kCold.
Temperature::kUnknown, threshold_time - 2000);
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1091,40 +1077,34 @@ TEST_F(CompactionPickerTest, FIFOToCold2) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kColdThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kColdThreshold;
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
// The following two files qualify for compaction to kCold.
// But only the last two should be included to respect `max_compaction_bytes`.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1138,45 +1118,40 @@ TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kColdThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kColdThreshold;
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
// The following two files qualify for compaction to kCold.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kCold, threshold_time - 5000);
Temperature::kWarm, threshold_time - 5000);
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
@ -1184,32 +1159,28 @@ TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kColdThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kColdThreshold;
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
@ -1217,78 +1188,65 @@ TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kCold, threshold_time - 3000);
// Qualifies for compaction to kCold.
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kCold, threshold_time - 5000);
Temperature::kWarm, threshold_time - 5000);
file_map_[2].first->being_compacted = true;
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
// Stop if a file is being compacted
ASSERT_TRUE(compaction.get() == nullptr);
}
TEST_F(CompactionPickerTest, FIFOToColdAndWarm) {
TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 10000;
uint64_t kHotThreshold = 2000;
uint64_t kWarmThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
// Test that multiple threshold works.
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kHot, kHotThreshold}, {Temperature::kWarm, kWarmThreshold}};
fifo_options_.age_for_warm = kWarmThreshold;
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t hot_threshold_time =
static_cast<uint64_t>(current_time) - kHotThreshold;
uint64_t warm_threshold_time =
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, hot_threshold_time + 100);
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, hot_threshold_time - 200);
// Qualifies for Hot
Temperature::kUnknown, threshold_time - 2000);
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, warm_threshold_time - 100);
// Qualifies for Warm
Temperature::kWarm, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, warm_threshold_time - 4000);
Temperature::kUnknown, threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kUnknown, warm_threshold_time - 5000);
Temperature::kWarm, threshold_time - 5000);
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// Stop if a file is being compacted
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
// Assumes compaction picker picks older files first.
ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
}
#endif // ROCKSDB_LITE
TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
NewVersionStorage(6, kCompactionStyleLevel);
ioptions_.compaction_pri = kMinOverlappingRatio;
@ -2565,61 +2523,6 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) {
mutable_cf_options_.max_bytes_for_level_base = 5000;
mutable_cf_options_.level0_file_num_compaction_trigger = 4;
mutable_cf_options_.max_compaction_bytes = 10000000u;
ioptions_.level_compaction_dynamic_level_bytes = false;
NewVersionStorage(6, kCompactionStyleLevel);
Add(1, 1U, "300", "350", 3000U, 0, 710, 800, 3000U);
Add(1, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
Add(1, 3U, "700", "750", 3000U, 0, 500, 550, 3000U);
Add(2, 4U, "800", "850", 4000U, 0, 150, 200, 4000U);
UpdateVersionStorageInfo();
// File #2 should be picked first, and expand both directions to include
// files #1 and #3.
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1, compaction->num_input_levels());
ASSERT_EQ(3, compaction->num_input_files(0));
ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2, compaction->input(0, 1)->fd.GetNumber());
ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) {
mutable_cf_options_.max_bytes_for_level_base = 5000;
mutable_cf_options_.level0_file_num_compaction_trigger = 4;
mutable_cf_options_.max_compaction_bytes = 10000000u;
ioptions_.level_compaction_dynamic_level_bytes = false;
NewVersionStorage(6, kCompactionStyleLevel);
// File 2 will be picked first, which by itself is trivial movable.
// There was a bug before where compaction also picks file 3 and 4,
// (and then file 1 since it overlaps with the key range),
// which makes the compaction not trivial movable.
Add(0, 1U, "450", "599", 3000U, 0, 710, 800, 3000U);
Add(0, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
Add(0, 3U, "300", "350", 3000U, 0, 500, 550, 3000U);
Add(0, 4U, "500", "550", 2999U, 0, 300, 350, 2999U);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1, compaction->num_input_levels());
ASSERT_EQ(1, compaction->num_input_files(0));
ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
mutable_cf_options_.max_bytes_for_level_base = 10000u;
mutable_cf_options_.max_compaction_bytes = 10001u;
@ -2970,6 +2873,7 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
ASSERT_EQ(0, compaction->output_level());
}
#ifndef ROCKSDB_LITE
TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
const uint64_t kFileSize = 100000;
@ -4078,6 +3982,7 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
PerKeyPlacementCompactionPickerTest, ::testing::Bool());
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_universal.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
#include <limits>
@ -19,7 +20,7 @@
#include "file/filename.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/statistics.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/string_util.h"
@ -133,8 +134,8 @@ class UniversalCompactionBuilder {
UniversalCompactionPicker* picker_;
LogBuffer* log_buffer_;
static std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
const VersionStorageInfo& vstorage, int last_level);
static std::vector<SortedRun> CalculateSortedRuns(
const VersionStorageInfo& vstorage);
// Pick a path ID to place a newly generated file, with its estimated file
// size.
@ -339,13 +340,13 @@ void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
std::vector<UniversalCompactionBuilder::SortedRun>
UniversalCompactionBuilder::CalculateSortedRuns(
const VersionStorageInfo& vstorage, int last_level) {
const VersionStorageInfo& vstorage) {
std::vector<UniversalCompactionBuilder::SortedRun> ret;
for (FileMetaData* f : vstorage.LevelFiles(0)) {
ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
f->being_compacted);
}
for (int level = 1; level <= last_level; level++) {
for (int level = 1; level < vstorage.num_levels(); level++) {
uint64_t total_compensated_size = 0U;
uint64_t total_size = 0U;
bool being_compacted = false;
@ -374,9 +375,7 @@ UniversalCompactionBuilder::CalculateSortedRuns(
Compaction* UniversalCompactionBuilder::PickCompaction() {
const int kLevel0 = 0;
score_ = vstorage_->CompactionScore(kLevel0);
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
sorted_runs_ = CalculateSortedRuns(*vstorage_, max_output_level);
sorted_runs_ = CalculateSortedRuns(*vstorage_);
if (sorted_runs_.size() == 0 ||
(vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@ -473,8 +472,6 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
"UniversalCompactionBuilder::PickCompaction:Return", nullptr);
return nullptr;
}
assert(c->output_level() <=
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
true &&
@ -702,18 +699,22 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
int start_level = sorted_runs_[start_index].level;
int output_level;
// last level is reserved for the files ingested behind
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
if (first_index_after == sorted_runs_.size()) {
output_level = max_output_level;
output_level = vstorage_->num_levels() - 1;
} else if (sorted_runs_[first_index_after].level == 0) {
output_level = 0;
} else {
output_level = sorted_runs_[first_index_after].level - 1;
}
std::vector<CompactionInputFiles> inputs(max_output_level + 1);
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind &&
(output_level == vstorage_->num_levels() - 1)) {
assert(output_level > 1);
output_level--;
}
std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i);
}
@ -1192,10 +1193,8 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
return nullptr;
}
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
// Pick the first non-empty level after the start_level
for (output_level = start_level + 1; output_level <= max_output_level;
for (output_level = start_level + 1; output_level < vstorage_->num_levels();
output_level++) {
if (vstorage_->NumLevelFiles(output_level) != 0) {
break;
@ -1203,9 +1202,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
}
// If all higher levels are empty, pick the highest level as output level
if (output_level > max_output_level) {
if (output_level == vstorage_->num_levels()) {
if (start_level == 0) {
output_level = max_output_level;
output_level = vstorage_->num_levels() - 1;
} else {
// If start level is non-zero and all higher levels are empty, this
// compaction will translate into a trivial move. Since the idea is
@ -1214,7 +1213,11 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
return nullptr;
}
}
assert(output_level <= max_output_level);
if (ioptions_.allow_ingest_behind &&
output_level == vstorage_->num_levels() - 1) {
assert(output_level > 1);
output_level--;
}
if (output_level != 0) {
if (start_level == 0) {
@ -1291,9 +1294,8 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
uint32_t path_id =
GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
int start_level = sorted_runs_[start_index].level;
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
std::vector<CompactionInputFiles> inputs(max_output_level + 1);
std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i);
}
@ -1330,7 +1332,13 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
int output_level;
if (end_index == sorted_runs_.size() - 1) {
output_level = max_output_level;
// output files at the last level, unless it's reserved
output_level = vstorage_->num_levels() - 1;
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind) {
assert(output_level > 1);
output_level--;
}
} else {
// if it's not including all sorted_runs, it can only output to the level
// above the `end_index + 1` sorted_run.
@ -1443,3 +1451,4 @@ uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
}
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_picker.h"
@ -28,3 +29,4 @@ class UniversalCompactionPicker : public CompactionPicker {
const VersionStorageInfo* vstorage) const override;
};
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -16,6 +16,7 @@
#include "options/options_helper.h"
#include "rocksdb/utilities/options_type.h"
#ifndef ROCKSDB_LITE
namespace ROCKSDB_NAMESPACE {
class SubcompactionState;
@ -831,3 +832,4 @@ bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
#endif // NDEBUG
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -3,6 +3,7 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "db/db_test_util.h"
#include "port/stack_trace.h"
@ -928,7 +929,7 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) {
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
@ -952,3 +953,14 @@ int main(int argc, char** argv) {
RegisterCustomObjects(argc, argv);
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // ROCKSDB_LITE

@ -15,9 +15,11 @@
namespace ROCKSDB_NAMESPACE {
static std::unordered_map<std::string, OptionTypeInfo>
sst_fixed_prefix_type_info = {
#ifndef ROCKSDB_LITE
{"length",
{0, OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
#endif // ROCKSDB_LITE
};
SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
@ -56,6 +58,7 @@ std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
}
#ifndef ROCKSDB_LITE
namespace {
static int RegisterSstPartitionerFactories(ObjectLibrary& library,
const std::string& /*arg*/) {
@ -70,14 +73,18 @@ static int RegisterSstPartitionerFactories(ObjectLibrary& library,
return 1;
}
} // namespace
#endif // ROCKSDB_LITE
Status SstPartitionerFactory::CreateFromString(
const ConfigOptions& options, const std::string& value,
std::shared_ptr<SstPartitionerFactory>* result) {
#ifndef ROCKSDB_LITE
static std::once_flag once;
std::call_once(once, [&]() {
RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
});
return LoadSharedObject<SstPartitionerFactory>(options, value, result);
#endif // ROCKSDB_LITE
return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
result);
}
} // namespace ROCKSDB_NAMESPACE

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save