Squashed 'librocksdb-sys/rocksdb/' content from commit 217f76421

git-subtree-dir: librocksdb-sys/rocksdb
git-subtree-split: 217f7642119aa60a9fa64bc64e02d7f78813d911
master
Niko PLP 11 months ago
commit 081ffa92ee
  1. 892
      .circleci/config.yml
  2. 6
      .circleci/ubsan_suppression_list.txt
  3. 5
      .clang-format
  4. 45
      .github/workflows/sanity_check.yml
  5. 99
      .gitignore
  6. 4
      .lgtm.yml
  7. 6
      .watchmanconfig
  8. 12
      AUTHORS
  9. 1597
      CMakeLists.txt
  10. 77
      CODE_OF_CONDUCT.md
  11. 17
      CONTRIBUTING.md
  12. 339
      COPYING
  13. 24
      DEFAULT_OPTIONS_HISTORY.md
  14. 16
      DUMP_FORMAT.md
  15. 2440
      HISTORY.md
  16. 220
      INSTALL.md
  17. 26
      LANGUAGE-BINDINGS.md
  18. 202
      LICENSE.Apache
  19. 29
      LICENSE.leveldb
  20. 2582
      Makefile
  21. 8
      PLUGINS.md
  22. 29
      README.md
  23. 5607
      TARGETS
  24. 165
      USERS.md
  25. 39
      Vagrantfile
  26. 228
      WINDOWS_PORT.md
  27. 6163
      buckifier/bench-slow.json
  28. 1594
      buckifier/bench.json
  29. 333
      buckifier/buckify_rocksdb.py
  30. 32
      buckifier/check_buck_targets.sh
  31. 6
      buckifier/rocks_test_runner.sh
  32. 150
      buckifier/targets_builder.py
  33. 41
      buckifier/targets_cfg.py
  34. 118
      buckifier/util.py
  35. 168
      build_tools/amalgamate.py
  36. 238
      build_tools/benchmark_log_tool.py
  37. 811
      build_tools/build_detect_platform
  38. 48
      build_tools/check-sources.sh
  39. 22
      build_tools/dependencies_platform010.sh
  40. 3
      build_tools/dockerbuild.sh
  41. 181
      build_tools/error_filter.py
  42. 55
      build_tools/fb_compile_mongo.sh
  43. 175
      build_tools/fbcode_config.sh
  44. 175
      build_tools/fbcode_config_platform010.sh
  45. 203
      build_tools/format-diff.sh
  46. 7971
      build_tools/gnu_parallel
  47. 129
      build_tools/make_package.sh
  48. 38
      build_tools/ps_with_stack
  49. 396
      build_tools/regression_build_test.sh
  50. 493
      build_tools/run_ci_db_test.ps1
  51. 45
      build_tools/setup_centos7.sh
  52. 57
      build_tools/ubuntu20_image/Dockerfile
  53. 106
      build_tools/update_dependencies.sh
  54. 23
      build_tools/version.sh
  55. 158
      cache/cache.cc
  56. 20
      cache/cache_bench.cc
  57. 1034
      cache/cache_bench_tool.cc
  58. 104
      cache/cache_entry_roles.cc
  59. 20
      cache/cache_entry_roles.h
  60. 182
      cache/cache_entry_stats.h
  61. 40
      cache/cache_helpers.cc
  62. 139
      cache/cache_helpers.h
  63. 364
      cache/cache_key.cc
  64. 143
      cache/cache_key.h
  65. 184
      cache/cache_reservation_manager.cc
  66. 317
      cache/cache_reservation_manager.h
  67. 469
      cache/cache_reservation_manager_test.cc
  68. 1061
      cache/cache_test.cc
  69. 109
      cache/charged_cache.cc
  70. 59
      cache/charged_cache.h
  71. 1557
      cache/clock_cache.cc
  72. 756
      cache/clock_cache.h
  73. 318
      cache/compressed_secondary_cache.cc
  74. 140
      cache/compressed_secondary_cache.h
  75. 1076
      cache/compressed_secondary_cache_test.cc
  76. 723
      cache/lru_cache.cc
  77. 467
      cache/lru_cache.h
  78. 2615
      cache/lru_cache_test.cc
  79. 44
      cache/secondary_cache.cc
  80. 433
      cache/secondary_cache_adapter.cc
  81. 76
      cache/secondary_cache_adapter.h
  82. 137
      cache/sharded_cache.cc
  83. 309
      cache/sharded_cache.h
  84. 375
      cache/typed_cache.h
  85. 54
      cmake/RocksDBConfig.cmake.in
  86. 7
      cmake/modules/CxxFlags.cmake
  87. 29
      cmake/modules/FindJeMalloc.cmake
  88. 29
      cmake/modules/FindNUMA.cmake
  89. 29
      cmake/modules/FindSnappy.cmake
  90. 33
      cmake/modules/FindTBB.cmake
  91. 29
      cmake/modules/Findgflags.cmake
  92. 29
      cmake/modules/Findlz4.cmake
  93. 26
      cmake/modules/Finduring.cmake
  94. 29
      cmake/modules/Findzstd.cmake
  95. 10
      cmake/modules/ReadVersion.cmake
  96. 30
      common.mk
  97. 82
      coverage/coverage_test.sh
  98. 128
      coverage/parse_gcov_output.py
  99. 121
      crash_test.mk
  100. 165
      db/arena_wrapped_db_iter.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -0,0 +1,892 @@
version: 2.1
orbs:
win: circleci/windows@5.0.0
commands:
install-cmake-on-macos:
steps:
- run:
name: Install cmake on macos
command: |
HOMEBREW_NO_AUTO_UPDATE=1 brew install cmake
install-jdk8-on-macos:
steps:
- run:
name: Install JDK 8 on macos
command: |
brew install --cask adoptopenjdk/openjdk/adoptopenjdk8
increase-max-open-files-on-macos:
steps:
- run:
name: Increase max open files
command: |
sudo sysctl -w kern.maxfiles=1048576
sudo sysctl -w kern.maxfilesperproc=1048576
sudo launchctl limit maxfiles 1048576
pre-steps:
steps:
- checkout
- run:
name: Setup Environment Variables
command: |
echo "export GTEST_THROW_ON_FAILURE=0" >> $BASH_ENV
echo "export GTEST_OUTPUT=\"xml:/tmp/test-results/\"" >> $BASH_ENV
echo "export SKIP_FORMAT_BUCK_CHECKS=1" >> $BASH_ENV
echo "export GTEST_COLOR=1" >> $BASH_ENV
echo "export CTEST_OUTPUT_ON_FAILURE=1" >> $BASH_ENV
echo "export CTEST_TEST_TIMEOUT=300" >> $BASH_ENV
echo "export ZLIB_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zlib" >> $BASH_ENV
echo "export BZIP2_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/bzip2" >> $BASH_ENV
echo "export SNAPPY_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/snappy" >> $BASH_ENV
echo "export LZ4_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/lz4" >> $BASH_ENV
echo "export ZSTD_DOWNLOAD_BASE=https://rocksdb-deps.s3.us-west-2.amazonaws.com/pkgs/zstd" >> $BASH_ENV
windows-build-steps:
steps:
- checkout
- run:
name: "Install thirdparty dependencies"
command: |
echo "Installing CMake..."
choco install cmake --installargs 'ADD_CMAKE_TO_PATH=System' -y
mkdir $Env:THIRDPARTY_HOME
cd $Env:THIRDPARTY_HOME
echo "Building Snappy dependency..."
curl https://github.com/google/snappy/archive/refs/tags/1.1.8.zip -O snappy-1.1.8.zip
unzip -q snappy-1.1.8.zip
cd snappy-1.1.8
mkdir build
cd build
& $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" ..
msbuild.exe Snappy.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
- run:
name: "Build RocksDB"
command: |
mkdir build
cd build
& $Env:CMAKE_BIN -G "$Env:CMAKE_GENERATOR" -DCMAKE_BUILD_TYPE=Debug -DOPTDBG=1 -DPORTABLE=1 -DSNAPPY=1 -DJNI=1 ..
cd ..
echo "Building with VS version: $Env:CMAKE_GENERATOR"
msbuild.exe build/rocksdb.sln -maxCpuCount -property:Configuration=Debug -property:Platform=x64
- run:
name: "Test RocksDB"
shell: powershell.exe
command: |
build_tools\run_ci_db_test.ps1 -SuiteRun arena_test,db_basic_test,db_test,db_test2,db_merge_operand_test,bloom_test,c_test,coding_test,crc32c_test,dynamic_bloom_test,env_basic_test,env_test,hash_test,random_test -Concurrency 16
pre-steps-macos:
steps:
- pre-steps
post-steps:
steps:
- store_test_results: # store test result if there's any
path: /tmp/test-results
- store_artifacts: # store LOG for debugging if there's any
path: LOG
- run: # on fail, compress Test Logs for diagnosing the issue
name: Compress Test Logs
command: tar -cvzf t.tar.gz t
when: on_fail
- store_artifacts: # on fail, store Test Logs for diagnosing the issue
path: t.tar.gz
destination: test_logs
when: on_fail
- run: # store core dumps if there's any
command: |
mkdir -p /tmp/core_dumps
cp core.* /tmp/core_dumps
when: on_fail
- store_artifacts:
path: /tmp/core_dumps
when: on_fail
upgrade-cmake:
steps:
- run:
name: Upgrade cmake
command: |
sudo apt remove --purge cmake
sudo snap install cmake --classic
install-gflags:
steps:
- run:
name: Install gflags
command: |
sudo apt-get update -y && sudo apt-get install -y libgflags-dev
install-gflags-on-macos:
steps:
- run:
name: Install gflags on macos
command: |
HOMEBREW_NO_AUTO_UPDATE=1 brew install gflags
setup-folly:
steps:
- run:
name: Checkout folly sources
command: |
make checkout_folly
build-folly:
steps:
- run:
name: Build folly and dependencies
command: |
make build_folly
build-for-benchmarks:
steps:
- pre-steps
- run:
name: "Linux build for benchmarks"
command: #sized for the resource-class rocksdb-benchmark-sys1
make V=1 J=8 -j8 release
perform-benchmarks:
steps:
- run:
name: "Test low-variance benchmarks"
command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 20000000
environment:
LD_LIBRARY_PATH: /usr/local/lib
# How long to run parts of the test(s)
DURATION_RO: 300
DURATION_RW: 500
# Keep threads within physical capacity of server (much lower than default)
NUM_THREADS: 1
MAX_BACKGROUND_JOBS: 4
# Don't run a couple of "optional" initial tests
CI_TESTS_ONLY: "true"
# Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree
WRITE_BUFFER_SIZE_MB: 16
TARGET_FILE_SIZE_BASE_MB: 16
MAX_BYTES_FOR_LEVEL_BASE_MB: 64
# The benchmark host has 32GB memory
# The following values are tailored to work with that
# Note, tests may not exercise the targeted issues if the memory is increased on new test hosts.
COMPRESSION_TYPE: "none"
CACHE_INDEX_AND_FILTER_BLOCKS: 1
MIN_LEVEL_TO_COMPRESS: 3
CACHE_SIZE_MB: 10240
MB_WRITE_PER_SEC: 2
post-benchmarks:
steps:
- store_artifacts: # store the benchmark output
path: /tmp/benchmark-results
destination: test_logs
- run:
name: Send benchmark report to visualisation
command: |
set +e
set +o pipefail
./build_tools/benchmark_log_tool.py --tsvfile /tmp/benchmark-results/report.tsv --esdocument https://search-rocksdb-bench-k2izhptfeap2hjfxteolsgsynm.us-west-2.es.amazonaws.com/bench_test3_rix/_doc
true
executors:
linux-docker:
docker:
# The image configuration is build_tools/ubuntu20_image/Dockerfile
# To update and build the image:
# $ cd build_tools/ubuntu20_image
# $ docker build -t zjay437/rocksdb:0.5 .
# $ docker push zjay437/rocksdb:0.5
# `zjay437` is the account name for zjay@meta.com which readwrite token is shared internally. To login:
# $ docker login --username zjay437
# Or please feel free to change it to your docker hub account for hosting the image, meta employee should already have the account and able to login with SSO.
# To avoid impacting the existing CI runs, please bump the version every time creating a new image
# to run the CI image environment locally:
# $ docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -it zjay437/rocksdb:0.5 bash
# option `--cap-add=SYS_PTRACE --security-opt seccomp=unconfined` is used to enable gdb to attach an existing process
- image: zjay437/rocksdb:0.6
jobs:
build-macos:
macos:
xcode: 12.5.1
resource_class: large
environment:
ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc cause env_test hang, disable it for now
steps:
- increase-max-open-files-on-macos
- install-gflags-on-macos
- pre-steps-macos
- run: ulimit -S -n `ulimit -H -n` && OPT=-DCIRCLECI make V=1 J=32 -j32 all
- post-steps
build-macos-cmake:
macos:
xcode: 12.5.1
resource_class: large
parameters:
run_even_tests:
description: run even or odd tests, used to split tests to 2 groups
type: boolean
default: true
steps:
- increase-max-open-files-on-macos
- install-cmake-on-macos
- install-gflags-on-macos
- pre-steps-macos
- run:
name: "cmake generate project file"
command: ulimit -S -n `ulimit -H -n` && mkdir build && cd build && cmake -DWITH_GFLAGS=1 ..
- run:
name: "Build tests"
command: cd build && make V=1 -j32
- when:
condition: << parameters.run_even_tests >>
steps:
- run:
name: "Run even tests"
command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 0,,2
- when:
condition:
not: << parameters.run_even_tests >>
steps:
- run:
name: "Run odd tests"
command: ulimit -S -n `ulimit -H -n` && cd build && ctest -j32 -I 1,,2
- post-steps
build-linux:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: make V=1 J=32 -j32 check
- post-steps
build-linux-encrypted_env-no_compression:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: ENCRYPTED_ENV=1 ROCKSDB_DISABLE_SNAPPY=1 ROCKSDB_DISABLE_ZLIB=1 ROCKSDB_DISABLE_BZIP=1 ROCKSDB_DISABLE_LZ4=1 ROCKSDB_DISABLE_ZSTD=1 make V=1 J=32 -j32 check
- run: |
./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression
- post-steps
build-linux-static_lib-alt_namespace-status_checked:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
- post-steps
build-linux-release:
executor: linux-docker
resource_class: 2xlarge
steps:
- checkout # check out the code in the project directory
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- post-steps
build-linux-release-rtti:
executor: linux-docker
resource_class: xlarge
steps:
- checkout # check out the code in the project directory
- run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
build-linux-clang-no_test_run:
executor: linux-docker
resource_class: xlarge
steps:
- checkout # check out the code in the project directory
- run: CC=clang CXX=clang++ USE_CLANG=1 PORTABLE=1 make V=1 -j16 all
- post-steps
build-linux-clang10-asan:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: COMPILE_WITH_ASAN=1 CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check # aligned new doesn't work for reason we haven't figured out
- post-steps
build-linux-clang10-mini-tsan:
executor: linux-docker
resource_class: 2xlarge+
steps:
- pre-steps
- run: COMPILE_WITH_TSAN=1 CC=clang-13 CXX=clang++-13 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 check
- post-steps
build-linux-clang10-ubsan:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: COMPILE_WITH_UBSAN=1 OPT="-fsanitize-blacklist=.circleci/ubsan_suppression_list.txt" CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 USE_CLANG=1 make V=1 -j32 ubsan_check # aligned new doesn't work for reason we haven't figured out
- post-steps
build-linux-valgrind:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: PORTABLE=1 make V=1 -j32 valgrind_test
- post-steps
build-linux-clang10-clang-analyze:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: CC=clang-10 CXX=clang++-10 ROCKSDB_DISABLE_ALIGNED_NEW=1 CLANG_ANALYZER="/usr/bin/clang++-10" CLANG_SCAN_BUILD=scan-build-10 USE_CLANG=1 make V=1 -j32 analyze # aligned new doesn't work for reason we haven't figured out. For unknown, reason passing "clang++-10" as CLANG_ANALYZER doesn't work, and we need a full path.
- post-steps
- run:
name: "compress test report"
command: tar -cvzf scan_build_report.tar.gz scan_build_report
when: on_fail
- store_artifacts:
path: scan_build_report.tar.gz
destination: scan_build_report
when: on_fail
build-linux-runner:
machine: true
resource_class: facebook/rocksdb-benchmark-sys1
steps:
- pre-steps
- run:
name: "Checked Linux build (Runner)"
command: make V=1 J=8 -j8 check
environment:
LD_LIBRARY_PATH: /usr/local/lib
- post-steps
build-linux-cmake-with-folly:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- setup-folly
- build-folly
- run: (mkdir build && cd build && cmake -DUSE_FOLLY=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)
- post-steps
build-linux-cmake-with-folly-lite-no-test:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- setup-folly
- run: (mkdir build && cd build && cmake -DUSE_FOLLY_LITE=1 -DWITH_GFLAGS=1 .. && make V=1 -j20)
- post-steps
build-linux-cmake-with-benchmark:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: mkdir build && cd build && cmake -DWITH_GFLAGS=1 -DWITH_BENCHMARK=1 .. && make V=1 -j20 && ctest -j20
- post-steps
build-linux-unity-and-headers:
docker: # executor type
- image: gcc:latest
environment:
EXTRA_CXXFLAGS: -mno-avx512f # Warnings-as-error in avx512fintrin.h, would be used on newer hardware
resource_class: large
steps:
- checkout # check out the code in the project directory
- run: apt-get update -y && apt-get install -y libgflags-dev
- run:
name: "Unity build"
command: make V=1 -j8 unity_test
no_output_timeout: 20m
- run: make V=1 -j8 -k check-headers # could be moved to a different build
- post-steps
build-linux-gcc-7-with-folly:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- setup-folly
- build-folly
- run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
build-linux-gcc-7-with-folly-lite-no-test:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- setup-folly
- run: USE_FOLLY_LITE=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 all
- post-steps
build-linux-gcc-8-no_test_run:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: CC=gcc-8 CXX=g++-8 V=1 make -j32 all
- post-steps
build-linux-cmake-with-folly-coroutines:
executor: linux-docker
resource_class: 2xlarge
environment:
CC: gcc-10
CXX: g++-10
steps:
- pre-steps
- setup-folly
- build-folly
- run: (mkdir build && cd build && cmake -DUSE_COROUTINES=1 -DWITH_GFLAGS=1 -DROCKSDB_BUILD_SHARED=0 .. && make V=1 -j20 && ctest -j20)
- post-steps
build-linux-gcc-10-cxx20-no_test_run:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: CC=gcc-10 CXX=g++-10 V=1 ROCKSDB_CXX_STANDARD=c++20 make -j32 all
- post-steps
build-linux-gcc-11-no_test_run:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
build-linux-clang-13-no_test_run:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j32 all microbench
- post-steps
# Ensure ASAN+UBSAN with folly, and full testsuite with clang 13
build-linux-clang-13-asan-ubsan-with-folly:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- setup-folly
- build-folly
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
# This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
build-linux-run-microbench:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: DEBUG_LEVEL=0 make -j32 run_microbench
- post-steps
build-linux-mini-crashtest:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush
- post-steps
build-linux-crashtest-tiered-storage-bb:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
build-linux-crashtest-tiered-storage-wb:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
build-windows-vs2022:
executor:
name: win/server-2022
size: 2xlarge
environment:
THIRDPARTY_HOME: C:/Users/circleci/thirdparty
CMAKE_HOME: C:/Program Files/CMake
CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8
SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build
SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
CMAKE_GENERATOR: Visual Studio 17 2022
steps:
- windows-build-steps
build-windows-vs2019:
executor:
name: win/server-2019
size: 2xlarge
environment:
THIRDPARTY_HOME: C:/Users/circleci/thirdparty
CMAKE_HOME: C:/Program Files/CMake
CMAKE_BIN: C:/Program Files/CMake/bin/cmake.exe
SNAPPY_HOME: C:/Users/circleci/thirdparty/snappy-1.1.8
SNAPPY_INCLUDE: C:/Users/circleci/thirdparty/snappy-1.1.8;C:/Users/circleci/thirdparty/snappy-1.1.8/build
SNAPPY_LIB_DEBUG: C:/Users/circleci/thirdparty/snappy-1.1.8/build/Debug/snappy.lib
CMAKE_GENERATOR: Visual Studio 16 2019
steps:
- windows-build-steps
build-linux-java:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Test RocksDBJava"
command: make V=1 J=8 -j8 jtest
- post-steps
build-linux-java-static:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Build RocksDBJava Static Library"
command: make V=1 J=8 -j8 rocksdbjavastatic
- post-steps
build-macos-java:
macos:
xcode: 12.5.1
resource_class: large
environment:
JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
ROCKSDB_DISABLE_JEMALLOC: 1 # jemalloc causes java 8 crash
steps:
- increase-max-open-files-on-macos
- install-gflags-on-macos
- install-jdk8-on-macos
- pre-steps-macos
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Test RocksDBJava"
command: make V=1 J=16 -j16 jtest
no_output_timeout: 20m
- post-steps
build-macos-java-static:
macos:
xcode: 12.5.1
resource_class: large
environment:
JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
steps:
- increase-max-open-files-on-macos
- install-gflags-on-macos
- install-cmake-on-macos
- install-jdk8-on-macos
- pre-steps-macos
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Build RocksDBJava x86 and ARM Static Libraries"
command: make V=1 J=16 -j16 rocksdbjavastaticosx
no_output_timeout: 20m
- post-steps
build-macos-java-static-universal:
macos:
xcode: 12.5.1
resource_class: large
environment:
JAVA_HOME: /Library/Java/JavaVirtualMachines/adoptopenjdk-8.jdk/Contents/Home
steps:
- increase-max-open-files-on-macos
- install-gflags-on-macos
- install-cmake-on-macos
- install-jdk8-on-macos
- pre-steps-macos
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Build RocksDBJava Universal Binary Static Library"
command: make V=1 J=16 -j16 rocksdbjavastaticosx_ub
no_output_timeout: 20m
- post-steps
build-examples:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run:
name: "Build examples"
command: |
make V=1 -j4 static_lib && cd examples && make V=1 -j4
- post-steps
build-cmake-mingw:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run: update-alternatives --set x86_64-w64-mingw32-g++ /usr/bin/x86_64-w64-mingw32-g++-posix
- run:
name: "Build cmake-mingw"
command: |
export PATH=$JAVA_HOME/bin:$PATH
echo "JAVA_HOME=${JAVA_HOME}"
which java && java -version
which javac && javac -version
mkdir build && cd build && cmake -DJNI=1 -DWITH_GFLAGS=OFF .. -DCMAKE_C_COMPILER=x86_64-w64-mingw32-gcc -DCMAKE_CXX_COMPILER=x86_64-w64-mingw32-g++ -DCMAKE_SYSTEM_NAME=Windows && make -j4 rocksdb rocksdbjni
- post-steps
build-linux-non-shm:
executor: linux-docker
resource_class: 2xlarge
environment:
TEST_TMPDIR: /tmp/rocksdb_test_tmp
steps:
- pre-steps
- run: make V=1 -j32 check
- post-steps
build-linux-arm-test-full:
machine:
image: ubuntu-2004:202111-02
resource_class: arm.large
steps:
- pre-steps
- install-gflags
- run: make V=1 J=4 -j4 check
- post-steps
build-linux-arm:
machine:
image: ubuntu-2004:202111-02
resource_class: arm.large
steps:
- pre-steps
- install-gflags
- run: ROCKSDBTESTS_PLATFORM_DEPENDENT=only make V=1 J=4 -j4 all_but_some_tests check_some
- post-steps
build-linux-arm-cmake-no_test_run:
machine:
image: ubuntu-2004:202111-02
resource_class: arm.large
environment:
JAVA_HOME: /usr/lib/jvm/java-8-openjdk-arm64
steps:
- pre-steps
- install-gflags
- run:
name: "Set Java Environment"
command: |
echo "JAVA_HOME=${JAVA_HOME}"
echo 'export PATH=$JAVA_HOME/bin:$PATH' >> $BASH_ENV
which java && java -version
which javac && javac -version
- run:
name: "Build with cmake"
command: |
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -DWITH_TESTS=0 -DWITH_GFLAGS=1 -DWITH_BENCHMARK_TOOLS=0 -DWITH_TOOLS=0 -DWITH_CORE_TOOLS=1 ..
make -j4
- run:
name: "Build Java with cmake"
command: |
rm -rf build
mkdir build
cd build
cmake -DJNI=1 -DCMAKE_BUILD_TYPE=Release -DWITH_GFLAGS=1 ..
make -j4 rocksdb rocksdbjni
- post-steps
build-format-compatible:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run:
name: "test"
command: |
export TEST_TMPDIR=/dev/shm/rocksdb
rm -rf /dev/shm/rocksdb
mkdir /dev/shm/rocksdb
tools/check_format_compatible.sh
- post-steps
build-fuzzers:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run:
name: "Build rocksdb lib"
command: CC=clang-13 CXX=clang++-13 USE_CLANG=1 make -j4 static_lib
- run:
name: "Build fuzzers"
command: cd fuzz && make sst_file_writer_fuzzer db_fuzzer db_map_fuzzer
- post-steps
benchmark-linux: #use a private Circle CI runner (resource_class) to run the job
machine: true
resource_class: facebook/rocksdb-benchmark-sys1
steps:
- build-for-benchmarks
- perform-benchmarks
- post-benchmarks
workflows:
version: 2
jobs-linux-run-tests:
jobs:
- build-linux
- build-linux-cmake-with-folly
- build-linux-cmake-with-folly-lite-no-test
- build-linux-gcc-7-with-folly
- build-linux-gcc-7-with-folly-lite-no-test
- build-linux-cmake-with-folly-coroutines
- build-linux-cmake-with-benchmark
- build-linux-encrypted_env-no_compression
jobs-linux-run-tests-san:
jobs:
- build-linux-clang10-asan
- build-linux-clang10-ubsan
- build-linux-clang10-mini-tsan
- build-linux-static_lib-alt_namespace-status_checked
jobs-linux-no-test-run:
jobs:
- build-linux-release
- build-linux-release-rtti
- build-examples
- build-fuzzers
- build-linux-clang-no_test_run
- build-linux-clang-13-no_test_run
- build-linux-gcc-8-no_test_run
- build-linux-gcc-10-cxx20-no_test_run
- build-linux-gcc-11-no_test_run
- build-linux-arm-cmake-no_test_run
jobs-linux-other-checks:
jobs:
- build-linux-clang10-clang-analyze
- build-linux-unity-and-headers
- build-linux-mini-crashtest
jobs-windows:
jobs:
- build-windows-vs2022
- build-windows-vs2019
- build-cmake-mingw
jobs-java:
jobs:
- build-linux-java
- build-linux-java-static
- build-macos-java
- build-macos-java-static
- build-macos-java-static-universal
jobs-macos:
jobs:
- build-macos
- build-macos-cmake:
run_even_tests: true
- build-macos-cmake:
run_even_tests: false
jobs-linux-arm:
jobs:
- build-linux-arm
build-fuzzers:
jobs:
- build-fuzzers
benchmark-linux:
triggers:
- schedule:
cron: "0 * * * *"
filters:
branches:
only:
- main
jobs:
- benchmark-linux
nightly:
triggers:
- schedule:
cron: "0 9 * * *"
filters:
branches:
only:
- main
jobs:
- build-format-compatible
- build-linux-arm-test-full
- build-linux-run-microbench
- build-linux-non-shm
- build-linux-clang-13-asan-ubsan-with-folly
- build-linux-valgrind

@ -0,0 +1,6 @@
# Supress UBSAN warnings related to stl_tree.h, e.g.
# UndefinedBehaviorSanitizer: undefined-behavior /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43 in
# /usr/bin/../lib/gcc/x86_64-linux-gnu/5.4.0/../../../../include/c++/5.4.0/bits/stl_tree.h:1505:43:
# runtime error: upcast of address 0x000001fa8820 with insufficient space for an object of type
# 'std::_Rb_tree_node<std::pair<const std::__cxx11::basic_string<char>, rocksdb::(anonymous namespace)::LockHoldingInfo> >'
src:*bits/stl_tree.h

@ -0,0 +1,5 @@
# Complete list of style options can be found at:
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html
---
BasedOnStyle: Google
...

@ -0,0 +1,45 @@
name: Check buck targets and code format
on: [push, pull_request]
permissions:
contents: read
jobs:
check:
name: Check TARGETS file and code format
runs-on: ubuntu-latest
steps:
- name: Checkout feature branch
uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Fetch from upstream
run: |
git remote add upstream https://github.com/facebook/rocksdb.git && git fetch upstream
- name: Where am I
run: |
echo git status && git status
echo "git remote -v" && git remote -v
echo git branch && git branch
- name: Setup Python
uses: actions/setup-python@v1
- name: Install Dependencies
run: python -m pip install --upgrade pip
- name: Install argparse
run: pip install argparse
- name: Download clang-format-diff.py
run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
- name: Check format
run: VERBOSE_CHECK=1 make check-format
- name: Compare buckify output
run: make check-buck-targets
- name: Simple source code checks
run: make check-sources

99
.gitignore vendored

@ -0,0 +1,99 @@
make_config.mk
rocksdb.pc
*.a
*.arc
*.d
*.dylib*
*.gcda
*.gcno
*.o
*.o.tmp
*.so
*.so.*
*_test
*_bench
*_stress
*.out
*.class
*.jar
*.*jnilib*
*.d-e
*.o-*
*.swp
*~
*.vcxproj
*.vcxproj.filters
*.sln
*.cmake
.watchmanconfig
CMakeCache.txt
CMakeFiles/
build/
ldb
manifest_dump
sst_dump
blob_dump
block_cache_trace_analyzer
tools/block_cache_analyzer/*.pyc
column_aware_encoding_exp
util/build_version.cc
build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT
.gdbhistory
.gdb_history
package/
unity.a
tags
etags
rocksdb_dump
rocksdb_undump
db_test2
trace_analyzer
block_cache_trace_analyzer
io_tracer_parser
.DS_Store
.vs
.vscode
.clangd
java/out
java/target
java/test-libs
java/*.log
java/include/org_rocksdb_*.h
.idea/
*.iml
rocksdb.cc
rocksdb.h
unity.cc
java/crossbuild/.vagrant
.vagrant/
java/**/*.asc
java/javadoc
scan_build_report/
t
LOG
db_logs/
tp2/
fbcode/
fbcode
buckifier/*.pyc
buckifier/__pycache__
compile_commands.json
clang-format-diff.py
.py3/
fuzz/proto/gen/
fuzz/crash-*
cmake-build-*
third-party/folly/
.cache
*.sublime-*

@ -0,0 +1,4 @@
extraction:
cpp:
index:
build_command: make static_lib

@ -0,0 +1,6 @@
{
"content_hash_warming": true,
"content_hash_max_items": 333333,
"hint_num_files_per_dir": 8,
"fsevents_latency": 0.05
}

@ -0,0 +1,12 @@
Facebook Inc.
Facebook Engineering Team
Google Inc.
# Initial version authors:
Jeffrey Dean <jeff@google.com>
Sanjay Ghemawat <sanjay@google.com>
# Partial list of contributors:
Kevin Regan <kevin.d.regan@gmail.com>
Johan Bilien <jobi@litl.com>
Matthew Von-Maszewski <https://github.com/matthewvon> (Basho Technologies)

File diff suppressed because it is too large Load Diff

@ -0,0 +1,77 @@
# Code of Conduct
## Our Pledge
In the interest of fostering an open and welcoming environment, we as
contributors and maintainers pledge to make participation in our project and
our community a harassment-free experience for everyone, regardless of age, body
size, disability, ethnicity, sex characteristics, gender identity and expression,
level of experience, education, socio-economic status, nationality, personal
appearance, race, religion, or sexual identity and orientation.
## Our Standards
Examples of behavior that contributes to creating a positive environment
include:
* Using welcoming and inclusive language
* Being respectful of differing viewpoints and experiences
* Gracefully accepting constructive criticism
* Focusing on what is best for the community
* Showing empathy towards other community members
Examples of unacceptable behavior by participants include:
* The use of sexualized language or imagery and unwelcome sexual attention or
advances
* Trolling, insulting/derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or electronic
address, without explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Our Responsibilities
Project maintainers are responsible for clarifying the standards of acceptable
behavior and are expected to take appropriate and fair corrective action in
response to any instances of unacceptable behavior.
Project maintainers have the right and responsibility to remove, edit, or
reject comments, commits, code, wiki edits, issues, and other contributions
that are not aligned to this Code of Conduct, or to ban temporarily or
permanently any contributor for other behaviors that they deem inappropriate,
threatening, offensive, or harmful.
## Scope
This Code of Conduct applies within all project spaces, and it also applies when
an individual is representing the project or its community in public spaces.
Examples of representing a project or community include using an official
project e-mail address, posting via an official social media account, or acting
as an appointed representative at an online or offline event. Representation of
a project may be further defined and clarified by project maintainers.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported by contacting the project team at <opensource-conduct@fb.com>. All
complaints will be reviewed and investigated and will result in a response that
is deemed necessary and appropriate to the circumstances. The project team is
obligated to maintain confidentiality with regard to the reporter of an incident.
Further details of specific enforcement policies may be posted separately.
Project maintainers who do not follow or enforce the Code of Conduct in good
faith may face temporary or permanent repercussions as determined by other
members of the project's leadership.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
[homepage]: https://www.contributor-covenant.org
For answers to common questions about this code of conduct, see
https://www.contributor-covenant.org/faq

@ -0,0 +1,17 @@
# Contributing to RocksDB
## Code of Conduct
The code of conduct is described in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md)
## Contributor License Agreement ("CLA")
In order to accept your pull request, we need you to submit a CLA. You
only need to do this once, so if you've done this for another Facebook
open source project, you're good to go. If you are submitting a pull
request for the first time, just let us know that you have completed
the CLA and we can cross-check with your GitHub username.
Complete your CLA here: <https://code.facebook.com/cla>
If you prefer to sign a paper copy, we can send you a PDF. Send us an
e-mail or create a new github issue to request the CLA in PDF format.

@ -0,0 +1,339 @@
GNU GENERAL PUBLIC LICENSE
Version 2, June 1991
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The licenses for most software are designed to take away your
freedom to share and change it. By contrast, the GNU General Public
License is intended to guarantee your freedom to share and change free
software--to make sure the software is free for all its users. This
General Public License applies to most of the Free Software
Foundation's software and to any other program whose authors commit to
using it. (Some other Free Software Foundation software is covered by
the GNU Lesser General Public License instead.) You can apply it to
your programs, too.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
this service if you wish), that you receive source code or can get it
if you want it, that you can change the software or use pieces of it
in new free programs; and that you know you can do these things.
To protect your rights, we need to make restrictions that forbid
anyone to deny you these rights or to ask you to surrender the rights.
These restrictions translate to certain responsibilities for you if you
distribute copies of the software, or if you modify it.
For example, if you distribute copies of such a program, whether
gratis or for a fee, you must give the recipients all the rights that
you have. You must make sure that they, too, receive or can get the
source code. And you must show them these terms so they know their
rights.
We protect your rights with two steps: (1) copyright the software, and
(2) offer you this license which gives you legal permission to copy,
distribute and/or modify the software.
Also, for each author's protection and ours, we want to make certain
that everyone understands that there is no warranty for this free
software. If the software is modified by someone else and passed on, we
want its recipients to know that what they have is not the original, so
that any problems introduced by others will not reflect on the original
authors' reputations.
Finally, any free program is threatened constantly by software
patents. We wish to avoid the danger that redistributors of a free
program will individually obtain patent licenses, in effect making the
program proprietary. To prevent this, we have made it clear that any
patent must be licensed for everyone's free use or not licensed at all.
The precise terms and conditions for copying, distribution and
modification follow.
GNU GENERAL PUBLIC LICENSE
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
0. This License applies to any program or other work which contains
a notice placed by the copyright holder saying it may be distributed
under the terms of this General Public License. The "Program", below,
refers to any such program or work, and a "work based on the Program"
means either the Program or any derivative work under copyright law:
that is to say, a work containing the Program or a portion of it,
either verbatim or with modifications and/or translated into another
language. (Hereinafter, translation is included without limitation in
the term "modification".) Each licensee is addressed as "you".
Activities other than copying, distribution and modification are not
covered by this License; they are outside its scope. The act of
running the Program is not restricted, and the output from the Program
is covered only if its contents constitute a work based on the
Program (independent of having been made by running the Program).
Whether that is true depends on what the Program does.
1. You may copy and distribute verbatim copies of the Program's
source code as you receive it, in any medium, provided that you
conspicuously and appropriately publish on each copy an appropriate
copyright notice and disclaimer of warranty; keep intact all the
notices that refer to this License and to the absence of any warranty;
and give any other recipients of the Program a copy of this License
along with the Program.
You may charge a fee for the physical act of transferring a copy, and
you may at your option offer warranty protection in exchange for a fee.
2. You may modify your copy or copies of the Program or any portion
of it, thus forming a work based on the Program, and copy and
distribute such modifications or work under the terms of Section 1
above, provided that you also meet all of these conditions:
a) You must cause the modified files to carry prominent notices
stating that you changed the files and the date of any change.
b) You must cause any work that you distribute or publish, that in
whole or in part contains or is derived from the Program or any
part thereof, to be licensed as a whole at no charge to all third
parties under the terms of this License.
c) If the modified program normally reads commands interactively
when run, you must cause it, when started running for such
interactive use in the most ordinary way, to print or display an
announcement including an appropriate copyright notice and a
notice that there is no warranty (or else, saying that you provide
a warranty) and that users may redistribute the program under
these conditions, and telling the user how to view a copy of this
License. (Exception: if the Program itself is interactive but
does not normally print such an announcement, your work based on
the Program is not required to print an announcement.)
These requirements apply to the modified work as a whole. If
identifiable sections of that work are not derived from the Program,
and can be reasonably considered independent and separate works in
themselves, then this License, and its terms, do not apply to those
sections when you distribute them as separate works. But when you
distribute the same sections as part of a whole which is a work based
on the Program, the distribution of the whole must be on the terms of
this License, whose permissions for other licensees extend to the
entire whole, and thus to each and every part regardless of who wrote it.
Thus, it is not the intent of this section to claim rights or contest
your rights to work written entirely by you; rather, the intent is to
exercise the right to control the distribution of derivative or
collective works based on the Program.
In addition, mere aggregation of another work not based on the Program
with the Program (or with a work based on the Program) on a volume of
a storage or distribution medium does not bring the other work under
the scope of this License.
3. You may copy and distribute the Program (or a work based on it,
under Section 2) in object code or executable form under the terms of
Sections 1 and 2 above provided that you also do one of the following:
a) Accompany it with the complete corresponding machine-readable
source code, which must be distributed under the terms of Sections
1 and 2 above on a medium customarily used for software interchange; or,
b) Accompany it with a written offer, valid for at least three
years, to give any third party, for a charge no more than your
cost of physically performing source distribution, a complete
machine-readable copy of the corresponding source code, to be
distributed under the terms of Sections 1 and 2 above on a medium
customarily used for software interchange; or,
c) Accompany it with the information you received as to the offer
to distribute corresponding source code. (This alternative is
allowed only for noncommercial distribution and only if you
received the program in object code or executable form with such
an offer, in accord with Subsection b above.)
The source code for a work means the preferred form of the work for
making modifications to it. For an executable work, complete source
code means all the source code for all modules it contains, plus any
associated interface definition files, plus the scripts used to
control compilation and installation of the executable. However, as a
special exception, the source code distributed need not include
anything that is normally distributed (in either source or binary
form) with the major components (compiler, kernel, and so on) of the
operating system on which the executable runs, unless that component
itself accompanies the executable.
If distribution of executable or object code is made by offering
access to copy from a designated place, then offering equivalent
access to copy the source code from the same place counts as
distribution of the source code, even though third parties are not
compelled to copy the source along with the object code.
4. You may not copy, modify, sublicense, or distribute the Program
except as expressly provided under this License. Any attempt
otherwise to copy, modify, sublicense or distribute the Program is
void, and will automatically terminate your rights under this License.
However, parties who have received copies, or rights, from you under
this License will not have their licenses terminated so long as such
parties remain in full compliance.
5. You are not required to accept this License, since you have not
signed it. However, nothing else grants you permission to modify or
distribute the Program or its derivative works. These actions are
prohibited by law if you do not accept this License. Therefore, by
modifying or distributing the Program (or any work based on the
Program), you indicate your acceptance of this License to do so, and
all its terms and conditions for copying, distributing or modifying
the Program or works based on it.
6. Each time you redistribute the Program (or any work based on the
Program), the recipient automatically receives a license from the
original licensor to copy, distribute or modify the Program subject to
these terms and conditions. You may not impose any further
restrictions on the recipients' exercise of the rights granted herein.
You are not responsible for enforcing compliance by third parties to
this License.
7. If, as a consequence of a court judgment or allegation of patent
infringement or for any other reason (not limited to patent issues),
conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot
distribute so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you
may not distribute the Program at all. For example, if a patent
license would not permit royalty-free redistribution of the Program by
all those who receive copies directly or indirectly through you, then
the only way you could satisfy both it and this License would be to
refrain entirely from distribution of the Program.
If any portion of this section is held invalid or unenforceable under
any particular circumstance, the balance of the section is intended to
apply and the section as a whole is intended to apply in other
circumstances.
It is not the purpose of this section to induce you to infringe any
patents or other property right claims or to contest validity of any
such claims; this section has the sole purpose of protecting the
integrity of the free software distribution system, which is
implemented by public license practices. Many people have made
generous contributions to the wide range of software distributed
through that system in reliance on consistent application of that
system; it is up to the author/donor to decide if he or she is willing
to distribute software through any other system and a licensee cannot
impose that choice.
This section is intended to make thoroughly clear what is believed to
be a consequence of the rest of this License.
8. If the distribution and/or use of the Program is restricted in
certain countries either by patents or by copyrighted interfaces, the
original copyright holder who places the Program under this License
may add an explicit geographical distribution limitation excluding
those countries, so that distribution is permitted only in or among
countries not thus excluded. In such case, this License incorporates
the limitation as if written in the body of this License.
9. The Free Software Foundation may publish revised and/or new versions
of the General Public License from time to time. Such new versions will
be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the Program
specifies a version number of this License which applies to it and "any
later version", you have the option of following the terms and conditions
either of that version or of any later version published by the Free
Software Foundation. If the Program does not specify a version number of
this License, you may choose any version ever published by the Free Software
Foundation.
10. If you wish to incorporate parts of the Program into other free
programs whose distribution conditions are different, write to the author
to ask for permission. For software which is copyrighted by the Free
Software Foundation, write to the Free Software Foundation; we sometimes
make exceptions for this. Our decision will be guided by the two goals
of preserving the free status of all derivatives of our free software and
of promoting the sharing and reuse of software generally.
NO WARRANTY
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
REPAIR OR CORRECTION.
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGES.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
convey the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
Also add information on how to contact you by electronic and paper mail.
If the program is interactive, make it output a short notice like this
when it starts in an interactive mode:
Gnomovision version 69, Copyright (C) year name of author
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
This is free software, and you are welcome to redistribute it
under certain conditions; type `show c' for details.
The hypothetical commands `show w' and `show c' should show the appropriate
parts of the General Public License. Of course, the commands you use may
be called something other than `show w' and `show c'; they could even be
mouse-clicks or menu items--whatever suits your program.
You should also get your employer (if you work as a programmer) or your
school, if any, to sign a "copyright disclaimer" for the program, if
necessary. Here is a sample; alter the names:
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
`Gnomovision' (which makes passes at compilers) written by James Hacker.
<signature of Ty Coon>, 1 April 1989
Ty Coon, President of Vice
This General Public License does not permit incorporating your program into
proprietary programs. If your program is a subroutine library, you may
consider it more useful to permit linking proprietary applications with the
library. If this is what you want to do, use the GNU Lesser General
Public License instead of this License.

@ -0,0 +1,24 @@
# RocksDB default options change log (NO LONGER MAINTAINED)
## Unreleased
* delayed_write_rate takes the rate given by rate_limiter if not specified.
## 5.2
* Change the default of delayed slowdown value to 16MB/s and further increase the L0 stop condition to 36 files.
## 5.0 (11/17/2016)
* Options::allow_concurrent_memtable_write and Options::enable_write_thread_adaptive_yield are now true by default
* Options.level0_stop_writes_trigger default value changes from 24 to 32.
## 4.8.0 (5/2/2016)
* options.max_open_files changes from 5000 to -1. It improves performance, but users need to set file descriptor limit to be large enough and watch memory usage for index and bloom filters.
* options.base_background_compactions changes from max_background_compactions to 1. When users set higher max_background_compactions but the write throughput is not high, the writes are less spiky to disks.
* options.wal_recovery_mode changes from kTolerateCorruptedTailRecords to kPointInTimeRecovery. Avoid some false positive when file system or hardware reorder the writes for file data and metadata.
## 4.7.0 (4/8/2016)
* options.write_buffer_size changes from 4MB to 64MB.
* options.target_file_size_base changes from 2MB to 64MB.
* options.max_bytes_for_level_base changes from 10MB to 256MB.
* options.soft_pending_compaction_bytes_limit changes from 0 (disabled) to 64GB.
* options.hard_pending_compaction_bytes_limit changes from 0 (disabled) to 256GB.
* table_cache_numshardbits changes from 4 to 6.
* max_file_opening_threads changes from 1 to 16.

@ -0,0 +1,16 @@
## RocksDB dump format
The version 1 RocksDB dump format is fairly simple:
1) The dump starts with the magic 8 byte identifier "ROCKDUMP"
2) The magic is followed by an 8 byte big-endian version which is 0x00000001.
3) Next are arbitrarily sized chunks of bytes prepended by 4 byte little endian number indicating how large each chunk is.
4) The first chunk is special and is a json string indicating some things about the creation of this dump. It contains the following keys:
* database-path: The path of the database this dump was created from.
* hostname: The hostname of the machine where the dump was created.
* creation-time: Unix seconds since epoc when this dump was created.
5) Following the info dump the slices paired into are key/value pairs.

File diff suppressed because it is too large Load Diff

@ -0,0 +1,220 @@
## Compilation
**Important**: If you plan to run RocksDB in production, don't compile using default
`make` or `make all`. That will compile RocksDB in debug mode, which is much slower
than release mode.
RocksDB's library should be able to compile without any dependency installed,
although we recommend installing some compression libraries (see below).
We do depend on newer gcc/clang with C++17 support (GCC >= 7, Clang >= 5).
There are few options when compiling RocksDB:
* [recommended] `make static_lib` will compile librocksdb.a, RocksDB static library. Compiles static library in release mode.
* `make shared_lib` will compile librocksdb.so, RocksDB shared library. Compiles shared library in release mode.
* `make check` will compile and run all the unit tests. `make check` will compile RocksDB in debug mode.
* `make all` will compile our static library, and all our tools and unit tests. Our tools
depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
use binaries compiled by `make all` in production.
* By default the binary we produce is optimized for the CPU you're compiling on
(`-march=native` or the equivalent). To build a binary compatible with the most
general architecture supported by your CPU and compiler, set `PORTABLE=1` for
the build, but performance will suffer as many operations benefit from newer
and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`,
it can be set to an architecture name recognized by your compiler. For example,
on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports
many or most of the available optimizations while still being compatible with
most processors made since roughly 2013.
## Dependencies
* You can link RocksDB with following compression libraries:
- [zlib](http://www.zlib.net/) - a library for data compression.
- [bzip2](http://www.bzip.org/) - a library for data compression.
- [lz4](https://github.com/lz4/lz4) - a library for extremely fast data compression.
- [snappy](http://google.github.io/snappy/) - a library for fast
data compression.
- [zstandard](http://www.zstd.net) - Fast real-time compression
algorithm.
* All our tools depend on:
- [gflags](https://gflags.github.io/gflags/) - a library that handles
command line flags processing. You can compile rocksdb library even
if you don't have gflags installed.
* `make check` will also check code formatting, which requires [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
* If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed.
* You can do the following to install Google benchmark. These commands are copied from `./build_tools/ubuntu20_image/Dockerfile`:
`$ git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark`
`$ cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install`
## Supported platforms
* **Linux - Ubuntu**
* Upgrade your gcc to version at least 7 to get C++17 support.
* Install gflags. First, try: `sudo apt-get install libgflags-dev`
If this doesn't work and you're using Ubuntu, here's a nice tutorial:
(http://askubuntu.com/questions/312173/installing-gflags-12-04)
* Install snappy. This is usually as easy as:
`sudo apt-get install libsnappy-dev`.
* Install zlib. Try: `sudo apt-get install zlib1g-dev`.
* Install bzip2: `sudo apt-get install libbz2-dev`.
* Install lz4: `sudo apt-get install liblz4-dev`.
* Install zstandard: `sudo apt-get install libzstd-dev`.
* **Linux - CentOS / RHEL**
* Upgrade your gcc to version at least 7 to get C++17 support
* Install gflags:
git clone https://github.com/gflags/gflags.git
cd gflags
git checkout v2.0
./configure && make && sudo make install
**Notice**: Once installed, please add the include path for gflags to your `CPATH` environment variable and the
lib path to `LIBRARY_PATH`. If installed with default settings, the include path will be `/usr/local/include`
and the lib path will be `/usr/local/lib`.
* Install snappy:
sudo yum install snappy snappy-devel
* Install zlib:
sudo yum install zlib zlib-devel
* Install bzip2:
sudo yum install bzip2 bzip2-devel
* Install lz4:
sudo yum install lz4-devel
* Install ASAN (optional for debugging):
sudo yum install libasan
* Install zstandard:
* With [EPEL](https://fedoraproject.org/wiki/EPEL):
sudo yum install libzstd-devel
* With CentOS 8:
sudo dnf install libzstd-devel
* From source:
wget https://github.com/facebook/zstd/archive/v1.1.3.tar.gz
mv v1.1.3.tar.gz zstd-1.1.3.tar.gz
tar zxvf zstd-1.1.3.tar.gz
cd zstd-1.1.3
make && sudo make install
* **OS X**:
* Install latest C++ compiler that supports C++ 17:
* Update XCode: run `xcode-select --install` (or install it from XCode App's settting).
* Install via [homebrew](http://brew.sh/).
* If you're first time developer in MacOS, you still need to run: `xcode-select --install` in your command line.
* run `brew tap homebrew/versions; brew install gcc7 --use-llvm` to install gcc 7 (or higher).
* run `brew install rocksdb`
* **FreeBSD** (11.01):
* You can either install RocksDB from the Ports system using `cd /usr/ports/databases/rocksdb && make install`, or you can follow the details below to install dependencies and compile from source code:
* Install the dependencies for RocksDB:
export BATCH=YES
cd /usr/ports/devel/gmake && make install
cd /usr/ports/devel/gflags && make install
cd /usr/ports/archivers/snappy && make install
cd /usr/ports/archivers/bzip2 && make install
cd /usr/ports/archivers/liblz4 && make install
cd /usr/ports/archivesrs/zstd && make install
cd /usr/ports/devel/git && make install
* Install the dependencies for RocksJava (optional):
export BATCH=yes
cd /usr/ports/java/openjdk7 && make install
* Build RocksDB from source:
cd ~
git clone https://github.com/facebook/rocksdb.git
cd rocksdb
gmake static_lib
* Build RocksJava from source (optional):
cd rocksdb
export JAVA_HOME=/usr/local/openjdk7
gmake rocksdbjava
* **OpenBSD** (6.3/-current):
* As RocksDB is not available in the ports yet you have to build it on your own:
* Install the dependencies for RocksDB:
pkg_add gmake gflags snappy bzip2 lz4 zstd git jdk bash findutils gnuwatch
* Build RocksDB from source:
cd ~
git clone https://github.com/facebook/rocksdb.git
cd rocksdb
gmake static_lib
* Build RocksJava from source (optional):
cd rocksdb
export JAVA_HOME=/usr/local/jdk-1.8.0
export PATH=$PATH:/usr/local/jdk-1.8.0/bin
gmake rocksdbjava
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`.
* **Windows** (Visual Studio 2017 to up):
* Read and follow the instructions at CMakeLists.txt
* Or install via [vcpkg](https://github.com/microsoft/vcpkg)
* run `vcpkg install rocksdb:x64-windows`
* **AIX 6.1**
* Install AIX Toolbox rpms with gcc
* Use these environment variables:
export PORTABLE=1
export CC=gcc
export AR="ar -X64"
export EXTRA_ARFLAGS=-X64
export EXTRA_CFLAGS=-maix64
export EXTRA_CXXFLAGS=-maix64
export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc"
export LIBPATH=/opt/freeware/lib
export JAVA_HOME=/usr/java8_64
export PATH=/opt/freeware/bin:$PATH
* **Solaris Sparc**
* Install GCC 7 and higher.
* Use these environment variables:
export CC=gcc
export EXTRA_CFLAGS=-m64
export EXTRA_CXXFLAGS=-m64
export EXTRA_LDFLAGS=-m64
export PORTABLE=1
export PLATFORM_LDFLAGS="-static-libstdc++ -static-libgcc"

@ -0,0 +1,26 @@
This is the list of all known third-party language bindings for RocksDB. If something is missing, please open a pull request to add it.
* Java - https://github.com/facebook/rocksdb/tree/main/java
* Python
* http://python-rocksdb.readthedocs.io/en/latest/
* http://pyrocksdb.readthedocs.org/en/latest/ (unmaintained)
* Perl - https://metacpan.org/pod/RocksDB
* Node.js - https://npmjs.org/package/rocksdb
* Go
* https://github.com/linxGnu/grocksdb
* https://github.com/tecbot/gorocksdb (unmaintained)
* Ruby - http://rubygems.org/gems/rocksdb-ruby
* Haskell - https://hackage.haskell.org/package/rocksdb-haskell
* PHP - https://github.com/Photonios/rocksdb-php
* C#
* https://github.com/warrenfalk/rocksdb-sharp
* https://github.com/curiosity-ai/rocksdb-sharp
* Rust
* https://github.com/pingcap/rust-rocksdb (used in production fork of https://github.com/spacejam/rust-rocksdb)
* https://github.com/spacejam/rust-rocksdb
* https://github.com/bh1xuw/rust-rocks
* D programming language - https://github.com/b1naryth1ef/rocksdb
* Erlang - https://gitlab.com/barrel-db/erlang-rocksdb
* Elixir - https://github.com/urbint/rox
* Nim - https://github.com/status-im/nim-rocksdb
* Swift and Objective-C (iOS/OSX) - https://github.com/iabudiab/ObjectiveRocks

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

@ -0,0 +1,29 @@
This contains code that is from LevelDB, and that code is under the following license:
Copyright (c) 2011 The LevelDB Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

2582
Makefile

File diff suppressed because it is too large Load Diff

@ -0,0 +1,8 @@
This is the list of all known third-party plugins for RocksDB. If something is missing, please open a pull request to add it.
* [Dedupfs](https://github.com/ajkr/dedupfs): an example for plugin developers to reference
* [HDFS](https://github.com/riversand963/rocksdb-hdfs-env): an Env used for interacting with HDFS. Migrated from main RocksDB repo
* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
* [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.
* [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB.
* [IPPCP](https://github.com/intel/ippcp-plugin-rocksdb): a plugin to enable encryption on RocksDB based on Intel optimized open source IPP-Crypto library.

@ -0,0 +1,29 @@
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
and Jeff Dean (jeff@google.com)
This code is a library that forms the core building block for a fast
key-value server, especially suited for storing data on flash drives.
It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
making it especially suitable for storing multiple terabytes of data in a
single database.
Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples
See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
The public interface is in `include/`. Callers should not include or
rely on the details of any other header files in this package. Those
internal APIs may be changed without warning.
Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups.
## License
RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses.

5607
TARGETS

File diff suppressed because it is too large Load Diff

@ -0,0 +1,165 @@
This document lists users of RocksDB and their use cases. If you are using RocksDB, please open a pull request and add yourself to the list.
## Facebook
At Facebook, we use RocksDB as storage engines in multiple data management services and a backend for many different stateful services, including:
1. MyRocks -- https://github.com/MySQLOnRocksDB/mysql-5.6
2. MongoRocks -- https://github.com/mongodb-partners/mongo-rocks
3. ZippyDB -- Facebook's distributed key-value store with Paxos-style replication, built on top of RocksDB.[1] https://www.youtube.com/watch?v=DfiN7pG0D0khtt
4. Laser -- Laser is a high query throughput, low (millisecond) latency, key-value storage service built on top of RocksDB.[1]
4. Dragon -- a distributed graph query engine. https://code.facebook.com/posts/1737605303120405/dragon-a-distributed-graph-query-engine/
5. Stylus -- a low-level stream processing framework writtenin C++.[1]
6. LogDevice -- a distributed data store for logs [2]
[1] https://research.facebook.com/publications/realtime-data-processing-at-facebook/
[2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
## Bilibili
[Bilibili](bilibili.com) [uses](https://www.alluxio.io/blog/when-ai-meets-alluxio-at-bilibili-building-an-efficient-ai-platform-for-data-preprocessing-and-model-training/) Alluxio to speed up its ML training workloads, and Alluxio uses RocksDB to store its filesystem metadata, so Bilibili uses RocksDB.
Bilibili's [real-time platform](https://www.alibabacloud.com/blog/architecture-and-practices-of-bilibilis-real-time-platform_596676) uses Flink, and uses RocksDB as Flink's state store.
## TikTok
TikTok, or its parent company ByteDance, uses RocksDB as the storage engine for some storage systems, such as its distributed graph database [ByteGraph](https://vldb.org/pvldb/vol15/p3306-li.pdf).
Also, TikTok uses [Alluxio](alluxio.io) to [speed up Presto queries](https://www.alluxio.io/resources/videos/improving-presto-performance-with-alluxio-at-tiktok/), and Alluxio stores the files' metadata in RocksDB.
## FoundationDB
[FoundationDB](https://www.foundationdb.org/) [uses](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp) RocksDB to implement a [key-value store interface](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp#L1127) in its server backend.
## Apple
Apple [uses](https://opensource.apple.com/projects/foundationdb/) FoundationDB, so it also uses RocksDB.
## Snowflake
Snowflake [uses](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/) FoundationDB, so it also uses RocksDB.
## Microsoft
The Bing search engine from Microsoft uses RocksDB as the storage engine for its web data platform: https://blogs.bing.com/Engineering-Blog/october-2021/RocksDB-in-Microsoft-Bing
## LinkedIn
Two different use cases at Linkedin are using RocksDB as a storage engine:
1. LinkedIn's follow feed for storing user's activities. Check out the blog post: https://engineering.linkedin.com/blog/2016/03/followfeed--linkedin-s-feed-made-faster-and-smarter
2. Apache Samza, open source framework for stream processing
Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasundaram: http://www.youtube.com/watch?v=plqVp_OnSzg
## Yahoo
Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
## Tencent
[PaxosStore](https://github.com/Tencent/paxosstore) is a distributed database supporting WeChat. It uses RocksDB as its storage engine.
## Baidu
[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
## CockroachDB
CockroachDB is an open-source geo-replicated transactional database. They are using RocksDB as their storage engine. Check out their github: https://github.com/cockroachdb/cockroach
## DNANexus
DNANexus is using RocksDB to speed up processing of genomics data.
You can learn more from this great blog post by Mike Lin: http://devblog.dnanexus.com/faster-bam-sorting-with-samtools-and-rocksdb/
## Iron.io
Iron.io is using RocksDB as a storage engine for their distributed queueing system.
Learn more from Tech Talk by Reed Allman: http://www.youtube.com/watch?v=HTjt6oj-RL4
## Tango Me
Tango is using RocksDB as a graph storage to store all users' connection data and other social activity data.
## Turn
Turn is using RocksDB as a storage layer for their key/value store, serving at peak 2.4MM QPS out of different datacenters.
Check out our RocksDB Protobuf merge operator at: https://github.com/vladb38/rocksdb_protobuf
## Santander UK/Cloudera Profession Services
Check out their blog post: http://blog.cloudera.com/blog/2015/08/inside-santanders-near-real-time-data-ingest-architecture/
## Airbnb
Airbnb is using RocksDB as a storage engine for their personalized search service. You can learn more about it here: https://www.youtube.com/watch?v=ASQ6XMtogMs
## Alluxio
[Alluxio](https://www.alluxio.io) uses RocksDB to serve and scale file system metadata to beyond 1 Billion files. The detailed design and implementation is described in this engineering blog:
https://www.alluxio.io/blog/scalable-metadata-service-in-alluxio-storing-billions-of-files/
## Pinterest
Pinterest's Object Retrieval System uses RocksDB for storage: https://www.youtube.com/watch?v=MtFEVEs_2Vo
## Smyte
[Smyte](https://www.smyte.com/) uses RocksDB as the storage layer for their core key-value storage, high-performance counters and time-windowed HyperLogLog services.
## Rakuten Marketing
[Rakuten Marketing](https://marketing.rakuten.com/) uses RocksDB as the disk cache layer for the real-time bidding service in their Performance DSP.
## VWO, Wingify
[VWO's](https://vwo.com/) Smart Code checker and URL helper uses RocksDB to store all the URLs where VWO's Smart Code is installed.
## quasardb
[quasardb](https://www.quasardb.net) is a high-performance, distributed, transactional key-value database that integrates well with in-memory analytics engines such as Apache Spark.
quasardb uses a heavily tuned RocksDB as its persistence layer.
## Netflix
[Netflix](http://techblog.netflix.com/2016/05/application-data-caching-using-ssds.html) Netflix uses RocksDB on AWS EC2 instances with local SSD drives to cache application data.
## TiKV
[TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer.
## TiDB
[TiDB](https://github.com/pingcap/tidb) uses the TiKV distributed key-value database, so it uses RocksDB.
## PingCAP
[PingCAP](https://www.pingcap.com/) is the company behind TiDB, its cloud database service uses RocksDB.
## Apache Spark
[Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store.
## Databricks
[Databricks](https://www.databricks.com/) [replaces AWS RDS with TiDB](https://www.pingcap.com/case-study/how-databricks-tackles-the-scalability-limit-with-a-mysql-alternative/) for scalability, so it uses RocksDB.
## Apache Flink
[Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine.
## Dgraph
[Dgraph](https://github.com/dgraph-io/dgraph) is an open-source, scalable, distributed, low latency, high throughput Graph database .They use RocksDB to store state locally on a machine.
## Uber
[Uber](http://eng.uber.com/cherami/) uses RocksDB as a durable and scalable task queue.
## 360 Pika
[360](http://www.360.cn/) [Pika](https://github.com/Qihoo360/pika) is a nosql compatible with redis. With the huge amount of data stored, redis may suffer for a capacity bottleneck, and pika was born for solving it. It has widely been used in many companies.
## LzLabs
LzLabs is using RocksDB as a storage engine in their multi-database distributed framework to store application configuration and user data.
## ProfaneDB
[ProfaneDB](https://profanedb.gitlab.io/) is a database for Protocol Buffers, and uses RocksDB for storage. It is accessible via gRPC, and the schema is defined using directly `.proto` files.
## IOTA Foundation
[IOTA Foundation](https://www.iota.org/) is using RocksDB in the [IOTA Reference Implementation (IRI)](https://github.com/iotaledger/iri) to store the local state of the Tangle. The Tangle is the first open-source distributed ledger powering the future of the Internet of Things.
## Avrio Project
[Avrio Project](http://avrio-project.github.io/avrio.network/) is using RocksDB in [Avrio ](https://github.com/avrio-project/avrio) to store blocks, account balances and data and other blockchain-releated data. Avrio is a multiblockchain decentralized cryptocurrency empowering monetary transactions.
## Crux
[Crux](https://github.com/juxt/crux) is a document database that uses RocksDB for local [EAV](https://en.wikipedia.org/wiki/Entity%E2%80%93attribute%E2%80%93value_model) index storage to enable point-in-time bitemporal Datalog queries. The "unbundled" architecture uses Kafka to provide horizontal scalability.
## Nebula Graph
[Nebula Graph](https://github.com/vesoft-inc/nebula) is a distributed, scalable, lightning-fast, open source graph database capable of hosting super large scale graphs with dozens of billions of vertices (nodes) and trillions of edges, with milliseconds of latency.
## YugabyteDB
[YugabyteDB](https://www.yugabyte.com/) is an open source, high performance, distributed SQL database that uses RocksDB as its storage layer. For more information, please see https://github.com/yugabyte/yugabyte-db/.
## ArangoDB
[ArangoDB](https://www.arangodb.com/) is a native multi-model database with flexible data models for documents, graphs, and key-values, for building high performance applications using a convenient SQL-like query language or JavaScript extensions. It uses RocksDB as its storage engine.
## Milvus
[Milvus](https://milvus.io/) is an open source vector database for unstructured data. It uses RocksDB not only as one of the supported kv storage engines, but also as a message queue.
## Kafka
[Kafka](https://kafka.apache.org/) is an open-source distributed event streaming platform, it uses RocksDB to store state in Kafka Streams: https://www.confluent.io/blog/how-to-tune-rocksdb-kafka-streams-state-stores-performance/.
## Solana Labs
[Solana](https://github.com/solana-labs/solana) is a fast, secure, scalable, and decentralized blockchain. It uses RocksDB as the underlying storage for its ledger store.
## Others
More databases using RocksDB can be found at [dbdb.io](https://dbdb.io/browse?embeds=rocksdb).

39
Vagrantfile vendored

@ -0,0 +1,39 @@
# Vagrant file
Vagrant.configure("2") do |config|
config.vm.provider "virtualbox" do |v|
v.memory = 4096
v.cpus = 2
end
config.vm.define "ubuntu14" do |box|
box.vm.box = "ubuntu/trusty64"
end
config.vm.define "centos65" do |box|
box.vm.box = "chef/centos-6.5"
end
config.vm.define "centos7" do |box|
box.vm.box = "centos/7"
box.vm.provision "shell", path: "build_tools/setup_centos7.sh"
end
config.vm.define "FreeBSD10" do |box|
box.vm.guest = :freebsd
box.vm.box = "robin/freebsd-10"
# FreeBSD does not support 'mount_virtualbox_shared_folder', use NFS
box.vm.synced_folder ".", "/vagrant", :nfs => true, id: "vagrant-root"
box.vm.network "private_network", ip: "10.0.1.10"
# build everything after creating VM, skip using --no-provision
box.vm.provision "shell", inline: <<-SCRIPT
pkg install -y gmake clang35
export CXX=/usr/local/bin/clang++35
cd /vagrant
gmake clean
gmake all OPT=-g
SCRIPT
end
end

@ -0,0 +1,228 @@
# Microsoft Contribution Notes
## Contributors
* Alexander Zinoviev https://github.com/zinoale
* Dmitri Smirnov https://github.com/yuslepukhin
* Praveen Rao https://github.com/PraveenSinghRao
* Sherlock Huang https://github.com/SherlockNoMad
## Introduction
RocksDB is a well proven open source key-value persistent store, optimized for fast storage. It provides scalability with number of CPUs and storage IOPS, to support IO-bound, in-memory and write-once workloads, most importantly, to be flexible to allow for innovation.
As Microsoft Bing team we have been continuously pushing hard to improve the scalability, efficiency of platform and eventually benefit Bing end-user satisfaction. We would like to explore the opportunity to embrace open source, RocksDB here, to use, enhance and customize for our usage, and also contribute back to the RocksDB community. Herein, we are pleased to offer this RocksDB port for Windows platform.
These notes describe some decisions and changes we had to make with regards to porting RocksDB on Windows. We hope this will help both reviewers and users of the Windows port.
We are open for comments and improvements.
## OS specifics
All of the porting, testing and benchmarking was done on Windows Server 2012 R2 Datacenter 64-bit but to the best of our knowledge there is not a specific API we used during porting that is unsupported on other Windows OS after Vista.
## Porting goals
We strive to achieve the following goals:
* make use of the existing porting interface of RocksDB
* make minimum [WY2]modifications within platform independent code.
* make all unit test pass both in debug and release builds.
* Note: latest introduction of SyncPoint seems to disable running db_test in Release.
* make performance on par with published benchmarks accounting for HW differences
* we would like to keep the port code inline with the main branch with no forking
## Build system
We have chosen CMake as a widely accepted build system to build the Windows port. It is very fast and convenient.
At the same time it generates Visual Studio projects that are both usable from a command line and IDE.
The top-level CMakeLists.txt file contains description of all targets and build rules. It also provides brief instructions on how to build the software for Windows. One more build related file is thirdparty.inc that also resides on the top level. This file must be edited to point to actual third party libraries location.
We think that it would be beneficial to merge the existing make-based build system and the new cmake-based build system into a single one to use on all platforms.
All building and testing was done for 64-bit. We have not conducted any testing for 32-bit and early reports indicate that it will not run on 32-bit.
## C++ and STL notes
We had to make some minimum changes within the portable files that either account for OS differences or the shortcomings of C++11 support in the current version of the MS compiler. Most or all of them are expected to be fixed in the upcoming compiler releases.
We plan to use this port for our business purposes here at Bing and this provided business justification for this port. This also means, we do not have at present to choose the compiler version at will.
* Certain headers that are not present and not necessary on Windows were simply `#ifndef OS_WIN` in a few places (`unistd.h`)
* All posix specific headers were replaced to port/port.h which worked well
* Replaced `dirent.h` for `port/port_dirent.h` (very few places) with the implementation of the relevant interfaces within `rocksdb::port` namespace
* Replaced `sys/time.h` to `port/sys_time.h` (few places) implemented equivalents within `rocksdb::port`
* `printf %z` specification is not supported on Windows. To imitate existing standards we came up with a string macro `ROCKSDB_PRIszt` which expands to `zu` on posix systems and to `Iu` on windows.
* in class member initialization were moved to a __ctors in some cases
* `constexpr` is not supported. We had to replace `std::numeric_limits<>::max/min()` to its C macros for constants. Sometimes we had to make class members `static const` and place a definition within a .cc file.
* `constexpr` for functions was replaced to a template specialization (1 place)
* Union members that have non-trivial constructors were replaced to `char[]` in one place along with bug fixes (spatial experimental feature)
* Zero-sized arrays are deemed a non-standard extension which we converted to 1 size array and that should work well for the purposes of these classes.
* `std::chrono` lacks nanoseconds support (fixed in the upcoming release of the STL) and we had to use `QueryPerfCounter()` within env_win.cc
* Function local statics initialization is still not safe. Used `std::once` to mitigate within WinEnv.
## Windows Environments notes
We endeavored to make it functionally on par with posix_env. This means we replicated the functionality of the thread pool and other things as precise as possible, including:
* Replicate posix logic using std:thread primitives.
* Implement all posix_env disk access functionality.
* Set `use_os_buffer=false` to disable OS disk buffering for WinWritableFile and WinRandomAccessFile.
* Replace `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure.
* Use `SetFileInformationByHandle` to compensate absence of `fallocate`.
### In detail
Even though Windows provides its own efficient thread-pool implementation we chose to replicate posix logic using `std::thread` primitives. This allows anyone to quickly detect any changes within the posix source code and replicate them within windows env. This has proven to work very well. At the same time for anyone who wishes to replace the built-in thread-pool can do so using RocksDB stackable environments.
For disk access we implemented all of the functionality present within the posix_env which includes memory mapped files, random access, rate-limiter support etc.
The `use_os_buffer` flag on Posix platforms currently denotes disabling read-ahead log via `fadvise` mechanism. Windows does not have `fadvise` system call. What is more, it implements disk cache in a way that differs from Linux greatly. It's not an uncommon practice on Windows to perform un-buffered disk access to gain control of the memory consumption. We think that in our use case this may also be a good configuration option at the expense of disk throughput. To compensate one may increase the configured in-memory cache size instead. Thus we have chosen `use_os_buffer=false` to disable OS disk buffering for `WinWritableFile` and `WinRandomAccessFile`. The OS imposes restrictions on the alignment of the disk offsets, buffers used and the amount of data that is read/written when accessing files in un-buffered mode. When the option is true, the classes behave in a standard way. This allows to perform writes and reads in cases when un-buffered access does not make sense such as WAL and MANIFEST.
We have replaced `pread/pwrite` with `WriteFile/ReadFile` with `OVERLAPPED` structure so we can atomically seek to the position of the disk operation but still perform the operation synchronously. Thus we able to emulate that functionality of `pread/pwrite` reasonably well. The only difference is that the file pointer is not returned to its original position but that hardly matters given the random nature of access.
We used `SetFileInformationByHandle` both to truncate files after writing a full final page to disk and to pre-allocate disk space for faster I/O thus compensating for the absence of `fallocate` although some differences remain. For example, the pre-allocated space is not filled with zeros like on Linux, however, on a positive note, the end of file position is also not modified after pre-allocation.
RocksDB renames, copies and deletes files at will even though they may be opened with another handle at the same time. We had to relax and allow nearly all the concurrent access permissions possible.
## Thread-Local Storage
Thread-Local storage plays a significant role for RocksDB performance. Rather than creating a separate implementation we chose to create inline wrappers that forward `pthread_specific` calls to Windows `Tls` interfaces within `rocksdb::port` namespace. This leaves the existing meat of the logic in tact and unchanged and just as maintainable.
To mitigate the lack of thread local storage cleanup on thread-exit we added a limited amount of windows specific code within the same thread_local.cc file that injects a cleanup callback into a `"__tls"` structure within `".CRT$XLB"` data segment. This approach guarantees that the callback is invoked regardless of whether RocksDB used within an executable, standalone DLL or within another DLL.
## Jemalloc usage
When RocksDB is used with Jemalloc the latter needs to be initialized before any of the C++ globals or statics. To accomplish that we injected an initialization routine into `".CRT$XCT"` that is automatically invoked by the runtime before initializing static objects. je-uninit is queued to `atexit()`.
The jemalloc redirecting `new/delete` global operators are used by the linker providing certain conditions are met. See build section in these notes.
## Stack Trace and Unhandled Exception Handler
We decided not to implement these two features because the hosting program as a rule has these two things in it.
We experienced no inconveniences debugging issues in the debugger or analyzing process dumps if need be and thus we did not
see this as a priority.
## Performance results
### Setup
All of the benchmarks are run on the same set of machines. Here are the details of the test setup:
* 2 Intel(R) Xeon(R) E5 2450 0 @ 2.10 GHz (total 16 cores)
* 2 XK0480GDQPH SSD Device, total 894GB free disk
* Machine has 128 GB of RAM
* Operating System: Windows Server 2012 R2 Datacenter
* 100 Million keys; each key is of size 10 bytes, each value is of size 800 bytes
* total database size is ~76GB
* The performance result is based on RocksDB 3.11.
* The parameters used, unless specified, were exactly the same as published in the GitHub Wiki page.
### RocksDB on flash storage
#### Test 1. Bulk Load of keys in Random Order
Version 3.11
* Total Run Time: 17.6 min
* Fillrandom: 5.480 micros/op 182465 ops/sec; 142.0 MB/s
* Compact: 486056544.000 micros/op 0 ops/sec
Version 3.10
* Total Run Time: 16.2 min
* Fillrandom: 5.018 micros/op 199269 ops/sec; 155.1 MB/s
* Compact: 441313173.000 micros/op 0 ops/sec;
#### Test 2. Bulk Load of keys in Sequential Order
Version 3.11
* Fillseq: 4.944 micros/op 202k ops/sec; 157.4 MB/s
Version 3.10
* Fillseq: 4.105 micros/op 243.6k ops/sec; 189.6 MB/s
#### Test 3. Random Write
Version 3.11
* Unbuffered I/O enabled
* Overwrite: 52.661 micros/op 18.9k ops/sec; 14.8 MB/s
Version 3.10
* Unbuffered I/O enabled
* Overwrite: 52.661 micros/op 18.9k ops/sec;
#### Test 4. Random Read
Version 3.11
* Unbuffered I/O enabled
* Readrandom: 15.716 micros/op 63.6k ops/sec; 49.5 MB/s
Version 3.10
* Unbuffered I/O enabled
* Readrandom: 15.548 micros/op 64.3k ops/sec;
#### Test 5. Multi-threaded read and single-threaded write
Version 3.11
* Unbuffered I/O enabled
* Readwhilewriting: 25.128 micros/op 39.7k ops/sec;
Version 3.10
* Unbuffered I/O enabled
* Readwhilewriting: 24.854 micros/op 40.2k ops/sec;
### RocksDB In Memory
#### Test 1. Point Lookup
Version 3.11
80K writes/sec
* Write Rate Achieved: 40.5k write/sec;
* Readwhilewriting: 0.314 micros/op 3187455 ops/sec; 364.8 MB/s (715454999 of 715454999 found)
Version 3.10
* Write Rate Achieved: 50.6k write/sec
* Readwhilewriting: 0.316 micros/op 3162028 ops/sec; (719576999 of 719576999 found)
*10K writes/sec*
Version 3.11
* Write Rate Achieved: 5.8k/s write/sec
* Readwhilewriting: 0.246 micros/op 4062669 ops/sec; 464.9 MB/s (915481999 of 915481999 found)
Version 3.10
* Write Rate Achieved: 5.8k/s write/sec
* Readwhilewriting: 0.244 micros/op 4106253 ops/sec; (927986999 of 927986999 found)
#### Test 2. Prefix Range Query
Version 3.11
80K writes/sec
* Write Rate Achieved: 46.3k/s write/sec
* Readwhilewriting: 0.362 micros/op 2765052 ops/sec; 316.4 MB/s (611549999 of 611549999 found)
Version 3.10
* Write Rate Achieved: 45.8k/s write/sec
* Readwhilewriting: 0.317 micros/op 3154941 ops/sec; (708158999 of 708158999 found)
Version 3.11
10K writes/sec
* Write Rate Achieved: 5.78k write/sec
* Readwhilewriting: 0.269 micros/op 3716692 ops/sec; 425.3 MB/s (837401999 of 837401999 found)
Version 3.10
* Write Rate Achieved: 5.7k write/sec
* Readwhilewriting: 0.261 micros/op 3830152 ops/sec; (863482999 of 863482999 found)
We think that there is still big room to improve the performance, which will be an ongoing effort for us.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,333 @@
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
try:
from builtins import str
except ImportError:
from __builtin__ import str
import fnmatch
import json
import os
import sys
from targets_builder import TARGETSBuilder
from util import ColorString
# This script generates TARGETS file for Buck.
# Buck is a build tool specifying dependencies among different build targets.
# User can pass extra dependencies as a JSON object via command line, and this
# script can include these dependencies in the generate TARGETS file.
# Usage:
# $python3 buckifier/buckify_rocksdb.py
# (This generates a TARGET file without user-specified dependency for unit
# tests.)
# $python3 buckifier/buckify_rocksdb.py \
# '{"fake": {
# "extra_deps": [":test_dep", "//fakes/module:mock1"],
# "extra_compiler_flags": ["-DFOO_BAR", "-Os"]
# }
# }'
# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
# unit tests, and will use the extra_compiler_flags to compile the unit test
# source.)
# tests to export as libraries for inclusion in other projects
_EXPORTED_TEST_LIBS = ["env_basic_test"]
# Parse src.mk files as a Dictionary of
# VAR_NAME => list of files
def parse_src_mk(repo_path):
src_mk = repo_path + "/src.mk"
src_files = {}
for line in open(src_mk):
line = line.strip()
if len(line) == 0 or line[0] == "#":
continue
if "=" in line:
current_src = line.split("=")[0].strip()
src_files[current_src] = []
elif ".c" in line:
src_path = line.split("\\")[0].strip()
src_files[current_src].append(src_path)
return src_files
# get all .cc / .c files
def get_cc_files(repo_path):
cc_files = []
for root, _dirnames, filenames in os.walk(
repo_path
): # noqa: B007 T25377293 Grandfathered in
root = root[(len(repo_path) + 1) :]
if "java" in root:
# Skip java
continue
for filename in fnmatch.filter(filenames, "*.cc"):
cc_files.append(os.path.join(root, filename))
for filename in fnmatch.filter(filenames, "*.c"):
cc_files.append(os.path.join(root, filename))
return cc_files
# Get non_parallel tests from Makefile
def get_non_parallel_tests(repo_path):
Makefile = repo_path + "/Makefile"
s = set({})
found_non_parallel_tests = False
for line in open(Makefile):
line = line.strip()
if line.startswith("NON_PARALLEL_TEST ="):
found_non_parallel_tests = True
elif found_non_parallel_tests:
if line.endswith("\\"):
# remove the trailing \
line = line[:-1]
line = line.strip()
s.add(line)
else:
# we consumed all the non_parallel tests
break
return s
# Parse extra dependencies passed by user from command line
def get_dependencies():
deps_map = {"": {"extra_deps": [], "extra_compiler_flags": []}}
if len(sys.argv) < 2:
return deps_map
def encode_dict(data):
rv = {}
for k, v in data.items():
if isinstance(v, dict):
v = encode_dict(v)
rv[k] = v
return rv
extra_deps = json.loads(sys.argv[1], object_hook=encode_dict)
for target_alias, deps in extra_deps.items():
deps_map[target_alias] = deps
return deps_map
# Prepare TARGETS file for buck
def generate_targets(repo_path, deps_map):
print(ColorString.info("Generating TARGETS"))
# parsed src.mk file
src_mk = parse_src_mk(repo_path)
# get all .cc files
cc_files = get_cc_files(repo_path)
# get non_parallel tests from Makefile
non_parallel_tests = get_non_parallel_tests(repo_path)
if src_mk is None or cc_files is None or non_parallel_tests is None:
return False
extra_argv = ""
if len(sys.argv) >= 2:
# Heuristically quote and canonicalize whitespace for inclusion
# in how the file was generated.
extra_argv = " '{0}'".format(" ".join(sys.argv[1].split()))
TARGETS = TARGETSBuilder("%s/TARGETS" % repo_path, extra_argv)
# rocksdb_lib
TARGETS.add_library(
"rocksdb_lib",
src_mk["LIB_SOURCES"] +
# always add range_tree, it's only excluded on ppc64, which we don't use internally
src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
deps=[
"//folly/container:f14_hash",
"//folly/experimental/coro:blocking_wait",
"//folly/experimental/coro:collect",
"//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex",
],
)
# rocksdb_whole_archive_lib
TARGETS.add_library(
"rocksdb_whole_archive_lib",
[],
deps=[
":rocksdb_lib",
],
headers=None,
extra_external_deps="",
link_whole=True,
)
# rocksdb_test_lib
TARGETS.add_library(
"rocksdb_test_lib",
src_mk.get("MOCK_LIB_SOURCES", [])
+ src_mk.get("TEST_LIB_SOURCES", [])
+ src_mk.get("EXP_LIB_SOURCES", [])
+ src_mk.get("ANALYZER_LIB_SOURCES", []),
[":rocksdb_lib"],
extra_test_libs=True,
)
# rocksdb_tools_lib
TARGETS.add_library(
"rocksdb_tools_lib",
src_mk.get("BENCH_LIB_SOURCES", [])
+ src_mk.get("ANALYZER_LIB_SOURCES", [])
+ ["test_util/testutil.cc"],
[":rocksdb_lib"],
)
# rocksdb_cache_bench_tools_lib
TARGETS.add_library(
"rocksdb_cache_bench_tools_lib",
src_mk.get("CACHE_BENCH_LIB_SOURCES", []),
[":rocksdb_lib"],
)
# rocksdb_stress_lib
TARGETS.add_rocksdb_library(
"rocksdb_stress_lib",
src_mk.get("ANALYZER_LIB_SOURCES", [])
+ src_mk.get("STRESS_LIB_SOURCES", [])
+ ["test_util/testutil.cc"],
)
# db_stress binary
TARGETS.add_binary(
"db_stress", ["db_stress_tool/db_stress.cc"], [":rocksdb_stress_lib"]
)
# bench binaries
for src in src_mk.get("MICROBENCH_SOURCES", []):
name = src.rsplit("/", 1)[1].split(".")[0] if "/" in src else src.split(".")[0]
TARGETS.add_binary(name, [src], [], extra_bench_libs=True)
print("Extra dependencies:\n{0}".format(json.dumps(deps_map)))
# Dictionary test executable name -> relative source file path
test_source_map = {}
# c_test.c is added through TARGETS.add_c_test(). If there
# are more than one .c test file, we need to extend
# TARGETS.add_c_test() to include other C tests too.
for test_src in src_mk.get("TEST_MAIN_SOURCES_C", []):
if test_src != "db/c_test.c":
print("Don't know how to deal with " + test_src)
return False
TARGETS.add_c_test()
try:
with open(f"{repo_path}/buckifier/bench.json") as json_file:
fast_fancy_bench_config_list = json.load(json_file)
for config_dict in fast_fancy_bench_config_list:
clean_benchmarks = {}
benchmarks = config_dict["benchmarks"]
for binary, benchmark_dict in benchmarks.items():
clean_benchmarks[binary] = {}
for benchmark, overloaded_metric_list in benchmark_dict.items():
clean_benchmarks[binary][benchmark] = []
for metric in overloaded_metric_list:
if not isinstance(metric, dict):
clean_benchmarks[binary][benchmark].append(metric)
TARGETS.add_fancy_bench_config(
config_dict["name"],
clean_benchmarks,
False,
config_dict["expected_runtime_one_iter"],
config_dict["sl_iterations"],
config_dict["regression_threshold"],
)
with open(f"{repo_path}/buckifier/bench-slow.json") as json_file:
slow_fancy_bench_config_list = json.load(json_file)
for config_dict in slow_fancy_bench_config_list:
clean_benchmarks = {}
benchmarks = config_dict["benchmarks"]
for binary, benchmark_dict in benchmarks.items():
clean_benchmarks[binary] = {}
for benchmark, overloaded_metric_list in benchmark_dict.items():
clean_benchmarks[binary][benchmark] = []
for metric in overloaded_metric_list:
if not isinstance(metric, dict):
clean_benchmarks[binary][benchmark].append(metric)
for config_dict in slow_fancy_bench_config_list:
TARGETS.add_fancy_bench_config(
config_dict["name"] + "_slow",
clean_benchmarks,
True,
config_dict["expected_runtime_one_iter"],
config_dict["sl_iterations"],
config_dict["regression_threshold"],
)
# it is better servicelab experiments break
# than rocksdb github ci
except Exception:
pass
TARGETS.add_test_header()
for test_src in src_mk.get("TEST_MAIN_SOURCES", []):
test = test_src.split(".c")[0].strip().split("/")[-1].strip()
test_source_map[test] = test_src
print("" + test + " " + test_src)
for target_alias, deps in deps_map.items():
for test, test_src in sorted(test_source_map.items()):
if len(test) == 0:
print(ColorString.warning("Failed to get test name for %s" % test_src))
continue
test_target_name = test if not target_alias else test + "_" + target_alias
if test in _EXPORTED_TEST_LIBS:
test_library = "%s_lib" % test_target_name
TARGETS.add_library(
test_library,
[test_src],
deps=[":rocksdb_test_lib"],
extra_test_libs=True,
)
TARGETS.register_test(
test_target_name,
test_src,
deps=json.dumps(deps["extra_deps"] + [":" + test_library]),
extra_compiler_flags=json.dumps(deps["extra_compiler_flags"]),
)
else:
TARGETS.register_test(
test_target_name,
test_src,
deps=json.dumps(deps["extra_deps"] + [":rocksdb_test_lib"]),
extra_compiler_flags=json.dumps(deps["extra_compiler_flags"]),
)
print(ColorString.info("Generated TARGETS Summary:"))
print(ColorString.info("- %d libs" % TARGETS.total_lib))
print(ColorString.info("- %d binarys" % TARGETS.total_bin))
print(ColorString.info("- %d tests" % TARGETS.total_test))
return True
def get_rocksdb_path():
# rocksdb = {script_dir}/..
script_dir = os.path.dirname(sys.argv[0])
script_dir = os.path.abspath(script_dir)
rocksdb_path = os.path.abspath(os.path.join(script_dir, "../"))
return rocksdb_path
def exit_with_error(msg):
print(ColorString.error(msg))
sys.exit(1)
def main():
deps_map = get_dependencies()
# Generate TARGETS file for buck
ok = generate_targets(get_rocksdb_path(), deps_map)
if not ok:
exit_with_error("Failed to generate TARGETS files")
if __name__ == "__main__":
main()

@ -0,0 +1,32 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# If clang_format_diff.py command is not specfied, we assume we are able to
# access directly without any path.
TGT_DIFF=`git diff TARGETS | head -n 1`
if [ ! -z "$TGT_DIFF" ]
then
echo "TARGETS file has uncommitted changes. Skip this check."
exit 0
fi
echo Backup original TARGETS file.
cp TARGETS TARGETS.bkp
${PYTHON:-python3} buckifier/buckify_rocksdb.py
TGT_DIFF=`git diff TARGETS | head -n 1`
if [ -z "$TGT_DIFF" ]
then
mv TARGETS.bkp TARGETS
exit 0
else
echo "Please run '${PYTHON:-python3} buckifier/buckify_rocksdb.py' to update TARGETS file."
echo "Do not manually update TARGETS file."
${PYTHON:-python3} --version
mv TARGETS.bkp TARGETS
exit 1
fi

@ -0,0 +1,6 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Create a tmp directory for the test to use
TEST_DIR=$(mktemp -d /dev/shm/fbcode_rocksdb_XXXXXXX)
# shellcheck disable=SC2068
TEST_TMPDIR="$TEST_DIR" $@ && rm -rf "$TEST_DIR"

@ -0,0 +1,150 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
try:
from builtins import object, str
except ImportError:
from __builtin__ import object, str
import pprint
import targets_cfg
def pretty_list(lst, indent=8):
if lst is None or len(lst) == 0:
return ""
if len(lst) == 1:
return '"%s"' % lst[0]
separator = '",\n%s"' % (" " * indent)
res = separator.join(sorted(lst))
res = "\n" + (" " * indent) + '"' + res + '",\n' + (" " * (indent - 4))
return res
class TARGETSBuilder(object):
def __init__(self, path, extra_argv):
self.path = path
header = targets_cfg.rocksdb_target_header_template.format(
extra_argv=extra_argv
)
with open(path, "wb") as targets_file:
targets_file.write(header.encode("utf-8"))
self.total_lib = 0
self.total_bin = 0
self.total_test = 0
self.tests_cfg = ""
def add_library(
self,
name,
srcs,
deps=None,
headers=None,
extra_external_deps="",
link_whole=False,
external_dependencies=None,
extra_test_libs=False,
):
if headers is not None:
headers = "[" + pretty_list(headers) + "]"
with open(self.path, "ab") as targets_file:
targets_file.write(
targets_cfg.library_template.format(
name=name,
srcs=pretty_list(srcs),
headers=headers,
deps=pretty_list(deps),
extra_external_deps=extra_external_deps,
link_whole=link_whole,
external_dependencies=pretty_list(external_dependencies),
extra_test_libs=extra_test_libs,
).encode("utf-8")
)
self.total_lib = self.total_lib + 1
def add_rocksdb_library(self, name, srcs, headers=None, external_dependencies=None):
if headers is not None:
headers = "[" + pretty_list(headers) + "]"
with open(self.path, "ab") as targets_file:
targets_file.write(
targets_cfg.rocksdb_library_template.format(
name=name,
srcs=pretty_list(srcs),
headers=headers,
external_dependencies=pretty_list(external_dependencies),
).encode("utf-8")
)
self.total_lib = self.total_lib + 1
def add_binary(
self,
name,
srcs,
deps=None,
extra_preprocessor_flags=None,
extra_bench_libs=False,
):
with open(self.path, "ab") as targets_file:
targets_file.write(
targets_cfg.binary_template.format(
name=name,
srcs=pretty_list(srcs),
deps=pretty_list(deps),
extra_preprocessor_flags=pretty_list(extra_preprocessor_flags),
extra_bench_libs=extra_bench_libs,
).encode("utf-8")
)
self.total_bin = self.total_bin + 1
def add_c_test(self):
with open(self.path, "ab") as targets_file:
targets_file.write(
b"""
add_c_test_wrapper()
"""
)
def add_test_header(self):
with open(self.path, "ab") as targets_file:
targets_file.write(
b"""
# Generate a test rule for each entry in ROCKS_TESTS
# Do not build the tests in opt mode, since SyncPoint and other test code
# will not be included.
"""
)
def add_fancy_bench_config(
self,
name,
bench_config,
slow,
expected_runtime,
sl_iterations,
regression_threshold,
):
with open(self.path, "ab") as targets_file:
targets_file.write(
targets_cfg.fancy_bench_template.format(
name=name,
bench_config=pprint.pformat(bench_config),
slow=slow,
expected_runtime=expected_runtime,
sl_iterations=sl_iterations,
regression_threshold=regression_threshold,
).encode("utf-8")
)
def register_test(self, test_name, src, deps, extra_compiler_flags):
with open(self.path, "ab") as targets_file:
targets_file.write(
targets_cfg.unittests_template.format(
test_name=test_name,
test_cc=str(src),
deps=deps,
extra_compiler_flags=extra_compiler_flags,
).encode("utf-8")
)
self.total_test = self.total_test + 1

@ -0,0 +1,41 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import absolute_import, division, print_function, unicode_literals
rocksdb_target_header_template = """# This file \100generated by:
#$ python3 buckifier/buckify_rocksdb.py{extra_argv}
# --> DO NOT EDIT MANUALLY <--
# This file is a Facebook-specific integration for buck builds, so can
# only be validated by Facebook employees.
#
# @noautodeps @nocodemods
load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrapper","cpp_binary_wrapper","cpp_unittest_wrapper","fancy_bench_wrapper","add_c_test_wrapper")
"""
library_template = """
cpp_library_wrapper(name="{name}", srcs=[{srcs}], deps=[{deps}], headers={headers}, link_whole={link_whole}, extra_test_libs={extra_test_libs})
"""
rocksdb_library_template = """
rocks_cpp_library_wrapper(name="{name}", srcs=[{srcs}], headers={headers})
"""
binary_template = """
cpp_binary_wrapper(name="{name}", srcs=[{srcs}], deps=[{deps}], extra_preprocessor_flags=[{extra_preprocessor_flags}], extra_bench_libs={extra_bench_libs})
"""
unittests_template = """
cpp_unittest_wrapper(name="{test_name}",
srcs=["{test_cc}"],
deps={deps},
extra_compiler_flags={extra_compiler_flags})
"""
fancy_bench_template = """
fancy_bench_wrapper(suite_name="{name}", binary_to_bench_to_metric_list_map={bench_config}, slow={slow}, expected_runtime={expected_runtime}, sl_iterations={sl_iterations}, regression_threshold={regression_threshold})
"""

@ -0,0 +1,118 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
"""
This module keeps commonly used components.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
try:
from builtins import object
except ImportError:
from __builtin__ import object
import os
import subprocess
import sys
import time
class ColorString(object):
"""Generate colorful strings on terminal"""
HEADER = "\033[95m"
BLUE = "\033[94m"
GREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"
ENDC = "\033[0m"
@staticmethod
def _make_color_str(text, color):
# In Python2, default encoding for unicode string is ASCII
if sys.version_info.major <= 2:
return "".join([color, text.encode("utf-8"), ColorString.ENDC])
# From Python3, default encoding for unicode string is UTF-8
return "".join([color, text, ColorString.ENDC])
@staticmethod
def ok(text):
if ColorString.is_disabled:
return text
return ColorString._make_color_str(text, ColorString.GREEN)
@staticmethod
def info(text):
if ColorString.is_disabled:
return text
return ColorString._make_color_str(text, ColorString.BLUE)
@staticmethod
def header(text):
if ColorString.is_disabled:
return text
return ColorString._make_color_str(text, ColorString.HEADER)
@staticmethod
def error(text):
if ColorString.is_disabled:
return text
return ColorString._make_color_str(text, ColorString.FAIL)
@staticmethod
def warning(text):
if ColorString.is_disabled:
return text
return ColorString._make_color_str(text, ColorString.WARNING)
is_disabled = False
def run_shell_command(shell_cmd, cmd_dir=None):
"""Run a single shell command.
@returns a tuple of shell command return code, stdout, stderr"""
if cmd_dir is not None and not os.path.exists(cmd_dir):
run_shell_command("mkdir -p %s" % cmd_dir)
start = time.time()
print("\t>>> Running: " + shell_cmd)
p = subprocess.Popen( # noqa
shell_cmd,
shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
cwd=cmd_dir,
)
stdout, stderr = p.communicate()
end = time.time()
# Report time if we spent more than 5 minutes executing a command
execution_time = end - start
if execution_time > (60 * 5):
mins = execution_time / 60
secs = execution_time % 60
print("\t>time spent: %d minutes %d seconds" % (mins, secs))
return p.returncode, stdout, stderr
def run_shell_commands(shell_cmds, cmd_dir=None, verbose=False):
"""Execute a sequence of shell commands, which is equivalent to
running `cmd1 && cmd2 && cmd3`
@returns boolean indication if all commands succeeds.
"""
if cmd_dir:
print("\t=== Set current working directory => %s" % cmd_dir)
for shell_cmd in shell_cmds:
ret_code, stdout, stderr = run_shell_command(shell_cmd, cmd_dir)
if stdout:
if verbose or ret_code != 0:
print(ColorString.info("stdout: \n"), stdout)
if stderr:
# contents in stderr is not necessarily to be error messages.
if verbose or ret_code != 0:
print(ColorString.error("stderr: \n"), stderr)
if ret_code != 0:
return False
return True

@ -0,0 +1,168 @@
#!/usr/bin/python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# amalgamate.py creates an amalgamation from a unity build.
# It can be run with either Python 2 or 3.
# An amalgamation consists of a header that includes the contents of all public
# headers and a source file that includes the contents of all source files and
# private headers.
#
# This script works by starting with the unity build file and recursively expanding
# #include directives. If the #include is found in a public include directory,
# that header is expanded into the amalgamation header.
#
# A particular header is only expanded once, so this script will
# break if there are multiple inclusions of the same header that are expected to
# expand differently. Similarly, this type of code causes issues:
#
# #ifdef FOO
# #include "bar.h"
# // code here
# #else
# #include "bar.h" // oops, doesn't get expanded
# // different code here
# #endif
#
# The solution is to move the include out of the #ifdef.
from __future__ import print_function
import argparse
import re
import sys
from os import path
include_re = re.compile('^[ \t]*#include[ \t]+"(.*)"[ \t]*$')
included = set()
excluded = set()
def find_header(name, abs_path, include_paths):
samedir = path.join(path.dirname(abs_path), name)
if path.exists(samedir):
return samedir
for include_path in include_paths:
include_path = path.join(include_path, name)
if path.exists(include_path):
return include_path
return None
def expand_include(
include_path,
f,
abs_path,
source_out,
header_out,
include_paths,
public_include_paths,
):
if include_path in included:
return False
included.add(include_path)
with open(include_path) as f:
print('#line 1 "{}"'.format(include_path), file=source_out)
process_file(
f, include_path, source_out, header_out, include_paths, public_include_paths
)
return True
def process_file(
f, abs_path, source_out, header_out, include_paths, public_include_paths
):
for (line, text) in enumerate(f):
m = include_re.match(text)
if m:
filename = m.groups()[0]
# first check private headers
include_path = find_header(filename, abs_path, include_paths)
if include_path:
if include_path in excluded:
source_out.write(text)
expanded = False
else:
expanded = expand_include(
include_path,
f,
abs_path,
source_out,
header_out,
include_paths,
public_include_paths,
)
else:
# now try public headers
include_path = find_header(filename, abs_path, public_include_paths)
if include_path:
# found public header
expanded = False
if include_path in excluded:
source_out.write(text)
else:
expand_include(
include_path,
f,
abs_path,
header_out,
None,
public_include_paths,
[],
)
else:
sys.exit(
"unable to find {}, included in {} on line {}".format(
filename, abs_path, line
)
)
if expanded:
print('#line {} "{}"'.format(line + 1, abs_path), file=source_out)
elif text != "#pragma once\n":
source_out.write(text)
def main():
parser = argparse.ArgumentParser(
description="Transform a unity build into an amalgamation"
)
parser.add_argument("source", help="source file")
parser.add_argument(
"-I",
action="append",
dest="include_paths",
help="include paths for private headers",
)
parser.add_argument(
"-i",
action="append",
dest="public_include_paths",
help="include paths for public headers",
)
parser.add_argument(
"-x", action="append", dest="excluded", help="excluded header files"
)
parser.add_argument("-o", dest="source_out", help="output C++ file", required=True)
parser.add_argument(
"-H", dest="header_out", help="output C++ header file", required=True
)
args = parser.parse_args()
include_paths = list(map(path.abspath, args.include_paths or []))
public_include_paths = list(map(path.abspath, args.public_include_paths or []))
excluded.update(map(path.abspath, args.excluded or []))
filename = args.source
abs_path = path.abspath(filename)
with open(filename) as f, open(args.source_out, "w") as source_out, open(
args.header_out, "w"
) as header_out:
print('#line 1 "{}"'.format(filename), file=source_out)
print('#include "{}"'.format(header_out.name), file=source_out)
process_file(
f, abs_path, source_out, header_out, include_paths, public_include_paths
)
if __name__ == "__main__":
main()

@ -0,0 +1,238 @@
#!/usr/bin/env python3
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
"""Access the results of benchmark runs
Send these results on to OpenSearch graphing service
"""
import argparse
import itertools
import logging
import os
import re
import sys
import requests
from dateutil import parser
logging.basicConfig(level=logging.DEBUG)
class Configuration:
opensearch_user = os.environ["ES_USER"]
opensearch_pass = os.environ["ES_PASS"]
class BenchmarkResultException(Exception):
def __init__(self, message, content):
super().__init__(self, message)
self.content = content
class BenchmarkUtils:
expected_keys = [
"ops_sec",
"mb_sec",
"lsm_sz",
"blob_sz",
"c_wgb",
"w_amp",
"c_mbps",
"c_wsecs",
"c_csecs",
"b_rgb",
"b_wgb",
"usec_op",
"p50",
"p99",
"p99.9",
"p99.99",
"pmax",
"uptime",
"stall%",
"Nstall",
"u_cpu",
"s_cpu",
"rss",
"test",
"date",
"version",
"job_id",
]
def sanity_check(row):
if "test" not in row:
logging.debug(f"not 'test' in row: {row}")
return False
if row["test"] == "":
logging.debug(f"row['test'] == '': {row}")
return False
if "date" not in row:
logging.debug(f"not 'date' in row: {row}")
return False
if "ops_sec" not in row:
logging.debug(f"not 'ops_sec' in row: {row}")
return False
try:
_ = int(row["ops_sec"])
except (ValueError, TypeError):
logging.debug(f"int(row['ops_sec']): {row}")
return False
try:
(_, _) = parser.parse(row["date"], fuzzy_with_tokens=True)
except (parser.ParserError):
logging.error(
f"parser.parse((row['date']): not a valid format for date in row: {row}"
)
return False
return True
def conform_opensearch(row):
(dt, _) = parser.parse(row["date"], fuzzy_with_tokens=True)
# create a test_date field, which was previously what was expected
# repair the date field, which has what can be a WRONG ISO FORMAT, (no leading 0 on single-digit day-of-month)
# e.g. 2022-07-1T00:14:55 should be 2022-07-01T00:14:55
row["test_date"] = dt.isoformat()
row["date"] = dt.isoformat()
return {key.replace(".", "_"): value for key, value in row.items()}
class ResultParser:
def __init__(self, field="(\w|[+-:.%])+", intrafield="(\s)+", separator="\t"):
self.field = re.compile(field)
self.intra = re.compile(intrafield)
self.sep = re.compile(separator)
def ignore(self, l_in: str):
if len(l_in) == 0:
return True
if l_in[0:1] == "#":
return True
return False
def line(self, line_in: str):
"""Parse a line into items
Being clever about separators
"""
line = line_in
row = []
while line != "":
match_item = self.field.match(line)
if match_item:
item = match_item.group(0)
row.append(item)
line = line[len(item) :]
else:
match_intra = self.intra.match(line)
if match_intra:
intra = match_intra.group(0)
# Count the separators
# If there are >1 then generate extra blank fields
# White space with no true separators fakes up a single separator
tabbed = self.sep.split(intra)
sep_count = len(tabbed) - 1
if sep_count == 0:
sep_count = 1
for _ in range(sep_count - 1):
row.append("")
line = line[len(intra) :]
else:
raise BenchmarkResultException(
"Invalid TSV line", f"{line_in} at {line}"
)
return row
def parse(self, lines):
"""Parse something that iterates lines"""
rows = [self.line(line) for line in lines if not self.ignore(line)]
header = rows[0]
width = len(header)
records = [
{k: v for (k, v) in itertools.zip_longest(header, row[:width])}
for row in rows[1:]
]
return records
def load_report_from_tsv(filename: str):
file = open(filename, "r")
contents = file.readlines()
file.close()
parser = ResultParser()
report = parser.parse(contents)
logging.debug(f"Loaded TSV Report: {report}")
return report
def push_report_to_opensearch(report, esdocument):
sanitized = [
BenchmarkUtils.conform_opensearch(row)
for row in report
if BenchmarkUtils.sanity_check(row)
]
logging.debug(
f"upload {len(sanitized)} sane of {len(report)} benchmarks to opensearch"
)
for single_benchmark in sanitized:
logging.debug(f"upload benchmark: {single_benchmark}")
response = requests.post(
esdocument,
json=single_benchmark,
auth=(os.environ["ES_USER"], os.environ["ES_PASS"]),
)
logging.debug(
f"Sent to OpenSearch, status: {response.status_code}, result: {response.text}"
)
response.raise_for_status()
def push_report_to_null(report):
for row in report:
if BenchmarkUtils.sanity_check(row):
logging.debug(f"row {row}")
conformed = BenchmarkUtils.conform_opensearch(row)
logging.debug(f"conformed row {conformed}")
def main():
"""Tool for fetching, parsing and uploading benchmark results to OpenSearch / ElasticSearch
This tool will
(1) Open a local tsv benchmark report file
(2) Upload to OpenSearch document, via https/JSON
"""
parser = argparse.ArgumentParser(description="CircleCI benchmark scraper.")
# --tsvfile is the name of the file to read results from
# --esdocument is the ElasticSearch document to push these results into
#
parser.add_argument(
"--tsvfile",
default="build_tools/circle_api_scraper_input.txt",
help="File from which to read tsv report",
)
parser.add_argument(
"--esdocument",
help="ElasticSearch/OpenSearch document URL to upload report into",
)
parser.add_argument(
"--upload", choices=["opensearch", "none"], default="opensearch"
)
args = parser.parse_args()
logging.debug(f"Arguments: {args}")
reports = load_report_from_tsv(args.tsvfile)
if args.upload == "opensearch":
push_report_to_opensearch(reports, args.esdocument)
else:
push_report_to_null(reports)
if __name__ == "__main__":
sys.exit(main())

@ -0,0 +1,811 @@
#!/usr/bin/env bash
#
# Detects OS we're compiling on and outputs a file specified by the first
# argument, which in turn gets read while processing Makefile.
#
# The output will set the following variables:
# CC C Compiler path
# CXX C++ Compiler path
# PLATFORM_LDFLAGS Linker flags
# JAVA_LDFLAGS Linker flags for RocksDBJava
# JAVA_STATIC_LDFLAGS Linker flags for RocksDBJava static build
# JAVAC_ARGS Arguments for javac
# PLATFORM_SHARED_EXT Extension for shared libraries
# PLATFORM_SHARED_LDFLAGS Flags for building shared library
# PLATFORM_SHARED_CFLAGS Flags for compiling objects for shared library
# PLATFORM_CCFLAGS C compiler flags
# PLATFORM_CXXFLAGS C++ compiler flags. Will contain:
# PLATFORM_SHARED_VERSIONED Set to 'true' if platform supports versioned
# shared libraries, empty otherwise.
# FIND Command for the find utility
# WATCH Command for the watch utility
#
# The PLATFORM_CCFLAGS and PLATFORM_CXXFLAGS might include the following:
#
# -DROCKSDB_PLATFORM_POSIX if posix-platform based
# -DSNAPPY if the Snappy library is present
# -DLZ4 if the LZ4 library is present
# -DZSTD if the ZSTD library is present
# -DNUMA if the NUMA library is present
# -DTBB if the TBB library is present
# -DMEMKIND if the memkind library is present
#
# Using gflags in rocksdb:
# Our project depends on gflags, which requires users to take some extra steps
# before they can compile the whole repository:
# 1. Install gflags. You may download it from here:
# https://gflags.github.io/gflags/ (Mac users can `brew install gflags`)
# 2. Once installed, add the include path for gflags to your CPATH env var and
# the lib path to LIBRARY_PATH. If installed with default settings, the lib
# will be /usr/local/lib and the include path will be /usr/local/include
OUTPUT=$1
if test -z "$OUTPUT"; then
echo "usage: $0 <output-filename>" >&2
exit 1
fi
# we depend on C++17, but should be compatible with newer standards
if [ "$ROCKSDB_CXX_STANDARD" ]; then
PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
else
PLATFORM_CXXFLAGS="-std=c++17"
fi
# we currently depend on POSIX platform
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
# Default to fbcode gcc on internal fb machines
if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
FBCODE_BUILD="true"
# If we're compiling with TSAN or shared lib, we need pic build
PIC_BUILD=$COMPILE_WITH_TSAN
if [ "$LIB_MODE" == "shared" ]; then
PIC_BUILD=1
fi
source "$PWD/build_tools/fbcode_config_platform010.sh"
fi
# Delete existing output, if it exists
rm -f "$OUTPUT"
touch "$OUTPUT"
if test -z "$CC"; then
if [ -x "$(command -v cc)" ]; then
CC=cc
elif [ -x "$(command -v clang)" ]; then
CC=clang
else
CC=cc
fi
fi
if test -z "$CXX"; then
if [ -x "$(command -v g++)" ]; then
CXX=g++
elif [ -x "$(command -v clang++)" ]; then
CXX=clang++
else
CXX=g++
fi
fi
if test -z "$AR"; then
if [ -x "$(command -v gcc-ar)" ]; then
AR=gcc-ar
elif [ -x "$(command -v llvm-ar)" ]; then
AR=llvm-ar
else
AR=ar
fi
fi
# Detect OS
if test -z "$TARGET_OS"; then
TARGET_OS=`uname -s`
fi
if test -z "$TARGET_ARCHITECTURE"; then
TARGET_ARCHITECTURE=`uname -m`
fi
if test -z "$CLANG_SCAN_BUILD"; then
CLANG_SCAN_BUILD=scan-build
fi
if test -z "$CLANG_ANALYZER"; then
CLANG_ANALYZER=$(command -v clang++ 2> /dev/null)
fi
if test -z "$FIND"; then
FIND=find
fi
if test -z "$WATCH"; then
WATCH=watch
fi
COMMON_FLAGS="$COMMON_FLAGS ${CFLAGS}"
CROSS_COMPILE=
PLATFORM_CCFLAGS=
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="-Wl,--no-as-needed -shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true
# generic port files (working on all platform by #ifdef) go directly in /port
GENERIC_PORT_FILES=`cd "$ROCKSDB_ROOT"; find port -name '*.cc' | tr "\n" " "`
# On GCC, we pick libc's memcmp over GCC's memcmp via -fno-builtin-memcmp
case "$TARGET_OS" in
Darwin)
PLATFORM=OS_MACOSX
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX"
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
# PORT_FILES=port/darwin/darwin_specific.cc
;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE "
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
CROSS_COMPILE=true
PLATFORM_SHARED_VERSIONED=
;;
Linux)
PLATFORM=OS_LINUX
COMMON_FLAGS="$COMMON_FLAGS -DOS_LINUX"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
else
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
fi
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -ldl"
if test -z "$ROCKSDB_USE_IO_URING"; then
ROCKSDB_USE_IO_URING=1
fi
if test "$ROCKSDB_USE_IO_URING" -ne 0; then
# check for liburing
$CXX $PLATFORM_CXXFLAGS -x c++ - -luring -o test.o 2>/dev/null <<EOF
#include <liburing.h>
int main() {
struct io_uring ring;
io_uring_queue_init(1, &ring, 0);
return 0;
}
EOF
if [ "$?" = 0 ]; then
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -luring"
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_IOURING_PRESENT"
fi
fi
# PORT_FILES=port/linux/linux_specific.cc
;;
SunOS)
PLATFORM=OS_SOLARIS
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_SOLARIS -m64"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt -static-libstdc++ -static-libgcc -m64"
# PORT_FILES=port/sunos/sunos_specific.cc
;;
AIX)
PLATFORM=OS_AIX
CC=gcc
COMMON_FLAGS="$COMMON_FLAGS -maix64 -pthread -fno-builtin-memcmp -D_REENTRANT -DOS_AIX -D__STDC_FORMAT_MACROS"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread -lpthread -lrt -maix64 -static-libstdc++ -static-libgcc"
# PORT_FILES=port/aix/aix_specific.cc
;;
FreeBSD)
PLATFORM=OS_FREEBSD
CXX=clang++
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_FREEBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/freebsd/freebsd_specific.cc
;;
GNU/kFreeBSD)
PLATFORM=OS_GNU_KFREEBSD
COMMON_FLAGS="$COMMON_FLAGS -DOS_GNU_KFREEBSD"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
else
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
fi
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/gnu_kfreebsd/gnu_kfreebsd_specific.cc
;;
NetBSD)
PLATFORM=OS_NETBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_NETBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lgcc_s"
# PORT_FILES=port/netbsd/netbsd_specific.cc
;;
OpenBSD)
PLATFORM=OS_OPENBSD
CXX=clang++
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_OPENBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -pthread"
# PORT_FILES=port/openbsd/openbsd_specific.cc
FIND=gfind
WATCH=gnuwatch
;;
DragonFly)
PLATFORM=OS_DRAGONFLYBSD
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_DRAGONFLYBSD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread"
# PORT_FILES=port/dragonfly/dragonfly_specific.cc
;;
Cygwin)
PLATFORM=CYGWIN
PLATFORM_SHARED_CFLAGS=""
PLATFORM_CXXFLAGS="-std=gnu++11"
COMMON_FLAGS="$COMMON_FLAGS -DCYGWIN"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
else
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -latomic"
fi
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lpthread -lrt"
# PORT_FILES=port/linux/linux_specific.cc
;;
OS_ANDROID_CROSSCOMPILE)
PLATFORM=OS_ANDROID
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp -D_REENTRANT -DOS_ANDROID -DROCKSDB_PLATFORM_POSIX"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS " # All pthread features are in the Android C library
# PORT_FILES=port/android/android.cc
CROSS_COMPILE=true
;;
*)
echo "Unknown platform!" >&2
exit 1
esac
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
JAVA_LDFLAGS="$PLATFORM_LDFLAGS"
JAVA_STATIC_LDFLAGS="$PLATFORM_LDFLAGS"
JAVAC_ARGS="-source 8"
if [ "$CROSS_COMPILE" = "true" -o "$FBCODE_BUILD" = "true" ]; then
# Cross-compiling; do not try any compilation tests.
# Also don't need any compilation tests if compiling on fbcode
if [ "$FBCODE_BUILD" = "true" ]; then
# Enable backtrace on fbcode since the necessary libraries are present
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
FOLLY_DIR="third-party/folly"
fi
true
else
if ! test $ROCKSDB_DISABLE_FALLOCATE; then
# Test whether fallocate is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <fcntl.h>
#include <linux/falloc.h>
int main() {
int fd = open("/dev/null", 0);
fallocate(fd, FALLOC_FL_KEEP_SIZE, 0, 1024);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_FALLOCATE_PRESENT"
fi
fi
if ! test $ROCKSDB_DISABLE_SNAPPY; then
# Test whether Snappy library is installed
# http://code.google.com/p/snappy/
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <snappy.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DSNAPPY"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lsnappy"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lsnappy"
fi
fi
if ! test $ROCKSDB_DISABLE_GFLAGS; then
# Test whether gflags library is installed
# http://gflags.github.io/gflags/
# check if the namespace is gflags
if $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
#include <gflags/gflags.h>
using namespace GFLAGS_NAMESPACE;
int main() {}
EOF
then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
# check if namespace is gflags
elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
#include <gflags/gflags.h>
using namespace gflags;
int main() {}
EOF
then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=gflags"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
# check if namespace is google
elif $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null << EOF
#include <gflags/gflags.h>
using namespace google;
int main() {}
EOF
then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=google"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
fi
fi
if ! test $ROCKSDB_DISABLE_ZLIB; then
# Test whether zlib library is installed
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <zlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
fi
fi
if ! test $ROCKSDB_DISABLE_BZIP; then
# Test whether bzip library is installed
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <bzlib.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DBZIP2"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbz2"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lbz2"
fi
fi
if ! test $ROCKSDB_DISABLE_LZ4; then
# Test whether lz4 library is installed
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <lz4.h>
#include <lz4hc.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DLZ4"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
JAVA_LDFLAGS="$JAVA_LDFLAGS -llz4"
fi
fi
if ! test $ROCKSDB_DISABLE_ZSTD; then
# Test whether zstd library is installed
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <zstd.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DZSTD"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lzstd"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lzstd"
fi
fi
if ! test $ROCKSDB_DISABLE_NUMA; then
# Test whether numa is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -lnuma 2>/dev/null <<EOF
#include <numa.h>
#include <numaif.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lnuma"
fi
fi
if ! test $ROCKSDB_DISABLE_TBB; then
# Test whether tbb is available
$CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ltbb 2>/dev/null <<EOF
#include <tbb/tbb.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DTBB"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltbb"
JAVA_LDFLAGS="$JAVA_LDFLAGS -ltbb"
fi
fi
if ! test $ROCKSDB_DISABLE_JEMALLOC; then
# Test whether jemalloc is available
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ljemalloc \
2>/dev/null; then
# This will enable some preprocessor identifiers in the Makefile
JEMALLOC=1
# JEMALLOC can be enabled either using the flag (like here) or by
# providing direct link to the jemalloc library
WITH_JEMALLOC_FLAG=1
# check for JEMALLOC installed with HomeBrew
if [ "$PLATFORM" == "OS_MACOSX" ]; then
if [ "$TARGET_ARCHITECTURE" = "arm64" ]; then
# on M1 Macs, homebrew installs here instead of /usr/local
JEMALLOC_PREFIX="/opt/homebrew"
else
JEMALLOC_PREFIX="/usr/local"
fi
if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
JEMALLOC_INCLUDE="-I${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/include"
JEMALLOC_LIB="${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_LDFLAGS="$JAVA_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
fi
fi
fi
fi
if ! test $JEMALLOC && ! test $ROCKSDB_DISABLE_TCMALLOC; then
# jemalloc is not available. Let's try tcmalloc
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o \
-ltcmalloc 2>/dev/null; then
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -ltcmalloc"
JAVA_LDFLAGS="$JAVA_LDFLAGS -ltcmalloc"
fi
fi
if ! test $ROCKSDB_DISABLE_MALLOC_USABLE_SIZE; then
# Test whether malloc_usable_size is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <malloc.h>
int main() {
size_t res = malloc_usable_size(0);
(void)res;
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_MALLOC_USABLE_SIZE"
fi
fi
if ! test $ROCKSDB_DISABLE_MEMKIND; then
# Test whether memkind library is installed
$CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -lmemkind 2>/dev/null <<EOF
#include <memkind.h>
int main() {
memkind_malloc(MEMKIND_DAX_KMEM, 1024);
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DMEMKIND"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lmemkind"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lmemkind"
fi
fi
if ! test $ROCKSDB_DISABLE_PTHREAD_MUTEX_ADAPTIVE_NP; then
# Test whether PTHREAD_MUTEX_ADAPTIVE_NP mutex type is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <pthread.h>
int main() {
int x = PTHREAD_MUTEX_ADAPTIVE_NP;
(void)x;
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX"
fi
fi
if ! test $ROCKSDB_DISABLE_BACKTRACE; then
# Test whether backtrace is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <execinfo.h>
int main() {
void* frames[1];
backtrace_symbols(frames, backtrace(frames, 1));
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
else
# Test whether execinfo library is installed
$CXX $PLATFORM_CXXFLAGS -lexecinfo -x c++ - -o test.o 2>/dev/null <<EOF
#include <execinfo.h>
int main() {
void* frames[1];
backtrace_symbols(frames, backtrace(frames, 1));
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_BACKTRACE"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lexecinfo"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lexecinfo"
fi
fi
fi
if ! test $ROCKSDB_DISABLE_PG; then
# Test if -pg is supported
$CXX $PLATFORM_CXXFLAGS -pg -x c++ - -o test.o 2>/dev/null <<EOF
int main() {
return 0;
}
EOF
if [ "$?" = 0 ]; then
PROFILING_FLAGS=-pg
fi
fi
if ! test $ROCKSDB_DISABLE_SYNC_FILE_RANGE; then
# Test whether sync_file_range is supported for compatibility with an old glibc
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
int fd = open("/dev/null", 0);
sync_file_range(fd, 0, 1024, SYNC_FILE_RANGE_WRITE);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_RANGESYNC_PRESENT"
fi
fi
if ! test $ROCKSDB_DISABLE_SCHED_GETCPU; then
# Test whether sched_getcpu is supported
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <sched.h>
int main() {
int cpuid = sched_getcpu();
(void)cpuid;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_SCHED_GETCPU_PRESENT"
fi
fi
if ! test $ROCKSDB_DISABLE_AUXV_GETAUXVAL; then
# Test whether getauxval is supported
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <sys/auxv.h>
int main() {
uint64_t auxv = getauxval(AT_HWCAP);
(void)auxv;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DROCKSDB_AUXV_GETAUXVAL_PRESENT"
fi
fi
if ! test $ROCKSDB_DISABLE_ALIGNED_NEW; then
# Test whether c++17 aligned-new is supported
$CXX $PLATFORM_CXXFLAGS -faligned-new -x c++ - -o test.o 2>/dev/null <<EOF
struct alignas(1024) t {int a;};
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS -faligned-new -DHAVE_ALIGNED_NEW"
fi
fi
if ! test $ROCKSDB_DISABLE_BENCHMARK; then
# Test whether google benchmark is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o /dev/null -lbenchmark -lpthread 2>/dev/null <<EOF
#include <benchmark/benchmark.h>
int main() {}
EOF
if [ "$?" = 0 ]; then
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lbenchmark"
fi
fi
if test $USE_FOLLY; then
# Test whether libfolly library is installed
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <folly/synchronization/DistributedMutex.h>
int main() {}
EOF
if [ "$?" != 0 ]; then
FOLLY_DIR="./third-party/folly"
fi
fi
fi
# TODO(tec): Fix -Wshorten-64-to-32 errors on FreeBSD and enable the warning.
# -Wshorten-64-to-32 breaks compilation on FreeBSD aarch64 and i386
if ! { [ "$TARGET_OS" = FreeBSD ] && [ "$TARGET_ARCHITECTURE" = arm64 -o "$TARGET_ARCHITECTURE" = i386 ]; }; then
# Test whether -Wshorten-64-to-32 is available
$CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -Wshorten-64-to-32 2>/dev/null <<EOF
int main() {}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -Wshorten-64-to-32"
fi
fi
if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
# Tune for this POWER processor, treating '+' models as base models
POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
COMMON_FLAGS="$COMMON_FLAGS -mcpu=$POWER -mtune=$POWER "
elif test -n "`echo $TARGET_ARCHITECTURE | grep -e^arm -e^aarch64`"; then
# TODO: Handle this with approprite options.
COMMON_FLAGS="$COMMON_FLAGS"
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^aarch64`"; then
COMMON_FLAGS="$COMMON_FLAGS"
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ \
-march=native - -o /dev/null 2>/dev/null; then
COMMON_FLAGS="$COMMON_FLAGS -march=native "
else
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
fi
COMMON_FLAGS="$COMMON_FLAGS"
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif [ "$TARGET_OS" == "IOS" ]; then
COMMON_FLAGS="$COMMON_FLAGS"
else
COMMON_FLAGS="$COMMON_FLAGS -march=native "
fi
else
# PORTABLE specified
if [ "$PORTABLE" == 1 ]; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif test "$USE_SSE"; then
# USE_SSE is DEPRECATED
# This is a rough approximation of the old USE_SSE behavior
COMMON_FLAGS="$COMMON_FLAGS -march=haswell"
fi
# Other than those cases, not setting -march= here.
else
# Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell
COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}"
fi
if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
# For portability compile for macOS 10.14 or newer
COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.14"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.14"
# -mmacosx-version-min must come first here.
PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.14 $PLATFORM_SHARED_LDFLAGS"
PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.14"
JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
fi
fi
if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
# check for GNU libc on ppc64
$CXX -x c++ - -o /dev/null 2>/dev/null <<EOF
#include <stdio.h>
#include <stdlib.h>
#include <gnu/libc-version.h>
int main(int argc, char *argv[]) {
printf("GNU libc version: %s\n", gnu_get_libc_version());
return 0;
}
EOF
if [ "$?" != 0 ]; then
PPC_LIBC_IS_GNU=0
fi
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
int main() {
uint64_t a = 0xffffFFFFffffFFFF;
__uint128_t b = __uint128_t(a) * a;
a = static_cast<uint64_t>(b >> 64);
(void)a;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_UINT128_EXTENSION"
fi
if [ "$FBCODE_BUILD" != "true" -a "$PLATFORM" = OS_LINUX ]; then
$CXX $COMMON_FLAGS $PLATFORM_SHARED_CFLAGS -x c++ -c - -o test_dl.o 2>/dev/null <<EOF
void dummy_func() {}
EOF
if [ "$?" = 0 ]; then
$CXX $COMMON_FLAGS $PLATFORM_SHARED_LDFLAGS test_dl.o -o test.o 2>/dev/null
if [ "$?" = 0 ]; then
EXEC_LDFLAGS+="-ldl"
rm -f test_dl.o
fi
fi
fi
# check for F_FULLFSYNC
$CXX $PLATFORM_CXXFALGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <fcntl.h>
int main() {
fcntl(0, F_FULLFSYNC);
return 0;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS -DHAVE_FULLFSYNC"
fi
rm -f test.o test_dl.o
# Get the path for the folly installation dir
if [ "$USE_FOLLY" ]; then
if [ "$FOLLY_DIR" ]; then
FOLLY_PATH=`cd $FOLLY_DIR && $PYTHON build/fbcode_builder/getdeps.py show-inst-dir folly`
fi
fi
PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS"
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS"
VALGRIND_VER="$VALGRIND_VER"
ROCKSDB_MAJOR=`build_tools/version.sh major`
ROCKSDB_MINOR=`build_tools/version.sh minor`
ROCKSDB_PATCH=`build_tools/version.sh patch`
echo "CC=$CC" >> "$OUTPUT"
echo "CXX=$CXX" >> "$OUTPUT"
echo "AR=$AR" >> "$OUTPUT"
echo "PLATFORM=$PLATFORM" >> "$OUTPUT"
echo "PLATFORM_LDFLAGS=$PLATFORM_LDFLAGS" >> "$OUTPUT"
echo "PLATFORM_CMAKE_FLAGS=$PLATFORM_CMAKE_FLAGS" >> "$OUTPUT"
echo "JAVA_LDFLAGS=$JAVA_LDFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_LDFLAGS=$JAVA_STATIC_LDFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_DEPS_CCFLAGS=$JAVA_STATIC_DEPS_CCFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_DEPS_CXXFLAGS=$JAVA_STATIC_DEPS_CXXFLAGS" >> "$OUTPUT"
echo "JAVA_STATIC_DEPS_LDFLAGS=$JAVA_STATIC_DEPS_LDFLAGS" >> "$OUTPUT"
echo "JAVAC_ARGS=$JAVAC_ARGS" >> "$OUTPUT"
echo "VALGRIND_VER=$VALGRIND_VER" >> "$OUTPUT"
echo "PLATFORM_CCFLAGS=$PLATFORM_CCFLAGS" >> "$OUTPUT"
echo "PLATFORM_CXXFLAGS=$PLATFORM_CXXFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> "$OUTPUT"
echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> "$OUTPUT"
echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> "$OUTPUT"
echo "EXEC_LDFLAGS=$EXEC_LDFLAGS" >> "$OUTPUT"
echo "JEMALLOC_INCLUDE=$JEMALLOC_INCLUDE" >> "$OUTPUT"
echo "JEMALLOC_LIB=$JEMALLOC_LIB" >> "$OUTPUT"
echo "ROCKSDB_MAJOR=$ROCKSDB_MAJOR" >> "$OUTPUT"
echo "ROCKSDB_MINOR=$ROCKSDB_MINOR" >> "$OUTPUT"
echo "ROCKSDB_PATCH=$ROCKSDB_PATCH" >> "$OUTPUT"
echo "CLANG_SCAN_BUILD=$CLANG_SCAN_BUILD" >> "$OUTPUT"
echo "CLANG_ANALYZER=$CLANG_ANALYZER" >> "$OUTPUT"
echo "PROFILING_FLAGS=$PROFILING_FLAGS" >> "$OUTPUT"
echo "FIND=$FIND" >> "$OUTPUT"
echo "WATCH=$WATCH" >> "$OUTPUT"
echo "FOLLY_PATH=$FOLLY_PATH" >> "$OUTPUT"
# This will enable some related identifiers for the preprocessor
if test -n "$JEMALLOC"; then
echo "JEMALLOC=1" >> "$OUTPUT"
fi
# Indicates that jemalloc should be enabled using -ljemalloc flag
# The alternative is to porvide a direct link to the library via JEMALLOC_LIB
# and JEMALLOC_INCLUDE
if test -n "$WITH_JEMALLOC_FLAG"; then
echo "WITH_JEMALLOC_FLAG=$WITH_JEMALLOC_FLAG" >> "$OUTPUT"
fi
echo "LUA_PATH=$LUA_PATH" >> "$OUTPUT"
if test -n "$USE_FOLLY"; then
echo "USE_FOLLY=$USE_FOLLY" >> "$OUTPUT"
fi
if test -n "$PPC_LIBC_IS_GNU"; then
echo "PPC_LIBC_IS_GNU=$PPC_LIBC_IS_GNU" >> "$OUTPUT"
fi

@ -0,0 +1,48 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Check for some simple mistakes that should prevent commit or push
BAD=""
git grep -n 'namespace rocksdb' -- '*.[ch]*'
if [ "$?" != "1" ]; then
echo "^^^^^ Do not hardcode namespace rocksdb. Use ROCKSDB_NAMESPACE"
BAD=1
fi
git grep -n -i 'nocommit' -- ':!build_tools/check-sources.sh'
if [ "$?" != "1" ]; then
echo "^^^^^ Code was not intended to be committed"
BAD=1
fi
git grep -n 'include <rocksdb/' -- ':!build_tools/check-sources.sh'
if [ "$?" != "1" ]; then
echo '^^^^^ Use double-quotes as in #include "rocksdb/something.h"'
BAD=1
fi
git grep -n 'include "include/rocksdb/' -- ':!build_tools/check-sources.sh'
if [ "$?" != "1" ]; then
echo '^^^^^ Use #include "rocksdb/something.h" instead of #include "include/rocksdb/something.h"'
BAD=1
fi
git grep -n 'using namespace' -- ':!build_tools' ':!docs' \
':!third-party/folly/folly/lang/Align.h' \
':!third-party/gtest-1.8.1/fused-src/gtest/gtest.h'
if [ "$?" != "1" ]; then
echo '^^^^ Do not use "using namespace"'
BAD=1
fi
git grep -n -P "[\x80-\xFF]" -- ':!docs' ':!*.md'
if [ "$?" != "1" ]; then
echo '^^^^ Use only ASCII characters in source files'
BAD=1
fi
if [ "$BAD" ]; then
exit 1
fi

@ -0,0 +1,22 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# The file is generated using update_dependencies.sh.
GCC_BASE=/mnt/gvfs/third-party2/gcc/e40bde78650fa91b8405a857e3f10bf336633fb0/11.x/centos7-native/886b5eb
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/2043340983c032915adbb6f78903dc855b65aee8/12/platform010/9520e0f
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/c00dcc6a3e4125c7e8b248e9a79c14b78ac9e0ca/11.x/platform010/5684a5a
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/0b9c8e4b060eda62f3bc1c6127bbe1256697569b/2.34/platform010/f259413
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/bc9647f7912b131315827d65cb6189c21f381d05/1.1.3/platform010/76ebdda
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/a6f5f3f1d063d2d00cd02fc12f0f05fc3ab3a994/1.2.11/platform010/76ebdda
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/09703139cfc376bd8a82642385a0e97726b28287/1.0.6/platform010/76ebdda
LZ4_BASE=/mnt/gvfs/third-party2/lz4/60220d6a5bf7722b9cc239a1368c596619b12060/1.9.1/platform010/76ebdda
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/50eace8143eaaea9473deae1f3283e0049e05633/1.4.x/platform010/64091f4
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/5d27e5919771603da06000a027b12f799e58a4f7/2.2.0/platform010/76ebdda
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform010/f57cc4a
NUMA_BASE=/mnt/gvfs/third-party2/numa/6b412770957aa3c8a87e5e0dcd8cc2f45f393bc0/2.0.11/platform010/76ebdda
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/52f69816e936e147664ad717eb71a1a0e9dc973a/1.4/platform010/5074a48
TBB_BASE=/mnt/gvfs/third-party2/tbb/c9cc192099fa84c0dcd0ffeedd44a373ad6e4925/2018_U5/platform010/76ebdda
LIBURING_BASE=/mnt/gvfs/third-party2/liburing/a98e2d137007e3ebf7f33bd6f99c2c56bdaf8488/20210212/platform010/76ebdda
BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/780c7a0f9cf0967961e69ad08e61cddd85d61821/trunk/platform010/76ebdda
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/02d9f76aaaba580611cf75e741753c800c7fdc12/fb/platform010/da39a3e
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/938dc3f064ef3a48c0446f5b11d788d50b3eb5ee/2.37/centos7-native/da39a3e
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/429a6b3203eb415f1599bd15183659153129188e/3.15.0/platform010/76ebdda
LUA_BASE=/mnt/gvfs/third-party2/lua/363787fa5cac2a8aa20638909210443278fa138e/5.3.4/platform010/9079c97

@ -0,0 +1,3 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
docker run -v $PWD:/rocks -w /rocks buildpack-deps make

@ -0,0 +1,181 @@
# Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
# This source code is licensed under both the GPLv2 (found in the
# COPYING file in the root directory) and Apache 2.0 License
# (found in the LICENSE.Apache file in the root directory).
"""Filter for error messages in test output:
- Receives merged stdout/stderr from test on stdin
- Finds patterns of known error messages for test name (first argument)
- Prints those error messages to stdout
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import re
import sys
class ErrorParserBase(object):
def parse_error(self, line):
"""Parses a line of test output. If it contains an error, returns a
formatted message describing the error; otherwise, returns None.
Subclasses must override this method.
"""
raise NotImplementedError
class GTestErrorParser(ErrorParserBase):
"""A parser that remembers the last test that began running so it can print
that test's name upon detecting failure.
"""
_GTEST_NAME_PATTERN = re.compile(r"\[ RUN \] (\S+)$")
# format: '<filename or "unknown file">:<line #>: Failure'
_GTEST_FAIL_PATTERN = re.compile(r"(unknown file|\S+:\d+): Failure$")
def __init__(self):
self._last_gtest_name = "Unknown test"
def parse_error(self, line):
gtest_name_match = self._GTEST_NAME_PATTERN.match(line)
if gtest_name_match:
self._last_gtest_name = gtest_name_match.group(1)
return None
gtest_fail_match = self._GTEST_FAIL_PATTERN.match(line)
if gtest_fail_match:
return "%s failed: %s" % (self._last_gtest_name, gtest_fail_match.group(1))
return None
class MatchErrorParser(ErrorParserBase):
"""A simple parser that returns the whole line if it matches the pattern."""
def __init__(self, pattern):
self._pattern = re.compile(pattern)
def parse_error(self, line):
if self._pattern.match(line):
return line
return None
class CompilerErrorParser(MatchErrorParser):
def __init__(self):
# format (compile error):
# '<filename>:<line #>:<column #>: error: <error msg>'
# format (link error):
# '<filename>:<line #>: error: <error msg>'
# The below regex catches both
super(CompilerErrorParser, self).__init__(r"\S+:\d+: error:")
class ScanBuildErrorParser(MatchErrorParser):
def __init__(self):
super(ScanBuildErrorParser, self).__init__(r"scan-build: \d+ bugs found.$")
class DbCrashErrorParser(MatchErrorParser):
def __init__(self):
super(DbCrashErrorParser, self).__init__(r"\*\*\*.*\^$|TEST FAILED.")
class WriteStressErrorParser(MatchErrorParser):
def __init__(self):
super(WriteStressErrorParser, self).__init__(
r"ERROR: write_stress died with exitcode=\d+"
)
class AsanErrorParser(MatchErrorParser):
def __init__(self):
super(AsanErrorParser, self).__init__(r"==\d+==ERROR: AddressSanitizer:")
class UbsanErrorParser(MatchErrorParser):
def __init__(self):
# format: '<filename>:<line #>:<column #>: runtime error: <error msg>'
super(UbsanErrorParser, self).__init__(r"\S+:\d+:\d+: runtime error:")
class ValgrindErrorParser(MatchErrorParser):
def __init__(self):
# just grab the summary, valgrind doesn't clearly distinguish errors
# from other log messages.
super(ValgrindErrorParser, self).__init__(r"==\d+== ERROR SUMMARY:")
class CompatErrorParser(MatchErrorParser):
def __init__(self):
super(CompatErrorParser, self).__init__(r"==== .*[Ee]rror.* ====$")
class TsanErrorParser(MatchErrorParser):
def __init__(self):
super(TsanErrorParser, self).__init__(r"WARNING: ThreadSanitizer:")
_TEST_NAME_TO_PARSERS = {
"punit": [CompilerErrorParser, GTestErrorParser],
"unit": [CompilerErrorParser, GTestErrorParser],
"release": [CompilerErrorParser, GTestErrorParser],
"unit_481": [CompilerErrorParser, GTestErrorParser],
"release_481": [CompilerErrorParser, GTestErrorParser],
"clang_unit": [CompilerErrorParser, GTestErrorParser],
"clang_release": [CompilerErrorParser, GTestErrorParser],
"clang_analyze": [CompilerErrorParser, ScanBuildErrorParser],
"code_cov": [CompilerErrorParser, GTestErrorParser],
"unity": [CompilerErrorParser, GTestErrorParser],
"lite": [CompilerErrorParser],
"lite_test": [CompilerErrorParser, GTestErrorParser],
"stress_crash": [CompilerErrorParser, DbCrashErrorParser],
"stress_crash_with_atomic_flush": [CompilerErrorParser, DbCrashErrorParser],
"stress_crash_with_txn": [CompilerErrorParser, DbCrashErrorParser],
"write_stress": [CompilerErrorParser, WriteStressErrorParser],
"asan": [CompilerErrorParser, GTestErrorParser, AsanErrorParser],
"asan_crash": [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
"asan_crash_with_atomic_flush": [
CompilerErrorParser,
AsanErrorParser,
DbCrashErrorParser,
],
"asan_crash_with_txn": [CompilerErrorParser, AsanErrorParser, DbCrashErrorParser],
"ubsan": [CompilerErrorParser, GTestErrorParser, UbsanErrorParser],
"ubsan_crash": [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
"ubsan_crash_with_atomic_flush": [
CompilerErrorParser,
UbsanErrorParser,
DbCrashErrorParser,
],
"ubsan_crash_with_txn": [CompilerErrorParser, UbsanErrorParser, DbCrashErrorParser],
"valgrind": [CompilerErrorParser, GTestErrorParser, ValgrindErrorParser],
"tsan": [CompilerErrorParser, GTestErrorParser, TsanErrorParser],
"format_compatible": [CompilerErrorParser, CompatErrorParser],
"run_format_compatible": [CompilerErrorParser, CompatErrorParser],
"no_compression": [CompilerErrorParser, GTestErrorParser],
"run_no_compression": [CompilerErrorParser, GTestErrorParser],
"regression": [CompilerErrorParser],
"run_regression": [CompilerErrorParser],
}
def main():
if len(sys.argv) != 2:
return "Usage: %s <test name>" % sys.argv[0]
test_name = sys.argv[1]
if test_name not in _TEST_NAME_TO_PARSERS:
return "Unknown test name: %s" % test_name
error_parsers = []
for parser_cls in _TEST_NAME_TO_PARSERS[test_name]:
error_parsers.append(parser_cls())
for line in sys.stdin:
line = line.strip()
for error_parser in error_parsers:
error_msg = error_parser.parse_error(line)
if error_msg is not None:
print(error_msg)
if __name__ == "__main__":
sys.exit(main())

@ -0,0 +1,55 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# fail early
set -e
if test -z $ROCKSDB_PATH; then
ROCKSDB_PATH=~/rocksdb
fi
source $ROCKSDB_PATH/build_tools/fbcode_config4.8.1.sh
EXTRA_LDFLAGS=""
if test -z $ALLOC; then
# default
ALLOC=tcmalloc
elif [[ $ALLOC == "jemalloc" ]]; then
ALLOC=system
EXTRA_LDFLAGS+=" -Wl,--whole-archive $JEMALLOC_LIB -Wl,--no-whole-archive"
fi
# we need to force mongo to use static library, not shared
STATIC_LIB_DEP_DIR='build/static_library_dependencies'
test -d $STATIC_LIB_DEP_DIR || mkdir $STATIC_LIB_DEP_DIR
test -h $STATIC_LIB_DEP_DIR/`basename $SNAPPY_LIBS` || ln -s $SNAPPY_LIBS $STATIC_LIB_DEP_DIR
test -h $STATIC_LIB_DEP_DIR/`basename $LZ4_LIBS` || ln -s $LZ4_LIBS $STATIC_LIB_DEP_DIR
EXTRA_LDFLAGS+=" -L $STATIC_LIB_DEP_DIR"
set -x
EXTRA_CMD=""
if ! test -e version.json; then
# this is Mongo 3.0
EXTRA_CMD="--rocksdb \
--variant-dir=linux2/norm
--cxx=${CXX} \
--cc=${CC} \
--use-system-zlib" # add this line back to normal code path
# when https://jira.mongodb.org/browse/SERVER-19123 is resolved
fi
scons \
LINKFLAGS="$EXTRA_LDFLAGS $EXEC_LDFLAGS $PLATFORM_LDFLAGS" \
CCFLAGS="$CXXFLAGS -L $STATIC_LIB_DEP_DIR" \
LIBS="lz4 gcc stdc++" \
LIBPATH="$ROCKSDB_PATH" \
CPPPATH="$ROCKSDB_PATH/include" \
-j32 \
--allocator=$ALLOC \
--nostrip \
--opt=on \
--disable-minimum-compiler-version-enforcement \
--use-system-snappy \
--disable-warnings-as-errors \
$EXTRA_CMD $*

@ -0,0 +1,175 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ and clang compilers and also
# uses jemalloc
# Environment variables that change the behavior of this script:
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
BASEDIR=`dirname $BASH_SOURCE`
source "$BASEDIR/dependencies.sh"
CFLAGS=""
# libgcc
LIBGCC_INCLUDE="$LIBGCC_BASE/include"
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
# glibc
GLIBC_INCLUDE="$GLIBC_BASE/include"
GLIBC_LIBS=" -L $GLIBC_BASE/lib"
if ! test $ROCKSDB_DISABLE_SNAPPY; then
# snappy
SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
if test -z $PIC_BUILD; then
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy.a"
else
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy_pic.a"
fi
CFLAGS+=" -DSNAPPY"
fi
if test -z $PIC_BUILD; then
if ! test $ROCKSDB_DISABLE_ZLIB; then
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
ZLIB_LIBS=" $ZLIB_BASE/lib/libz.a"
CFLAGS+=" -DZLIB"
fi
if ! test $ROCKSDB_DISABLE_BZIP; then
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
BZIP_LIBS=" $BZIP2_BASE/lib/libbz2.a"
CFLAGS+=" -DBZIP2"
fi
if ! test $ROCKSDB_DISABLE_LZ4; then
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
LZ4_LIBS=" $LZ4_BASE/lib/liblz4.a"
CFLAGS+=" -DLZ4"
fi
fi
if ! test $ROCKSDB_DISABLE_ZSTD; then
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
if test -z $PIC_BUILD; then
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd.a"
else
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd_pic.a"
fi
CFLAGS+=" -DZSTD -DZSTD_STATIC_LINKING_ONLY"
fi
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
if test -z $PIC_BUILD; then
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags.a"
else
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags_pic.a"
fi
CFLAGS+=" -DGFLAGS=gflags"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc.a"
if test -z $PIC_BUILD; then
# location of numa
NUMA_INCLUDE=" -I $NUMA_BASE/include/"
NUMA_LIB=" $NUMA_BASE/lib/libnuma.a"
CFLAGS+=" -DNUMA"
# location of libunwind
LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind.a"
fi
# location of TBB
TBB_INCLUDE=" -isystem $TBB_BASE/include/"
if test -z $PIC_BUILD; then
TBB_LIBS="$TBB_BASE/lib/libtbb.a"
else
TBB_LIBS="$TBB_BASE/lib/libtbb_pic.a"
fi
CFLAGS+=" -DTBB"
test "$USE_SSE" || USE_SSE=1
export USE_SSE
test "$PORTABLE" || PORTABLE=1
export PORTABLE
BINUTILS="$BINUTILS_BASE/bin"
AR="$BINUTILS/ar"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE"
STDLIBS="-L $GCC_BASE/lib64"
CLANG_BIN="$CLANG_BASE/bin"
CLANG_LIB="$CLANG_BASE/lib"
CLANG_SRC="$CLANG_BASE/../../src"
CLANG_ANALYZER="$CLANG_BIN/clang++"
CLANG_SCAN_BUILD="$CLANG_SRC/llvm/tools/clang/tools/scan-build/bin/scan-build"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
AR="$GCC_BASE/bin/gcc-ar"
CFLAGS+=" -B$BINUTILS/gold"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
JEMALLOC=1
else
# clang
CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
CC="$CLANG_BIN/clang"
CXX="$CLANG_BIN/clang++"
AR="$CLANG_BIN/llvm-ar"
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
CFLAGS+=" -B$BINUTILS/gold -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/5.x "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/5.x/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" -Wno-expansion-to-defined "
CXXFLAGS="-nostdinc++"
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
EXEC_LDFLAGS+=" -B$BINUTILS/gold"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/gcc-5-glibc-2.23/lib/ld.so"
EXEC_LDFLAGS+=" $LIBUNWIND"
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/gcc-5-glibc-2.23/lib"
# required by libtbb
EXEC_LDFLAGS+=" -ldl"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS"
VALGRIND_VER="$VALGRIND_BASE/bin/"
LUA_PATH="$LUA_BASE"
if test -z $PIC_BUILD; then
LUA_LIB=" $LUA_PATH/lib/liblua.a"
else
LUA_LIB=" $LUA_PATH/lib/liblua_pic.a"
fi
export CC CXX AR CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB

@ -0,0 +1,175 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ and clang compilers and also
# uses jemalloc
# Environment variables that change the behavior of this script:
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
BASEDIR=`dirname $BASH_SOURCE`
source "$BASEDIR/dependencies_platform010.sh"
# Disallow using libraries from default locations as they might not be compatible with platform010 libraries.
CFLAGS=" --sysroot=/DOES/NOT/EXIST"
# libgcc
LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/trunk"
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib -B$LIBGCC_BASE/lib/gcc/x86_64-facebook-linux/trunk/"
# glibc
GLIBC_INCLUDE="$GLIBC_BASE/include"
GLIBC_LIBS=" -L $GLIBC_BASE/lib"
GLIBC_LIBS+=" -B$GLIBC_BASE/lib"
if test -z $PIC_BUILD; then
MAYBE_PIC=
else
MAYBE_PIC=_pic
fi
if ! test $ROCKSDB_DISABLE_SNAPPY; then
# snappy
SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
CFLAGS+=" -DSNAPPY"
fi
if ! test $ROCKSDB_DISABLE_ZLIB; then
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
CFLAGS+=" -DZLIB"
fi
if ! test $ROCKSDB_DISABLE_BZIP; then
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
CFLAGS+=" -DBZIP2"
fi
if ! test $ROCKSDB_DISABLE_LZ4; then
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
CFLAGS+=" -DLZ4"
fi
if ! test $ROCKSDB_DISABLE_ZSTD; then
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
CFLAGS+=" -DZSTD"
fi
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
CFLAGS+=" -DGFLAGS=gflags"
BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
# location of numa
NUMA_INCLUDE=" -I $NUMA_BASE/include/"
NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
CFLAGS+=" -DNUMA"
# location of libunwind
LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
# location of TBB
TBB_INCLUDE=" -isystem $TBB_BASE/include/"
TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
CFLAGS+=" -DTBB"
# location of LIBURING
LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
CFLAGS+=" -DLIBURING"
test "$USE_SSE" || USE_SSE=1
export USE_SSE
test "$PORTABLE" || PORTABLE=1
export PORTABLE
BINUTILS="$BINUTILS_BASE/bin"
AR="$BINUTILS/ar"
AS="$BINUTILS/as"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE"
STDLIBS="-L $GCC_BASE/lib64"
CLANG_BIN="$CLANG_BASE/bin"
CLANG_LIB="$CLANG_BASE/lib"
CLANG_SRC="$CLANG_BASE/../../src"
CLANG_ANALYZER="$CLANG_BIN/clang++"
CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
AR="$GCC_BASE/bin/gcc-ar"
CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
CFLAGS+=" -I$GCC_BASE/include"
CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/include"
CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/install-tools/include"
CFLAGS+=" -isystem $GCC_BASE/lib/gcc/x86_64-redhat-linux-gnu/11.2.1/include-fixed/"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -I$GLIBC_INCLUDE"
CFLAGS+=" -I$LIBGCC_BASE/include"
CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/"
CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/x86_64-facebook-linux/"
CFLAGS+=" -I$LIBGCC_BASE/include/c++/11.x/backward"
CFLAGS+=" -isystem $GLIBC_INCLUDE -I$GLIBC_INCLUDE"
JEMALLOC=1
else
# clang
CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
CC="$CLANG_BIN/clang"
CXX="$CLANG_BIN/clang++"
AR="$CLANG_BIN/llvm-ar"
CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/trunk "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/trunk/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -Wno-expansion-to-defined "
CXXFLAGS="-nostdinc++"
fi
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform010/lib/ld.so"
EXEC_LDFLAGS+=" $LIBUNWIND"
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform010/lib"
EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
# required by libtbb
EXEC_LDFLAGS+=" -ldl"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
PLATFORM_LDFLAGS+=" -B$BINUTILS"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
VALGRIND_VER="$VALGRIND_BASE/bin/"
export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB

@ -0,0 +1,203 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# If clang_format_diff.py command is not specfied, we assume we are able to
# access directly without any path.
print_usage () {
echo "Usage:"
echo "format-diff.sh [OPTIONS]"
echo "-c: check only."
echo "-h: print this message."
}
while getopts ':ch' OPTION; do
case "$OPTION" in
c)
CHECK_ONLY=1
;;
h)
print_usage
exit 1
;;
?)
print_usage
exit 1
;;
esac
done
REPO_ROOT="$(git rev-parse --show-toplevel)"
if [ "$CLANG_FORMAT_DIFF" ]; then
echo "Note: CLANG_FORMAT_DIFF='$CLANG_FORMAT_DIFF'"
# Dry run to confirm dependencies like argparse
if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
true #Good
else
exit 128
fi
else
# First try directly executing the possibilities
if clang-format-diff --help &> /dev/null < /dev/null; then
CLANG_FORMAT_DIFF=clang-format-diff
elif clang-format-diff.py --help &> /dev/null < /dev/null; then
CLANG_FORMAT_DIFF=clang-format-diff.py
elif $REPO_ROOT/clang-format-diff.py --help &> /dev/null < /dev/null; then
CLANG_FORMAT_DIFF=$REPO_ROOT/clang-format-diff.py
else
# This probably means we need to directly invoke the interpreter.
# But first find clang-format-diff.py
if [ -f "$REPO_ROOT/clang-format-diff.py" ]; then
CFD_PATH="$REPO_ROOT/clang-format-diff.py"
elif which clang-format-diff.py &> /dev/null; then
CFD_PATH="$(which clang-format-diff.py)"
else
echo "You didn't have clang-format-diff.py and/or clang-format available in your computer!"
echo "You can download clang-format-diff.py by running: "
echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
echo "You should make sure the downloaded script is not compromised."
echo "You can download clang-format by running:"
echo " brew install clang-format"
echo " Or"
echo " apt install clang-format"
echo " This might work too:"
echo " yum install git-clang-format"
echo "Then make sure clang-format is available and executable from \$PATH:"
echo " clang-format --version"
exit 128
fi
# Check argparse pre-req on interpreter, or it will fail
if echo import argparse | ${PYTHON:-python3}; then
true # Good
else
echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
echo "installed. You can try either of the follow ways to install it:"
echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse"
echo " 2. easy_install argparse (if you have easy_install)"
echo " 3. pip install argparse (if you have pip)"
exit 129
fi
# Unfortunately, some machines have a Python2 clang-format-diff.py
# installed but only a Python3 interpreter installed. Unfortunately,
# automatic 2to3 migration is insufficient, so suggest downloading latest.
if grep -q "print '" "$CFD_PATH" && \
${PYTHON:-python3} --version | grep -q 'ython 3'; then
echo "You have clang-format-diff.py for Python 2 but are using a Python 3"
echo "interpreter (${PYTHON:-python3})."
echo "You can download clang-format-diff.py for Python 3 by running: "
echo " curl --location https://raw.githubusercontent.com/llvm/llvm-project/main/clang/tools/clang-format/clang-format-diff.py -o ${REPO_ROOT}/clang-format-diff.py"
echo "You should make sure the downloaded script is not compromised."
exit 130
fi
CLANG_FORMAT_DIFF="${PYTHON:-python3} $CFD_PATH"
# This had better work after all those checks
if $CLANG_FORMAT_DIFF --help >/dev/null < /dev/null; then
true #Good
else
exit 128
fi
fi
fi
# TODO(kailiu) following work is not complete since we still need to figure
# out how to add the modified files done pre-commit hook to git's commit index.
#
# Check if this script has already been added to pre-commit hook.
# Will suggest user to add this script to pre-commit hook if their pre-commit
# is empty.
# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
# then
# echo "Would you like to add this script to pre-commit hook, which will do "
# echo -n "the format check for all the affected lines before you check in (y/n):"
# read add_to_hook
# if [ "$add_to_hook" == "y" ]
# then
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
# fi
# fi
set -e
uncommitted_code=`git diff HEAD`
# If there's no uncommitted changes, we assume user are doing post-commit
# format check, in which case we'll try to check the modified lines vs. the
# facebook/rocksdb.git main branch. Otherwise, we'll check format of the
# uncommitted code only.
if [ -z "$uncommitted_code" ]
then
# Attempt to get name of facebook/rocksdb.git remote.
[ "$FORMAT_REMOTE" ] || FORMAT_REMOTE="$(LC_ALL=POSIX LANG=POSIX git remote -v | grep 'facebook/rocksdb.git' | head -n 1 | cut -f 1)"
# Fall back on 'origin' if that fails
[ "$FORMAT_REMOTE" ] || FORMAT_REMOTE=origin
# Use main branch from that remote
[ "$FORMAT_UPSTREAM" ] || FORMAT_UPSTREAM="$FORMAT_REMOTE/$(LC_ALL=POSIX LANG=POSIX git remote show $FORMAT_REMOTE | sed -n '/HEAD branch/s/.*: //p')"
# Get the common ancestor with that remote branch. Everything after that
# common ancestor would be considered the contents of a pull request, so
# should be relevant for formatting fixes.
FORMAT_UPSTREAM_MERGE_BASE="$(git merge-base "$FORMAT_UPSTREAM" HEAD)"
# Get the differences
diffs=$(git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -p 1)
echo "Checking format of changes not yet in $FORMAT_UPSTREAM..."
else
# Check the format of uncommitted lines,
diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
echo "Checking format of uncommitted changes..."
fi
if [ -z "$diffs" ]
then
echo "Nothing needs to be reformatted!"
exit 0
elif [ $CHECK_ONLY ]
then
echo "Your change has unformatted code. Please run make format!"
if [ $VERBOSE_CHECK ]; then
clang-format --version
echo "$diffs"
fi
exit 1
fi
# Highlight the insertion/deletion from the clang-format-diff.py's output
COLOR_END="\033[0m"
COLOR_RED="\033[0;31m"
COLOR_GREEN="\033[0;32m"
echo -e "Detect lines that doesn't follow the format rules:\r"
# Add the color to the diff. lines added will be green; lines removed will be red.
echo "$diffs" |
sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
echo -e "Would you like to fix the format automatically (y/n): \c"
# Make sure under any mode, we can read user input.
exec < /dev/tty
read to_fix
if [ "$to_fix" != "y" ]
then
exit 1
fi
# Do in-place format adjustment.
if [ -z "$uncommitted_code" ]
then
git diff -U0 "$FORMAT_UPSTREAM_MERGE_BASE" | $CLANG_FORMAT_DIFF -i -p 1
else
git diff -U0 HEAD | $CLANG_FORMAT_DIFF -i -p 1
fi
echo "Files reformatted!"
# Amend to last commit if user do the post-commit format check
if [ -z "$uncommitted_code" ]; then
echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
read to_amend
if [ "$to_amend" == "y" ]
then
git commit -a --amend --reuse-message HEAD
echo "Amended to last commit"
fi
fi

File diff suppressed because it is too large Load Diff

@ -0,0 +1,129 @@
# shellcheck disable=SC1113
#/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
set -e
function log() {
echo "[+] $1"
}
function fatal() {
echo "[!] $1"
exit 1
}
function platform() {
local __resultvar=$1
if [[ -f "/etc/yum.conf" ]]; then
eval $__resultvar="centos"
elif [[ -f "/etc/dpkg/dpkg.cfg" ]]; then
eval $__resultvar="ubuntu"
else
fatal "Unknwon operating system"
fi
}
platform OS
function package() {
if [[ $OS = "ubuntu" ]]; then
if dpkg --get-selections | grep --quiet $1; then
log "$1 is already installed. skipping."
else
# shellcheck disable=SC2068
apt-get install $@ -y
fi
elif [[ $OS = "centos" ]]; then
if rpm -qa | grep --quiet $1; then
log "$1 is already installed. skipping."
else
# shellcheck disable=SC2068
yum install $@ -y
fi
fi
}
function detect_fpm_output() {
if [[ $OS = "ubuntu" ]]; then
export FPM_OUTPUT=deb
elif [[ $OS = "centos" ]]; then
export FPM_OUTPUT=rpm
fi
}
detect_fpm_output
function gem_install() {
if gem list | grep --quiet $1; then
log "$1 is already installed. skipping."
else
# shellcheck disable=SC2068
gem install $@
fi
}
function main() {
if [[ $# -ne 1 ]]; then
fatal "Usage: $0 <rocksdb_version>"
else
log "using rocksdb version: $1"
fi
if [[ -d /vagrant ]]; then
if [[ $OS = "ubuntu" ]]; then
package g++-4.8
export CXX=g++-4.8
# the deb would depend on libgflags2, but the static lib is the only thing
# installed by make install
package libgflags-dev
package ruby-all-dev
elif [[ $OS = "centos" ]]; then
pushd /etc/yum.repos.d
if [[ ! -f /etc/yum.repos.d/devtools-1.1.repo ]]; then
wget http://people.centos.org/tru/devtools-1.1/devtools-1.1.repo
fi
package devtoolset-1.1-gcc --enablerepo=testing-1.1-devtools-6
package devtoolset-1.1-gcc-c++ --enablerepo=testing-1.1-devtools-6
export CC=/opt/centos/devtoolset-1.1/root/usr/bin/gcc
export CPP=/opt/centos/devtoolset-1.1/root/usr/bin/cpp
export CXX=/opt/centos/devtoolset-1.1/root/usr/bin/c++
export PATH=$PATH:/opt/centos/devtoolset-1.1/root/usr/bin
popd
if ! rpm -qa | grep --quiet gflags; then
rpm -i https://github.com/schuhschuh/gflags/releases/download/v2.1.0/gflags-devel-2.1.0-1.amd64.rpm
fi
package ruby
package ruby-devel
package rubygems
package rpm-build
fi
fi
gem_install fpm
make static_lib
LIBDIR=/usr/lib
if [[ $FPM_OUTPUT = "rpm" ]]; then
LIBDIR=$(rpm --eval '%_libdir')
fi
rm -rf package
make install DESTDIR=package PREFIX=/usr LIBDIR=$LIBDIR
fpm \
-s dir \
-t $FPM_OUTPUT \
-C package \
-n rocksdb \
-v $1 \
--url http://rocksdb.org/ \
-m rocksdb@fb.com \
--license BSD \
--vendor Facebook \
--description "RocksDB is an embeddable persistent key-value store for fast storage." \
usr
}
# shellcheck disable=SC2068
main $@

@ -0,0 +1,38 @@
#!/usr/bin/env perl
use strict;
open(my $ps, "-|", "ps -wwf");
my $cols_known = 0;
my $cmd_col = 0;
my $pid_col = 0;
while (<$ps>) {
print;
my @cols = split(/\s+/);
if (!$cols_known && /CMD/) {
# Parse relevant ps column headers
for (my $i = 0; $i <= $#cols; $i++) {
if ($cols[$i] eq "CMD") {
$cmd_col = $i;
}
if ($cols[$i] eq "PID") {
$pid_col = $i;
}
}
$cols_known = 1;
} else {
my $pid = $cols[$pid_col];
my $cmd = $cols[$cmd_col];
# Match numeric PID and relative path command
# -> The intention is only to dump stack traces for hangs in code under
# test, which means we probably just built it and are executing by
# relative path (e.g. ./my_test or foo/bar_test) rather then by absolute
# path (e.g. /usr/bin/time) or PATH search (e.g. grep).
if ($pid =~ /^[0-9]+$/ && $cmd =~ /^[^\/ ]+[\/]/) {
print "Dumping stacks for $pid...\n";
system("pstack $pid || gdb -batch -p $pid -ex 'thread apply all bt'");
}
}
}
close $ps;

@ -0,0 +1,396 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
set -e
NUM=10000000
if [ $# -eq 1 ];then
DATA_DIR=$1
elif [ $# -eq 2 ];then
DATA_DIR=$1
STAT_FILE=$2
fi
# On the production build servers, set data and stat
# files/directories not in /tmp or else the tempdir cleaning
# scripts will make you very unhappy.
DATA_DIR=${DATA_DIR:-$(mktemp -t -d rocksdb_XXXX)}
STAT_FILE=${STAT_FILE:-$(mktemp -t -u rocksdb_test_stats_XXXX)}
function cleanup {
rm -rf $DATA_DIR
rm -f $STAT_FILE.*
}
trap cleanup EXIT
make release
# measure fillseq + fill up the DB for overwrite benchmark
./db_bench \
--benchmarks=fillseq \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$NUM \
--writes=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq
# measure overwrite performance
./db_bench \
--benchmarks=overwrite \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--writes=$((NUM / 10)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite
# fill up the db for readrandom benchmark (1GB total size)
./db_bench \
--benchmarks=fillseq \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$NUM \
--writes=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=1 > /dev/null
# measure readrandom with 6GB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom
# measure readrandom with 6GB block cache and tailing iterator
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--use_tailing_iterator=1 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandomtailing
# measure readrandom with 100MB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--cache_size=104857600 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandomsmallblockcache
# measure readrandom with 8k data in memtable
./db_bench \
--benchmarks=overwrite,readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$NUM \
--reads=$((NUM / 5)) \
--writes=512 \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--write_buffer_size=1000000000 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom_mem_sst
# fill up the db for readrandom benchmark with filluniquerandom (1GB total size)
./db_bench \
--benchmarks=filluniquerandom \
--db=$DATA_DIR \
--use_existing_db=0 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--writes=$((NUM / 4)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=1 > /dev/null
# dummy test just to compact the data
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 1000)) \
--reads=$((NUM / 1000)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > /dev/null
# measure readrandom after load with filluniquerandom with 6GB block cache
./db_bench \
--benchmarks=readrandom \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--reads=$((NUM / 4)) \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--disable_auto_compactions=1 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readrandom_filluniquerandom
# measure readwhilewriting after load with filluniquerandom with 6GB block cache
./db_bench \
--benchmarks=readwhilewriting \
--db=$DATA_DIR \
--use_existing_db=1 \
--bloom_bits=10 \
--num=$((NUM / 4)) \
--reads=$((NUM / 4)) \
--benchmark_write_rate_limit=$(( 110 * 1024 )) \
--write_buffer_size=100000000 \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=16 > ${STAT_FILE}.readwhilewriting
# measure memtable performance -- none of the data gets flushed to disk
./db_bench \
--benchmarks=fillrandom,readrandom, \
--db=$DATA_DIR \
--use_existing_db=0 \
--num=$((NUM / 10)) \
--reads=$NUM \
--cache_size=6442450944 \
--cache_numshardbits=6 \
--table_cache_numshardbits=4 \
--write_buffer_size=1000000000 \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--value_size=10 \
--threads=16 > ${STAT_FILE}.memtablefillreadrandom
common_in_mem_args="--db=/dev/shm/rocksdb \
--num_levels=6 \
--key_size=20 \
--prefix_size=12 \
--keys_per_prefix=10 \
--value_size=100 \
--compression_type=none \
--compression_ratio=1 \
--write_buffer_size=134217728 \
--max_write_buffer_number=4 \
--level0_file_num_compaction_trigger=8 \
--level0_slowdown_writes_trigger=16 \
--level0_stop_writes_trigger=24 \
--target_file_size_base=134217728 \
--max_bytes_for_level_base=1073741824 \
--disable_wal=0 \
--wal_dir=/dev/shm/rocksdb \
--sync=0 \
--verify_checksum=1 \
--delete_obsolete_files_period_micros=314572800 \
--use_plain_table=1 \
--open_files=-1 \
--mmap_read=1 \
--mmap_write=0 \
--bloom_bits=10 \
--bloom_locality=1 \
--perf_level=0"
# prepare a in-memory DB with 50M keys, total DB size is ~6G
./db_bench \
$common_in_mem_args \
--statistics=0 \
--max_background_compactions=16 \
--max_background_flushes=16 \
--benchmarks=filluniquerandom \
--use_existing_db=0 \
--num=52428800 \
--threads=1 > /dev/null
# Readwhilewriting
./db_bench \
$common_in_mem_args \
--statistics=1 \
--max_background_compactions=4 \
--max_background_flushes=0 \
--benchmarks=readwhilewriting\
--use_existing_db=1 \
--duration=600 \
--threads=32 \
--benchmark_write_rate_limit=9502720 > ${STAT_FILE}.readwhilewriting_in_ram
# Seekrandomwhilewriting
./db_bench \
$common_in_mem_args \
--statistics=1 \
--max_background_compactions=4 \
--max_background_flushes=0 \
--benchmarks=seekrandomwhilewriting \
--use_existing_db=1 \
--use_tailing_iterator=1 \
--duration=600 \
--threads=32 \
--benchmark_write_rate_limit=9502720 > ${STAT_FILE}.seekwhilewriting_in_ram
# measure fillseq with bunch of column families
./db_bench \
--benchmarks=fillseq \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=0 \
--num=$NUM \
--writes=$NUM \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq_lots_column_families
# measure overwrite performance with bunch of column families
./db_bench \
--benchmarks=overwrite \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=1 \
--num=$NUM \
--writes=$((NUM / 10)) \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite_lots_column_families
# send data to ods
function send_to_ods {
key="$1"
value="$2"
if [ -z $JENKINS_HOME ]; then
# running on devbox, just print out the values
echo $1 $2
return
fi
if [ -z "$value" ];then
echo >&2 "ERROR: Key $key doesn't have a value."
return
fi
curl --silent "https://www.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
--connect-timeout 60
}
function send_benchmark_to_ods {
bench="$1"
bench_key="$2"
file="$3"
QPS=$(grep $bench $file | awk '{print $5}')
P50_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $3}' )
P75_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $5}' )
P99_MICROS=$(grep $bench $file -A 6 | grep "Percentiles" | awk '{print $7}' )
send_to_ods rocksdb.build.$bench_key.qps $QPS
send_to_ods rocksdb.build.$bench_key.p50_micros $P50_MICROS
send_to_ods rocksdb.build.$bench_key.p75_micros $P75_MICROS
send_to_ods rocksdb.build.$bench_key.p99_micros $P99_MICROS
}
send_benchmark_to_ods overwrite overwrite $STAT_FILE.overwrite
send_benchmark_to_ods fillseq fillseq $STAT_FILE.fillseq
send_benchmark_to_ods readrandom readrandom $STAT_FILE.readrandom
send_benchmark_to_ods readrandom readrandom_tailing $STAT_FILE.readrandomtailing
send_benchmark_to_ods readrandom readrandom_smallblockcache $STAT_FILE.readrandomsmallblockcache
send_benchmark_to_ods readrandom readrandom_memtable_sst $STAT_FILE.readrandom_mem_sst
send_benchmark_to_ods readrandom readrandom_fillunique_random $STAT_FILE.readrandom_filluniquerandom
send_benchmark_to_ods fillrandom memtablefillrandom $STAT_FILE.memtablefillreadrandom
send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadrandom
send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families

@ -0,0 +1,493 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# This script enables you running RocksDB tests by running
# All the tests concurrently and utilizing all the cores
Param(
[switch]$EnableJE = $false, # Look for and use test executable, append _je to listed exclusions
[switch]$RunAll = $false, # Will attempt discover all *_test[_je].exe binaries and run all
# of them as Google suites. I.e. It will run test cases concurrently
# except those mentioned as $Run, those will run as individual test cases
# And any execlued with $ExcludeExes or $ExcludeCases
# It will also not run any individual test cases
# excluded but $ExcludeCasese
[switch]$RunAllExe = $false, # Look for and use test exdcutables, append _je to exclusions automatically
# It will attempt to run them in parallel w/o breaking them up on individual
# test cases. Those listed with $ExcludeExes will be excluded
[string]$SuiteRun = "", # Split test suites in test cases and run in parallel, not compatible with $RunAll
[string]$Run = "", # Run specified executables in parallel but do not split to test cases
[string]$ExcludeCases = "", # Exclude test cases, expects a comma separated list, no spaces
# Takes effect when $RunAll or $SuiteRun is specified. Must have full
# Test cases name including a group and a parameter if any
[string]$ExcludeExes = "", # Exclude exes from consideration, expects a comma separated list,
# no spaces. Takes effect only when $RunAll is specified
[string]$WorkFolder = "", # Direct tests to use that folder. SSD or Ram drive are better options.
# Number of async tasks that would run concurrently. Recommend a number below 64.
# However, CPU utlization really depends on the storage media. Recommend ram based disk.
# a value of 1 will run everything serially
[int]$Concurrency = 8,
[int]$Limit = -1 # -1 means do not limit for test purposes
)
# Folders and commands must be fullpath to run assuming
# the current folder is at the root of the git enlistment
$StartDate = (Get-Date)
$StartDate
$DebugPreference = "Continue"
# These tests are not google test suites and we should guard
# Against running them as suites
$RunOnly = New-Object System.Collections.Generic.HashSet[string]
$RunOnly.Add("c_test") | Out-Null
$RunOnly.Add("compact_on_deletion_collector_test") | Out-Null
$RunOnly.Add("merge_test") | Out-Null
$RunOnly.Add("stringappend_test") | Out-Null # Apparently incorrectly written
$RunOnly.Add("backup_engine_test") | Out-Null # Disabled
$RunOnly.Add("timer_queue_test") | Out-Null # Not a gtest
if($RunAll -and $SuiteRun -ne "") {
Write-Error "$RunAll and $SuiteRun are not compatible"
exit 1
}
if($RunAllExe -and $Run -ne "") {
Write-Error "$RunAllExe and $Run are not compatible"
exit 1
}
# If running under Appveyor assume that root
[string]$Appveyor = $Env:APPVEYOR_BUILD_FOLDER
if($Appveyor -ne "") {
$RootFolder = $Appveyor
} else {
$RootFolder = $PSScriptRoot -replace '\\build_tools', ''
}
$LogFolder = -Join($RootFolder, "\db_logs\")
$BinariesFolder = -Join($RootFolder, "\build\Debug\")
if($WorkFolder -eq "") {
# If TEST_TMPDIR is set use it
[string]$var = $Env:TEST_TMPDIR
if($var -eq "") {
$WorkFolder = -Join($RootFolder, "\db_tests\")
$Env:TEST_TMPDIR = $WorkFolder
} else {
$WorkFolder = $var
}
} else {
# Override from a command line
$Env:TEST_TMPDIR = $WorkFolder
}
Write-Output "Root: $RootFolder, WorkFolder: $WorkFolder"
Write-Output "BinariesFolder: $BinariesFolder, LogFolder: $LogFolder"
# Create test directories in the current folder
md -Path $WorkFolder -ErrorAction Ignore | Out-Null
md -Path $LogFolder -ErrorAction Ignore | Out-Null
$ExcludeCasesSet = New-Object System.Collections.Generic.HashSet[string]
if($ExcludeCases -ne "") {
Write-Host "ExcludeCases: $ExcludeCases"
$l = $ExcludeCases -split ' '
ForEach($t in $l) {
$ExcludeCasesSet.Add($t) | Out-Null
}
}
$ExcludeExesSet = New-Object System.Collections.Generic.HashSet[string]
if($ExcludeExes -ne "") {
Write-Host "ExcludeExe: $ExcludeExes"
$l = $ExcludeExes -split ' '
ForEach($t in $l) {
$ExcludeExesSet.Add($t) | Out-Null
}
}
# Extract the names of its tests by running db_test with --gtest_list_tests.
# This filter removes the "#"-introduced comments, and expands to
# fully-qualified names by changing input like this:
#
# DBTest.
# Empty
# WriteEmptyBatch
# MultiThreaded/MultiThreadedDBTest.
# MultiThreaded/0 # GetParam() = 0
# MultiThreaded/1 # GetParam() = 1
# RibbonTypeParamTest/0. # TypeParam = struct DefaultTypesAndSettings
# CompactnessAndBacktrackAndFpRate
# Extremes
# FindOccupancyForSuccessRate
#
# into this:
#
# DBTest.Empty
# DBTest.WriteEmptyBatch
# MultiThreaded/MultiThreadedDBTest.MultiThreaded/0
# MultiThreaded/MultiThreadedDBTest.MultiThreaded/1
# RibbonTypeParamTest/0.CompactnessAndBacktrackAndFpRate
# RibbonTypeParamTest/0.Extremes
# RibbonTypeParamTest/0.FindOccupancyForSuccessRate
#
# Output into the parameter in a form TestName -> Log File Name
function ExtractTestCases([string]$GTestExe, $HashTable) {
$Tests = @()
# Run db_test to get a list of tests and store it into $a array
&$GTestExe --gtest_list_tests | tee -Variable Tests | Out-Null
# Current group
$Group=""
ForEach( $l in $Tests) {
# remove trailing comment if any
$l = $l -replace '\s+\#.*',''
# Leading whitespace is fine
$l = $l -replace '^\s+',''
# Trailing dot is a test group but no whitespace
if ($l -match "\.$" -and $l -notmatch "\s+") {
$Group = $l
} else {
# Otherwise it is a test name, remove leading space
$test = $l
# create a log name
$test = "$Group$test"
if($ExcludeCasesSet.Contains($test)) {
Write-Warning "$test case is excluded"
continue
}
$test_log = $test -replace '[\./]','_'
$test_log += ".log"
$log_path = -join ($LogFolder, $test_log)
# Add to a hashtable
$HashTable.Add($test, $log_path);
}
}
}
# The function removes trailing .exe siffix if any,
# creates a name for the log file
# Then adds the test name if it was not excluded into
# a HashTable in a form of test_name -> log_path
function MakeAndAdd([string]$token, $HashTable) {
$test_name = $token -replace '.exe$', ''
$log_name = -join ($test_name, ".log")
$log_path = -join ($LogFolder, $log_name)
$HashTable.Add($test_name, $log_path)
}
# This function takes a list of Suites to run
# Lists all the test cases in each of the suite
# and populates HashOfHashes
# Ordered by suite(exe) @{ Exe = @{ TestCase = LogName }}
function ProcessSuites($ListOfSuites, $HashOfHashes) {
$suite_list = $ListOfSuites
# Problem: if you run --gtest_list_tests on
# a non Google Test executable then it will start executing
# and we will get nowhere
ForEach($suite in $suite_list) {
if($RunOnly.Contains($suite)) {
Write-Warning "$suite is excluded from running as Google test suite"
continue
}
if($EnableJE) {
$suite += "_je"
}
$Cases = [ordered]@{}
$Cases.Clear()
$suite_exe = -Join ($BinariesFolder, $suite)
ExtractTestCases -GTestExe $suite_exe -HashTable $Cases
if($Cases.Count -gt 0) {
$HashOfHashes.Add($suite, $Cases);
}
}
# Make logs and run
if($CasesToRun.Count -lt 1) {
Write-Error "Failed to extract tests from $SuiteRun"
exit 1
}
}
# This will contain all test executables to run
# Hash table that contains all non suite
# Test executable to run
$TestExes = [ordered]@{}
# Check for test exe that are not
# Google Test Suites
# Since this is explicitely mentioned it is not subject
# for exclusions
if($Run -ne "") {
$test_list = $Run -split ' '
ForEach($t in $test_list) {
if($EnableJE) {
$t += "_je"
}
MakeAndAdd -token $t -HashTable $TestExes
}
if($TestExes.Count -lt 1) {
Write-Error "Failed to extract tests from $Run"
exit 1
}
} elseif($RunAllExe) {
# Discover all the test binaries
if($EnableJE) {
$pattern = "*_test_je.exe"
} else {
$pattern = "*_test.exe"
}
$search_path = -join ($BinariesFolder, $pattern)
Write-Host "Binaries Search Path: $search_path"
$DiscoveredExe = @()
dir -Path $search_path | ForEach-Object {
$DiscoveredExe += ($_.Name)
}
# Remove exclusions
ForEach($e in $DiscoveredExe) {
$e = $e -replace '.exe$', ''
$bare_name = $e -replace '_je$', ''
if($ExcludeExesSet.Contains($bare_name)) {
Write-Warning "Test $e is excluded"
continue
}
MakeAndAdd -token $e -HashTable $TestExes
}
if($TestExes.Count -lt 1) {
Write-Error "Failed to discover test executables"
exit 1
}
}
# Ordered by exe @{ Exe = @{ TestCase = LogName }}
$CasesToRun = [ordered]@{}
if($SuiteRun -ne "") {
$suite_list = $SuiteRun -split ' '
ProcessSuites -ListOfSuites $suite_list -HashOfHashes $CasesToRun
} elseif ($RunAll) {
# Discover all the test binaries
if($EnableJE) {
$pattern = "*_test_je.exe"
} else {
$pattern = "*_test.exe"
}
$search_path = -join ($BinariesFolder, $pattern)
Write-Host "Binaries Search Path: $search_path"
$ListOfExe = @()
dir -Path $search_path | ForEach-Object {
$ListOfExe += ($_.Name)
}
# Exclude those in RunOnly from running as suites
$ListOfSuites = @()
ForEach($e in $ListOfExe) {
$e = $e -replace '.exe$', ''
$bare_name = $e -replace '_je$', ''
if($ExcludeExesSet.Contains($bare_name)) {
Write-Warning "Test $e is excluded"
continue
}
if($RunOnly.Contains($bare_name)) {
MakeAndAdd -token $e -HashTable $TestExes
} else {
$ListOfSuites += $bare_name
}
}
ProcessSuites -ListOfSuites $ListOfSuites -HashOfHashes $CasesToRun
}
# Invoke a test with a filter and redirect all output
$InvokeTestCase = {
param($exe, $test, $log);
&$exe --gtest_filter=$test > $log 2>&1
}
# Invoke all tests and redirect output
$InvokeTestAsync = {
param($exe, $log)
&$exe > $log 2>&1
}
# Hash that contains tests to rerun if any failed
# Those tests will be rerun sequentially
# $Rerun = [ordered]@{}
# Test limiting factor here
[int]$count = 0
# Overall status
[bool]$script:success = $true;
function RunJobs($Suites, $TestCmds, [int]$ConcurrencyVal)
{
# Array to wait for any of the running jobs
$jobs = @()
# Hash JobToLog
$JobToLog = @{}
# Wait for all to finish and get the results
while(($JobToLog.Count -gt 0) -or
($TestCmds.Count -gt 0) -or
($Suites.Count -gt 0)) {
# Make sure we have maximum concurrent jobs running if anything
# and the $Limit either not set or allows to proceed
while(($JobToLog.Count -lt $ConcurrencyVal) -and
((($TestCmds.Count -gt 0) -or ($Suites.Count -gt 0)) -and
(($Limit -lt 0) -or ($count -lt $Limit)))) {
# We always favore suites to run if available
[string]$exe_name = ""
[string]$log_path = ""
$Cases = @{}
if($Suites.Count -gt 0) {
# Will the first one
ForEach($e in $Suites.Keys) {
$exe_name = $e
$Cases = $Suites[$e]
break
}
[string]$test_case = ""
[string]$log_path = ""
ForEach($c in $Cases.Keys) {
$test_case = $c
$log_path = $Cases[$c]
break
}
Write-Host "Starting $exe_name::$test_case"
[string]$Exe = -Join ($BinariesFolder, $exe_name)
$job = Start-Job -Name "$exe_name::$test_case" -ArgumentList @($Exe,$test_case,$log_path) -ScriptBlock $InvokeTestCase
$JobToLog.Add($job, $log_path)
$Cases.Remove($test_case)
if($Cases.Count -lt 1) {
$Suites.Remove($exe_name)
}
} elseif ($TestCmds.Count -gt 0) {
ForEach($e in $TestCmds.Keys) {
$exe_name = $e
$log_path = $TestCmds[$e]
break
}
Write-Host "Starting $exe_name"
[string]$Exe = -Join ($BinariesFolder, $exe_name)
$job = Start-Job -Name $exe_name -ScriptBlock $InvokeTestAsync -ArgumentList @($Exe,$log_path)
$JobToLog.Add($job, $log_path)
$TestCmds.Remove($exe_name)
} else {
Write-Error "In the job loop but nothing to run"
exit 1
}
++$count
} # End of Job starting loop
if($JobToLog.Count -lt 1) {
break
}
$jobs = @()
foreach($k in $JobToLog.Keys) { $jobs += $k }
$completed = Wait-Job -Job $jobs -Any
$log = $JobToLog[$completed]
$JobToLog.Remove($completed)
$message = -join @($completed.Name, " State: ", ($completed.State))
$log_content = @(Get-Content $log)
if($completed.State -ne "Completed") {
$script:success = $false
Write-Warning $message
$log_content | Write-Warning
} else {
# Scan the log. If we find PASSED and no occurrence of FAILED
# then it is a success
[bool]$pass_found = $false
ForEach($l in $log_content) {
if(($l -match "^\[\s+FAILED") -or
($l -match "Assertion failed:")) {
$pass_found = $false
break
}
if(($l -match "^\[\s+PASSED") -or
($l -match " : PASSED$") -or
($l -match "^PASS$") -or # Special c_test case
($l -match "Passed all tests!") ) {
$pass_found = $true
}
}
if(!$pass_found) {
$script:success = $false;
Write-Warning $message
$log_content | Write-Warning
} else {
Write-Host $message
}
}
# Remove cached job info from the system
# Should be no output
Receive-Job -Job $completed | Out-Null
}
}
RunJobs -Suites $CasesToRun -TestCmds $TestExes -ConcurrencyVal $Concurrency
$EndDate = (Get-Date)
New-TimeSpan -Start $StartDate -End $EndDate |
ForEach-Object {
"Elapsed time: {0:g}" -f $_
}
if(!$script:success) {
# This does not succeed killing off jobs quick
# So we simply exit
# Remove-Job -Job $jobs -Force
# indicate failure using this exit code
exit 1
}
exit 0

@ -0,0 +1,45 @@
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
set -ex
ROCKSDB_VERSION="6.7.3"
ZSTD_VERSION="1.4.4"
echo "This script configures CentOS with everything needed to build and run RocksDB"
yum update -y && yum install epel-release -y
yum install -y \
wget \
gcc-c++ \
snappy snappy-devel \
zlib zlib-devel \
bzip2 bzip2-devel \
lz4-devel \
libasan \
gflags
mkdir -pv /usr/local/rocksdb-${ROCKSDB_VERSION}
ln -sfT /usr/local/rocksdb-${ROCKSDB_VERSION} /usr/local/rocksdb
wget -qO /tmp/zstd-${ZSTD_VERSION}.tar.gz https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz
wget -qO /tmp/rocksdb-${ROCKSDB_VERSION}.tar.gz https://github.com/facebook/rocksdb/archive/v${ROCKSDB_VERSION}.tar.gz
cd /tmp
tar xzvf zstd-${ZSTD_VERSION}.tar.gz
tar xzvf rocksdb-${ROCKSDB_VERSION}.tar.gz -C /usr/local/
echo "Installing ZSTD..."
pushd zstd-${ZSTD_VERSION}
make && make install
popd
echo "Compiling RocksDB..."
cd /usr/local/rocksdb
chown -R vagrant:vagrant /usr/local/rocksdb/
sudo -u vagrant make static_lib
cd examples/
sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ make all
sudo -u vagrant LD_LIBRARY_PATH=/usr/local/lib/ ./c_simple_example

@ -0,0 +1,57 @@
# from official ubuntu 20.04
FROM ubuntu:20.04
# update system
RUN apt-get update && apt-get upgrade -y
# install basic tools
RUN apt-get install -y vim wget curl
# install tzdata noninteractive
RUN DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC apt-get -y install tzdata
# install git and default compilers
RUN apt-get install -y git gcc g++ clang clang-tools
# install basic package
RUN apt-get install -y lsb-release software-properties-common gnupg
# install gflags, tbb
RUN apt-get install -y libgflags-dev libtbb-dev
# install compression libs
RUN apt-get install -y libsnappy-dev zlib1g-dev libbz2-dev liblz4-dev libzstd-dev
# install cmake
RUN apt-get install -y cmake
RUN apt-get install -y libssl-dev
# install clang-13
WORKDIR /root
RUN wget https://apt.llvm.org/llvm.sh
RUN chmod +x llvm.sh
RUN ./llvm.sh 13 all
# install gcc-7, 8, 10, 11, default is 9
RUN apt-get install -y gcc-7 g++-7
RUN apt-get install -y gcc-8 g++-8
RUN apt-get install -y gcc-10 g++-10
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
RUN apt-get install -y gcc-11 g++-11
# install apt-get install -y valgrind
RUN apt-get install -y valgrind
# install folly depencencies
RUN apt-get install -y libgoogle-glog-dev
# install openjdk 8
RUN apt-get install -y openjdk-8-jdk
ENV JAVA_HOME /usr/lib/jvm/java-1.8.0-openjdk-amd64
# install mingw
RUN apt-get install -y mingw-w64
# install gtest-parallel package
RUN git clone --single-branch --branch master --depth 1 https://github.com/google/gtest-parallel.git ~/gtest-parallel
ENV PATH $PATH:/root/gtest-parallel
# install libprotobuf for fuzzers test
RUN apt-get install -y ninja-build binutils liblzma-dev libz-dev pkg-config autoconf libtool
RUN git clone --branch v1.0 https://github.com/google/libprotobuf-mutator.git ~/libprotobuf-mutator && cd ~/libprotobuf-mutator && git checkout ffd86a32874e5c08a143019aad1aaf0907294c9f && mkdir build && cd build && cmake .. -GNinja -DCMAKE_C_COMPILER=clang-13 -DCMAKE_CXX_COMPILER=clang++-13 -DCMAKE_BUILD_TYPE=Release -DLIB_PROTO_MUTATOR_DOWNLOAD_PROTOBUF=ON && ninja && ninja install
ENV PKG_CONFIG_PATH /usr/local/OFF/:/root/libprotobuf-mutator/build/external.protobuf/lib/pkgconfig/
ENV PROTOC_BIN /root/libprotobuf-mutator/build/external.protobuf/bin/protoc
# install the latest google benchmark
RUN git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark
RUN cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install
# clean up
RUN rm -rf /var/lib/apt/lists/*
RUN rm -rf /root/benchmark

@ -0,0 +1,106 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Update dependencies.sh file with the latest avaliable versions
BASEDIR=$(dirname $0)
OUTPUT=""
function log_header()
{
echo "# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved." >> "$OUTPUT"
echo "# The file is generated using update_dependencies.sh." >> "$OUTPUT"
}
function log_variable()
{
echo "$1=${!1}" >> "$OUTPUT"
}
TP2_LATEST="/data/users/$USER/fbsource/fbcode/third-party2/"
## $1 => lib name
## $2 => lib version (if not provided, will try to pick latest)
## $3 => platform (if not provided, will try to pick latest gcc)
##
## get_lib_base will set a variable named ${LIB_NAME}_BASE to the lib location
function get_lib_base()
{
local lib_name=$1
local lib_version=$2
local lib_platform=$3
local result="$TP2_LATEST/$lib_name/"
# Lib Version
if [ -z "$lib_version" ] || [ "$lib_version" = "LATEST" ]; then
# version is not provided, use latest
result=`ls -dr1v $result/*/ | head -n1`
else
result="$result/$lib_version/"
fi
# Lib Platform
if [ -z "$lib_platform" ]; then
# platform is not provided, use latest gcc
result=`ls -dr1v $result/gcc-*[^fb]/ | head -n1`
else
echo $lib_platform
result="$result/$lib_platform/"
fi
result=`ls -1d $result/*/ | head -n1`
echo Finding link $result
# lib_name => LIB_NAME_BASE
local __res_var=${lib_name^^}"_BASE"
__res_var=`echo $__res_var | tr - _`
# LIB_NAME_BASE=$result
eval $__res_var=`readlink -f $result`
log_variable $__res_var
}
###########################################################
# platform010 dependencies #
###########################################################
OUTPUT="$BASEDIR/dependencies_platform010.sh"
rm -f "$OUTPUT"
touch "$OUTPUT"
echo "Writing dependencies to $OUTPUT"
# Compilers locations
GCC_BASE=`readlink -f $TP2_LATEST/gcc/11.x/centos7-native/*/`
CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/12/platform010/*/`
log_header
log_variable GCC_BASE
log_variable CLANG_BASE
# Libraries locations
get_lib_base libgcc 11.x platform010
get_lib_base glibc 2.34 platform010
get_lib_base snappy LATEST platform010
get_lib_base zlib LATEST platform010
get_lib_base bzip2 LATEST platform010
get_lib_base lz4 LATEST platform010
get_lib_base zstd LATEST platform010
get_lib_base gflags LATEST platform010
get_lib_base jemalloc LATEST platform010
get_lib_base numa LATEST platform010
get_lib_base libunwind LATEST platform010
get_lib_base tbb 2018_U5 platform010
get_lib_base liburing LATEST platform010
get_lib_base benchmark LATEST platform010
get_lib_base kernel-headers fb platform010
get_lib_base binutils LATEST centos7-native
get_lib_base valgrind LATEST platform010
get_lib_base lua 5.3.4 platform010
git diff $OUTPUT

@ -0,0 +1,23 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
if [ "$#" = "0" ]; then
echo "Usage: $0 major|minor|patch|full"
exit 1
fi
if [ "$1" = "major" ]; then
cat include/rocksdb/version.h | grep MAJOR | head -n1 | awk '{print $3}'
fi
if [ "$1" = "minor" ]; then
cat include/rocksdb/version.h | grep MINOR | head -n1 | awk '{print $3}'
fi
if [ "$1" = "patch" ]; then
cat include/rocksdb/version.h | grep PATCH | head -n1 | awk '{print $3}'
fi
if [ "$1" = "full" ]; then
awk '/#define ROCKSDB/ { env[$2] = $3 }
END { printf "%s.%s.%s\n", env["ROCKSDB_MAJOR"],
env["ROCKSDB_MINOR"],
env["ROCKSDB_PATCH"] }' \
include/rocksdb/version.h
fi

158
cache/cache.cc vendored

@ -0,0 +1,158 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "rocksdb/cache.h"
#include "cache/lru_cache.h"
#include "rocksdb/secondary_cache.h"
#include "rocksdb/utilities/customizable_util.h"
#include "rocksdb/utilities/options_type.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
const Cache::CacheItemHelper kNoopCacheItemHelper{};
static std::unordered_map<std::string, OptionTypeInfo>
lru_cache_options_type_info = {
{"capacity",
{offsetof(struct LRUCacheOptions, capacity), OptionType::kSizeT,
OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
{"num_shard_bits",
{offsetof(struct LRUCacheOptions, num_shard_bits), OptionType::kInt,
OptionVerificationType::kNormal, OptionTypeFlags::kMutable}},
{"strict_capacity_limit",
{offsetof(struct LRUCacheOptions, strict_capacity_limit),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"high_pri_pool_ratio",
{offsetof(struct LRUCacheOptions, high_pri_pool_ratio),
OptionType::kDouble, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"low_pri_pool_ratio",
{offsetof(struct LRUCacheOptions, low_pri_pool_ratio),
OptionType::kDouble, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
};
static std::unordered_map<std::string, OptionTypeInfo>
comp_sec_cache_options_type_info = {
{"capacity",
{offsetof(struct CompressedSecondaryCacheOptions, capacity),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"num_shard_bits",
{offsetof(struct CompressedSecondaryCacheOptions, num_shard_bits),
OptionType::kInt, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"compression_type",
{offsetof(struct CompressedSecondaryCacheOptions, compression_type),
OptionType::kCompressionType, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"compress_format_version",
{offsetof(struct CompressedSecondaryCacheOptions,
compress_format_version),
OptionType::kUInt32T, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
{"enable_custom_split_merge",
{offsetof(struct CompressedSecondaryCacheOptions,
enable_custom_split_merge),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
};
Status SecondaryCache::CreateFromString(
const ConfigOptions& config_options, const std::string& value,
std::shared_ptr<SecondaryCache>* result) {
if (value.find("compressed_secondary_cache://") == 0) {
std::string args = value;
args.erase(0, std::strlen("compressed_secondary_cache://"));
Status status;
std::shared_ptr<SecondaryCache> sec_cache;
CompressedSecondaryCacheOptions sec_cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&comp_sec_cache_options_type_info, "",
args, &sec_cache_opts);
if (status.ok()) {
sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
}
if (status.ok()) {
result->swap(sec_cache);
}
return status;
} else {
return LoadSharedObject<SecondaryCache>(config_options, value, result);
}
}
Status Cache::CreateFromString(const ConfigOptions& config_options,
const std::string& value,
std::shared_ptr<Cache>* result) {
Status status;
std::shared_ptr<Cache> cache;
if (value.find('=') == std::string::npos) {
cache = NewLRUCache(ParseSizeT(value));
} else {
LRUCacheOptions cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&lru_cache_options_type_info, "",
value, &cache_opts);
if (status.ok()) {
cache = NewLRUCache(cache_opts);
}
}
if (status.ok()) {
result->swap(cache);
}
return status;
}
bool Cache::AsyncLookupHandle::IsReady() {
return pending_handle == nullptr || pending_handle->IsReady();
}
bool Cache::AsyncLookupHandle::IsPending() { return pending_handle != nullptr; }
Cache::Handle* Cache::AsyncLookupHandle::Result() {
assert(!IsPending());
return result_handle;
}
void Cache::StartAsyncLookup(AsyncLookupHandle& async_handle) {
async_handle.found_dummy_entry = false; // in case re-used
assert(!async_handle.IsPending());
async_handle.result_handle =
Lookup(async_handle.key, async_handle.helper, async_handle.create_context,
async_handle.priority, async_handle.stats);
}
Cache::Handle* Cache::Wait(AsyncLookupHandle& async_handle) {
WaitAll(&async_handle, 1);
return async_handle.Result();
}
void Cache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
for (size_t i = 0; i < count; ++i) {
if (async_handles[i].IsPending()) {
// If a pending handle gets here, it should be marked at "to be handled
// by a caller" by that caller erasing the pending_cache on it.
assert(async_handles[i].pending_cache == nullptr);
}
}
}
void Cache::SetEvictionCallback(EvictionCallback&& fn) {
// Overwriting non-empty with non-empty could indicate a bug
assert(!eviction_callback_ || !fn);
eviction_callback_ = std::move(fn);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,20 @@
// Copyright (c) 2013-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef GFLAGS
#include <cstdio>
int main() {
fprintf(stderr, "Please install gflags to run rocksdb tools\n");
return 1;
}
#else
#include "rocksdb/cache_bench_tool.h"
int main(int argc, char** argv) {
return ROCKSDB_NAMESPACE::cache_bench_tool(argc, argv);
}
#endif // GFLAGS

File diff suppressed because it is too large Load Diff

@ -0,0 +1,104 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/cache_entry_roles.h"
#include <mutex>
#include "port/lang.h"
namespace ROCKSDB_NAMESPACE {
std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToCamelString{{
"DataBlock",
"FilterBlock",
"FilterMetaBlock",
"DeprecatedFilterBlock",
"IndexBlock",
"OtherBlock",
"WriteBuffer",
"CompressionDictionaryBuildingBuffer",
"FilterConstruction",
"BlockBasedTableReader",
"FileMetadata",
"BlobValue",
"BlobCache",
"Misc",
}};
std::array<std::string, kNumCacheEntryRoles> kCacheEntryRoleToHyphenString{{
"data-block",
"filter-block",
"filter-meta-block",
"deprecated-filter-block",
"index-block",
"other-block",
"write-buffer",
"compression-dictionary-building-buffer",
"filter-construction",
"block-based-table-reader",
"file-metadata",
"blob-value",
"blob-cache",
"misc",
}};
const std::string& GetCacheEntryRoleName(CacheEntryRole role) {
return kCacheEntryRoleToHyphenString[static_cast<size_t>(role)];
}
const std::string& BlockCacheEntryStatsMapKeys::CacheId() {
static const std::string kCacheId = "id";
return kCacheId;
}
const std::string& BlockCacheEntryStatsMapKeys::CacheCapacityBytes() {
static const std::string kCacheCapacityBytes = "capacity";
return kCacheCapacityBytes;
}
const std::string&
BlockCacheEntryStatsMapKeys::LastCollectionDurationSeconds() {
static const std::string kLastCollectionDurationSeconds =
"secs_for_last_collection";
return kLastCollectionDurationSeconds;
}
const std::string& BlockCacheEntryStatsMapKeys::LastCollectionAgeSeconds() {
static const std::string kLastCollectionAgeSeconds =
"secs_since_last_collection";
return kLastCollectionAgeSeconds;
}
namespace {
std::string GetPrefixedCacheEntryRoleName(const std::string& prefix,
CacheEntryRole role) {
const std::string& role_name = GetCacheEntryRoleName(role);
std::string prefixed_role_name;
prefixed_role_name.reserve(prefix.size() + role_name.size());
prefixed_role_name.append(prefix);
prefixed_role_name.append(role_name);
return prefixed_role_name;
}
} // namespace
std::string BlockCacheEntryStatsMapKeys::EntryCount(CacheEntryRole role) {
const static std::string kPrefix = "count.";
return GetPrefixedCacheEntryRoleName(kPrefix, role);
}
std::string BlockCacheEntryStatsMapKeys::UsedBytes(CacheEntryRole role) {
const static std::string kPrefix = "bytes.";
return GetPrefixedCacheEntryRoleName(kPrefix, role);
}
std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
const static std::string kPrefix = "percent.";
return GetPrefixedCacheEntryRoleName(kPrefix, role);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,20 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <array>
#include <cstdint>
#include "rocksdb/cache.h"
namespace ROCKSDB_NAMESPACE {
extern std::array<std::string, kNumCacheEntryRoles>
kCacheEntryRoleToCamelString;
extern std::array<std::string, kNumCacheEntryRoles>
kCacheEntryRoleToHyphenString;
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,182 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <array>
#include <cstdint>
#include <memory>
#include <mutex>
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "port/lang.h"
#include "rocksdb/cache.h"
#include "rocksdb/status.h"
#include "rocksdb/system_clock.h"
#include "test_util/sync_point.h"
#include "util/coding_lean.h"
namespace ROCKSDB_NAMESPACE {
// A generic helper object for gathering stats about cache entries by
// iterating over them with ApplyToAllEntries. This class essentially
// solves the problem of slowing down a Cache with too many stats
// collectors that could be sharing stat results, such as from multiple
// column families or multiple DBs sharing a Cache. We employ a few
// mitigations:
// * Only one collector for a particular kind of Stats is alive
// for each Cache. This is guaranteed using the Cache itself to hold
// the collector.
// * A mutex ensures only one thread is gathering stats for this
// collector.
// * The most recent gathered stats are saved and simply copied to
// satisfy requests within a time window (default: 3 minutes) of
// completion of the most recent stat gathering.
//
// Template parameter Stats must be copyable and trivially constructable,
// as well as...
// concept Stats {
// // Notification before applying callback to all entries
// void BeginCollection(Cache*, SystemClock*, uint64_t start_time_micros);
// // Get the callback to apply to all entries. `callback`
// // type must be compatible with Cache::ApplyToAllEntries
// callback GetEntryCallback();
// // Notification after applying callback to all entries
// void EndCollection(Cache*, SystemClock*, uint64_t end_time_micros);
// // Notification that a collection was skipped because of
// // sufficiently recent saved results.
// void SkippedCollection();
// }
template <class Stats>
class CacheEntryStatsCollector {
public:
// Gather and save stats if saved stats are too old. (Use GetStats() to
// read saved stats.)
//
// Maximum allowed age for a "hit" on saved results is determined by the
// two interval parameters. Both set to 0 forces a re-scan. For example
// with min_interval_seconds=300 and min_interval_factor=100, if the last
// scan took 10s, we would only rescan ("miss") if the age in seconds of
// the saved results is > max(300, 100*10).
// Justification: scans can vary wildly in duration, e.g. from 0.02 sec
// to as much as 20 seconds, so we want to be able to cap the absolute
// and relative frequency of scans.
void CollectStats(int min_interval_seconds, int min_interval_factor) {
// Waits for any pending reader or writer (collector)
std::lock_guard<std::mutex> lock(working_mutex_);
uint64_t max_age_micros =
static_cast<uint64_t>(std::max(min_interval_seconds, 0)) * 1000000U;
if (last_end_time_micros_ > last_start_time_micros_ &&
min_interval_factor > 0) {
max_age_micros = std::max(
max_age_micros, min_interval_factor * (last_end_time_micros_ -
last_start_time_micros_));
}
uint64_t start_time_micros = clock_->NowMicros();
if ((start_time_micros - last_end_time_micros_) > max_age_micros) {
last_start_time_micros_ = start_time_micros;
working_stats_.BeginCollection(cache_, clock_, start_time_micros);
cache_->ApplyToAllEntries(working_stats_.GetEntryCallback(), {});
TEST_SYNC_POINT_CALLBACK(
"CacheEntryStatsCollector::GetStats:AfterApplyToAllEntries", nullptr);
uint64_t end_time_micros = clock_->NowMicros();
last_end_time_micros_ = end_time_micros;
working_stats_.EndCollection(cache_, clock_, end_time_micros);
} else {
working_stats_.SkippedCollection();
}
// Save so that we don't need to wait for an outstanding collection in
// order to make of copy of the last saved stats
std::lock_guard<std::mutex> lock2(saved_mutex_);
saved_stats_ = working_stats_;
}
// Gets saved stats, regardless of age
void GetStats(Stats *stats) {
std::lock_guard<std::mutex> lock(saved_mutex_);
*stats = saved_stats_;
}
Cache *GetCache() const { return cache_; }
// Gets or creates a shared instance of CacheEntryStatsCollector in the
// cache itself, and saves into `ptr`. This shared_ptr will hold the
// entry in cache until all refs are destroyed.
static Status GetShared(Cache *raw_cache, SystemClock *clock,
std::shared_ptr<CacheEntryStatsCollector> *ptr) {
assert(raw_cache);
BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
cache{raw_cache};
const Slice &cache_key = GetCacheKey();
auto h = cache.Lookup(cache_key);
if (h == nullptr) {
// Not yet in cache, but Cache doesn't provide a built-in way to
// avoid racing insert. So we double-check under a shared mutex,
// inspired by TableCache.
STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
std::lock_guard<std::mutex> lock(static_mutex);
h = cache.Lookup(cache_key);
if (h == nullptr) {
auto new_ptr = new CacheEntryStatsCollector(cache.get(), clock);
// TODO: non-zero charge causes some tests that count block cache
// usage to go flaky. Fix the problem somehow so we can use an
// accurate charge.
size_t charge = 0;
Status s =
cache.Insert(cache_key, new_ptr, charge, &h, Cache::Priority::HIGH);
if (!s.ok()) {
assert(h == nullptr);
delete new_ptr;
return s;
}
}
}
// If we reach here, shared entry is in cache with handle `h`.
assert(cache.get()->GetCacheItemHelper(h) == cache.GetBasicHelper());
// Build an aliasing shared_ptr that keeps `ptr` in cache while there
// are references.
*ptr = cache.SharedGuard(h);
return Status::OK();
}
private:
explicit CacheEntryStatsCollector(Cache *cache, SystemClock *clock)
: saved_stats_(),
working_stats_(),
last_start_time_micros_(0),
last_end_time_micros_(/*pessimistic*/ 10000000),
cache_(cache),
clock_(clock) {}
static const Slice &GetCacheKey() {
// For each template instantiation
static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
static Slice ckey_slice = ckey.AsSlice();
return ckey_slice;
}
std::mutex saved_mutex_;
Stats saved_stats_;
std::mutex working_mutex_;
Stats working_stats_;
uint64_t last_start_time_micros_;
uint64_t last_end_time_micros_;
Cache *const cache_;
SystemClock *const clock_;
};
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,40 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/cache_helpers.h"
namespace ROCKSDB_NAMESPACE {
void ReleaseCacheHandleCleanup(void* arg1, void* arg2) {
Cache* const cache = static_cast<Cache*>(arg1);
assert(cache);
Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
assert(cache_handle);
cache->Release(cache_handle);
}
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority, size_t* out_charge) {
assert(helper);
assert(helper->create_cb);
Cache::ObjectPtr value;
size_t charge;
Status st = helper->create_cb(saved, create_context,
cache->memory_allocator(), &value, &charge);
if (st.ok()) {
st =
cache->Insert(key, value, helper, charge, /*handle*/ nullptr, priority);
if (out_charge) {
*out_charge = charge;
}
}
return st;
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,139 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <cassert>
#include "rocksdb/advanced_cache.h"
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
// Returns the cached value given a cache handle.
template <typename T>
T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
assert(cache);
assert(handle);
return static_cast<T*>(cache->Value(handle));
}
// Turns a T* into a Slice so it can be used as a key with Cache.
template <typename T>
Slice GetSliceForKey(const T* t) {
return Slice(reinterpret_cast<const char*>(t), sizeof(T));
}
void ReleaseCacheHandleCleanup(void* arg1, void* arg2);
// Generic resource management object for cache handles that releases the handle
// when destroyed. Has unique ownership of the handle, so copying it is not
// allowed, while moving it transfers ownership.
template <typename T>
class CacheHandleGuard {
public:
CacheHandleGuard() = default;
CacheHandleGuard(Cache* cache, Cache::Handle* handle)
: cache_(cache),
handle_(handle),
value_(GetFromCacheHandle<T>(cache, handle)) {
assert(cache_ && handle_ && value_);
}
CacheHandleGuard(const CacheHandleGuard&) = delete;
CacheHandleGuard& operator=(const CacheHandleGuard&) = delete;
CacheHandleGuard(CacheHandleGuard&& rhs) noexcept
: cache_(rhs.cache_), handle_(rhs.handle_), value_(rhs.value_) {
assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
rhs.ResetFields();
}
CacheHandleGuard& operator=(CacheHandleGuard&& rhs) noexcept {
if (this == &rhs) {
return *this;
}
ReleaseHandle();
cache_ = rhs.cache_;
handle_ = rhs.handle_;
value_ = rhs.value_;
assert((!cache_ && !handle_ && !value_) || (cache_ && handle_ && value_));
rhs.ResetFields();
return *this;
}
~CacheHandleGuard() { ReleaseHandle(); }
bool IsEmpty() const { return !handle_; }
Cache* GetCache() const { return cache_; }
Cache::Handle* GetCacheHandle() const { return handle_; }
T* GetValue() const { return value_; }
void TransferTo(Cleanable* cleanable) {
if (cleanable) {
if (handle_ != nullptr) {
assert(cache_);
cleanable->RegisterCleanup(&ReleaseCacheHandleCleanup, cache_, handle_);
}
}
ResetFields();
}
void Reset() {
ReleaseHandle();
ResetFields();
}
private:
void ReleaseHandle() {
if (IsEmpty()) {
return;
}
assert(cache_);
cache_->Release(handle_);
}
void ResetFields() {
cache_ = nullptr;
handle_ = nullptr;
value_ = nullptr;
}
private:
Cache* cache_ = nullptr;
Cache::Handle* handle_ = nullptr;
T* value_ = nullptr;
};
// Build an aliasing shared_ptr that keeps `handle` in cache while there
// are references, but the pointer is to the value for that cache entry,
// which must be of type T. This is copyable, unlike CacheHandleGuard, but
// does not provide access to caching details.
template <typename T>
std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
Cache::Handle* handle) {
auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
return std::shared_ptr<T>(wrapper, GetFromCacheHandle<T>(cache, handle));
}
// Given the persistable data (saved) for a block cache entry, parse that
// into a cache entry object and insert it into the given cache. The charge
// of the new entry can be returned to the caller through `out_charge`.
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority = Cache::Priority::LOW,
size_t* out_charge = nullptr);
} // namespace ROCKSDB_NAMESPACE

364
cache/cache_key.cc vendored

@ -0,0 +1,364 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/cache_key.h"
#include <algorithm>
#include <atomic>
#include "rocksdb/advanced_cache.h"
#include "table/unique_id_impl.h"
#include "util/hash.h"
#include "util/math.h"
namespace ROCKSDB_NAMESPACE {
// Value space plan for CacheKey:
//
// file_num_etc64_ | offset_etc64_ | Only generated by
// ---------------+---------------+------------------------------------------
// 0 | 0 | Reserved for "empty" CacheKey()
// 0 | > 0, < 1<<63 | CreateUniqueForCacheLifetime
// 0 | >= 1<<63 | CreateUniqueForProcessLifetime
// > 0 | any | OffsetableCacheKey.WithOffset
CacheKey CacheKey::CreateUniqueForCacheLifetime(Cache *cache) {
// +1 so that we can reserve all zeros for "unset" cache key
uint64_t id = cache->NewId() + 1;
// Ensure we don't collide with CreateUniqueForProcessLifetime
assert((id >> 63) == 0U);
return CacheKey(0, id);
}
CacheKey CacheKey::CreateUniqueForProcessLifetime() {
// To avoid colliding with CreateUniqueForCacheLifetime, assuming
// Cache::NewId counts up from zero, here we count down from UINT64_MAX.
// If this ever becomes a point of contention, we could sub-divide the
// space and use CoreLocalArray.
static std::atomic<uint64_t> counter{UINT64_MAX};
uint64_t id = counter.fetch_sub(1, std::memory_order_relaxed);
// Ensure we don't collide with CreateUniqueForCacheLifetime
assert((id >> 63) == 1U);
return CacheKey(0, id);
}
// How we generate CacheKeys and base OffsetableCacheKey, assuming that
// db_session_ids are generated from a base_session_id and
// session_id_counter (by SemiStructuredUniqueIdGen+EncodeSessionId
// in DBImpl::GenerateDbSessionId):
//
// Conceptual inputs:
// db_id (unstructured, from GenerateRawUniqueId or equiv)
// * could be shared between cloned DBs but rare
// * could be constant, if session id suffices
// base_session_id (unstructured, from GenerateRawUniqueId)
// session_id_counter (structured)
// * usually much smaller than 2**24
// orig_file_number (structured)
// * usually smaller than 2**24
// offset_in_file (structured, might skip lots of values)
// * usually smaller than 2**32
//
// Overall approach (see https://github.com/pdillinger/unique_id for
// background):
//
// First, we have three "structured" values, up to 64 bits each, that we
// need to fit, without losses, into 128 bits. In practice, the values will
// be small enough that they should fit. For example, applications generating
// large SST files (large offsets) will naturally produce fewer files (small
// file numbers). But we don't know ahead of time what bounds the values will
// have.
//
// Second, we have unstructured inputs that enable distinct RocksDB processes
// to pick a random point in space, likely very different from others. Xoring
// the structured with the unstructured give us a cache key that is
// structurally distinct between related keys (e.g. same file or same RocksDB
// process) and distinct with high probability between unrelated keys.
//
// The problem of packing three structured values into the space for two is
// complicated by the fact that we want to derive cache keys from SST unique
// IDs, which have already combined structured and unstructured inputs in a
// practically inseparable way. And we want a base cache key that works
// with an offset of any size. So basically, we need to encode these three
// structured values, each up to 64 bits, into 128 bits without knowing any
// of their sizes. The DownwardInvolution() function gives us a mechanism to
// accomplish this. (See its properties in math.h.) Specifically, for inputs
// a, b, and c:
// lower64 = DownwardInvolution(a) ^ ReverseBits(b);
// upper64 = c ^ ReverseBits(a);
// The 128-bit output is unique assuming there exist some i, j, and k
// where a < 2**i, b < 2**j, c < 2**k, i <= 64, j <= 64, k <= 64, and
// i + j + k <= 128. In other words, as long as there exist some bounds
// that would allow us to pack the bits of a, b, and c into the output
// if we know the bound, we can generate unique outputs without knowing
// those bounds. To validate this claim, the inversion function (given
// the bounds) has been implemented in CacheKeyDecoder in
// db_block_cache_test.cc.
//
// With that in mind, the outputs in terms of the conceptual inputs look
// like this, using bitwise-xor of the constituent pieces, low bits on left:
//
// |------------------------- file_num_etc64 -------------------------|
// | +++++++++ base_session_id (lower 64 bits, involution) +++++++++ |
// |-----------------------------------------------------------------|
// | session_id_counter (involution) ..... | |
// |-----------------------------------------------------------------|
// | hash of: ++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
// | * base_session_id (upper ~39 bits) |
// | * db_id (~122 bits entropy) |
// |-----------------------------------------------------------------|
// | | ..... orig_file_number (reversed) |
// |-----------------------------------------------------------------|
//
//
// |------------------------- offset_etc64 --------------------------|
// | ++++++++++ base_session_id (lower 64 bits, reversed) ++++++++++ |
// |-----------------------------------------------------------------|
// | | ..... session_id_counter (reversed) |
// |-----------------------------------------------------------------|
// | offset_in_file ............... | |
// |-----------------------------------------------------------------|
//
// Some oddities or inconveniences of this layout are due to deriving
// the "base" cache key (without offset) from the SST unique ID (see
// GetSstInternalUniqueId). Specifically,
// * Lower 64 of base_session_id occurs in both output words (ok but
// weird)
// * The inclusion of db_id is bad for the conditions under which we
// can guarantee uniqueness, but could be useful in some cases with
// few small files per process, to make up for db session id only having
// ~103 bits of entropy.
//
// In fact, if DB ids were not involved, we would be guaranteed unique
// cache keys for files generated in a single process until total bits for
// biggest session_id_counter, orig_file_number, and offset_in_file
// reach 128 bits.
//
// With the DB id limitation, we only have nice guaranteed unique cache
// keys for files generated in a single process until biggest
// session_id_counter and offset_in_file reach combined 64 bits. This
// is quite good in practice because we can have millions of DB Opens
// with terabyte size SST files, or billions of DB Opens with gigabyte
// size SST files.
//
// One of the considerations in the translation between existing SST unique
// IDs and base cache keys is supporting better SST unique IDs in a future
// format_version. If we use a process-wide file counter instead of
// session counter and file numbers, we only need to combine two 64-bit values
// instead of three. But we don't want to track unique ID versions in the
// manifest, so we want to keep the same translation layer between SST unique
// IDs and base cache keys, even with updated SST unique IDs. If the new
// unique IDs put the file counter where the orig_file_number was, and
// use no structured field where session_id_counter was, then our translation
// layer works fine for two structured fields as well as three (for
// compatibility). The small computation for the translation (one
// DownwardInvolution(), two ReverseBits(), both ~log(64) instructions deep)
// is negligible for computing as part of SST file reader open.
//
// More on how https://github.com/pdillinger/unique_id applies here:
// Every bit of output always includes "unstructured" uniqueness bits and
// often combines with "structured" uniqueness bits. The "unstructured" bits
// change infrequently: only when we cannot guarantee our state tracking for
// "structured" uniqueness hasn't been cloned. Using a static
// SemiStructuredUniqueIdGen for db_session_ids, this means we only get an
// "all new" session id when a new process uses RocksDB. (Between processes,
// we don't know if a DB or other persistent storage has been cloned. We
// assume that if VM hot cloning is used, subsequently generated SST files
// do not interact.) Within a process, only the session_lower of the
// db_session_id changes incrementally ("structured" uniqueness).
//
// This basically means that our offsets, counters and file numbers allow us
// to do somewhat "better than random" (birthday paradox) while in the
// degenerate case of completely new session for each tiny file, we still
// have strong uniqueness properties from the birthday paradox, with ~103
// bit session IDs or up to 128 bits entropy with different DB IDs sharing a
// cache.
//
// More collision probability analysis:
// Suppose a RocksDB host generates (generously) 2 GB/s (10TB data, 17 DWPD)
// with average process/session lifetime of (pessimistically) 4 minutes.
// In 180 days (generous allowable data lifespan), we generate 31 million GB
// of data, or 2^55 bytes, and 2^16 "all new" session IDs.
//
// First, suppose this is in a single DB (lifetime 180 days):
// 128 bits cache key size
// - 55 <- ideal size for byte offsets + file numbers
// - 2 <- bits for offsets and file numbers not exactly powers of two
// + 2 <- bits saved not using byte offsets in BlockBasedTable::GetCacheKey
// ----
// 73 <- bits remaining for distinguishing session IDs
// The probability of a collision in 73 bits of session ID data is less than
// 1 in 2**(73 - (2 * 16)), or roughly 1 in a trillion. And this assumes all
// data from the last 180 days is in cache for potential collision, and that
// cache keys under each session id exhaustively cover the remaining 57 bits
// while in reality they'll only cover a small fraction of it.
//
// Although data could be transferred between hosts, each host has its own
// cache and we are already assuming a high rate of "all new" session ids.
// So this doesn't really change the collision calculation. Across a fleet
// of 1 million, each with <1 in a trillion collision possibility,
// fleetwide collision probability is <1 in a million.
//
// Now suppose we have many DBs per host, say 2**10, with same host-wide write
// rate and process/session lifetime. File numbers will be ~10 bits smaller
// and we will have 2**10 times as many session IDs because of simultaneous
// lifetimes. So now collision chance is less than 1 in 2**(83 - (2 * 26)),
// or roughly 1 in a billion.
//
// Suppose instead we generated random or hashed cache keys for each
// (compressed) block. For 1KB compressed block size, that is 2^45 cache keys
// in 180 days. Collision probability is more easily estimated at roughly
// 1 in 2**(128 - (2 * 45)) or roughly 1 in a trillion (assuming all
// data from the last 180 days is in cache, but NOT the other assumption
// for the 1 in a trillion estimate above).
//
//
// Collision probability estimation through simulation:
// A tool ./cache_bench -stress_cache_key broadly simulates host-wide cache
// activity over many months, by making some pessimistic simplifying
// assumptions. See class StressCacheKey in cache_bench_tool.cc for details.
// Here is some sample output with
// `./cache_bench -stress_cache_key -sck_keep_bits=43`:
//
// Total cache or DBs size: 32TiB Writing 925.926 MiB/s or 76.2939TiB/day
// Multiply by 1.15292e+18 to correct for simulation losses (but still
// assume whole file cached)
//
// These come from default settings of 2.5M files per day of 32 MB each, and
// `-sck_keep_bits=43` means that to represent a single file, we are only
// keeping 43 bits of the 128-bit (base) cache key. With file size of 2**25
// contiguous keys (pessimistic), our simulation is about 2\*\*(128-43-25) or
// about 1 billion billion times more prone to collision than reality.
//
// More default assumptions, relatively pessimistic:
// * 100 DBs in same process (doesn't matter much)
// * Re-open DB in same process (new session ID related to old session ID) on
// average every 100 files generated
// * Restart process (all new session IDs unrelated to old) 24 times per day
//
// After enough data, we get a result at the end (-sck_keep_bits=43):
//
// (keep 43 bits) 18 collisions after 2 x 90 days, est 10 days between
// (1.15292e+19 corrected)
//
// If we believe the (pessimistic) simulation and the mathematical
// extrapolation, we would need to run a billion machines all for 11 billion
// days to expect a cache key collision. To help verify that our extrapolation
// ("corrected") is robust, we can make our simulation more precise by
// increasing the "keep" bits, which takes more running time to get enough
// collision data:
//
// (keep 44 bits) 16 collisions after 5 x 90 days, est 28.125 days between
// (1.6213e+19 corrected)
// (keep 45 bits) 15 collisions after 7 x 90 days, est 42 days between
// (1.21057e+19 corrected)
// (keep 46 bits) 15 collisions after 17 x 90 days, est 102 days between
// (1.46997e+19 corrected)
// (keep 47 bits) 15 collisions after 49 x 90 days, est 294 days between
// (2.11849e+19 corrected)
//
// The extrapolated prediction seems to be within noise (sampling error).
//
// With the `-sck_randomize` option, we can see that typical workloads like
// above have lower collision probability than "random" cache keys (note:
// offsets still non-randomized) by a modest amount (roughly 2-3x less
// collision prone than random), which should make us reasonably comfortable
// even in "degenerate" cases (e.g. repeatedly launch a process to generate
// one file with SstFileWriter):
//
// (rand 43 bits) 22 collisions after 1 x 90 days, est 4.09091 days between
// (4.7165e+18 corrected)
//
// We can see that with more frequent process restarts,
// -sck_restarts_per_day=5000, which means more all-new session IDs, we get
// closer to the "random" cache key performance:
//
// 15 collisions after 1 x 90 days, est 6 days between (6.91753e+18 corrected)
//
// And with less frequent process restarts and re-opens,
// -sck_restarts_per_day=1 -sck_reopen_nfiles=1000, we get lower collision
// probability:
//
// 18 collisions after 8 x 90 days, est 40 days between (4.61169e+19 corrected)
//
// Other tests have been run to validate other conditions behave as expected,
// never behaving "worse than random" unless we start chopping off structured
// data.
//
// Conclusion: Even in extreme cases, rapidly burning through "all new" IDs
// that only arise when a new process is started, the chance of any cache key
// collisions in a giant fleet of machines is negligible. Especially when
// processes live for hours or days, the chance of a cache key collision is
// likely more plausibly due to bad hardware than to bad luck in random
// session ID data. Software defects are surely more likely to cause corruption
// than both of those.
//
// TODO: Nevertheless / regardless, an efficient way to detect (and thus
// quantify) block cache corruptions, including collisions, should be added.
OffsetableCacheKey::OffsetableCacheKey(const std::string &db_id,
const std::string &db_session_id,
uint64_t file_number) {
UniqueId64x2 internal_id;
Status s = GetSstInternalUniqueId(db_id, db_session_id, file_number,
&internal_id, /*force=*/true);
assert(s.ok());
*this = FromInternalUniqueId(&internal_id);
}
OffsetableCacheKey OffsetableCacheKey::FromInternalUniqueId(UniqueIdPtr id) {
uint64_t session_lower = id.ptr[0];
uint64_t file_num_etc = id.ptr[1];
#ifndef NDEBUG
bool is_empty = session_lower == 0 && file_num_etc == 0;
#endif
// Although DBImpl guarantees (in recent versions) that session_lower is not
// zero, that's not entirely sufficient to guarantee that file_num_etc64_ is
// not zero (so that the 0 case can be used by CacheKey::CreateUnique*)
// However, if we are given an "empty" id as input, then we should produce
// "empty" as output.
// As a consequence, this function is only bijective assuming
// id[0] == 0 only if id[1] == 0.
if (session_lower == 0U) {
session_lower = file_num_etc;
}
// See comments above for how DownwardInvolution and ReverseBits
// make this function invertible under various assumptions.
OffsetableCacheKey rv;
rv.file_num_etc64_ =
DownwardInvolution(session_lower) ^ ReverseBits(file_num_etc);
rv.offset_etc64_ = ReverseBits(session_lower);
// Because of these transformations and needing to allow arbitrary
// offset (thus, second 64 bits of cache key might be 0), we need to
// make some correction to ensure the first 64 bits is not 0.
// Fortunately, the transformation ensures the second 64 bits is not 0
// for non-empty base key, so we can swap in the case one is 0 without
// breaking bijectivity (assuming condition above).
assert(is_empty || rv.offset_etc64_ > 0);
if (rv.file_num_etc64_ == 0) {
std::swap(rv.file_num_etc64_, rv.offset_etc64_);
}
assert(is_empty || rv.file_num_etc64_ > 0);
return rv;
}
// Inverse of FromInternalUniqueId (assuming file_num_etc64 == 0 only if
// offset_etc64 == 0)
UniqueId64x2 OffsetableCacheKey::ToInternalUniqueId() {
uint64_t a = file_num_etc64_;
uint64_t b = offset_etc64_;
if (b == 0) {
std::swap(a, b);
}
UniqueId64x2 rv;
rv[0] = ReverseBits(b);
rv[1] = ReverseBits(a ^ DownwardInvolution(rv[0]));
return rv;
}
} // namespace ROCKSDB_NAMESPACE

143
cache/cache_key.h vendored

@ -0,0 +1,143 @@
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <cstdint>
#include "rocksdb/rocksdb_namespace.h"
#include "rocksdb/slice.h"
#include "table/unique_id_impl.h"
namespace ROCKSDB_NAMESPACE {
class Cache;
// A standard holder for fixed-size block cache keys (and for related caches).
// They are created through one of these, each using its own range of values:
// * CacheKey::CreateUniqueForCacheLifetime
// * CacheKey::CreateUniqueForProcessLifetime
// * Default ctor ("empty" cache key)
// * OffsetableCacheKey->WithOffset
//
// The first two use atomic counters to guarantee uniqueness over the given
// lifetime and the last uses a form of universally unique identifier for
// uniqueness with very high probabilty (and guaranteed for files generated
// during a single process lifetime).
//
// CacheKeys are currently used by calling AsSlice() to pass as a key to
// Cache. For performance, the keys are endianness-dependent (though otherwise
// portable). (Persistable cache entries are not intended to cross platforms.)
class CacheKey {
public:
// For convenience, constructs an "empty" cache key that is never returned
// by other means.
inline CacheKey() : file_num_etc64_(), offset_etc64_() {}
inline bool IsEmpty() const {
return (file_num_etc64_ == 0) & (offset_etc64_ == 0);
}
// Use this cache key as a Slice (byte order is endianness-dependent)
inline Slice AsSlice() const {
static_assert(sizeof(*this) == 16, "Standardized on 16-byte cache key");
assert(!IsEmpty());
return Slice(reinterpret_cast<const char *>(this), sizeof(*this));
}
// Create a CacheKey that is unique among others associated with this Cache
// instance. Depends on Cache::NewId. This is useful for block cache
// "reservations".
static CacheKey CreateUniqueForCacheLifetime(Cache *cache);
// Create a CacheKey that is unique among others for the lifetime of this
// process. This is useful for saving in a static data member so that
// different DB instances can agree on a cache key for shared entities,
// such as for CacheEntryStatsCollector.
static CacheKey CreateUniqueForProcessLifetime();
protected:
friend class OffsetableCacheKey;
CacheKey(uint64_t file_num_etc64, uint64_t offset_etc64)
: file_num_etc64_(file_num_etc64), offset_etc64_(offset_etc64) {}
uint64_t file_num_etc64_;
uint64_t offset_etc64_;
};
constexpr uint8_t kCacheKeySize = static_cast<uint8_t>(sizeof(CacheKey));
// A file-specific generator of cache keys, sometimes referred to as the
// "base" cache key for a file because all the cache keys for various offsets
// within the file are computed using simple arithmetic. The basis for the
// general approach is dicussed here: https://github.com/pdillinger/unique_id
// Heavily related to GetUniqueIdFromTableProperties.
//
// If the db_id, db_session_id, and file_number come from the file's table
// properties, then the keys will be stable across DB::Open/Close, backup/
// restore, import/export, etc.
//
// This class "is a" CacheKey only privately so that it is not misused as
// a ready-to-use CacheKey.
class OffsetableCacheKey : private CacheKey {
public:
// For convenience, constructs an "empty" cache key that should not be used.
inline OffsetableCacheKey() : CacheKey() {}
// Constructs an OffsetableCacheKey with the given information about a file.
// This constructor never generates an "empty" base key.
OffsetableCacheKey(const std::string &db_id, const std::string &db_session_id,
uint64_t file_number);
// Creates an OffsetableCacheKey from an SST unique ID, so that cache keys
// can be derived from DB manifest data before reading the file from
// storage--so that every part of the file can potentially go in a persistent
// cache.
//
// Calling GetSstInternalUniqueId() on a db_id, db_session_id, and
// file_number and passing the result to this function produces the same
// base cache key as feeding those inputs directly to the constructor.
//
// This is a bijective transformation assuming either id is empty or
// lower 64 bits is non-zero:
// * Empty (all zeros) input -> empty (all zeros) output
// * Lower 64 input is non-zero -> lower 64 output (file_num_etc64_) is
// non-zero
static OffsetableCacheKey FromInternalUniqueId(UniqueIdPtr id);
// This is the inverse transformation to the above, assuming either empty
// or lower 64 bits (file_num_etc64_) is non-zero. Perhaps only useful for
// testing.
UniqueId64x2 ToInternalUniqueId();
inline bool IsEmpty() const {
bool result = file_num_etc64_ == 0;
assert(!(offset_etc64_ > 0 && result));
return result;
}
// Construct a CacheKey for an offset within a file. An offset is not
// necessarily a byte offset if a smaller unique identifier of keyable
// offsets is used.
//
// This class was designed to make this hot code extremely fast.
inline CacheKey WithOffset(uint64_t offset) const {
assert(!IsEmpty());
return CacheKey(file_num_etc64_, offset_etc64_ ^ offset);
}
// The "common prefix" is a shared prefix for all the returned CacheKeys.
// It is specific to the file but the same for all offsets within the file.
static constexpr size_t kCommonPrefixSize = 8;
inline Slice CommonPrefixSlice() const {
static_assert(sizeof(file_num_etc64_) == kCommonPrefixSize,
"8 byte common prefix expected");
assert(!IsEmpty());
assert(&this->file_num_etc64_ == static_cast<const void *>(this));
return Slice(reinterpret_cast<const char *>(this), kCommonPrefixSize);
}
};
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,184 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "cache/cache_reservation_manager.h"
#include <cassert>
#include <cstddef>
#include <cstring>
#include <memory>
#include "rocksdb/cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "table/block_based/reader_common.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::CacheReservationHandle::CacheReservationHandle(
std::size_t incremental_memory_used,
std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr)
: incremental_memory_used_(incremental_memory_used) {
assert(cache_res_mgr);
cache_res_mgr_ = cache_res_mgr;
}
template <CacheEntryRole R>
CacheReservationManagerImpl<
R>::CacheReservationHandle::~CacheReservationHandle() {
Status s = cache_res_mgr_->ReleaseCacheReservation(incremental_memory_used_);
s.PermitUncheckedError();
}
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
std::shared_ptr<Cache> cache, bool delayed_decrease)
: cache_(cache),
delayed_decrease_(delayed_decrease),
cache_allocated_size_(0),
memory_used_(0) {
assert(cache != nullptr);
}
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
for (auto* handle : dummy_handles_) {
cache_.ReleaseAndEraseIfLastRef(handle);
}
}
template <CacheEntryRole R>
Status CacheReservationManagerImpl<R>::UpdateCacheReservation(
std::size_t new_mem_used) {
memory_used_ = new_mem_used;
std::size_t cur_cache_allocated_size =
cache_allocated_size_.load(std::memory_order_relaxed);
if (new_mem_used == cur_cache_allocated_size) {
return Status::OK();
} else if (new_mem_used > cur_cache_allocated_size) {
Status s = IncreaseCacheReservation(new_mem_used);
return s;
} else {
// In delayed decrease mode, we don't decrease cache reservation
// untill the memory usage is less than 3/4 of what we reserve
// in the cache.
// We do this because
// (1) Dummy entry insertion is expensive in block cache
// (2) Delayed releasing previously inserted dummy entries can save such
// expensive dummy entry insertion on memory increase in the near future,
// which is likely to happen when the memory usage is greater than or equal
// to 3/4 of what we reserve
if (delayed_decrease_ && new_mem_used >= cur_cache_allocated_size / 4 * 3) {
return Status::OK();
} else {
Status s = DecreaseCacheReservation(new_mem_used);
return s;
}
}
}
template <CacheEntryRole R>
Status CacheReservationManagerImpl<R>::MakeCacheReservation(
std::size_t incremental_memory_used,
std::unique_ptr<CacheReservationManager::CacheReservationHandle>* handle) {
assert(handle);
Status s =
UpdateCacheReservation(GetTotalMemoryUsed() + incremental_memory_used);
(*handle).reset(new CacheReservationManagerImpl::CacheReservationHandle(
incremental_memory_used,
std::enable_shared_from_this<
CacheReservationManagerImpl<R>>::shared_from_this()));
return s;
}
template <CacheEntryRole R>
Status CacheReservationManagerImpl<R>::ReleaseCacheReservation(
std::size_t incremental_memory_used) {
assert(GetTotalMemoryUsed() >= incremental_memory_used);
std::size_t updated_total_mem_used =
GetTotalMemoryUsed() - incremental_memory_used;
Status s = UpdateCacheReservation(updated_total_mem_used);
return s;
}
template <CacheEntryRole R>
Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
std::size_t new_mem_used) {
Status return_status = Status::OK();
while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
Cache::Handle* handle = nullptr;
return_status = cache_.Insert(GetNextCacheKey(), kSizeDummyEntry, &handle);
if (return_status != Status::OK()) {
return return_status;
}
dummy_handles_.push_back(handle);
cache_allocated_size_ += kSizeDummyEntry;
}
return return_status;
}
template <CacheEntryRole R>
Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
std::size_t new_mem_used) {
Status return_status = Status::OK();
// Decrease to the smallest multiple of kSizeDummyEntry that is greater than
// or equal to new_mem_used We do addition instead of new_mem_used <=
// cache_allocated_size_.load(std::memory_order_relaxed) - kSizeDummyEntry to
// avoid underflow of size_t when cache_allocated_size_ = 0
while (new_mem_used + kSizeDummyEntry <=
cache_allocated_size_.load(std::memory_order_relaxed)) {
assert(!dummy_handles_.empty());
auto* handle = dummy_handles_.back();
cache_.ReleaseAndEraseIfLastRef(handle);
dummy_handles_.pop_back();
cache_allocated_size_ -= kSizeDummyEntry;
}
return return_status;
}
template <CacheEntryRole R>
std::size_t CacheReservationManagerImpl<R>::GetTotalReservedCacheSize() {
return cache_allocated_size_.load(std::memory_order_relaxed);
}
template <CacheEntryRole R>
std::size_t CacheReservationManagerImpl<R>::GetTotalMemoryUsed() {
return memory_used_;
}
template <CacheEntryRole R>
Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
// Calling this function will have the side-effect of changing the
// underlying cache_key_ that is shared among other keys generated from this
// fucntion. Therefore please make sure the previous keys are saved/copied
// before calling this function.
cache_key_ = CacheKey::CreateUniqueForCacheLifetime(cache_.get());
return cache_key_.AsSlice();
}
template <CacheEntryRole R>
const Cache::CacheItemHelper*
CacheReservationManagerImpl<R>::TEST_GetCacheItemHelperForRole() {
return CacheInterface::GetHelper();
}
template class CacheReservationManagerImpl<
CacheEntryRole::kBlockBasedTableReader>;
template class CacheReservationManagerImpl<
CacheEntryRole::kCompressionDictionaryBuildingBuffer>;
template class CacheReservationManagerImpl<CacheEntryRole::kFilterConstruction>;
template class CacheReservationManagerImpl<CacheEntryRole::kMisc>;
template class CacheReservationManagerImpl<CacheEntryRole::kWriteBuffer>;
template class CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>;
template class CacheReservationManagerImpl<CacheEntryRole::kBlobCache>;
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,317 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <mutex>
#include <vector>
#include "cache/cache_entry_roles.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
// CacheReservationManager is an interface for reserving cache space for the
// memory used
class CacheReservationManager {
public:
// CacheReservationHandle is for managing the lifetime of a cache reservation
// for an incremental amount of memory used (i.e, incremental_memory_used)
class CacheReservationHandle {
public:
virtual ~CacheReservationHandle() {}
};
virtual ~CacheReservationManager() {}
virtual Status UpdateCacheReservation(std::size_t new_memory_used) = 0;
// TODO(hx235): replace the usage of
// `UpdateCacheReservation(memory_used_delta, increase)` with
// `UpdateCacheReservation(new_memory_used)` so that we only have one
// `UpdateCacheReservation` function
virtual Status UpdateCacheReservation(std::size_t memory_used_delta,
bool increase) = 0;
virtual Status MakeCacheReservation(
std::size_t incremental_memory_used,
std::unique_ptr<CacheReservationManager::CacheReservationHandle>
*handle) = 0;
virtual std::size_t GetTotalReservedCacheSize() = 0;
virtual std::size_t GetTotalMemoryUsed() = 0;
};
// CacheReservationManagerImpl implements interface CacheReservationManager
// for reserving cache space for the memory used by inserting/releasing dummy
// entries in the cache.
//
// This class is NOT thread-safe, except that GetTotalReservedCacheSize()
// can be called without external synchronization.
template <CacheEntryRole R>
class CacheReservationManagerImpl
: public CacheReservationManager,
public std::enable_shared_from_this<CacheReservationManagerImpl<R>> {
public:
class CacheReservationHandle
: public CacheReservationManager::CacheReservationHandle {
public:
CacheReservationHandle(
std::size_t incremental_memory_used,
std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr);
~CacheReservationHandle() override;
private:
std::size_t incremental_memory_used_;
std::shared_ptr<CacheReservationManagerImpl> cache_res_mgr_;
};
// Construct a CacheReservationManagerImpl
// @param cache The cache where dummy entries are inserted and released for
// reserving cache space
// @param delayed_decrease If set true, then dummy entries won't be released
// immediately when memory usage decreases.
// Instead, it will be released when the memory usage
// decreases to 3/4 of what we have reserved so far.
// This is for saving some future dummy entry
// insertion when memory usage increases are likely to
// happen in the near future.
//
// REQUIRED: cache is not nullptr
explicit CacheReservationManagerImpl(std::shared_ptr<Cache> cache,
bool delayed_decrease = false);
// no copy constructor, copy assignment, move constructor, move assignment
CacheReservationManagerImpl(const CacheReservationManagerImpl &) = delete;
CacheReservationManagerImpl &operator=(const CacheReservationManagerImpl &) =
delete;
CacheReservationManagerImpl(CacheReservationManagerImpl &&) = delete;
CacheReservationManagerImpl &operator=(CacheReservationManagerImpl &&) =
delete;
~CacheReservationManagerImpl() override;
// One of the two ways of reserving/releasing cache space,
// see MakeCacheReservation() for the other.
//
// Use ONLY one of these two ways to prevent unexpected behavior.
//
// Insert and release dummy entries in the cache to
// match the size of total dummy entries with the least multiple of
// kSizeDummyEntry greater than or equal to new_mem_used
//
// Insert dummy entries if new_memory_used > cache_allocated_size_;
//
// Release dummy entries if new_memory_used < cache_allocated_size_
// (and new_memory_used < cache_allocated_size_ * 3/4
// when delayed_decrease is set true);
//
// Keey dummy entries the same if (1) new_memory_used == cache_allocated_size_
// or (2) new_memory_used is in the interval of
// [cache_allocated_size_ * 3/4, cache_allocated_size) when delayed_decrease
// is set true.
//
// @param new_memory_used The number of bytes used by new memory
// The most recent new_memoy_used passed in will be returned
// in GetTotalMemoryUsed() even when the call return non-ok status.
//
// Since the class is NOT thread-safe, external synchronization on the
// order of calling UpdateCacheReservation() is needed if you want
// GetTotalMemoryUsed() indeed returns the latest memory used.
//
// @return On inserting dummy entries, it returns Status::OK() if all dummy
// entry insertions succeed.
// Otherwise, it returns the first non-ok status;
// On releasing dummy entries, it always returns Status::OK().
// On keeping dummy entries the same, it always returns Status::OK().
Status UpdateCacheReservation(std::size_t new_memory_used) override;
Status UpdateCacheReservation(std::size_t /* memory_used_delta */,
bool /* increase */) override {
return Status::NotSupported();
}
// One of the two ways of reserving cache space and releasing is done through
// destruction of CacheReservationHandle.
// See UpdateCacheReservation() for the other way.
//
// Use ONLY one of these two ways to prevent unexpected behavior.
//
// Insert dummy entries in the cache for the incremental memory usage
// to match the size of total dummy entries with the least multiple of
// kSizeDummyEntry greater than or equal to the total memory used.
//
// A CacheReservationHandle is returned as an output parameter.
// The reserved dummy entries are automatically released on the destruction of
// this handle, which achieves better RAII per cache reservation.
//
// WARNING: Deallocate all the handles of the CacheReservationManager object
// before deallocating the object to prevent unexpected behavior.
//
// @param incremental_memory_used The number of bytes increased in memory
// usage.
//
// Calling GetTotalMemoryUsed() afterward will return the total memory
// increased by this number, even when calling MakeCacheReservation()
// returns non-ok status.
//
// Since the class is NOT thread-safe, external synchronization in
// calling MakeCacheReservation() is needed if you want
// GetTotalMemoryUsed() indeed returns the latest memory used.
//
// @param handle An pointer to std::unique_ptr<CacheReservationHandle> that
// manages the lifetime of the cache reservation represented by the
// handle.
//
// @return It returns Status::OK() if all dummy
// entry insertions succeed.
// Otherwise, it returns the first non-ok status;
//
// REQUIRES: handle != nullptr
Status MakeCacheReservation(
std::size_t incremental_memory_used,
std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
override;
// Return the size of the cache (which is a multiple of kSizeDummyEntry)
// successfully reserved by calling UpdateCacheReservation().
//
// When UpdateCacheReservation() returns non-ok status,
// calling GetTotalReservedCacheSize() after that might return a slightly
// smaller number than the actual reserved cache size due to
// the returned number will always be a multiple of kSizeDummyEntry
// and cache full might happen in the middle of inserting a dummy entry.
std::size_t GetTotalReservedCacheSize() override;
// Return the latest total memory used indicated by the most recent call of
// UpdateCacheReservation(std::size_t new_memory_used);
std::size_t GetTotalMemoryUsed() override;
static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
// For testing only - it is to help ensure the CacheItemHelperForRole<R>
// accessed from CacheReservationManagerImpl and the one accessed from the
// test are from the same translation units
static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();
private:
static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
Slice GetNextCacheKey();
Status ReleaseCacheReservation(std::size_t incremental_memory_used);
Status IncreaseCacheReservation(std::size_t new_mem_used);
Status DecreaseCacheReservation(std::size_t new_mem_used);
using CacheInterface = PlaceholderSharedCacheInterface<R>;
CacheInterface cache_;
bool delayed_decrease_;
std::atomic<std::size_t> cache_allocated_size_;
std::size_t memory_used_;
std::vector<Cache::Handle *> dummy_handles_;
CacheKey cache_key_;
};
class ConcurrentCacheReservationManager
: public CacheReservationManager,
public std::enable_shared_from_this<ConcurrentCacheReservationManager> {
public:
class CacheReservationHandle
: public CacheReservationManager::CacheReservationHandle {
public:
CacheReservationHandle(
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr,
std::unique_ptr<CacheReservationManager::CacheReservationHandle>
cache_res_handle) {
assert(cache_res_mgr && cache_res_handle);
cache_res_mgr_ = cache_res_mgr;
cache_res_handle_ = std::move(cache_res_handle);
}
~CacheReservationHandle() override {
std::lock_guard<std::mutex> lock(cache_res_mgr_->cache_res_mgr_mu_);
cache_res_handle_.reset();
}
private:
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
std::unique_ptr<CacheReservationManager::CacheReservationHandle>
cache_res_handle_;
};
explicit ConcurrentCacheReservationManager(
std::shared_ptr<CacheReservationManager> cache_res_mgr) {
cache_res_mgr_ = std::move(cache_res_mgr);
}
ConcurrentCacheReservationManager(const ConcurrentCacheReservationManager &) =
delete;
ConcurrentCacheReservationManager &operator=(
const ConcurrentCacheReservationManager &) = delete;
ConcurrentCacheReservationManager(ConcurrentCacheReservationManager &&) =
delete;
ConcurrentCacheReservationManager &operator=(
ConcurrentCacheReservationManager &&) = delete;
~ConcurrentCacheReservationManager() override {}
inline Status UpdateCacheReservation(std::size_t new_memory_used) override {
std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
return cache_res_mgr_->UpdateCacheReservation(new_memory_used);
}
inline Status UpdateCacheReservation(std::size_t memory_used_delta,
bool increase) override {
std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
std::size_t total_mem_used = cache_res_mgr_->GetTotalMemoryUsed();
Status s;
if (!increase) {
assert(total_mem_used >= memory_used_delta);
s = cache_res_mgr_->UpdateCacheReservation(total_mem_used -
memory_used_delta);
} else {
s = cache_res_mgr_->UpdateCacheReservation(total_mem_used +
memory_used_delta);
}
return s;
}
inline Status MakeCacheReservation(
std::size_t incremental_memory_used,
std::unique_ptr<CacheReservationManager::CacheReservationHandle> *handle)
override {
std::unique_ptr<CacheReservationManager::CacheReservationHandle>
wrapped_handle;
Status s;
{
std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
s = cache_res_mgr_->MakeCacheReservation(incremental_memory_used,
&wrapped_handle);
}
(*handle).reset(
new ConcurrentCacheReservationManager::CacheReservationHandle(
std::enable_shared_from_this<
ConcurrentCacheReservationManager>::shared_from_this(),
std::move(wrapped_handle)));
return s;
}
inline std::size_t GetTotalReservedCacheSize() override {
return cache_res_mgr_->GetTotalReservedCacheSize();
}
inline std::size_t GetTotalMemoryUsed() override {
std::lock_guard<std::mutex> lock(cache_res_mgr_mu_);
return cache_res_mgr_->GetTotalMemoryUsed();
}
private:
std::mutex cache_res_mgr_mu_;
std::shared_ptr<CacheReservationManager> cache_res_mgr_;
};
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,469 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "cache/cache_reservation_manager.h"
#include <cstddef>
#include <cstring>
#include <memory>
#include "cache/cache_entry_roles.h"
#include "rocksdb/cache.h"
#include "rocksdb/slice.h"
#include "test_util/testharness.h"
#include "util/coding.h"
namespace ROCKSDB_NAMESPACE {
class CacheReservationManagerTest : public ::testing::Test {
protected:
static constexpr std::size_t kSizeDummyEntry =
CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
static constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
static constexpr int kNumShardBits = 0; // 2^0 shard
static constexpr std::size_t kMetaDataChargeOverhead = 10000;
std::shared_ptr<Cache> cache = NewLRUCache(kCacheCapacity, kNumShardBits);
std::shared_ptr<CacheReservationManager> test_cache_rev_mng;
CacheReservationManagerTest() {
test_cache_rev_mng =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache);
}
};
TEST_F(CacheReservationManagerTest, GenerateCacheKey) {
std::size_t new_mem_used = 1 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
ASSERT_LT(cache->GetPinnedUsage(),
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
// Next unique Cache key
CacheKey ckey = CacheKey::CreateUniqueForCacheLifetime(cache.get());
// Get to the underlying values
uint64_t* ckey_data = reinterpret_cast<uint64_t*>(&ckey);
// Back it up to the one used by CRM (using CacheKey implementation details)
ckey_data[1]--;
// Specific key (subject to implementation details)
EXPECT_EQ(ckey_data[0], 0);
EXPECT_EQ(ckey_data[1], 2);
Cache::Handle* handle = cache->Lookup(ckey.AsSlice());
EXPECT_NE(handle, nullptr)
<< "Failed to generate the cache key for the dummy entry correctly";
// Clean up the returned handle from Lookup() to prevent memory leak
cache->Release(handle);
}
TEST_F(CacheReservationManagerTest, KeepCacheReservationTheSame) {
std::size_t new_mem_used = 1 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry);
ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
std::size_t initial_pinned_usage = cache->GetPinnedUsage();
ASSERT_GE(initial_pinned_usage, 1 * kSizeDummyEntry);
ASSERT_LT(initial_pinned_usage,
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to keep cache reservation the same when new_mem_used equals "
"to current cache reservation";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry)
<< "Failed to bookkeep correctly when new_mem_used equals to current "
"cache reservation";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly when new_mem_used "
"equals to current cache reservation";
EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
<< "Failed to keep underlying dummy entries the same when new_mem_used "
"equals to current cache reservation";
}
TEST_F(CacheReservationManagerTest,
IncreaseCacheReservationByMultiplesOfDummyEntrySize) {
std::size_t new_mem_used = 2 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to increase cache reservation correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
2 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation increase correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
<< "Failed to increase underlying dummy entries in cache correctly";
EXPECT_LT(cache->GetPinnedUsage(),
2 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to increase underlying dummy entries in cache correctly";
}
TEST_F(CacheReservationManagerTest,
IncreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
std::size_t new_mem_used = 2 * kSizeDummyEntry + kSizeDummyEntry / 2;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to increase cache reservation correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
3 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation increase correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 3 * kSizeDummyEntry)
<< "Failed to increase underlying dummy entries in cache correctly";
EXPECT_LT(cache->GetPinnedUsage(),
3 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to increase underlying dummy entries in cache correctly";
}
TEST(CacheReservationManagerIncreaseReservcationOnFullCacheTest,
IncreaseCacheReservationOnFullCache) {
;
constexpr std::size_t kSizeDummyEntry =
CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
constexpr std::size_t kSmallCacheCapacity = 4 * kSizeDummyEntry;
constexpr std::size_t kBigCacheCapacity = 4096 * kSizeDummyEntry;
constexpr std::size_t kMetaDataChargeOverhead = 10000;
LRUCacheOptions lo;
lo.capacity = kSmallCacheCapacity;
lo.num_shard_bits = 0; // 2^0 shard
lo.strict_capacity_limit = true;
std::shared_ptr<Cache> cache = NewLRUCache(lo);
std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache);
std::size_t new_mem_used = kSmallCacheCapacity + 1;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::MemoryLimit())
<< "Failed to return status to indicate failure of dummy entry insertion "
"during cache reservation on full cache";
EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry)
<< "Failed to bookkeep correctly before cache resevation failure happens "
"due to full cache";
EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
kSmallCacheCapacity)
<< "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
"entry insertions) when encountering cache resevation failure due to "
"full cache";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
<< "Failed to insert underlying dummy entries correctly when "
"encountering cache resevation failure due to full cache";
EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
<< "Failed to insert underlying dummy entries correctly when "
"encountering cache resevation failure due to full cache";
new_mem_used = kSmallCacheCapacity / 2; // 2 dummy entries
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to decrease cache reservation after encountering cache "
"reservation failure due to full cache";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
2 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation decrease correctly after "
"encountering cache reservation due to full cache";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry)
<< "Failed to release underlying dummy entries correctly on cache "
"reservation decrease after encountering cache resevation failure due "
"to full cache";
EXPECT_LT(cache->GetPinnedUsage(),
2 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to release underlying dummy entries correctly on cache "
"reservation decrease after encountering cache resevation failure due "
"to full cache";
// Create cache full again for subsequent tests
new_mem_used = kSmallCacheCapacity + 1;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::MemoryLimit())
<< "Failed to return status to indicate failure of dummy entry insertion "
"during cache reservation on full cache";
EXPECT_GE(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry)
<< "Failed to bookkeep correctly before cache resevation failure happens "
"due to full cache";
EXPECT_LE(test_cache_rev_mng->GetTotalReservedCacheSize(),
kSmallCacheCapacity)
<< "Failed to bookkeep correctly (i.e, bookkeep only successful dummy "
"entry insertions) when encountering cache resevation failure due to "
"full cache";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
<< "Failed to insert underlying dummy entries correctly when "
"encountering cache resevation failure due to full cache";
EXPECT_LE(cache->GetPinnedUsage(), kSmallCacheCapacity)
<< "Failed to insert underlying dummy entries correctly when "
"encountering cache resevation failure due to full cache";
// Increase cache capacity so the previously failed insertion can fully
// succeed
cache->SetCapacity(kBigCacheCapacity);
new_mem_used = kSmallCacheCapacity + 1;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to increase cache reservation after increasing cache capacity "
"and mitigating cache full error";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
5 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation increase correctly after "
"increasing cache capacity and mitigating cache full error";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 5 * kSizeDummyEntry)
<< "Failed to insert underlying dummy entries correctly after increasing "
"cache capacity and mitigating cache full error";
EXPECT_LT(cache->GetPinnedUsage(),
5 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to insert underlying dummy entries correctly after increasing "
"cache capacity and mitigating cache full error";
}
TEST_F(CacheReservationManagerTest,
DecreaseCacheReservationByMultiplesOfDummyEntrySize) {
std::size_t new_mem_used = 2 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
2 * kSizeDummyEntry);
ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
ASSERT_LT(cache->GetPinnedUsage(),
2 * kSizeDummyEntry + kMetaDataChargeOverhead);
new_mem_used = 1 * kSizeDummyEntry;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to decrease cache reservation correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation decrease correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
<< "Failed to decrease underlying dummy entries in cache correctly";
EXPECT_LT(cache->GetPinnedUsage(),
1 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to decrease underlying dummy entries in cache correctly";
}
TEST_F(CacheReservationManagerTest,
DecreaseCacheReservationNotByMultiplesOfDummyEntrySize) {
std::size_t new_mem_used = 2 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
2 * kSizeDummyEntry);
ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
ASSERT_GE(cache->GetPinnedUsage(), 2 * kSizeDummyEntry);
ASSERT_LT(cache->GetPinnedUsage(),
2 * kSizeDummyEntry + kMetaDataChargeOverhead);
new_mem_used = kSizeDummyEntry / 2;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to decrease cache reservation correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
1 * kSizeDummyEntry)
<< "Failed to bookkeep cache reservation decrease correctly";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry)
<< "Failed to decrease underlying dummy entries in cache correctly";
EXPECT_LT(cache->GetPinnedUsage(),
1 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to decrease underlying dummy entries in cache correctly";
}
TEST(CacheReservationManagerWithDelayedDecreaseTest,
DecreaseCacheReservationWithDelayedDecrease) {
constexpr std::size_t kSizeDummyEntry =
CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
constexpr std::size_t kMetaDataChargeOverhead = 10000;
LRUCacheOptions lo;
lo.capacity = kCacheCapacity;
lo.num_shard_bits = 0;
std::shared_ptr<Cache> cache = NewLRUCache(lo);
std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache, true /* delayed_decrease */);
std::size_t new_mem_used = 8 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
8 * kSizeDummyEntry);
ASSERT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used);
std::size_t initial_pinned_usage = cache->GetPinnedUsage();
ASSERT_GE(initial_pinned_usage, 8 * kSizeDummyEntry);
ASSERT_LT(initial_pinned_usage,
8 * kSizeDummyEntry + kMetaDataChargeOverhead);
new_mem_used = 6 * kSizeDummyEntry;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
8 * kSizeDummyEntry)
<< "Failed to bookkeep correctly when delaying cache reservation "
"decrease";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
<< "Failed to delay decreasing underlying dummy entries in cache";
new_mem_used = 7 * kSizeDummyEntry;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK()) << "Failed to delay decreasing cache reservation";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
8 * kSizeDummyEntry)
<< "Failed to bookkeep correctly when delaying cache reservation "
"decrease";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_EQ(cache->GetPinnedUsage(), initial_pinned_usage)
<< "Failed to delay decreasing underlying dummy entries in cache";
new_mem_used = 6 * kSizeDummyEntry - 1;
s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
EXPECT_EQ(s, Status::OK())
<< "Failed to decrease cache reservation correctly when new_mem_used < "
"GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(),
6 * kSizeDummyEntry)
<< "Failed to bookkeep correctly when new_mem_used < "
"GetTotalReservedCacheSize() * 3 / 4 on delayed decrease mode";
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), new_mem_used)
<< "Failed to bookkeep the used memory correctly";
EXPECT_GE(cache->GetPinnedUsage(), 6 * kSizeDummyEntry)
<< "Failed to decrease underlying dummy entries in cache when "
"new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
"decrease mode";
EXPECT_LT(cache->GetPinnedUsage(),
6 * kSizeDummyEntry + kMetaDataChargeOverhead)
<< "Failed to decrease underlying dummy entries in cache when "
"new_mem_used < GetTotalReservedCacheSize() * 3 / 4 on delayed "
"decrease mode";
}
TEST(CacheReservationManagerDestructorTest,
ReleaseRemainingDummyEntriesOnDestruction) {
constexpr std::size_t kSizeDummyEntry =
CacheReservationManagerImpl<CacheEntryRole::kMisc>::GetDummyEntrySize();
constexpr std::size_t kCacheCapacity = 4096 * kSizeDummyEntry;
constexpr std::size_t kMetaDataChargeOverhead = 10000;
LRUCacheOptions lo;
lo.capacity = kCacheCapacity;
lo.num_shard_bits = 0;
std::shared_ptr<Cache> cache = NewLRUCache(lo);
{
std::shared_ptr<CacheReservationManager> test_cache_rev_mng =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache);
std::size_t new_mem_used = 1 * kSizeDummyEntry;
Status s = test_cache_rev_mng->UpdateCacheReservation(new_mem_used);
ASSERT_EQ(s, Status::OK());
ASSERT_GE(cache->GetPinnedUsage(), 1 * kSizeDummyEntry);
ASSERT_LT(cache->GetPinnedUsage(),
1 * kSizeDummyEntry + kMetaDataChargeOverhead);
}
EXPECT_EQ(cache->GetPinnedUsage(), 0 * kSizeDummyEntry)
<< "Failed to release remaining underlying dummy entries in cache in "
"CacheReservationManager's destructor";
}
TEST(CacheReservationHandleTest, HandleTest) {
constexpr std::size_t kOneGigabyte = 1024 * 1024 * 1024;
constexpr std::size_t kSizeDummyEntry = 256 * 1024;
constexpr std::size_t kMetaDataChargeOverhead = 10000;
LRUCacheOptions lo;
lo.capacity = kOneGigabyte;
lo.num_shard_bits = 0;
std::shared_ptr<Cache> cache = NewLRUCache(lo);
std::shared_ptr<CacheReservationManager> test_cache_rev_mng(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache));
std::size_t mem_used = 0;
const std::size_t incremental_mem_used_handle_1 = 1 * kSizeDummyEntry;
const std::size_t incremental_mem_used_handle_2 = 2 * kSizeDummyEntry;
std::unique_ptr<CacheReservationManager::CacheReservationHandle> handle_1,
handle_2;
// To test consecutive CacheReservationManager::MakeCacheReservation works
// correctly in terms of returning the handle as well as updating cache
// reservation and the latest total memory used
Status s = test_cache_rev_mng->MakeCacheReservation(
incremental_mem_used_handle_1, &handle_1);
mem_used = mem_used + incremental_mem_used_handle_1;
ASSERT_EQ(s, Status::OK());
EXPECT_TRUE(handle_1 != nullptr);
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
EXPECT_GE(cache->GetPinnedUsage(), mem_used);
EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
s = test_cache_rev_mng->MakeCacheReservation(incremental_mem_used_handle_2,
&handle_2);
mem_used = mem_used + incremental_mem_used_handle_2;
ASSERT_EQ(s, Status::OK());
EXPECT_TRUE(handle_2 != nullptr);
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
EXPECT_GE(cache->GetPinnedUsage(), mem_used);
EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
// To test
// CacheReservationManager::CacheReservationHandle::~CacheReservationHandle()
// works correctly in releasing the cache reserved for the handle
handle_1.reset();
EXPECT_TRUE(handle_1 == nullptr);
mem_used = mem_used - incremental_mem_used_handle_1;
EXPECT_EQ(test_cache_rev_mng->GetTotalReservedCacheSize(), mem_used);
EXPECT_EQ(test_cache_rev_mng->GetTotalMemoryUsed(), mem_used);
EXPECT_GE(cache->GetPinnedUsage(), mem_used);
EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
// To test the actual CacheReservationManager object won't be deallocated
// as long as there remain handles pointing to it.
// We strongly recommend deallocating CacheReservationManager object only
// after all its handles are deallocated to keep things easy to reasonate
test_cache_rev_mng.reset();
EXPECT_GE(cache->GetPinnedUsage(), mem_used);
EXPECT_LT(cache->GetPinnedUsage(), mem_used + kMetaDataChargeOverhead);
handle_2.reset();
// The CacheReservationManager object is now deallocated since all the handles
// and its original pointer is gone
mem_used = mem_used - incremental_mem_used_handle_2;
EXPECT_EQ(mem_used, 0);
EXPECT_EQ(cache->GetPinnedUsage(), mem_used);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {
ROCKSDB_NAMESPACE::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

1061
cache/cache_test.cc vendored

File diff suppressed because it is too large Load Diff

@ -0,0 +1,109 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/charged_cache.h"
#include "cache/cache_reservation_manager.h"
namespace ROCKSDB_NAMESPACE {
ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache)
: CacheWrapper(cache),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<
CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
block_cache))) {}
Status ChargedCache::Insert(const Slice& key, ObjectPtr obj,
const CacheItemHelper* helper, size_t charge,
Handle** handle, Priority priority) {
Status s = target_->Insert(key, obj, helper, charge, handle, priority);
if (s.ok()) {
// Insert may cause the cache entry eviction if the cache is full. So we
// directly call the reservation manager to update the total memory used
// in the cache.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
return s;
}
Cache::Handle* ChargedCache::Lookup(const Slice& key,
const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority, Statistics* stats) {
auto handle = target_->Lookup(key, helper, create_context, priority, stats);
// Lookup may promote the KV pair from the secondary cache to the primary
// cache. So we directly call the reservation manager to update the total
// memory used in the cache.
if (helper && helper->create_cb) {
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
return handle;
}
void ChargedCache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
target_->WaitAll(async_handles, count);
// In case of any promotions. Although some could finish by return of
// StartAsyncLookup, Wait/WaitAll will generally be used, so simpler to
// update here.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
bool ChargedCache::Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref) {
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, useful, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
->UpdateCacheReservation(memory_used_delta, /* increase */ false)
.PermitUncheckedError();
}
return erased;
}
bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
->UpdateCacheReservation(memory_used_delta, /* increase */ false)
.PermitUncheckedError();
}
return erased;
}
void ChargedCache::Erase(const Slice& key) {
target_->Erase(key);
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::EraseUnRefEntries() {
target_->EraseUnRefEntries();
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::SetCapacity(size_t capacity) {
target_->SetCapacity(capacity);
// SetCapacity can result in evictions when the cache capacity is decreased,
// so we would want to update the cache reservation here as well.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,59 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <string>
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
namespace ROCKSDB_NAMESPACE {
class ConcurrentCacheReservationManager;
// A cache interface which wraps around another cache and takes care of
// reserving space in block cache towards a single global memory limit, and
// forwards all the calls to the underlying cache.
class ChargedCache : public CacheWrapper {
public:
ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache);
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
bool Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref = false) override;
bool Release(Cache::Handle* handle, bool erase_if_last_ref = false) override;
void Erase(const Slice& key) override;
void EraseUnRefEntries() override;
static const char* kClassName() { return "ChargedCache"; }
const char* Name() const override { return kClassName(); }
void SetCapacity(size_t capacity) override;
inline Cache* GetCache() const { return target_.get(); }
inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
const {
return cache_res_mgr_.get();
}
private:
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};
} // namespace ROCKSDB_NAMESPACE

1557
cache/clock_cache.cc vendored

File diff suppressed because it is too large Load Diff

756
cache/clock_cache.h vendored

@ -0,0 +1,756 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <array>
#include <atomic>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include "cache/cache_key.h"
#include "cache/sharded_cache.h"
#include "port/lang.h"
#include "port/malloc.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/math.h"
namespace ROCKSDB_NAMESPACE {
namespace clock_cache {
// Forward declaration of friend class.
class ClockCacheTest;
// HyperClockCache is an alternative to LRUCache specifically tailored for
// use as BlockBasedTableOptions::block_cache
//
// Benefits
// --------
// * Fully lock free (no waits or spins) for efficiency under high concurrency
// * Optimized for hot path reads. For concurrency control, most Lookup() and
// essentially all Release() are a single atomic add operation.
// * Eviction on insertion is fully parallel and lock-free.
// * Uses a generalized + aging variant of CLOCK eviction that might outperform
// LRU in some cases. (For background, see
// https://en.wikipedia.org/wiki/Page_replacement_algorithm)
//
// Costs
// -----
// * Hash table is not resizable (for lock-free efficiency) so capacity is not
// dynamically changeable. Rely on an estimated average value (block) size for
// space+time efficiency. (See estimated_entry_charge option details.)
// * Insert usually does not (but might) overwrite a previous entry associated
// with a cache key. This is OK for RocksDB uses of Cache.
// * Only supports keys of exactly 16 bytes, which is what RocksDB uses for
// block cache (not row cache or table cache).
// * SecondaryCache is not supported.
// * Cache priorities are less aggressively enforced. Unlike LRUCache, enough
// transient LOW or BOTTOM priority items can evict HIGH priority entries that
// are not referenced recently (or often) enough.
// * If pinned entries leave little or nothing eligible for eviction,
// performance can degrade substantially, because of clock eviction eating
// CPU looking for evictable entries and because Release does not
// pro-actively delete unreferenced entries when the cache is over-full.
// Specifically, this makes this implementation more susceptible to the
// following combination:
// * num_shard_bits is high (e.g. 6)
// * capacity small (e.g. some MBs)
// * some large individual entries (e.g. non-partitioned filters)
// where individual entries occupy a large portion of their shard capacity.
// This should be mostly mitigated by the implementation picking a lower
// number of cache shards than LRUCache for a given capacity (when
// num_shard_bits is not overridden; see calls to GetDefaultCacheShardBits()).
// * With strict_capacity_limit=false, respecting the capacity limit is not as
// aggressive as LRUCache. The limit might be transiently exceeded by a very
// small number of entries even when not strictly necessary, and slower to
// recover after pinning forces limit to be substantially exceeded. (Even with
// strict_capacity_limit=true, RocksDB will nevertheless transiently allocate
// memory before discovering it is over the block cache capacity, so this
// should not be a detectable regression in respecting memory limits, except
// on exceptionally small caches.)
// * In some cases, erased or duplicated entries might not be freed
// immediately. They will eventually be freed by eviction from further Inserts.
// * Internal metadata can overflow if the number of simultaneous references
// to a cache handle reaches many millions.
//
// High-level eviction algorithm
// -----------------------------
// A score (or "countdown") is maintained for each entry, initially determined
// by priority. The score is incremented on each Lookup, up to a max of 3,
// though is easily returned to previous state if useful=false with Release.
// During CLOCK-style eviction iteration, entries with score > 0 are
// decremented if currently unreferenced and entries with score == 0 are
// evicted if currently unreferenced. Note that scoring might not be perfect
// because entries can be referenced transiently within the cache even when
// there are no outside references to the entry.
//
// Cache sharding like LRUCache is used to reduce contention on usage+eviction
// state, though here the performance improvement from more shards is small,
// and (as noted above) potentially detrimental if shard capacity is too close
// to largest entry size. Here cache sharding mostly only affects cache update
// (Insert / Erase) performance, not read performance.
//
// Read efficiency (hot path)
// --------------------------
// Mostly to minimize the cost of accessing metadata blocks with
// cache_index_and_filter_blocks=true, we focus on optimizing Lookup and
// Release. In terms of concurrency, at a minimum, these operations have
// to do reference counting (and Lookup has to compare full keys in a safe
// way). Can we fold in all the other metadata tracking *for free* with
// Lookup and Release doing a simple atomic fetch_add/fetch_sub? (Assume
// for the moment that Lookup succeeds on the first probe.)
//
// We have a clever way of encoding an entry's reference count and countdown
// clock so that Lookup and Release are each usually a single atomic addition.
// In a single metadata word we have both an "acquire" count, incremented by
// Lookup, and a "release" count, incremented by Release. If useful=false,
// Release can instead decrement the acquire count. Thus the current ref
// count is (acquires - releases), and the countdown clock is min(3, acquires).
// Note that only unreferenced entries (acquires == releases) are eligible
// for CLOCK manipulation and eviction. We tolerate use of more expensive
// compare_exchange operations for cache writes (insertions and erasures).
//
// In a cache receiving many reads and little or no writes, it is possible
// for the acquire and release counters to overflow. Assuming the *current*
// refcount never reaches to many millions, we only have to correct for
// overflow in both counters in Release, not in Lookup. The overflow check
// should be only 1-2 CPU cycles per Release because it is a predictable
// branch on a simple condition on data already in registers.
//
// Slot states
// -----------
// We encode a state indicator into the same metadata word with the
// acquire and release counters. This allows bigger state transitions to
// be atomic. States:
//
// * Empty - slot is not in use and unowned. All other metadata and data is
// in an undefined state.
// * Construction - slot is exclusively owned by one thread, the thread
// successfully entering this state, for populating or freeing data.
// * Shareable (group) - slot holds an entry with counted references for
// pinning and reading, including
// * Visible - slot holds an entry that can be returned by Lookup
// * Invisible - slot holds an entry that is not visible to Lookup
// (erased by user) but can be read by existing references, and ref count
// changed by Ref and Release.
//
// A special case is "standalone" entries, which are heap-allocated handles
// not in the table. They are always Invisible and freed on zero refs.
//
// State transitions:
// Empty -> Construction (in Insert): The encoding of state enables Insert to
// perform an optimistic atomic bitwise-or to take ownership if a slot is
// empty, or otherwise make no state change.
//
// Construction -> Visible (in Insert): This can be a simple assignment to the
// metadata word because the current thread has exclusive ownership and other
// metadata is meaningless.
//
// Visible -> Invisible (in Erase): This can be a bitwise-and while holding
// a shared reference, which is safe because the change is idempotent (in case
// of parallel Erase). By the way, we never go Invisible->Visible.
//
// Shareable -> Construction (in Evict part of Insert, in Erase, and in
// Release if Invisible): This is for starting to freeing/deleting an
// unreferenced entry. We have to use compare_exchange to ensure we only make
// this transition when there are zero refs.
//
// Construction -> Empty (in same places): This is for completing free/delete
// of an entry. A "release" atomic store suffices, as we have exclusive
// ownership of the slot but have to ensure none of the data member reads are
// re-ordered after committing the state transition.
//
// Insert
// ------
// If Insert were to guarantee replacing an existing entry for a key, there
// would be complications for concurrency and efficiency. First, consider how
// many probes to get to an entry. To ensure Lookup never waits and
// availability of a key is uninterrupted, we would need to use a different
// slot for a new entry for the same key. This means it is most likely in a
// later probing position than the old version, which should soon be removed.
// (Also, an entry is too big to replace atomically, even if no current refs.)
//
// However, overwrite capability is not really needed by RocksDB. Also, we
// know from our "redundant" stats that overwrites are very rare for the block
// cache, so we should not spend much to make them effective.
//
// So instead we Insert as soon as we find an empty slot in the probing
// sequence without seeing an existing (visible) entry for the same key. This
// way we only insert if we can improve the probing performance, and we don't
// need to probe beyond our insert position, assuming we are willing to let
// the previous entry for the same key die of old age (eventual eviction from
// not being used). We can reach a similar state with concurrent insertions,
// where one will pass over the other while it is "under construction."
// This temporary duplication is acceptable for RocksDB block cache because
// we know redundant insertion is rare.
//
// Another problem to solve is what to return to the caller when we find an
// existing entry whose probing position we cannot improve on, or when the
// table occupancy limit has been reached. If strict_capacity_limit=false,
// we must never fail Insert, and if a Handle* is provided, we have to return
// a usable Cache handle on success. The solution to this (typically rare)
// problem is "standalone" handles, which are usable by the caller but not
// actually available for Lookup in the Cache. Standalone handles are allocated
// independently on the heap and specially marked so that they are freed on
// the heap when their last reference is released.
//
// Usage on capacity
// -----------------
// Insert takes different approaches to usage tracking depending on
// strict_capacity_limit setting. If true, we enforce a kind of strong
// consistency where compare-exchange is used to ensure the usage number never
// exceeds its limit, and provide threads with an authoritative signal on how
// much "usage" they have taken ownership of. With strict_capacity_limit=false,
// we use a kind of "eventual consistency" where all threads Inserting to the
// same cache shard might race on reserving the same space, but the
// over-commitment will be worked out in later insertions. It is kind of a
// dance because we don't want threads racing each other too much on paying
// down the over-commitment (with eviction) either.
//
// Eviction
// --------
// A key part of Insert is evicting some entries currently unreferenced to
// make room for new entries. The high-level eviction algorithm is described
// above, but the details are also interesting. A key part is parallelizing
// eviction with a single CLOCK pointer. This works by each thread working on
// eviction pre-emptively incrementing the CLOCK pointer, and then CLOCK-
// updating or evicting the incremented-over slot(s). To reduce contention at
// the cost of possibly evicting too much, each thread increments the clock
// pointer by 4, so commits to updating at least 4 slots per batch. As
// described above, a CLOCK update will decrement the "countdown" of
// unreferenced entries, or evict unreferenced entries with zero countdown.
// Referenced entries are not updated, because we (presumably) don't want
// long-referenced entries to age while referenced. Note however that we
// cannot distinguish transiently referenced entries from cache user
// references, so some CLOCK updates might be somewhat arbitrarily skipped.
// This is OK as long as it is rare enough that eviction order is still
// pretty good.
//
// There is no synchronization on the completion of the CLOCK updates, so it
// is theoretically possible for another thread to cycle back around and have
// two threads racing on CLOCK updates to the same slot. Thus, we cannot rely
// on any implied exclusivity to make the updates or eviction more efficient.
// These updates use an opportunistic compare-exchange (no loop), where a
// racing thread might cause the update to be skipped without retry, but in
// such case the update is likely not needed because the most likely update
// to an entry is that it has become referenced. (TODO: test efficiency of
// avoiding compare-exchange loop)
//
// Release
// -------
// In the common case, Release is a simple atomic increment of the release
// counter. There is a simple overflow check that only does another atomic
// update in extremely rare cases, so costs almost nothing.
//
// If the Release specifies "not useful", we can instead decrement the
// acquire counter, which returns to the same CLOCK state as before Lookup
// or Ref.
//
// Adding a check for over-full cache on every release to zero-refs would
// likely be somewhat expensive, increasing read contention on cache shard
// metadata. Instead we are less aggressive about deleting entries right
// away in those cases.
//
// However Release tries to immediately delete entries reaching zero refs
// if (a) erase_if_last_ref is set by the caller, or (b) the entry is already
// marked invisible. Both of these are checks on values already in CPU
// registers so do not increase cross-CPU contention when not applicable.
// When applicable, they use a compare-exchange loop to take exclusive
// ownership of the slot for freeing the entry. These are rare cases
// that should not usually affect performance.
//
// Erase
// -----
// Searches for an entry like Lookup but moves it to Invisible state if found.
// This state transition is with bit operations so is idempotent and safely
// done while only holding a shared "read" reference. Like Release, it makes
// a best effort to immediately release an Invisible entry that reaches zero
// refs, but there are some corner cases where it will only be freed by the
// clock eviction process.
// ----------------------------------------------------------------------- //
// The load factor p is a real number in (0, 1) such that at all
// times at most a fraction p of all slots, without counting tombstones,
// are occupied by elements. This means that the probability that a random
// probe hits an occupied slot is at most p, and thus at most 1/p probes
// are required on average. For example, p = 70% implies that between 1 and 2
// probes are needed on average (bear in mind that this reasoning doesn't
// consider the effects of clustering over time, which should be negligible
// with double hashing).
// Because the size of the hash table is always rounded up to the next
// power of 2, p is really an upper bound on the actual load factor---the
// actual load factor is anywhere between p/2 and p. This is a bit wasteful,
// but bear in mind that slots only hold metadata, not actual values.
// Since space cost is dominated by the values (the LSM blocks),
// overprovisioning the table with metadata only increases the total cache space
// usage by a tiny fraction.
constexpr double kLoadFactor = 0.7;
// The user can exceed kLoadFactor if the sizes of the inserted values don't
// match estimated_value_size, or in some rare cases with
// strict_capacity_limit == false. To avoid degenerate performance, we set a
// strict upper bound on the load factor.
constexpr double kStrictLoadFactor = 0.84;
struct ClockHandleBasicData {
Cache::ObjectPtr value = nullptr;
const Cache::CacheItemHelper* helper = nullptr;
// A lossless, reversible hash of the fixed-size (16 byte) cache key. This
// eliminates the need to store a hash separately.
UniqueId64x2 hashed_key = kNullUniqueId64x2;
size_t total_charge = 0;
inline size_t GetTotalCharge() const { return total_charge; }
// Calls deleter (if non-null) on cache key and value
void FreeData(MemoryAllocator* allocator) const;
// Required by concept HandleImpl
const UniqueId64x2& GetHash() const { return hashed_key; }
};
struct ClockHandle : public ClockHandleBasicData {
// Constants for handling the atomic `meta` word, which tracks most of the
// state of the handle. The meta word looks like this:
// low bits high bits
// -----------------------------------------------------------------------
// | acquire counter | release counter | state marker |
// -----------------------------------------------------------------------
// For reading or updating counters in meta word.
static constexpr uint8_t kCounterNumBits = 30;
static constexpr uint64_t kCounterMask = (uint64_t{1} << kCounterNumBits) - 1;
static constexpr uint8_t kAcquireCounterShift = 0;
static constexpr uint64_t kAcquireIncrement = uint64_t{1}
<< kAcquireCounterShift;
static constexpr uint8_t kReleaseCounterShift = kCounterNumBits;
static constexpr uint64_t kReleaseIncrement = uint64_t{1}
<< kReleaseCounterShift;
// For reading or updating the state marker in meta word
static constexpr uint8_t kStateShift = 2U * kCounterNumBits;
// Bits contribution to state marker.
// Occupied means any state other than empty
static constexpr uint8_t kStateOccupiedBit = 0b100;
// Shareable means the entry is reference counted (visible or invisible)
// (only set if also occupied)
static constexpr uint8_t kStateShareableBit = 0b010;
// Visible is only set if also shareable
static constexpr uint8_t kStateVisibleBit = 0b001;
// Complete state markers (not shifted into full word)
static constexpr uint8_t kStateEmpty = 0b000;
static constexpr uint8_t kStateConstruction = kStateOccupiedBit;
static constexpr uint8_t kStateInvisible =
kStateOccupiedBit | kStateShareableBit;
static constexpr uint8_t kStateVisible =
kStateOccupiedBit | kStateShareableBit | kStateVisibleBit;
// Constants for initializing the countdown clock. (Countdown clock is only
// in effect with zero refs, acquire counter == release counter, and in that
// case the countdown clock == both of those counters.)
static constexpr uint8_t kHighCountdown = 3;
static constexpr uint8_t kLowCountdown = 2;
static constexpr uint8_t kBottomCountdown = 1;
// During clock update, treat any countdown clock value greater than this
// value the same as this value.
static constexpr uint8_t kMaxCountdown = kHighCountdown;
// TODO: make these coundown values tuning parameters for eviction?
// See above. Mutable for read reference counting.
mutable std::atomic<uint64_t> meta{};
// Whether this is a "deteched" handle that is independently allocated
// with `new` (so must be deleted with `delete`).
// TODO: ideally this would be packed into some other data field, such
// as upper bits of total_charge, but that incurs a measurable performance
// regression.
bool standalone = false;
inline bool IsStandalone() const { return standalone; }
inline void SetStandalone() { standalone = true; }
}; // struct ClockHandle
class BaseClockTable {
public:
BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed)
: metadata_charge_policy_(metadata_charge_policy),
allocator_(allocator),
eviction_callback_(*eviction_callback),
hash_seed_(*hash_seed) {}
template <class Table>
typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
size_t capacity,
bool strict_capacity_limit,
bool allow_uncharged);
template <class Table>
Status Insert(const ClockHandleBasicData& proto,
typename Table::HandleImpl** handle, Cache::Priority priority,
size_t capacity, bool strict_capacity_limit);
void Ref(ClockHandle& handle);
size_t GetOccupancy() const {
return occupancy_.load(std::memory_order_relaxed);
}
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
size_t GetStandaloneUsage() const {
return standalone_usage_.load(std::memory_order_relaxed);
}
uint32_t GetHashSeed() const { return hash_seed_; }
struct EvictionData {
size_t freed_charge = 0;
size_t freed_count = 0;
};
void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
#ifndef NDEBUG
// Acquire N references
void TEST_RefN(ClockHandle& handle, size_t n);
// Helper for TEST_ReleaseN
void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
#endif
private: // fns
// Creates a "standalone" handle for returning from an Insert operation that
// cannot be completed by actually inserting into the table.
// Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
template <class HandleImpl>
HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=true rules. This
// means the operation might fail with Status::MemoryLimit. If
// `need_evict_for_occupancy`, then eviction of at least one entry is
// required, and the operation should fail if not possible.
// NOTE: Otherwise, occupancy_ is not managed in this function
template <class Table>
Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=false rules. This
// means that updating `usage_` always succeeds even if forced to exceed
// capacity. If `need_evict_for_occupancy`, then eviction of at least one
// entry is required, and the operation should return false if such eviction
// is not possible. `usage_` is not updated in that case. Otherwise, returns
// true, indicating success.
// NOTE: occupancy_ is not managed in this function
template <class Table>
bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
protected: // data
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
// Clock algorithm sweep pointer.
std::atomic<uint64_t> clock_pointer_{};
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<size_t> occupancy_{};
// Memory usage by entries tracked by the cache (including standalone)
std::atomic<size_t> usage_{};
// Part of usage by standalone entries (not in table)
std::atomic<size_t> standalone_usage_{};
ALIGN_AS(CACHE_LINE_SIZE)
const CacheMetadataChargePolicy metadata_charge_policy_;
// From Cache, for deleter
MemoryAllocator* const allocator_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
// A reference to ShardedCacheBase::hash_seed_
const uint32_t& hash_seed_;
};
class HyperClockTable : public BaseClockTable {
public:
// Target size to be exactly a common cache line size (see static_assert in
// clock_cache.cc)
struct ALIGN_AS(64U) HandleImpl : public ClockHandle {
// The number of elements that hash to this slot or a lower one, but wind
// up in this slot or a higher one.
std::atomic<uint32_t> displacements{};
}; // struct HandleImpl
struct Opts {
size_t estimated_value_size;
};
HyperClockTable(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const Opts& opts);
~HyperClockTable();
// For BaseClockTable::Insert
struct InsertState {};
void StartInsert(InsertState& state);
// Returns true iff there is room for the proposed number of entries.
bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
HandleImpl* DoInsert(const ClockHandleBasicData& proto,
uint64_t initial_countdown, bool take_ref,
InsertState& state);
// Runs the clock eviction algorithm trying to reclaim at least
// requested_charge. Returns how much is evicted, which could be less
// if it appears impossible to evict the requested amount without blocking.
void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
HandleImpl* Lookup(const UniqueId64x2& hashed_key);
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
void Erase(const UniqueId64x2& hashed_key);
void EraseUnRefEntries();
size_t GetTableSize() const { return size_t{1} << length_bits_; }
size_t GetOccupancyLimit() const { return occupancy_limit_; }
const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return const_cast<size_t&>(occupancy_limit_);
}
// Release N references
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // functions
// Returns x mod 2^{length_bits_}.
inline size_t ModTableSize(uint64_t x) {
return BitwiseAnd(x, length_bits_mask_);
}
// Returns the first slot in the probe sequence with a handle e such that
// match_fn(e) is true. At every step, the function first tests whether
// match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
// whether the search should be aborted, and if so, FindSlot immediately
// returns nullptr. For every handle e that is not a match and not aborted,
// FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
// slot will be the last probed because the next would cycle back to the first
// slot probed. This function uses templates instead of std::function to
// minimize the risk of heap-allocated closures being created.
template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
const MatchFn& match_fn, const AbortFn& abort_fn,
const UpdateFn& update_fn);
// Re-decrement all displacements in probe path starting from beginning
// until (not including) the given handle
inline void Rollback(const UniqueId64x2& hashed_key, const HandleImpl* h);
// Subtracts `total_charge` from `usage_` and 1 from `occupancy_`.
// Ideally this comes after releasing the entry itself so that we
// actually have the available occupancy/usage that is claimed.
// However, that means total_charge has to be saved from the handle
// before releasing it so that it can be provided to this function.
inline void ReclaimEntryUsage(size_t total_charge);
MemoryAllocator* GetAllocator() const { return allocator_; }
// Returns the number of bits used to hash an element in the hash
// table.
static int CalcHashBits(size_t capacity, size_t estimated_value_size,
CacheMetadataChargePolicy metadata_charge_policy);
private: // data
// Number of hash bits used for table index.
// The size of the table is 1 << length_bits_.
const int length_bits_;
// For faster computation of ModTableSize.
const size_t length_bits_mask_;
// Maximum number of elements the user can store in the table.
const size_t occupancy_limit_;
// Array of slots comprising the hash table.
const std::unique_ptr<HandleImpl[]> array_;
}; // class HyperClockTable
// A single shard of sharded cache.
template <class Table>
class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
public:
ClockCacheShard(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const typename Table::Opts& opts);
// For CacheShard concept
using HandleImpl = typename Table::HandleImpl;
// Hash is lossless hash of 128-bit key
using HashVal = UniqueId64x2;
using HashCref = const HashVal&;
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Upper32of64(hash[0]);
}
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
assert(key.size() == kCacheKeySize);
HashVal in;
HashVal out;
// NOTE: endian dependence
// TODO: use GetUnaligned?
std::memcpy(&in, key.data(), kCacheKeySize);
BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]);
return out;
}
// For reconstructing key from hashed_key. Requires the caller to provide
// backing storage for the Slice in `unhashed`
static inline Slice ReverseHash(const UniqueId64x2& hashed,
UniqueId64x2* unhashed, uint32_t seed) {
BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
(*unhashed)[0] ^= seed;
// NOTE: endian dependence
return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
}
// Although capacity is dynamically changeable, the number of table slots is
// not, so growing capacity substantially could lead to hitting occupancy
// limit.
void SetCapacity(size_t capacity);
void SetStrictCapacityLimit(bool strict_capacity_limit);
Status Insert(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr value, const Cache::CacheItemHelper* helper,
size_t charge, HandleImpl** handle, Cache::Priority priority);
HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
bool Release(HandleImpl* handle, bool erase_if_last_ref = false);
bool Ref(HandleImpl* handle);
void Erase(const Slice& key, const UniqueId64x2& hashed_key);
size_t GetCapacity() const;
size_t GetUsage() const;
size_t GetStandaloneUsage() const;
size_t GetPinnedUsage() const;
size_t GetOccupancyCount() const;
size_t GetOccupancyLimit() const;
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr obj,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
std::string GetPrintableOptions() const { return std::string{}; }
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
const Cache::CacheItemHelper* /*helper*/,
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/, Statistics* /*stats*/) {
return Lookup(key, hashed_key);
}
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return table_.TEST_MutableOccupancyLimit();
}
// Acquire/release N references
void TEST_RefN(HandleImpl* handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // data
Table table_;
// Maximum total charge of all elements stored in the table.
std::atomic<size_t> capacity_;
// Whether to reject insertion if cache reaches its full capacity.
std::atomic<bool> strict_capacity_limit_;
}; // class ClockCacheShard
class HyperClockCache
#ifdef NDEBUG
final
#endif
: public ShardedCache<ClockCacheShard<HyperClockTable>> {
public:
using Shard = ClockCacheShard<HyperClockTable>;
explicit HyperClockCache(const HyperClockCacheOptions& opts);
const char* Name() const override { return "HyperClockCache"; }
Cache::ObjectPtr Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
void ReportProblems(
const std::shared_ptr<Logger>& /*info_log*/) const override;
}; // class HyperClockCache
} // namespace clock_cache
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,318 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/compressed_secondary_cache.h"
#include <algorithm>
#include <cstdint>
#include <memory>
#include "memory/memory_allocator_impl.h"
#include "monitoring/perf_context_imp.h"
#include "util/compression.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
CompressedSecondaryCache::CompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts)
: cache_(opts.LRUCacheOptions::MakeSharedCache()),
cache_options_(opts),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache_))) {}
CompressedSecondaryCache::~CompressedSecondaryCache() {
assert(cache_res_mgr_->GetTotalReservedCacheSize() == 0);
}
std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) {
assert(helper);
std::unique_ptr<SecondaryCacheResultHandle> handle;
kept_in_sec_cache = false;
Cache::Handle* lru_handle = cache_->Lookup(key);
if (lru_handle == nullptr) {
return nullptr;
}
void* handle_value = cache_->Value(lru_handle);
if (handle_value == nullptr) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
return nullptr;
}
CacheAllocationPtr* ptr{nullptr};
CacheAllocationPtr merged_value;
size_t handle_value_charge{0};
if (cache_options_.enable_custom_split_merge) {
CacheValueChunk* value_chunk_ptr =
reinterpret_cast<CacheValueChunk*>(handle_value);
merged_value = MergeChunksIntoValue(value_chunk_ptr, handle_value_charge);
ptr = &merged_value;
} else {
ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
handle_value_charge = cache_->GetCharge(lru_handle);
}
MemoryAllocator* allocator = cache_options_.memory_allocator.get();
Status s;
Cache::ObjectPtr value{nullptr};
size_t charge{0};
if (cache_options_.compression_type == kNoCompression ||
cache_options_.do_not_compress_roles.Contains(helper->role)) {
s = helper->create_cb(Slice(ptr->get(), handle_value_charge),
create_context, allocator, &value, &charge);
} else {
UncompressionContext uncompression_context(cache_options_.compression_type);
UncompressionInfo uncompression_info(uncompression_context,
UncompressionDict::GetEmptyDict(),
cache_options_.compression_type);
size_t uncompressed_size{0};
CacheAllocationPtr uncompressed = UncompressData(
uncompression_info, (char*)ptr->get(), handle_value_charge,
&uncompressed_size, cache_options_.compress_format_version, allocator);
if (!uncompressed) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
return nullptr;
}
s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
create_context, allocator, &value, &charge);
}
if (!s.ok()) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
return nullptr;
}
if (advise_erase) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
// Insert a dummy handle.
cache_
->Insert(key, /*obj=*/nullptr,
GetHelper(cache_options_.enable_custom_split_merge),
/*charge=*/0)
.PermitUncheckedError();
} else {
kept_in_sec_cache = true;
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
return handle;
}
Status CompressedSecondaryCache::Insert(const Slice& key,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper) {
if (value == nullptr) {
return Status::InvalidArgument();
}
Cache::Handle* lru_handle = cache_->Lookup(key);
auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
if (lru_handle == nullptr) {
PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
// Insert a dummy handle if the handle is evicted for the first time.
return cache_->Insert(key, /*obj=*/nullptr, internal_helper,
/*charge=*/0);
} else {
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
size_t size = (*helper->size_cb)(value);
CacheAllocationPtr ptr =
AllocateBlock(size, cache_options_.memory_allocator.get());
Status s = (*helper->saveto_cb)(value, 0, size, ptr.get());
if (!s.ok()) {
return s;
}
Slice val(ptr.get(), size);
std::string compressed_val;
if (cache_options_.compression_type != kNoCompression &&
!cache_options_.do_not_compress_roles.Contains(helper->role)) {
PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
CompressionOptions compression_opts;
CompressionContext compression_context(cache_options_.compression_type);
uint64_t sample_for_compression{0};
CompressionInfo compression_info(
compression_opts, compression_context, CompressionDict::GetEmptyDict(),
cache_options_.compression_type, sample_for_compression);
bool success =
CompressData(val, compression_info,
cache_options_.compress_format_version, &compressed_val);
if (!success) {
return Status::Corruption("Error compressing value.");
}
val = Slice(compressed_val);
size = compressed_val.size();
PERF_COUNTER_ADD(compressed_sec_cache_compressed_bytes, size);
if (!cache_options_.enable_custom_split_merge) {
ptr = AllocateBlock(size, cache_options_.memory_allocator.get());
memcpy(ptr.get(), compressed_val.data(), size);
}
}
PERF_COUNTER_ADD(compressed_sec_cache_insert_real_count, 1);
if (cache_options_.enable_custom_split_merge) {
size_t charge{0};
CacheValueChunk* value_chunks_head =
SplitValueIntoChunks(val, cache_options_.compression_type, charge);
return cache_->Insert(key, value_chunks_head, internal_helper, charge);
} else {
CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
return cache_->Insert(key, buf, internal_helper, size);
}
}
void CompressedSecondaryCache::Erase(const Slice& key) { cache_->Erase(key); }
Status CompressedSecondaryCache::SetCapacity(size_t capacity) {
MutexLock l(&capacity_mutex_);
cache_options_.capacity = capacity;
cache_->SetCapacity(capacity);
return Status::OK();
}
Status CompressedSecondaryCache::GetCapacity(size_t& capacity) {
MutexLock l(&capacity_mutex_);
capacity = cache_options_.capacity;
return Status::OK();
}
std::string CompressedSecondaryCache::GetPrintableOptions() const {
std::string ret;
ret.reserve(20000);
const int kBufferSize{200};
char buffer[kBufferSize];
ret.append(cache_->GetPrintableOptions());
snprintf(buffer, kBufferSize, " compression_type : %s\n",
CompressionTypeToString(cache_options_.compression_type).c_str());
ret.append(buffer);
snprintf(buffer, kBufferSize, " compress_format_version : %d\n",
cache_options_.compress_format_version);
ret.append(buffer);
return ret;
}
CompressedSecondaryCache::CacheValueChunk*
CompressedSecondaryCache::SplitValueIntoChunks(const Slice& value,
CompressionType compression_type,
size_t& charge) {
assert(!value.empty());
const char* src_ptr = value.data();
size_t src_size{value.size()};
CacheValueChunk dummy_head = CacheValueChunk();
CacheValueChunk* current_chunk = &dummy_head;
// Do not split when value size is large or there is no compression.
size_t predicted_chunk_size{0};
size_t actual_chunk_size{0};
size_t tmp_size{0};
while (src_size > 0) {
predicted_chunk_size = sizeof(CacheValueChunk) - 1 + src_size;
auto upper =
std::upper_bound(malloc_bin_sizes_.begin(), malloc_bin_sizes_.end(),
predicted_chunk_size);
// Do not split when value size is too small, too large, close to a bin
// size, or there is no compression.
if (upper == malloc_bin_sizes_.begin() ||
upper == malloc_bin_sizes_.end() ||
*upper - predicted_chunk_size < malloc_bin_sizes_.front() ||
compression_type == kNoCompression) {
tmp_size = predicted_chunk_size;
} else {
tmp_size = *(--upper);
}
CacheValueChunk* new_chunk =
reinterpret_cast<CacheValueChunk*>(new char[tmp_size]);
current_chunk->next = new_chunk;
current_chunk = current_chunk->next;
actual_chunk_size = tmp_size - sizeof(CacheValueChunk) + 1;
memcpy(current_chunk->data, src_ptr, actual_chunk_size);
current_chunk->size = actual_chunk_size;
src_ptr += actual_chunk_size;
src_size -= actual_chunk_size;
charge += tmp_size;
}
current_chunk->next = nullptr;
return dummy_head.next;
}
CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
const void* chunks_head, size_t& charge) {
const CacheValueChunk* head =
reinterpret_cast<const CacheValueChunk*>(chunks_head);
const CacheValueChunk* current_chunk = head;
charge = 0;
while (current_chunk != nullptr) {
charge += current_chunk->size;
current_chunk = current_chunk->next;
}
CacheAllocationPtr ptr =
AllocateBlock(charge, cache_options_.memory_allocator.get());
current_chunk = head;
size_t pos{0};
while (current_chunk != nullptr) {
memcpy(ptr.get() + pos, current_chunk->data, current_chunk->size);
pos += current_chunk->size;
current_chunk = current_chunk->next;
}
return ptr;
}
const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
bool enable_custom_split_merge) const {
if (enable_custom_split_merge) {
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
CacheValueChunk* chunks_head = static_cast<CacheValueChunk*>(obj);
while (chunks_head != nullptr) {
CacheValueChunk* tmp_chunk = chunks_head;
chunks_head = chunks_head->next;
tmp_chunk->Free();
obj = nullptr;
};
}};
return &kHelper;
} else {
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
delete static_cast<CacheAllocationPtr*>(obj);
obj = nullptr;
}};
return &kHelper;
}
}
std::shared_ptr<SecondaryCache>
CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
return std::make_shared<CompressedSecondaryCache>(*this);
}
Status CompressedSecondaryCache::Deflate(size_t decrease) {
return cache_res_mgr_->UpdateCacheReservation(decrease, /*increase=*/true);
}
Status CompressedSecondaryCache::Inflate(size_t increase) {
return cache_res_mgr_->UpdateCacheReservation(increase, /*increase=*/false);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,140 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <array>
#include <cstddef>
#include <memory>
#include "cache/cache_reservation_manager.h"
#include "cache/lru_cache.h"
#include "memory/memory_allocator_impl.h"
#include "rocksdb/secondary_cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/compression.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
public:
CompressedSecondaryCacheResultHandle(Cache::ObjectPtr value, size_t size)
: value_(value), size_(size) {}
~CompressedSecondaryCacheResultHandle() override = default;
CompressedSecondaryCacheResultHandle(
const CompressedSecondaryCacheResultHandle&) = delete;
CompressedSecondaryCacheResultHandle& operator=(
const CompressedSecondaryCacheResultHandle&) = delete;
bool IsReady() override { return true; }
void Wait() override {}
Cache::ObjectPtr Value() override { return value_; }
size_t Size() override { return size_; }
private:
Cache::ObjectPtr value_;
size_t size_;
};
// The CompressedSecondaryCache is a concrete implementation of
// rocksdb::SecondaryCache.
//
// When a block is found from CompressedSecondaryCache::Lookup, we check whether
// there is a dummy block with the same key in the primary cache.
// 1. If the dummy block exits, we erase the block from
// CompressedSecondaryCache and insert it into the primary cache.
// 2. If not, we just insert a dummy block into the primary cache
// (charging the actual size of the block) and don not erase the block from
// CompressedSecondaryCache. A standalone handle is returned to the caller.
//
// When a block is evicted from the primary cache, we check whether
// there is a dummy block with the same key in CompressedSecondaryCache.
// 1. If the dummy block exits, the block is inserted into
// CompressedSecondaryCache.
// 2. If not, we just insert a dummy block (size 0) in CompressedSecondaryCache.
//
// Users can also cast a pointer to CompressedSecondaryCache and call methods on
// it directly, especially custom methods that may be added
// in the future. For example -
// std::unique_ptr<rocksdb::SecondaryCache> cache =
// NewCompressedSecondaryCache(opts);
// static_cast<CompressedSecondaryCache*>(cache.get())->Erase(key);
class CompressedSecondaryCache : public SecondaryCache {
public:
explicit CompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts);
~CompressedSecondaryCache() override;
const char* Name() const override { return "CompressedSecondaryCache"; }
Status Insert(const Slice& key, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper) override;
std::unique_ptr<SecondaryCacheResultHandle> Lookup(
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) override;
bool SupportForceErase() const override { return true; }
void Erase(const Slice& key) override;
void WaitAll(std::vector<SecondaryCacheResultHandle*> /*handles*/) override {}
Status SetCapacity(size_t capacity) override;
Status GetCapacity(size_t& capacity) override;
Status Deflate(size_t decrease) override;
Status Inflate(size_t increase) override;
std::string GetPrintableOptions() const override;
size_t TEST_GetUsage() { return cache_->GetUsage(); }
private:
friend class CompressedSecondaryCacheTestBase;
static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
128, 256, 512, 1024, 2048, 4096, 8192, 16384};
struct CacheValueChunk {
// TODO try "CacheAllocationPtr next;".
CacheValueChunk* next;
size_t size;
// Beginning of the chunk data (MUST BE THE LAST FIELD IN THIS STRUCT!)
char data[1];
void Free() { delete[] reinterpret_cast<char*>(this); }
};
// Split value into chunks to better fit into jemalloc bins. The chunks
// are stored in CacheValueChunk and extra charge is needed for each chunk,
// so the cache charge is recalculated here.
CacheValueChunk* SplitValueIntoChunks(const Slice& value,
CompressionType compression_type,
size_t& charge);
// After merging chunks, the extra charge for each chunk is removed, so
// the charge is recalculated.
CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
size_t& charge);
// TODO: clean up to use cleaner interfaces in typed_cache.h
const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
std::shared_ptr<Cache> cache_;
CompressedSecondaryCacheOptions cache_options_;
mutable port::Mutex capacity_mutex_;
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};
} // namespace ROCKSDB_NAMESPACE

File diff suppressed because it is too large Load Diff

723
cache/lru_cache.cc vendored

@ -0,0 +1,723 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "cache/lru_cache.h"
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics_impl.h"
#include "port/lang.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace lru_cache {
LRUHandleTable::LRUHandleTable(int max_upper_hash_bits,
MemoryAllocator* allocator)
: length_bits_(/* historical starting size*/ 4),
list_(new LRUHandle* [size_t{1} << length_bits_] {}),
elems_(0),
max_length_bits_(max_upper_hash_bits),
allocator_(allocator) {}
LRUHandleTable::~LRUHandleTable() {
auto alloc = allocator_;
ApplyToEntriesRange(
[alloc](LRUHandle* h) {
if (!h->HasRefs()) {
h->Free(alloc);
}
},
0, size_t{1} << length_bits_);
}
LRUHandle* LRUHandleTable::Lookup(const Slice& key, uint32_t hash) {
return *FindPointer(key, hash);
}
LRUHandle* LRUHandleTable::Insert(LRUHandle* h) {
LRUHandle** ptr = FindPointer(h->key(), h->hash);
LRUHandle* old = *ptr;
h->next_hash = (old == nullptr ? nullptr : old->next_hash);
*ptr = h;
if (old == nullptr) {
++elems_;
if ((elems_ >> length_bits_) > 0) { // elems_ >= length
// Since each cache entry is fairly large, we aim for a small
// average linked list length (<= 1).
Resize();
}
}
return old;
}
LRUHandle* LRUHandleTable::Remove(const Slice& key, uint32_t hash) {
LRUHandle** ptr = FindPointer(key, hash);
LRUHandle* result = *ptr;
if (result != nullptr) {
*ptr = result->next_hash;
--elems_;
}
return result;
}
LRUHandle** LRUHandleTable::FindPointer(const Slice& key, uint32_t hash) {
LRUHandle** ptr = &list_[hash >> (32 - length_bits_)];
while (*ptr != nullptr && ((*ptr)->hash != hash || key != (*ptr)->key())) {
ptr = &(*ptr)->next_hash;
}
return ptr;
}
void LRUHandleTable::Resize() {
if (length_bits_ >= max_length_bits_) {
// Due to reaching limit of hash information, if we made the table bigger,
// we would allocate more addresses but only the same number would be used.
return;
}
if (length_bits_ >= 31) {
// Avoid undefined behavior shifting uint32_t by 32.
return;
}
uint32_t old_length = uint32_t{1} << length_bits_;
int new_length_bits = length_bits_ + 1;
std::unique_ptr<LRUHandle* []> new_list {
new LRUHandle* [size_t{1} << new_length_bits] {}
};
[[maybe_unused]] uint32_t count = 0;
for (uint32_t i = 0; i < old_length; i++) {
LRUHandle* h = list_[i];
while (h != nullptr) {
LRUHandle* next = h->next_hash;
uint32_t hash = h->hash;
LRUHandle** ptr = &new_list[hash >> (32 - new_length_bits)];
h->next_hash = *ptr;
*ptr = h;
h = next;
count++;
}
}
assert(elems_ == count);
list_ = std::move(new_list);
length_bits_ = new_length_bits;
}
LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio,
double low_pri_pool_ratio, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback)
: CacheShardBase(metadata_charge_policy),
capacity_(0),
high_pri_pool_usage_(0),
low_pri_pool_usage_(0),
strict_capacity_limit_(strict_capacity_limit),
high_pri_pool_ratio_(high_pri_pool_ratio),
high_pri_pool_capacity_(0),
low_pri_pool_ratio_(low_pri_pool_ratio),
low_pri_pool_capacity_(0),
table_(max_upper_hash_bits, allocator),
usage_(0),
lru_usage_(0),
mutex_(use_adaptive_mutex),
eviction_callback_(*eviction_callback) {
// Make empty circular linked list.
lru_.next = &lru_;
lru_.prev = &lru_;
lru_low_pri_ = &lru_;
lru_bottom_pri_ = &lru_;
SetCapacity(capacity);
}
void LRUCacheShard::EraseUnRefEntries() {
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
while (lru_.next != &lru_) {
LRUHandle* old = lru_.next;
// LRU list contains only elements which can be evicted.
assert(old->InCache() && !old->HasRefs());
LRU_Remove(old);
table_.Remove(old->key(), old->hash);
old->SetInCache(false);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
last_reference_list.push_back(old);
}
}
for (auto entry : last_reference_list) {
entry->Free(table_.GetAllocator());
}
}
void LRUCacheShard::ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state) {
// The state is essentially going to be the starting hash, which works
// nicely even if we resize between calls because we use upper-most
// hash bits for table indexes.
DMutexLock l(mutex_);
int length_bits = table_.GetLengthBits();
size_t length = size_t{1} << length_bits;
assert(average_entries_per_lock > 0);
// Assuming we are called with same average_entries_per_lock repeatedly,
// this simplifies some logic (index_end will not overflow).
assert(average_entries_per_lock < length || *state == 0);
size_t index_begin = *state >> (sizeof(size_t) * 8u - length_bits);
size_t index_end = index_begin + average_entries_per_lock;
if (index_end >= length) {
// Going to end
index_end = length;
*state = SIZE_MAX;
} else {
*state = index_end << (sizeof(size_t) * 8u - length_bits);
}
table_.ApplyToEntriesRange(
[callback,
metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
h->helper);
},
index_begin, index_end);
}
void LRUCacheShard::TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
LRUHandle** lru_bottom_pri) {
DMutexLock l(mutex_);
*lru = &lru_;
*lru_low_pri = lru_low_pri_;
*lru_bottom_pri = lru_bottom_pri_;
}
size_t LRUCacheShard::TEST_GetLRUSize() {
DMutexLock l(mutex_);
LRUHandle* lru_handle = lru_.next;
size_t lru_size = 0;
while (lru_handle != &lru_) {
lru_size++;
lru_handle = lru_handle->next;
}
return lru_size;
}
double LRUCacheShard::GetHighPriPoolRatio() {
DMutexLock l(mutex_);
return high_pri_pool_ratio_;
}
double LRUCacheShard::GetLowPriPoolRatio() {
DMutexLock l(mutex_);
return low_pri_pool_ratio_;
}
void LRUCacheShard::LRU_Remove(LRUHandle* e) {
assert(e->next != nullptr);
assert(e->prev != nullptr);
if (lru_low_pri_ == e) {
lru_low_pri_ = e->prev;
}
if (lru_bottom_pri_ == e) {
lru_bottom_pri_ = e->prev;
}
e->next->prev = e->prev;
e->prev->next = e->next;
e->prev = e->next = nullptr;
assert(lru_usage_ >= e->total_charge);
lru_usage_ -= e->total_charge;
assert(!e->InHighPriPool() || !e->InLowPriPool());
if (e->InHighPriPool()) {
assert(high_pri_pool_usage_ >= e->total_charge);
high_pri_pool_usage_ -= e->total_charge;
} else if (e->InLowPriPool()) {
assert(low_pri_pool_usage_ >= e->total_charge);
low_pri_pool_usage_ -= e->total_charge;
}
}
void LRUCacheShard::LRU_Insert(LRUHandle* e) {
assert(e->next == nullptr);
assert(e->prev == nullptr);
if (high_pri_pool_ratio_ > 0 && (e->IsHighPri() || e->HasHit())) {
// Inset "e" to head of LRU list.
e->next = &lru_;
e->prev = lru_.prev;
e->prev->next = e;
e->next->prev = e;
e->SetInHighPriPool(true);
e->SetInLowPriPool(false);
high_pri_pool_usage_ += e->total_charge;
MaintainPoolSize();
} else if (low_pri_pool_ratio_ > 0 &&
(e->IsHighPri() || e->IsLowPri() || e->HasHit())) {
// Insert "e" to the head of low-pri pool.
e->next = lru_low_pri_->next;
e->prev = lru_low_pri_;
e->prev->next = e;
e->next->prev = e;
e->SetInHighPriPool(false);
e->SetInLowPriPool(true);
low_pri_pool_usage_ += e->total_charge;
MaintainPoolSize();
lru_low_pri_ = e;
} else {
// Insert "e" to the head of bottom-pri pool.
e->next = lru_bottom_pri_->next;
e->prev = lru_bottom_pri_;
e->prev->next = e;
e->next->prev = e;
e->SetInHighPriPool(false);
e->SetInLowPriPool(false);
// if the low-pri pool is empty, lru_low_pri_ also needs to be updated.
if (lru_bottom_pri_ == lru_low_pri_) {
lru_low_pri_ = e;
}
lru_bottom_pri_ = e;
}
lru_usage_ += e->total_charge;
}
void LRUCacheShard::MaintainPoolSize() {
while (high_pri_pool_usage_ > high_pri_pool_capacity_) {
// Overflow last entry in high-pri pool to low-pri pool.
lru_low_pri_ = lru_low_pri_->next;
assert(lru_low_pri_ != &lru_);
lru_low_pri_->SetInHighPriPool(false);
lru_low_pri_->SetInLowPriPool(true);
assert(high_pri_pool_usage_ >= lru_low_pri_->total_charge);
high_pri_pool_usage_ -= lru_low_pri_->total_charge;
low_pri_pool_usage_ += lru_low_pri_->total_charge;
}
while (low_pri_pool_usage_ > low_pri_pool_capacity_) {
// Overflow last entry in low-pri pool to bottom-pri pool.
lru_bottom_pri_ = lru_bottom_pri_->next;
assert(lru_bottom_pri_ != &lru_);
lru_bottom_pri_->SetInHighPriPool(false);
lru_bottom_pri_->SetInLowPriPool(false);
assert(low_pri_pool_usage_ >= lru_bottom_pri_->total_charge);
low_pri_pool_usage_ -= lru_bottom_pri_->total_charge;
}
}
void LRUCacheShard::EvictFromLRU(size_t charge,
autovector<LRUHandle*>* deleted) {
while ((usage_ + charge) > capacity_ && lru_.next != &lru_) {
LRUHandle* old = lru_.next;
// LRU list contains only elements which can be evicted.
assert(old->InCache() && !old->HasRefs());
LRU_Remove(old);
table_.Remove(old->key(), old->hash);
old->SetInCache(false);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
deleted->push_back(old);
}
}
void LRUCacheShard::NotifyEvicted(
const autovector<LRUHandle*>& evicted_handles) {
MemoryAllocator* alloc = table_.GetAllocator();
for (LRUHandle* entry : evicted_handles) {
if (eviction_callback_ &&
eviction_callback_(entry->key(),
reinterpret_cast<Cache::Handle*>(entry))) {
// Callback took ownership of obj; just free handle
free(entry);
} else {
// Free the entries here outside of mutex for performance reasons.
entry->Free(alloc);
}
}
}
void LRUCacheShard::SetCapacity(size_t capacity) {
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
capacity_ = capacity;
high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
EvictFromLRU(0, &last_reference_list);
}
NotifyEvicted(last_reference_list);
}
void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
DMutexLock l(mutex_);
strict_capacity_limit_ = strict_capacity_limit;
}
Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
Status s = Status::OK();
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
// Free the space following strict LRU policy until enough space
// is freed or the lru list is empty.
EvictFromLRU(e->total_charge, &last_reference_list);
if ((usage_ + e->total_charge) > capacity_ &&
(strict_capacity_limit_ || handle == nullptr)) {
e->SetInCache(false);
if (handle == nullptr) {
// Don't insert the entry but still return ok, as if the entry inserted
// into cache and get evicted immediately.
last_reference_list.push_back(e);
} else {
free(e);
e = nullptr;
*handle = nullptr;
s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
}
} else {
// Insert into the cache. Note that the cache might get larger than its
// capacity if not enough space was freed up.
LRUHandle* old = table_.Insert(e);
usage_ += e->total_charge;
if (old != nullptr) {
s = Status::OkOverwritten();
assert(old->InCache());
old->SetInCache(false);
if (!old->HasRefs()) {
// old is on LRU because it's in cache and its reference count is 0.
LRU_Remove(old);
assert(usage_ >= old->total_charge);
usage_ -= old->total_charge;
last_reference_list.push_back(old);
}
}
if (handle == nullptr) {
LRU_Insert(e);
} else {
// If caller already holds a ref, no need to take one here.
if (!e->HasRefs()) {
e->Ref();
}
*handle = e;
}
}
}
NotifyEvicted(last_reference_list);
return s;
}
LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* /*helper*/,
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/,
Statistics* /*stats*/) {
DMutexLock l(mutex_);
LRUHandle* e = table_.Lookup(key, hash);
if (e != nullptr) {
assert(e->InCache());
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(e);
}
e->Ref();
e->SetHit();
}
return e;
}
bool LRUCacheShard::Ref(LRUHandle* e) {
DMutexLock l(mutex_);
// To create another reference - entry must be already externally referenced.
assert(e->HasRefs());
e->Ref();
return true;
}
void LRUCacheShard::SetHighPriorityPoolRatio(double high_pri_pool_ratio) {
DMutexLock l(mutex_);
high_pri_pool_ratio_ = high_pri_pool_ratio;
high_pri_pool_capacity_ = capacity_ * high_pri_pool_ratio_;
MaintainPoolSize();
}
void LRUCacheShard::SetLowPriorityPoolRatio(double low_pri_pool_ratio) {
DMutexLock l(mutex_);
low_pri_pool_ratio_ = low_pri_pool_ratio;
low_pri_pool_capacity_ = capacity_ * low_pri_pool_ratio_;
MaintainPoolSize();
}
bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
bool erase_if_last_ref) {
if (e == nullptr) {
return false;
}
bool must_free;
bool was_in_cache;
{
DMutexLock l(mutex_);
must_free = e->Unref();
was_in_cache = e->InCache();
if (must_free && was_in_cache) {
// The item is still in cache, and nobody else holds a reference to it.
if (usage_ > capacity_ || erase_if_last_ref) {
// The LRU list must be empty since the cache is full.
assert(lru_.next == &lru_ || erase_if_last_ref);
// Take this opportunity and remove the item.
table_.Remove(e->key(), e->hash);
e->SetInCache(false);
} else {
// Put the item back on the LRU list, and don't free it.
LRU_Insert(e);
must_free = false;
}
}
// If about to be freed, then decrement the cache usage.
if (must_free) {
assert(usage_ >= e->total_charge);
usage_ -= e->total_charge;
}
}
// Free the entry here outside of mutex for performance reasons.
if (must_free) {
// Only call eviction callback if we're sure no one requested erasure
// FIXME: disabled because of test churn
if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ &&
eviction_callback_(e->key(), reinterpret_cast<Cache::Handle*>(e))) {
// Callback took ownership of obj; just free handle
free(e);
} else {
e->Free(table_.GetAllocator());
}
}
return must_free;
}
LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge) {
assert(helper);
// value == nullptr is reserved for indicating failure in SecondaryCache
assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr));
// Allocate the memory here outside of the mutex.
// If the cache is full, we'll have to release it.
// It shouldn't happen very often though.
LRUHandle* e =
static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
e->value = value;
e->m_flags = 0;
e->im_flags = 0;
e->helper = helper;
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
memcpy(e->key_data, key.data(), key.size());
e->CalcTotalCharge(charge, metadata_charge_policy_);
return e;
}
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge, LRUHandle** handle,
Cache::Priority priority) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetPriority(priority);
e->SetInCache(true);
return InsertItem(e, handle);
}
LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge,
bool allow_uncharged) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetIsStandalone(true);
e->Ref();
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
EvictFromLRU(e->total_charge, &last_reference_list);
if (strict_capacity_limit_ && (usage_ + e->total_charge) > capacity_) {
if (allow_uncharged) {
e->total_charge = 0;
} else {
free(e);
e = nullptr;
}
} else {
usage_ += e->total_charge;
}
}
NotifyEvicted(last_reference_list);
return e;
}
void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
LRUHandle* e;
bool last_reference = false;
{
DMutexLock l(mutex_);
e = table_.Remove(key, hash);
if (e != nullptr) {
assert(e->InCache());
e->SetInCache(false);
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external references
LRU_Remove(e);
assert(usage_ >= e->total_charge);
usage_ -= e->total_charge;
last_reference = true;
}
}
}
// Free the entry here outside of mutex for performance reasons.
// last_reference will only be true if e != nullptr.
if (last_reference) {
e->Free(table_.GetAllocator());
}
}
size_t LRUCacheShard::GetUsage() const {
DMutexLock l(mutex_);
return usage_;
}
size_t LRUCacheShard::GetPinnedUsage() const {
DMutexLock l(mutex_);
assert(usage_ >= lru_usage_);
return usage_ - lru_usage_;
}
size_t LRUCacheShard::GetOccupancyCount() const {
DMutexLock l(mutex_);
return table_.GetOccupancyCount();
}
size_t LRUCacheShard::GetTableAddressCount() const {
DMutexLock l(mutex_);
return size_t{1} << table_.GetLengthBits();
}
void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
const int kBufferSize = 200;
char buffer[kBufferSize];
{
DMutexLock l(mutex_);
snprintf(buffer, kBufferSize, " high_pri_pool_ratio: %.3lf\n",
high_pri_pool_ratio_);
snprintf(buffer + strlen(buffer), kBufferSize - strlen(buffer),
" low_pri_pool_ratio: %.3lf\n", low_pri_pool_ratio_);
}
str.append(buffer);
}
LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) {
size_t per_shard = GetPerShardCapacity();
MemoryAllocator* alloc = memory_allocator();
InitShards([&](LRUCacheShard* cs) {
new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit,
opts.high_pri_pool_ratio, opts.low_pri_pool_ratio,
opts.use_adaptive_mutex, opts.metadata_charge_policy,
/* max_upper_hash_bits */ 32 - opts.num_shard_bits,
alloc, &eviction_callback_);
});
}
Cache::ObjectPtr LRUCache::Value(Handle* handle) {
auto h = reinterpret_cast<const LRUHandle*>(handle);
return h->value;
}
size_t LRUCache::GetCharge(Handle* handle) const {
return reinterpret_cast<const LRUHandle*>(handle)->GetCharge(
GetShard(0).metadata_charge_policy_);
}
const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
Handle* handle) const {
auto h = reinterpret_cast<const LRUHandle*>(handle);
return h->helper;
}
size_t LRUCache::TEST_GetLRUSize() {
return SumOverShards([](LRUCacheShard& cs) { return cs.TEST_GetLRUSize(); });
}
double LRUCache::GetHighPriPoolRatio() {
return GetShard(0).GetHighPriPoolRatio();
}
} // namespace lru_cache
std::shared_ptr<Cache> LRUCacheOptions::MakeSharedCache() const {
if (num_shard_bits >= 20) {
return nullptr; // The cache cannot be sharded into too many fine pieces.
}
if (high_pri_pool_ratio < 0.0 || high_pri_pool_ratio > 1.0) {
// Invalid high_pri_pool_ratio
return nullptr;
}
if (low_pri_pool_ratio < 0.0 || low_pri_pool_ratio > 1.0) {
// Invalid low_pri_pool_ratio
return nullptr;
}
if (low_pri_pool_ratio + high_pri_pool_ratio > 1.0) {
// Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
return nullptr;
}
// For sanitized options
LRUCacheOptions opts = *this;
if (opts.num_shard_bits < 0) {
opts.num_shard_bits = GetDefaultCacheShardBits(capacity);
}
std::shared_ptr<Cache> cache = std::make_shared<LRUCache>(opts);
if (secondary_cache) {
cache = std::make_shared<CacheWithSecondaryAdapter>(cache, secondary_cache);
}
return cache;
}
std::shared_ptr<RowCache> LRUCacheOptions::MakeSharedRowCache() const {
if (secondary_cache) {
// Not allowed for a RowCache
return nullptr;
}
// Works while RowCache is an alias for Cache
return MakeSharedCache();
}
} // namespace ROCKSDB_NAMESPACE

467
cache/lru_cache.h vendored

@ -0,0 +1,467 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <memory>
#include <string>
#include "cache/sharded_cache.h"
#include "port/lang.h"
#include "port/likely.h"
#include "port/malloc.h"
#include "port/port.h"
#include "util/autovector.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace lru_cache {
// LRU cache implementation. This class is not thread-safe.
// An entry is a variable length heap-allocated structure.
// Entries are referenced by cache and/or by any external entity.
// The cache keeps all its entries in a hash table. Some elements
// are also stored on LRU list.
//
// LRUHandle can be in these states:
// 1. Referenced externally AND in hash table.
// In that case the entry is *not* in the LRU list
// (refs >= 1 && in_cache == true)
// 2. Not referenced externally AND in hash table.
// In that case the entry is in the LRU list and can be freed.
// (refs == 0 && in_cache == true)
// 3. Referenced externally AND not in hash table.
// In that case the entry is not in the LRU list and not in hash table.
// The entry must be freed if refs becomes 0 in this state.
// (refs >= 1 && in_cache == false)
// If you call LRUCacheShard::Release enough times on an entry in state 1, it
// will go into state 2. To move from state 1 to state 3, either call
// LRUCacheShard::Erase or LRUCacheShard::Insert with the same key (but
// possibly different value). To move from state 2 to state 1, use
// LRUCacheShard::Lookup.
// While refs > 0, public properties like value and deleter must not change.
struct LRUHandle {
Cache::ObjectPtr value;
const Cache::CacheItemHelper* helper;
LRUHandle* next_hash;
LRUHandle* next;
LRUHandle* prev;
size_t total_charge; // TODO(opt): Only allow uint32_t?
size_t key_length;
// The hash of key(). Used for fast sharding and comparisons.
uint32_t hash;
// The number of external refs to this entry. The cache itself is not counted.
uint32_t refs;
// Mutable flags - access controlled by mutex
// The m_ and M_ prefixes (and im_ and IM_ later) are to hopefully avoid
// checking an M_ flag on im_flags or an IM_ flag on m_flags.
uint8_t m_flags;
enum MFlags : uint8_t {
// Whether this entry is referenced by the hash table.
M_IN_CACHE = (1 << 0),
// Whether this entry has had any lookups (hits).
M_HAS_HIT = (1 << 1),
// Whether this entry is in high-pri pool.
M_IN_HIGH_PRI_POOL = (1 << 2),
// Whether this entry is in low-pri pool.
M_IN_LOW_PRI_POOL = (1 << 3),
};
// "Immutable" flags - only set in single-threaded context and then
// can be accessed without mutex
uint8_t im_flags;
enum ImFlags : uint8_t {
// Whether this entry is high priority entry.
IM_IS_HIGH_PRI = (1 << 0),
// Whether this entry is low priority entry.
IM_IS_LOW_PRI = (1 << 1),
// Marks result handles that should not be inserted into cache
IM_IS_STANDALONE = (1 << 2),
};
// Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
char key_data[1];
Slice key() const { return Slice(key_data, key_length); }
// For HandleImpl concept
uint32_t GetHash() const { return hash; }
// Increase the reference count by 1.
void Ref() { refs++; }
// Just reduce the reference count by 1. Return true if it was last reference.
bool Unref() {
assert(refs > 0);
refs--;
return refs == 0;
}
// Return true if there are external refs, false otherwise.
bool HasRefs() const { return refs > 0; }
bool InCache() const { return m_flags & M_IN_CACHE; }
bool IsHighPri() const { return im_flags & IM_IS_HIGH_PRI; }
bool InHighPriPool() const { return m_flags & M_IN_HIGH_PRI_POOL; }
bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
bool HasHit() const { return m_flags & M_HAS_HIT; }
bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
void SetInCache(bool in_cache) {
if (in_cache) {
m_flags |= M_IN_CACHE;
} else {
m_flags &= ~M_IN_CACHE;
}
}
void SetPriority(Cache::Priority priority) {
if (priority == Cache::Priority::HIGH) {
im_flags |= IM_IS_HIGH_PRI;
im_flags &= ~IM_IS_LOW_PRI;
} else if (priority == Cache::Priority::LOW) {
im_flags &= ~IM_IS_HIGH_PRI;
im_flags |= IM_IS_LOW_PRI;
} else {
im_flags &= ~IM_IS_HIGH_PRI;
im_flags &= ~IM_IS_LOW_PRI;
}
}
void SetInHighPriPool(bool in_high_pri_pool) {
if (in_high_pri_pool) {
m_flags |= M_IN_HIGH_PRI_POOL;
} else {
m_flags &= ~M_IN_HIGH_PRI_POOL;
}
}
void SetInLowPriPool(bool in_low_pri_pool) {
if (in_low_pri_pool) {
m_flags |= M_IN_LOW_PRI_POOL;
} else {
m_flags &= ~M_IN_LOW_PRI_POOL;
}
}
void SetHit() { m_flags |= M_HAS_HIT; }
void SetIsStandalone(bool is_standalone) {
if (is_standalone) {
im_flags |= IM_IS_STANDALONE;
} else {
im_flags &= ~IM_IS_STANDALONE;
}
}
void Free(MemoryAllocator* allocator) {
assert(refs == 0);
assert(helper);
if (helper->del_cb) {
helper->del_cb(value, allocator);
}
free(this);
}
inline size_t CalcuMetaCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
if (metadata_charge_policy != kFullChargeCacheMetadata) {
return 0;
} else {
#ifdef ROCKSDB_MALLOC_USABLE_SIZE
return malloc_usable_size(
const_cast<void*>(static_cast<const void*>(this)));
#else
// This is the size that is used when a new handle is created.
return sizeof(LRUHandle) - 1 + key_length;
#endif
}
}
// Calculate the memory usage by metadata.
inline void CalcTotalCharge(
size_t charge, CacheMetadataChargePolicy metadata_charge_policy) {
total_charge = charge + CalcuMetaCharge(metadata_charge_policy);
}
inline size_t GetCharge(
CacheMetadataChargePolicy metadata_charge_policy) const {
size_t meta_charge = CalcuMetaCharge(metadata_charge_policy);
assert(total_charge >= meta_charge);
return total_charge - meta_charge;
}
};
// We provide our own simple hash table since it removes a whole bunch
// of porting hacks and is also faster than some of the built-in hash
// table implementations in some of the compiler/runtime combinations
// we have tested. E.g., readrandom speeds up by ~5% over the g++
// 4.4.3's builtin hashtable.
class LRUHandleTable {
public:
explicit LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator);
~LRUHandleTable();
LRUHandle* Lookup(const Slice& key, uint32_t hash);
LRUHandle* Insert(LRUHandle* h);
LRUHandle* Remove(const Slice& key, uint32_t hash);
template <typename T>
void ApplyToEntriesRange(T func, size_t index_begin, size_t index_end) {
for (size_t i = index_begin; i < index_end; i++) {
LRUHandle* h = list_[i];
while (h != nullptr) {
auto n = h->next_hash;
assert(h->InCache());
func(h);
h = n;
}
}
}
int GetLengthBits() const { return length_bits_; }
size_t GetOccupancyCount() const { return elems_; }
MemoryAllocator* GetAllocator() const { return allocator_; }
private:
// Return a pointer to slot that points to a cache entry that
// matches key/hash. If there is no such cache entry, return a
// pointer to the trailing slot in the corresponding linked list.
LRUHandle** FindPointer(const Slice& key, uint32_t hash);
void Resize();
// Number of hash bits (upper because lower bits used for sharding)
// used for table index. Length == 1 << length_bits_
int length_bits_;
// The table consists of an array of buckets where each bucket is
// a linked list of cache entries that hash into the bucket.
std::unique_ptr<LRUHandle*[]> list_;
// Number of elements currently in the table.
uint32_t elems_;
// Set from max_upper_hash_bits (see constructor).
const int max_length_bits_;
// From Cache, needed for delete
MemoryAllocator* const allocator_;
};
// A single shard of sharded cache.
class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
public:
// NOTE: the eviction_callback ptr is saved, as is it assumed to be kept
// alive in Cache.
LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits, MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback);
public: // Type definitions expected as parameter to ShardedCache
using HandleImpl = LRUHandle;
using HashVal = uint32_t;
using HashCref = uint32_t;
public: // Function definitions expected as parameter to ShardedCache
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return Lower32of64(GetSliceNPHash64(key, seed));
}
// Separate from constructor so caller can easily make an array of LRUCache
// if current usage is more than new capacity, the function will attempt to
// free the needed space.
void SetCapacity(size_t capacity);
// Set the flag to reject insertion if cache if full.
void SetStrictCapacityLimit(bool strict_capacity_limit);
// Set percentage of capacity reserved for high-pri cache entries.
void SetHighPriorityPoolRatio(double high_pri_pool_ratio);
// Set percentage of capacity reserved for low-pri cache entries.
void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
// Like Cache methods, but with an extra "hash" parameter.
Status Insert(const Slice& key, uint32_t hash, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge,
LRUHandle** handle, Cache::Priority priority);
LRUHandle* CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
LRUHandle* Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context,
Cache::Priority priority, Statistics* stats);
bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
bool Ref(LRUHandle* handle);
void Erase(const Slice& key, uint32_t hash);
// Although in some platforms the update of size_t is atomic, to make sure
// GetUsage() and GetPinnedUsage() work correctly under any platform, we'll
// protect them with mutex_.
size_t GetUsage() const;
size_t GetPinnedUsage() const;
size_t GetOccupancyCount() const;
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
public: // other function definitions
void TEST_GetLRUList(LRUHandle** lru, LRUHandle** lru_low_pri,
LRUHandle** lru_bottom_pri);
// Retrieves number of elements in LRU, for unit test purpose only.
// Not threadsafe.
size_t TEST_GetLRUSize();
// Retrieves high pri pool ratio
double GetHighPriPoolRatio();
// Retrieves low pri pool ratio
double GetLowPriPoolRatio();
void AppendPrintableOptions(std::string& /*str*/) const;
private:
friend class LRUCache;
// Insert an item into the hash table and, if handle is null, insert into
// the LRU list. Older items are evicted as necessary. Frees `item` on
// non-OK status.
Status InsertItem(LRUHandle* item, LRUHandle** handle);
void LRU_Remove(LRUHandle* e);
void LRU_Insert(LRUHandle* e);
// Overflow the last entry in high-pri pool to low-pri pool until size of
// high-pri pool is no larger than the size specify by high_pri_pool_pct.
void MaintainPoolSize();
// Free some space following strict LRU policy until enough space
// to hold (usage_ + charge) is freed or the lru list is empty
// This function is not thread safe - it needs to be executed while
// holding the mutex_.
void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
void NotifyEvicted(const autovector<LRUHandle*>& evicted_handles);
LRUHandle* CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge);
// Initialized before use.
size_t capacity_;
// Memory size for entries in high-pri pool.
size_t high_pri_pool_usage_;
// Memory size for entries in low-pri pool.
size_t low_pri_pool_usage_;
// Whether to reject insertion if cache reaches its full capacity.
bool strict_capacity_limit_;
// Ratio of capacity reserved for high priority cache entries.
double high_pri_pool_ratio_;
// High-pri pool size, equals to capacity * high_pri_pool_ratio.
// Remember the value to avoid recomputing each time.
double high_pri_pool_capacity_;
// Ratio of capacity reserved for low priority cache entries.
double low_pri_pool_ratio_;
// Low-pri pool size, equals to capacity * low_pri_pool_ratio.
// Remember the value to avoid recomputing each time.
double low_pri_pool_capacity_;
// Dummy head of LRU list.
// lru.prev is newest entry, lru.next is oldest entry.
// LRU contains items which can be evicted, ie reference only by cache
LRUHandle lru_;
// Pointer to head of low-pri pool in LRU list.
LRUHandle* lru_low_pri_;
// Pointer to head of bottom-pri pool in LRU list.
LRUHandle* lru_bottom_pri_;
// ------------^^^^^^^^^^^^^-----------
// Not frequently modified data members
// ------------------------------------
//
// We separate data members that are updated frequently from the ones that
// are not frequently updated so that they don't share the same cache line
// which will lead into false cache sharing
//
// ------------------------------------
// Frequently modified data members
// ------------vvvvvvvvvvvvv-----------
LRUHandleTable table_;
// Memory size for entries residing in the cache.
size_t usage_;
// Memory size for entries residing only in the LRU list.
size_t lru_usage_;
// mutex_ protects the following state.
// We don't count mutex_ as the cache's internal state so semantically we
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
};
class LRUCache
#ifdef NDEBUG
final
#endif
: public ShardedCache<LRUCacheShard> {
public:
explicit LRUCache(const LRUCacheOptions& opts);
const char* Name() const override { return "LRUCache"; }
ObjectPtr Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
// Retrieves number of elements in LRU, for unit test purpose only.
size_t TEST_GetLRUSize();
// Retrieves high pri pool ratio.
double GetHighPriPoolRatio();
};
} // namespace lru_cache
using LRUCache = lru_cache::LRUCache;
using LRUHandle = lru_cache::LRUHandle;
using LRUCacheShard = lru_cache::LRUCacheShard;
} // namespace ROCKSDB_NAMESPACE

File diff suppressed because it is too large Load Diff

@ -0,0 +1,44 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "rocksdb/secondary_cache.h"
#include "cache/cache_entry_roles.h"
namespace ROCKSDB_NAMESPACE {
namespace {
void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {}
size_t SliceSize(Cache::ObjectPtr obj) {
return static_cast<Slice*>(obj)->size();
}
Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length,
char* out) {
const Slice& slice = *static_cast<Slice*>(from_obj);
std::memcpy(out, slice.data() + from_offset, length);
return Status::OK();
}
Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*,
Cache::ObjectPtr*, size_t*) {
return Status::NotSupported("Only for dumping data into SecondaryCache");
}
} // namespace
Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
static Cache::CacheItemHelper helper_no_secondary{CacheEntryRole::kMisc,
&NoopDelete};
static Cache::CacheItemHelper helper{
CacheEntryRole::kMisc, &NoopDelete, &SliceSize,
&SliceSaveTo, &FailCreate, &helper_no_secondary};
// NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
return Insert(key, const_cast<Slice*>(&saved), &helper);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,433 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// A distinct pointer value for marking "dummy" cache entries
struct Dummy {
char val[7] = "kDummy";
};
const Dummy kDummy{};
Cache::ObjectPtr const kDummyObj = const_cast<Dummy*>(&kDummy);
} // namespace
// When CacheWithSecondaryAdapter is constructed with the distribute_cache_res
// parameter set to true, it manages the entire memory budget across the
// primary and secondary cache. The secondary cache is assumed to be in
// memory, such as the CompressedSecondaryCache. When a placeholder entry
// is inserted by a CacheReservationManager instance to reserve memory,
// the CacheWithSecondaryAdapter ensures that the reservation is distributed
// proportionally across the primary/secondary caches.
//
// The primary block cache is initially sized to the sum of the primary cache
// budget + teh secondary cache budget, as follows -
// |--------- Primary Cache Configured Capacity -----------|
// |---Secondary Cache Budget----|----Primary Cache Budget-----|
//
// A ConcurrentCacheReservationManager member in the CacheWithSecondaryAdapter,
// pri_cache_res_,
// is used to help with tracking the distribution of memory reservations.
// Initially, it accounts for the entire secondary cache budget as a
// reservation against the primary cache. This shrinks the usable capacity of
// the primary cache to the budget that the user originally desired.
//
// |--Reservation for Sec Cache--|-Pri Cache Usable Capacity---|
//
// When a reservation placeholder is inserted into the adapter, it is inserted
// directly into the primary cache. This means the entire charge of the
// placeholder is counted against the primary cache. To compensate and count
// a portion of it against the secondary cache, the secondary cache Deflate()
// method is called to shrink it. Since the Deflate() causes the secondary
// actual usage to shrink, it is refelcted here by releasing an equal amount
// from the pri_cache_res_ reservation. The Deflate() in the secondary cache
// can be, but is not required to be, implemented using its own cache
// reservation manager.
//
// For example, if the pri/sec ratio is 70/30, and the combined capacity is
// 100MB, the intermediate and final state after inserting a reservation
// placeholder for 10MB would be as follows -
//
// |-Reservation for Sec Cache-|-Pri Cache Usable Capacity-|---R---|
// 1. After inserting the placeholder in primary
// |------- 30MB -------------|------- 60MB -------------|-10MB--|
// 2. After deflating the secondary and adjusting the reservation for
// secondary against the primary
// |------- 27MB -------------|------- 63MB -------------|-10MB--|
//
// Likewise, when the user inserted placeholder is released, the secondary
// cache Inflate() method is called to grow it, and the pri_cache_res_
// reservation is increased by an equal amount.
//
// Another way of implementing this would have been to simply split the user
// reservation into primary and seconary components. However, this would
// require allocating a structure to track the associated secondary cache
// reservation, which adds some complexity and overhead.
//
CacheWithSecondaryAdapter::CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache, bool distribute_cache_res)
: CacheWrapper(std::move(target)),
secondary_cache_(std::move(secondary_cache)),
distribute_cache_res_(distribute_cache_res) {
target_->SetEvictionCallback([this](const Slice& key, Handle* handle) {
return EvictionHandler(key, handle);
});
if (distribute_cache_res_) {
size_t sec_capacity = 0;
pri_cache_res_ = std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
target_));
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
// Initially, the primary cache is sized to uncompressed cache budget plsu
// compressed secondary cache budget. The secondary cache budget is then
// taken away from the primary cache through cache reservations. Later,
// when a placeholder entry is inserted by the caller, its inserted
// into the primary cache and the portion that should be assigned to the
// secondary cache is freed from the reservation.
s = pri_cache_res_->UpdateCacheReservation(sec_capacity);
assert(s.ok());
sec_cache_res_ratio_ = (double)sec_capacity / target_->GetCapacity();
}
}
CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
// `*this` will be destroyed before `*target_`, so we have to prevent
// use after free
target_->SetEvictionCallback({});
#ifndef NDEBUG
if (distribute_cache_res_) {
size_t sec_capacity = 0;
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
assert(pri_cache_res_->GetTotalReservedCacheSize() == sec_capacity);
}
#endif // NDEBUG
}
bool CacheWithSecondaryAdapter::EvictionHandler(const Slice& key,
Handle* handle) {
auto helper = GetCacheItemHelper(handle);
if (helper->IsSecondaryCacheCompatible()) {
auto obj = target_->Value(handle);
// Ignore dummy entry
if (obj != kDummyObj) {
// Spill into secondary cache.
secondary_cache_->Insert(key, obj, helper).PermitUncheckedError();
}
}
// Never takes ownership of obj
return false;
}
bool CacheWithSecondaryAdapter::ProcessDummyResult(Cache::Handle** handle,
bool erase) {
if (*handle && target_->Value(*handle) == kDummyObj) {
target_->Release(*handle, erase);
*handle = nullptr;
return true;
} else {
return false;
}
}
void CacheWithSecondaryAdapter::CleanupCacheObject(
ObjectPtr obj, const CacheItemHelper* helper) {
if (helper->del_cb) {
helper->del_cb(obj, memory_allocator());
}
}
Cache::Handle* CacheWithSecondaryAdapter::Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache) {
assert(secondary_handle->IsReady());
ObjectPtr obj = secondary_handle->Value();
if (!obj) {
// Nothing found.
return nullptr;
}
// Found something.
switch (helper->role) {
case CacheEntryRole::kFilterBlock:
RecordTick(stats, SECONDARY_CACHE_FILTER_HITS);
break;
case CacheEntryRole::kIndexBlock:
RecordTick(stats, SECONDARY_CACHE_INDEX_HITS);
break;
case CacheEntryRole::kDataBlock:
RecordTick(stats, SECONDARY_CACHE_DATA_HITS);
break;
default:
break;
}
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
// Note: SecondaryCache::Size() is really charge (from the CreateCallback)
size_t charge = secondary_handle->Size();
Handle* result = nullptr;
// Insert into primary cache, possibly as a standalone+dummy entries.
if (secondary_cache_->SupportForceErase() && !found_dummy_entry) {
// Create standalone and insert dummy
// Allow standalone to be created even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
// Insert dummy to record recent use
// TODO: try to avoid case where inserting this dummy could overwrite a
// regular entry
Status s = Insert(key, kDummyObj, &kNoopCacheItemHelper, /*charge=*/0,
/*handle=*/nullptr, priority);
s.PermitUncheckedError();
// Nothing to do or clean up on dummy insertion failure
} else {
// Insert regular entry into primary cache.
// Don't allow it to spill into secondary cache again if it was kept there.
Status s = Insert(
key, obj, kept_in_sec_cache ? helper->without_secondary_compat : helper,
charge, &result, priority);
if (s.ok()) {
assert(result);
PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
} else {
// Create standalone result instead, even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
}
}
return result;
}
Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper,
size_t charge, Handle** handle,
Priority priority) {
Status s = target_->Insert(key, value, helper, charge, handle, priority);
if (s.ok() && value == nullptr && distribute_cache_res_) {
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
s = secondary_cache_->Deflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/false);
assert(s.ok());
}
return s;
}
Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key,
const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority,
Statistics* stats) {
// NOTE: we could just StartAsyncLookup() and Wait(), but this should be a bit
// more efficient
Handle* result =
target_->Lookup(key, helper, create_context, priority, stats);
bool secondary_compatible = helper && helper->IsSecondaryCacheCompatible();
bool found_dummy_entry =
ProcessDummyResult(&result, /*erase=*/secondary_compatible);
if (!result && secondary_compatible) {
// Try our secondary cache
bool kept_in_sec_cache = false;
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true,
found_dummy_entry, /*out*/ kept_in_sec_cache);
if (secondary_handle) {
result = Promote(std::move(secondary_handle), key, helper, priority,
stats, found_dummy_entry, kept_in_sec_cache);
}
}
return result;
}
bool CacheWithSecondaryAdapter::Release(Handle* handle,
bool erase_if_last_ref) {
if (erase_if_last_ref) {
ObjectPtr v = target_->Value(handle);
if (v == nullptr && distribute_cache_res_) {
size_t charge = target_->GetCharge(handle);
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
Status s = secondary_cache_->Inflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/true);
assert(s.ok());
}
}
return target_->Release(handle, erase_if_last_ref);
}
Cache::ObjectPtr CacheWithSecondaryAdapter::Value(Handle* handle) {
ObjectPtr v = target_->Value(handle);
// TODO with stacked secondaries: might fail in EvictionHandler
assert(v != kDummyObj);
return v;
}
void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary(
AsyncLookupHandle& async_handle) {
assert(!async_handle.IsPending());
assert(async_handle.result_handle == nullptr);
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(async_handle.key, async_handle.helper,
async_handle.create_context, /*wait*/ false,
async_handle.found_dummy_entry,
/*out*/ async_handle.kept_in_sec_cache);
if (secondary_handle) {
// TODO with stacked secondaries: Check & process if already ready?
async_handle.pending_handle = secondary_handle.release();
async_handle.pending_cache = secondary_cache_.get();
}
}
void CacheWithSecondaryAdapter::StartAsyncLookup(
AsyncLookupHandle& async_handle) {
target_->StartAsyncLookup(async_handle);
if (!async_handle.IsPending()) {
bool secondary_compatible =
async_handle.helper &&
async_handle.helper->IsSecondaryCacheCompatible();
async_handle.found_dummy_entry |= ProcessDummyResult(
&async_handle.result_handle, /*erase=*/secondary_compatible);
if (async_handle.Result() == nullptr && secondary_compatible) {
// Not found and not pending on another secondary cache
StartAsyncLookupOnMySecondary(async_handle);
}
}
}
void CacheWithSecondaryAdapter::WaitAll(AsyncLookupHandle* async_handles,
size_t count) {
if (count == 0) {
// Nothing to do
return;
}
// Requests that are pending on *my* secondary cache, at the start of this
// function
std::vector<AsyncLookupHandle*> my_pending;
// Requests that are pending on an "inner" secondary cache (managed somewhere
// under target_), as of the start of this function
std::vector<AsyncLookupHandle*> inner_pending;
// Initial accounting of pending handles, excluding those already handled
// by "outer" secondary caches. (See cur->pending_cache = nullptr.)
for (size_t i = 0; i < count; ++i) {
AsyncLookupHandle* cur = async_handles + i;
if (cur->pending_cache) {
assert(cur->IsPending());
assert(cur->helper);
assert(cur->helper->IsSecondaryCacheCompatible());
if (cur->pending_cache == secondary_cache_.get()) {
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
} else {
// Remember as potentially needing a lookup in my secondary
inner_pending.push_back(cur);
}
}
}
// Wait on inner-most cache lookups first
// TODO with stacked secondaries: because we are not using proper
// async/await constructs here yet, there is a false synchronization point
// here where all the results at one level are needed before initiating
// any lookups at the next level. Probably not a big deal, but worth noting.
if (!inner_pending.empty()) {
target_->WaitAll(async_handles, count);
}
// For those that failed to find something, convert to lookup in my
// secondary cache.
for (AsyncLookupHandle* cur : inner_pending) {
if (cur->Result() == nullptr) {
// Not found, try my secondary
StartAsyncLookupOnMySecondary(*cur);
if (cur->IsPending()) {
assert(cur->pending_cache == secondary_cache_.get());
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
}
}
}
// Wait on all lookups on my secondary cache
{
std::vector<SecondaryCacheResultHandle*> my_secondary_handles;
for (AsyncLookupHandle* cur : my_pending) {
my_secondary_handles.push_back(cur->pending_handle);
}
secondary_cache_->WaitAll(std::move(my_secondary_handles));
}
// Process results
for (AsyncLookupHandle* cur : my_pending) {
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle(
cur->pending_handle);
cur->pending_handle = nullptr;
cur->result_handle = Promote(
std::move(secondary_handle), cur->key, cur->helper, cur->priority,
cur->stats, cur->found_dummy_entry, cur->kept_in_sec_cache);
assert(cur->pending_cache == nullptr);
}
}
std::string CacheWithSecondaryAdapter::GetPrintableOptions() const {
std::string str = target_->GetPrintableOptions();
str.append(" secondary_cache:\n");
str.append(secondary_cache_->GetPrintableOptions());
return str;
}
const char* CacheWithSecondaryAdapter::Name() const {
// To the user, at least for now, configure the underlying cache with
// a secondary cache. So we pretend to be that cache
return target_->Name();
}
std::shared_ptr<Cache> NewTieredVolatileCache(
TieredVolatileCacheOptions& opts) {
if (!opts.cache_opts) {
return nullptr;
}
std::shared_ptr<Cache> cache;
if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) {
LRUCacheOptions cache_opts =
*(static_cast_with_check<LRUCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache_opts.capacity += opts.comp_cache_opts.capacity;
cache = cache_opts.MakeSharedCache();
} else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) {
HyperClockCacheOptions cache_opts =
*(static_cast_with_check<HyperClockCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache = cache_opts.MakeSharedCache();
} else {
return nullptr;
}
std::shared_ptr<SecondaryCache> sec_cache;
sec_cache = NewCompressedSecondaryCache(opts.comp_cache_opts);
return std::make_shared<CacheWithSecondaryAdapter>(cache, sec_cache, true);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,76 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "cache/cache_reservation_manager.h"
#include "rocksdb/secondary_cache.h"
namespace ROCKSDB_NAMESPACE {
class CacheWithSecondaryAdapter : public CacheWrapper {
public:
explicit CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache,
bool distribute_cache_res = false);
~CacheWithSecondaryAdapter() override;
Status Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper, size_t charge,
Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
using Cache::Release;
bool Release(Handle* handle, bool erase_if_last_ref = false) override;
ObjectPtr Value(Handle* handle) override;
void StartAsyncLookup(AsyncLookupHandle& async_handle) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
std::string GetPrintableOptions() const override;
const char* Name() const override;
Cache* TEST_GetCache() { return target_.get(); }
SecondaryCache* TEST_GetSecondaryCache() { return secondary_cache_.get(); }
private:
bool EvictionHandler(const Slice& key, Handle* handle);
void StartAsyncLookupOnMySecondary(AsyncLookupHandle& async_handle);
Handle* Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache);
bool ProcessDummyResult(Cache::Handle** handle, bool erase);
void CleanupCacheObject(ObjectPtr obj, const CacheItemHelper* helper);
std::shared_ptr<SecondaryCache> secondary_cache_;
// Whether to proportionally distribute cache memory reservations, i.e
// placeholder entries with null value and a non-zero charge, across
// the primary and secondary caches.
bool distribute_cache_res_;
// A cache reservation manager to keep track of secondary cache memory
// usage by reserving equivalent capacity against the primary cache
std::shared_ptr<ConcurrentCacheReservationManager> pri_cache_res_;
// Fraction of a cache memory reservation to be assigned to the secondary
// cache
double sec_cache_res_ratio_;
};
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,137 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "cache/sharded_cache.h"
#include <algorithm>
#include <cstdint>
#include <memory>
#include "env/unique_id_gen.h"
#include "rocksdb/env.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// The generated seeds must fit in 31 bits so that
// ShardedCacheOptions::hash_seed can be set to it explicitly, for
// diagnostic/debugging purposes.
constexpr uint32_t kSeedMask = 0x7fffffff;
uint32_t DetermineSeed(int32_t hash_seed_option) {
if (hash_seed_option >= 0) {
// User-specified exact seed
return static_cast<uint32_t>(hash_seed_option);
}
static SemiStructuredUniqueIdGen gen;
if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) {
std::string hostname;
Status s = Env::Default()->GetHostNameString(&hostname);
if (s.ok()) {
return GetSliceHash(hostname) & kSeedMask;
} else {
// Fall back on something stable within the process.
return BitwiseAnd(gen.GetBaseUpper(), kSeedMask);
}
} else {
// for kQuasiRandomHashSeed and fallback
uint32_t val = gen.GenerateNext<uint32_t>() & kSeedMask;
// Perform some 31-bit bijective transformations so that we get
// quasirandom, not just incrementing. (An incrementing seed from a
// random starting point would be fine, but hard to describe in a name.)
// See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like
// transformation here for our bijection in the lower 31 bits.
// See https://en.wikipedia.org/wiki/MurmurHash
val *= /*31-bit prime*/ 1150630961;
val ^= (val & kSeedMask) >> 17;
val *= /*31-bit prime*/ 1320603883;
return val & kSeedMask;
}
}
} // namespace
ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts)
: Cache(opts.memory_allocator),
last_id_(1),
shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1),
hash_seed_(DetermineSeed(opts.hash_seed)),
strict_capacity_limit_(opts.strict_capacity_limit),
capacity_(opts.capacity) {}
size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
uint32_t num_shards = GetNumShards();
return (capacity + (num_shards - 1)) / num_shards;
}
size_t ShardedCacheBase::GetPerShardCapacity() const {
return ComputePerShardCapacity(GetCapacity());
}
uint64_t ShardedCacheBase::NewId() {
return last_id_.fetch_add(1, std::memory_order_relaxed);
}
size_t ShardedCacheBase::GetCapacity() const {
MutexLock l(&config_mutex_);
return capacity_;
}
bool ShardedCacheBase::HasStrictCapacityLimit() const {
MutexLock l(&config_mutex_);
return strict_capacity_limit_;
}
size_t ShardedCacheBase::GetUsage(Handle* handle) const {
return GetCharge(handle);
}
std::string ShardedCacheBase::GetPrintableOptions() const {
std::string ret;
ret.reserve(20000);
const int kBufferSize = 200;
char buffer[kBufferSize];
{
MutexLock l(&config_mutex_);
snprintf(buffer, kBufferSize, " capacity : %" ROCKSDB_PRIszt "\n",
capacity_);
ret.append(buffer);
snprintf(buffer, kBufferSize, " num_shard_bits : %d\n",
GetNumShardBits());
ret.append(buffer);
snprintf(buffer, kBufferSize, " strict_capacity_limit : %d\n",
strict_capacity_limit_);
ret.append(buffer);
}
snprintf(buffer, kBufferSize, " memory_allocator : %s\n",
memory_allocator() ? memory_allocator()->Name() : "None");
ret.append(buffer);
AppendPrintableOptions(ret);
return ret;
}
int GetDefaultCacheShardBits(size_t capacity, size_t min_shard_size) {
int num_shard_bits = 0;
size_t num_shards = capacity / min_shard_size;
while (num_shards >>= 1) {
if (++num_shard_bits >= 6) {
// No more than 6.
return num_shard_bits;
}
}
return num_shard_bits;
}
int ShardedCacheBase::GetNumShardBits() const {
return BitsSetToOne(shard_mask_);
}
uint32_t ShardedCacheBase::GetNumShards() const { return shard_mask_ + 1; }
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,309 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <atomic>
#include <cstdint>
#include <string>
#include "port/lang.h"
#include "port/port.h"
#include "rocksdb/advanced_cache.h"
#include "util/hash.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
// Optional base class for classes implementing the CacheShard concept
class CacheShardBase {
public:
explicit CacheShardBase(CacheMetadataChargePolicy metadata_charge_policy)
: metadata_charge_policy_(metadata_charge_policy) {}
using DeleterFn = Cache::DeleterFn;
// Expected by concept CacheShard (TODO with C++20 support)
// Some Defaults
std::string GetPrintableOptions() const { return ""; }
using HashVal = uint64_t;
using HashCref = uint64_t;
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return GetSliceNPHash64(key, seed);
}
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Lower32of64(hash);
}
void AppendPrintableOptions(std::string& /*str*/) const {}
// Must be provided for concept CacheShard (TODO with C++20 support)
/*
struct HandleImpl { // for concept HandleImpl
HashVal hash;
HashCref GetHash() const;
...
};
Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge,
HandleImpl** handle, Cache::Priority priority,
bool standalone) = 0;
Handle* CreateStandalone(const Slice& key, HashCref hash, ObjectPtr obj,
const CacheItemHelper* helper,
size_t charge, bool allow_uncharged) = 0;
HandleImpl* Lookup(const Slice& key, HashCref hash,
const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context,
Cache::Priority priority,
Statistics* stats) = 0;
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
bool Ref(HandleImpl* handle) = 0;
void Erase(const Slice& key, HashCref hash) = 0;
void SetCapacity(size_t capacity) = 0;
void SetStrictCapacityLimit(bool strict_capacity_limit) = 0;
size_t GetUsage() const = 0;
size_t GetPinnedUsage() const = 0;
size_t GetOccupancyCount() const = 0;
size_t GetTableAddressCount() const = 0;
// Handles iterating over roughly `average_entries_per_lock` entries, using
// `state` to somehow record where it last ended up. Caller initially uses
// *state == 0 and implementation sets *state = SIZE_MAX to indicate
// completion.
void ApplyToSomeEntries(
const std::function<void(const Slice& key, ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state) = 0;
void EraseUnRefEntries() = 0;
*/
protected:
const CacheMetadataChargePolicy metadata_charge_policy_;
};
// Portions of ShardedCache that do not depend on the template parameter
class ShardedCacheBase : public Cache {
public:
explicit ShardedCacheBase(const ShardedCacheOptions& opts);
virtual ~ShardedCacheBase() = default;
int GetNumShardBits() const;
uint32_t GetNumShards() const;
uint64_t NewId() override;
bool HasStrictCapacityLimit() const override;
size_t GetCapacity() const override;
using Cache::GetUsage;
size_t GetUsage(Handle* handle) const override;
std::string GetPrintableOptions() const override;
uint32_t GetHashSeed() const override { return hash_seed_; }
protected: // fns
virtual void AppendPrintableOptions(std::string& str) const = 0;
size_t GetPerShardCapacity() const;
size_t ComputePerShardCapacity(size_t capacity) const;
protected: // data
std::atomic<uint64_t> last_id_; // For NewId
const uint32_t shard_mask_;
const uint32_t hash_seed_;
// Dynamic configuration parameters, guarded by config_mutex_
bool strict_capacity_limit_;
size_t capacity_;
mutable port::Mutex config_mutex_;
};
// Generic cache interface that shards cache by hash of keys. 2^num_shard_bits
// shards will be created, with capacity split evenly to each of the shards.
// Keys are typically sharded by the lowest num_shard_bits bits of hash value
// so that the upper bits of the hash value can keep a stable ordering of
// table entries even as the table grows (using more upper hash bits).
// See CacheShardBase above for what is expected of the CacheShard parameter.
template <class CacheShard>
class ShardedCache : public ShardedCacheBase {
public:
using HashVal = typename CacheShard::HashVal;
using HashCref = typename CacheShard::HashCref;
using HandleImpl = typename CacheShard::HandleImpl;
explicit ShardedCache(const ShardedCacheOptions& opts)
: ShardedCacheBase(opts),
shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
sizeof(CacheShard) * GetNumShards()))),
destroy_shards_in_dtor_(false) {}
virtual ~ShardedCache() {
if (destroy_shards_in_dtor_) {
ForEachShard([](CacheShard* cs) { cs->~CacheShard(); });
}
port::cacheline_aligned_free(shards_);
}
CacheShard& GetShard(HashCref hash) {
return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
}
const CacheShard& GetShard(HashCref hash) const {
return shards_[CacheShard::HashPieceForSharding(hash) & shard_mask_];
}
void SetCapacity(size_t capacity) override {
MutexLock l(&config_mutex_);
capacity_ = capacity;
auto per_shard = ComputePerShardCapacity(capacity);
ForEachShard([=](CacheShard* cs) { cs->SetCapacity(per_shard); });
}
void SetStrictCapacityLimit(bool s_c_l) override {
MutexLock l(&config_mutex_);
strict_capacity_limit_ = s_c_l;
ForEachShard(
[s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
}
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override {
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
auto h_out = reinterpret_cast<HandleImpl**>(handle);
return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out,
priority);
}
Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
const CacheItemHelper* helper, size_t charge,
bool allow_uncharged) override {
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).CreateStandalone(
key, hash, obj, helper, charge, allow_uncharged);
return reinterpret_cast<Handle*>(result);
}
Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr,
CreateContext* create_context = nullptr,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override {
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).Lookup(key, hash, helper,
create_context, priority, stats);
return reinterpret_cast<Handle*>(result);
}
void Erase(const Slice& key) override {
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
GetShard(hash).Erase(key, hash);
}
bool Release(Handle* handle, bool useful,
bool erase_if_last_ref = false) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
}
bool Ref(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Ref(h);
}
bool Release(Handle* handle, bool erase_if_last_ref = false) override {
return Release(handle, true /*useful*/, erase_if_last_ref);
}
using ShardedCacheBase::GetUsage;
size_t GetUsage() const override {
return SumOverShards2(&CacheShard::GetUsage);
}
size_t GetPinnedUsage() const override {
return SumOverShards2(&CacheShard::GetPinnedUsage);
}
size_t GetOccupancyCount() const override {
return SumOverShards2(&CacheShard::GetOccupancyCount);
}
size_t GetTableAddressCount() const override {
return SumOverShards2(&CacheShard::GetTableAddressCount);
}
void ApplyToAllEntries(
const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
const CacheItemHelper* helper)>& callback,
const ApplyToAllEntriesOptions& opts) override {
uint32_t num_shards = GetNumShards();
// Iterate over part of each shard, rotating between shards, to
// minimize impact on latency of concurrent operations.
std::unique_ptr<size_t[]> states(new size_t[num_shards]{});
size_t aepl = opts.average_entries_per_lock;
aepl = std::min(aepl, size_t{1});
bool remaining_work;
do {
remaining_work = false;
for (uint32_t i = 0; i < num_shards; i++) {
if (states[i] != SIZE_MAX) {
shards_[i].ApplyToSomeEntries(callback, aepl, &states[i]);
remaining_work |= states[i] != SIZE_MAX;
}
}
} while (remaining_work);
}
virtual void EraseUnRefEntries() override {
ForEachShard([](CacheShard* cs) { cs->EraseUnRefEntries(); });
}
void DisownData() override {
// Leak data only if that won't generate an ASAN/valgrind warning.
if (!kMustFreeHeapAllocations) {
destroy_shards_in_dtor_ = false;
}
}
protected:
inline void ForEachShard(const std::function<void(CacheShard*)>& fn) {
uint32_t num_shards = GetNumShards();
for (uint32_t i = 0; i < num_shards; i++) {
fn(shards_ + i);
}
}
inline size_t SumOverShards(
const std::function<size_t(CacheShard&)>& fn) const {
uint32_t num_shards = GetNumShards();
size_t result = 0;
for (uint32_t i = 0; i < num_shards; i++) {
result += fn(shards_[i]);
}
return result;
}
inline size_t SumOverShards2(size_t (CacheShard::*fn)() const) const {
return SumOverShards([fn](CacheShard& cs) { return (cs.*fn)(); });
}
// Must be called exactly once by derived class constructor
void InitShards(const std::function<void(CacheShard*)>& placement_new) {
ForEachShard(placement_new);
destroy_shards_in_dtor_ = true;
}
void AppendPrintableOptions(std::string& str) const override {
shards_[0].AppendPrintableOptions(str);
}
private:
CacheShard* const shards_;
bool destroy_shards_in_dtor_;
};
// 512KB is traditional minimum shard size.
int GetDefaultCacheShardBits(size_t capacity,
size_t min_shard_size = 512U * 1024U);
} // namespace ROCKSDB_NAMESPACE

375
cache/typed_cache.h vendored

@ -0,0 +1,375 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// APIs for accessing Cache in a type-safe and convenient way. Cache is kept
// at a low, thin level of abstraction so that different implementations can
// be plugged in, but these wrappers provide clean, convenient access to the
// most common operations.
//
// A number of template classes are needed for sharing common structure. The
// key classes are these:
//
// * PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value.
// * BasicTypedCacheInterface<TValue> - Used for primary cache storage of
// objects of type TValue.
// * FullTypedCacheHelper<TValue, TCreateContext> - Used for secondary cache
// compatible storage of objects of type TValue.
// * For each of these, there's a "Shared" version
// (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache,
// rather than assuming external ownership by holding only a raw `Cache*`.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "cache/cache_helpers.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/advanced_options.h"
namespace ROCKSDB_NAMESPACE {
// For future consideration:
// * Pass in value to Insert with std::unique_ptr& to simplify ownership
// transfer logic in callers
// * Make key type a template parameter (e.g. useful for table cache)
// * Closer integration with CacheHandleGuard (opt-in, so not always
// paying the extra overhead)
#define CACHE_TYPE_DEFS() \
using Priority = Cache::Priority; \
using Handle = Cache::Handle; \
using ObjectPtr = Cache::ObjectPtr; \
using CreateContext = Cache::CreateContext; \
using CacheItemHelper = Cache::CacheItemHelper /* caller ; */
template <typename CachePtr>
class BaseCacheInterface {
public:
CACHE_TYPE_DEFS();
/*implicit*/ BaseCacheInterface(CachePtr cache) : cache_(std::move(cache)) {}
inline void Release(Handle* handle) { cache_->Release(handle); }
inline void ReleaseAndEraseIfLastRef(Handle* handle) {
cache_->Release(handle, /*erase_if_last_ref*/ true);
}
inline void RegisterReleaseAsCleanup(Handle* handle, Cleanable& cleanable) {
cleanable.RegisterCleanup(&ReleaseCacheHandleCleanup, get(), handle);
}
inline Cache* get() const { return &*cache_; }
explicit inline operator bool() const noexcept { return cache_ != nullptr; }
protected:
CachePtr cache_;
};
// PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value. CacheEntryRole is required as
// a template parameter.
template <CacheEntryRole kRole, typename CachePtr = Cache*>
class PlaceholderCacheInterface : public BaseCacheInterface<CachePtr> {
public:
CACHE_TYPE_DEFS();
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
inline Status Insert(const Slice& key, size_t charge, Handle** handle) {
return this->cache_->Insert(key, /*value=*/nullptr, GetHelper(), charge,
handle);
}
static const Cache::CacheItemHelper* GetHelper() {
static const Cache::CacheItemHelper kHelper{kRole};
return &kHelper;
}
};
template <CacheEntryRole kRole>
using PlaceholderSharedCacheInterface =
PlaceholderCacheInterface<kRole, std::shared_ptr<Cache>>;
template <class TValue>
class BasicTypedCacheHelperFns {
public:
CACHE_TYPE_DEFS();
// E.g. char* for char[]
using TValuePtr = std::remove_extent_t<TValue>*;
protected:
inline static ObjectPtr UpCastValue(TValuePtr value) { return value; }
inline static TValuePtr DownCastValue(ObjectPtr value) {
return static_cast<TValuePtr>(value);
}
static void Delete(ObjectPtr value, MemoryAllocator* allocator) {
// FIXME: Currently, no callers actually allocate the ObjectPtr objects
// using the custom allocator, just subobjects that keep a reference to
// the allocator themselves (with CacheAllocationPtr).
if (/*DISABLED*/ false && allocator) {
if constexpr (std::is_destructible_v<TValue>) {
DownCastValue(value)->~TValue();
}
allocator->Deallocate(value);
} else {
// Like delete but properly handles TValue=char[] etc.
std::default_delete<TValue>{}(DownCastValue(value));
}
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, CacheEntryRole kRole>
class BasicTypedCacheHelper : public BasicTypedCacheHelperFns<TValue> {
public:
static const Cache::CacheItemHelper* GetBasicHelper() {
static const Cache::CacheItemHelper kHelper{kRole,
&BasicTypedCacheHelper::Delete};
return &kHelper;
}
};
// BasicTypedCacheInterface - Used for primary cache storage of objects of
// type TValue, which can be cleaned up with std::default_delete<TValue>. The
// role is provided by TValue::kCacheEntryRole or given in an optional
// template parameter.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class BasicTypedCacheInterface : public BaseCacheInterface<CachePtr>,
public BasicTypedCacheHelper<TValue, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
struct TypedHandle : public Handle {};
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
// ctor
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle {
TypedHandle* Result() {
return reinterpret_cast<TypedHandle*>(Cache::AsyncLookupHandle::Result());
}
};
inline Status Insert(const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr,
Priority priority = Priority::LOW) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
return this->cache_->Insert(
key, BasicTypedCacheHelperFns<TValue>::UpCastValue(value),
GetBasicHelper(), charge, untyped_handle, priority);
}
inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) {
return reinterpret_cast<TypedHandle*>(
this->cache_->BasicLookup(key, stats));
}
inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) {
assert(async_handle.helper == nullptr);
this->cache_->StartAsyncLookup(async_handle);
}
inline CacheHandleGuard<TValue> Guard(TypedHandle* handle) {
if (handle) {
return CacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline std::shared_ptr<TValue> SharedGuard(TypedHandle* handle) {
if (handle) {
return MakeSharedCacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline TValuePtr Value(TypedHandle* handle) {
return BasicTypedCacheHelperFns<TValue>::DownCastValue(
this->cache_->Value(handle));
}
};
// BasicTypedSharedCacheInterface - Like BasicTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole>
using BasicTypedSharedCacheInterface =
BasicTypedCacheInterface<TValue, kRole, std::shared_ptr<Cache>>;
// TValue must implement ContentSlice() and ~TValue
// TCreateContext must implement Create(std::unique_ptr<TValue>*, ...)
template <class TValue, class TCreateContext>
class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns<TValue> {
public:
CACHE_TYPE_DEFS();
protected:
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
static size_t Size(ObjectPtr v) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
return slice.size();
}
static Status SaveTo(ObjectPtr v, size_t from_offset, size_t length,
char* out) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
assert(from_offset < slice.size());
assert(from_offset + length <= slice.size());
std::copy_n(slice.data() + from_offset, length, out);
return Status::OK();
}
static Status Create(const Slice& data, CreateContext* context,
MemoryAllocator* allocator, ObjectPtr* out_obj,
size_t* out_charge) {
std::unique_ptr<TValue> value = nullptr;
if constexpr (sizeof(TCreateContext) > 0) {
TCreateContext* tcontext = static_cast<TCreateContext*>(context);
tcontext->Create(&value, out_charge, data, allocator);
} else {
TCreateContext::Create(&value, out_charge, data, allocator);
}
*out_obj = UpCastValue(value.release());
return Status::OK();
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, class TCreateContext, CacheEntryRole kRole>
class FullTypedCacheHelper
: public FullTypedCacheHelperFns<TValue, TCreateContext> {
public:
static const Cache::CacheItemHelper* GetFullHelper() {
static const Cache::CacheItemHelper kHelper{
kRole,
&FullTypedCacheHelper::Delete,
&FullTypedCacheHelper::Size,
&FullTypedCacheHelper::SaveTo,
&FullTypedCacheHelper::Create,
BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper()};
return &kHelper;
}
};
// FullTypedCacheHelper - Used for secondary cache compatible storage of
// objects of type TValue. In addition to BasicTypedCacheInterface constraints,
// we require TValue::ContentSlice() to return persistable data. This
// simplifies usage for the normal case of simple secondary cache compatibility
// (can give you a Slice to the data already in memory). In addition to
// TCreateContext performing the role of Cache::CreateContext, it is also
// expected to provide a function Create(std::unique_ptr<TValue>* value,
// size_t* out_charge, const Slice& data, MemoryAllocator* allocator) for
// creating new TValue.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class FullTypedCacheInterface
: public BasicTypedCacheInterface<TValue, kRole, CachePtr>,
public FullTypedCacheHelper<TValue, TCreateContext, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheInterface<TValue, kRole, CachePtr>::TypedHandle;
using typename BasicTypedCacheInterface<TValue, kRole,
CachePtr>::TypedAsyncLookupHandle;
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
using FullTypedCacheHelper<TValue, TCreateContext, kRole>::GetFullHelper;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
// ctor
using BasicTypedCacheInterface<TValue, kRole,
CachePtr>::BasicTypedCacheInterface;
// Insert with SecondaryCache compatibility (subject to CacheTier).
// (Basic Insert() also inherited.)
inline Status InsertFull(
const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr, Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier
? GetFullHelper()
: GetBasicHelper();
return this->cache_->Insert(key, UpCastValue(value), helper, charge,
untyped_handle, priority);
}
// Like SecondaryCache::InsertSaved, with SecondaryCache compatibility
// (subject to CacheTier).
inline Status InsertSaved(
const Slice& key, const Slice& data, TCreateContext* create_context,
Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier,
size_t* out_charge = nullptr) {
ObjectPtr value;
size_t charge;
Status st = GetFullHelper()->create_cb(data, create_context,
this->cache_->memory_allocator(),
&value, &charge);
if (out_charge) {
*out_charge = charge;
}
if (st.ok()) {
st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/,
priority, lowest_used_cache_tier);
} else {
GetFullHelper()->del_cb(value, this->cache_->memory_allocator());
}
return st;
}
// Lookup with SecondaryCache support (subject to CacheTier).
// (Basic Lookup() also inherited.)
inline TypedHandle* LookupFull(
const Slice& key, TCreateContext* create_context = nullptr,
Priority priority = Priority::LOW, Statistics* stats = nullptr,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
return reinterpret_cast<TypedHandle*>(this->cache_->Lookup(
key, GetFullHelper(), create_context, priority, stats));
} else {
return BasicTypedCacheInterface<TValue, kRole, CachePtr>::Lookup(key,
stats);
}
}
inline void StartAsyncLookupFull(
TypedAsyncLookupHandle& async_handle,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
async_handle.helper = GetFullHelper();
this->cache_->StartAsyncLookup(async_handle);
} else {
BasicTypedCacheInterface<TValue, kRole, CachePtr>::StartAsyncLookup(
async_handle);
}
}
};
// FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole>
using FullTypedSharedCacheInterface =
FullTypedCacheInterface<TValue, TCreateContext, kRole,
std::shared_ptr<Cache>>;
#undef CACHE_TYPE_DEFS
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,54 @@
@PACKAGE_INIT@
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/modules")
include(CMakeFindDependencyMacro)
set(GFLAGS_USE_TARGET_NAMESPACE @GFLAGS_USE_TARGET_NAMESPACE@)
if(@WITH_JEMALLOC@)
find_dependency(JeMalloc)
endif()
if(@WITH_GFLAGS@)
find_dependency(gflags CONFIG)
if(NOT gflags_FOUND)
find_dependency(gflags)
endif()
endif()
if(@WITH_SNAPPY@)
find_dependency(Snappy CONFIG)
if(NOT Snappy_FOUND)
find_dependency(Snappy)
endif()
endif()
if(@WITH_ZLIB@)
find_dependency(ZLIB)
endif()
if(@WITH_BZ2@)
find_dependency(BZip2)
endif()
if(@WITH_LZ4@)
find_dependency(lz4)
endif()
if(@WITH_ZSTD@)
find_dependency(zstd)
endif()
if(@WITH_NUMA@)
find_dependency(NUMA)
endif()
if(@WITH_TBB@)
find_dependency(TBB)
endif()
find_dependency(Threads)
include("${CMAKE_CURRENT_LIST_DIR}/RocksDBTargets.cmake")
check_required_components(RocksDB)

@ -0,0 +1,7 @@
macro(get_cxx_std_flags FLAGS_VARIABLE)
if( CMAKE_CXX_STANDARD_REQUIRED )
set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_STANDARD_COMPILE_OPTION})
else()
set(${FLAGS_VARIABLE} ${CMAKE_CXX${CMAKE_CXX_STANDARD}_EXTENSION_COMPILE_OPTION})
endif()
endmacro()

@ -0,0 +1,29 @@
# - Find JeMalloc library
# Find the native JeMalloc includes and library
#
# JeMalloc_INCLUDE_DIRS - where to find jemalloc.h, etc.
# JeMalloc_LIBRARIES - List of libraries when using jemalloc.
# JeMalloc_FOUND - True if jemalloc found.
find_path(JeMalloc_INCLUDE_DIRS
NAMES jemalloc/jemalloc.h
HINTS ${JEMALLOC_ROOT_DIR}/include)
find_library(JeMalloc_LIBRARIES
NAMES jemalloc
HINTS ${JEMALLOC_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(JeMalloc DEFAULT_MSG JeMalloc_LIBRARIES JeMalloc_INCLUDE_DIRS)
mark_as_advanced(
JeMalloc_LIBRARIES
JeMalloc_INCLUDE_DIRS)
if(JeMalloc_FOUND AND NOT (TARGET JeMalloc::JeMalloc))
add_library (JeMalloc::JeMalloc UNKNOWN IMPORTED)
set_target_properties(JeMalloc::JeMalloc
PROPERTIES
IMPORTED_LOCATION ${JeMalloc_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${JeMalloc_INCLUDE_DIRS})
endif()

@ -0,0 +1,29 @@
# - Find NUMA
# Find the NUMA library and includes
#
# NUMA_INCLUDE_DIRS - where to find numa.h, etc.
# NUMA_LIBRARIES - List of libraries when using NUMA.
# NUMA_FOUND - True if NUMA found.
find_path(NUMA_INCLUDE_DIRS
NAMES numa.h numaif.h
HINTS ${NUMA_ROOT_DIR}/include)
find_library(NUMA_LIBRARIES
NAMES numa
HINTS ${NUMA_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NUMA DEFAULT_MSG NUMA_LIBRARIES NUMA_INCLUDE_DIRS)
mark_as_advanced(
NUMA_LIBRARIES
NUMA_INCLUDE_DIRS)
if(NUMA_FOUND AND NOT (TARGET NUMA::NUMA))
add_library (NUMA::NUMA UNKNOWN IMPORTED)
set_target_properties(NUMA::NUMA
PROPERTIES
IMPORTED_LOCATION ${NUMA_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${NUMA_INCLUDE_DIRS})
endif()

@ -0,0 +1,29 @@
# - Find Snappy
# Find the snappy compression library and includes
#
# Snappy_INCLUDE_DIRS - where to find snappy.h, etc.
# Snappy_LIBRARIES - List of libraries when using snappy.
# Snappy_FOUND - True if snappy found.
find_path(Snappy_INCLUDE_DIRS
NAMES snappy.h
HINTS ${snappy_ROOT_DIR}/include)
find_library(Snappy_LIBRARIES
NAMES snappy
HINTS ${snappy_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(Snappy DEFAULT_MSG Snappy_LIBRARIES Snappy_INCLUDE_DIRS)
mark_as_advanced(
Snappy_LIBRARIES
Snappy_INCLUDE_DIRS)
if(Snappy_FOUND AND NOT (TARGET Snappy::snappy))
add_library (Snappy::snappy UNKNOWN IMPORTED)
set_target_properties(Snappy::snappy
PROPERTIES
IMPORTED_LOCATION ${Snappy_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${Snappy_INCLUDE_DIRS})
endif()

@ -0,0 +1,33 @@
# - Find TBB
# Find the Thread Building Blocks library and includes
#
# TBB_INCLUDE_DIRS - where to find tbb.h, etc.
# TBB_LIBRARIES - List of libraries when using TBB.
# TBB_FOUND - True if TBB found.
if(NOT DEFINED TBB_ROOT_DIR)
set(TBB_ROOT_DIR "$ENV{TBBROOT}")
endif()
find_path(TBB_INCLUDE_DIRS
NAMES tbb/tbb.h
HINTS ${TBB_ROOT_DIR}/include)
find_library(TBB_LIBRARIES
NAMES tbb
HINTS ${TBB_ROOT_DIR}/lib ENV LIBRARY_PATH)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(TBB DEFAULT_MSG TBB_LIBRARIES TBB_INCLUDE_DIRS)
mark_as_advanced(
TBB_LIBRARIES
TBB_INCLUDE_DIRS)
if(TBB_FOUND AND NOT (TARGET TBB::TBB))
add_library (TBB::TBB UNKNOWN IMPORTED)
set_target_properties(TBB::TBB
PROPERTIES
IMPORTED_LOCATION ${TBB_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${TBB_INCLUDE_DIRS})
endif()

@ -0,0 +1,29 @@
# - Find gflags library
# Find the gflags includes and library
#
# GFLAGS_INCLUDE_DIR - where to find gflags.h.
# GFLAGS_LIBRARIES - List of libraries when using gflags.
# gflags_FOUND - True if gflags found.
find_path(GFLAGS_INCLUDE_DIR
NAMES gflags/gflags.h)
find_library(GFLAGS_LIBRARIES
NAMES gflags)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(gflags
DEFAULT_MSG GFLAGS_LIBRARIES GFLAGS_INCLUDE_DIR)
mark_as_advanced(
GFLAGS_LIBRARIES
GFLAGS_INCLUDE_DIR)
if(gflags_FOUND AND NOT (TARGET gflags::gflags))
add_library(gflags::gflags UNKNOWN IMPORTED)
set_target_properties(gflags::gflags
PROPERTIES
IMPORTED_LOCATION ${GFLAGS_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${GFLAGS_INCLUDE_DIR}
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX")
endif()

@ -0,0 +1,29 @@
# - Find Lz4
# Find the lz4 compression library and includes
#
# lz4_INCLUDE_DIRS - where to find lz4.h, etc.
# lz4_LIBRARIES - List of libraries when using lz4.
# lz4_FOUND - True if lz4 found.
find_path(lz4_INCLUDE_DIRS
NAMES lz4.h
HINTS ${lz4_ROOT_DIR}/include)
find_library(lz4_LIBRARIES
NAMES lz4
HINTS ${lz4_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(lz4 DEFAULT_MSG lz4_LIBRARIES lz4_INCLUDE_DIRS)
mark_as_advanced(
lz4_LIBRARIES
lz4_INCLUDE_DIRS)
if(lz4_FOUND AND NOT (TARGET lz4::lz4))
add_library(lz4::lz4 UNKNOWN IMPORTED)
set_target_properties(lz4::lz4
PROPERTIES
IMPORTED_LOCATION ${lz4_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${lz4_INCLUDE_DIRS})
endif()

@ -0,0 +1,26 @@
# - Find liburing
#
# uring_INCLUDE_DIR - Where to find liburing.h
# uring_LIBRARIES - List of libraries when using uring.
# uring_FOUND - True if uring found.
find_path(uring_INCLUDE_DIR
NAMES liburing.h)
find_library(uring_LIBRARIES
NAMES liburing.a liburing)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(uring
DEFAULT_MSG uring_LIBRARIES uring_INCLUDE_DIR)
mark_as_advanced(
uring_INCLUDE_DIR
uring_LIBRARIES)
if(uring_FOUND AND NOT TARGET uring::uring)
add_library(uring::uring UNKNOWN IMPORTED)
set_target_properties(uring::uring PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${uring_INCLUDE_DIR}"
IMPORTED_LINK_INTERFACE_LANGUAGES "C"
IMPORTED_LOCATION "${uring_LIBRARIES}")
endif()

@ -0,0 +1,29 @@
# - Find zstd
# Find the zstd compression library and includes
#
# zstd_INCLUDE_DIRS - where to find zstd.h, etc.
# zstd_LIBRARIES - List of libraries when using zstd.
# zstd_FOUND - True if zstd found.
find_path(zstd_INCLUDE_DIRS
NAMES zstd.h
HINTS ${zstd_ROOT_DIR}/include)
find_library(zstd_LIBRARIES
NAMES zstd
HINTS ${zstd_ROOT_DIR}/lib)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(zstd DEFAULT_MSG zstd_LIBRARIES zstd_INCLUDE_DIRS)
mark_as_advanced(
zstd_LIBRARIES
zstd_INCLUDE_DIRS)
if(zstd_FOUND AND NOT (TARGET zstd::zstd))
add_library (zstd::zstd UNKNOWN IMPORTED)
set_target_properties(zstd::zstd
PROPERTIES
IMPORTED_LOCATION ${zstd_LIBRARIES}
INTERFACE_INCLUDE_DIRECTORIES ${zstd_INCLUDE_DIRS})
endif()

@ -0,0 +1,10 @@
# Read rocksdb version from version.h header file.
function(get_rocksdb_version version_var)
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/include/rocksdb/version.h" version_header_file)
foreach(component MAJOR MINOR PATCH)
string(REGEX MATCH "#define ROCKSDB_${component} ([0-9]+)" _ ${version_header_file})
set(ROCKSDB_VERSION_${component} ${CMAKE_MATCH_1})
endforeach()
set(${version_var} "${ROCKSDB_VERSION_MAJOR}.${ROCKSDB_VERSION_MINOR}.${ROCKSDB_VERSION_PATCH}" PARENT_SCOPE)
endfunction()

@ -0,0 +1,30 @@
ifndef PYTHON
# Default to python3. Some distros like CentOS 8 do not have `python`.
ifeq ($(origin PYTHON), undefined)
PYTHON := $(shell which python3 || which python || echo python3)
endif
export PYTHON
endif
# To setup tmp directory, first recognize some old variables for setting
# test tmp directory or base tmp directory. TEST_TMPDIR is usually read
# by RocksDB tools though Env/FileSystem::GetTestDirectory.
ifeq ($(TEST_TMPDIR),)
TEST_TMPDIR := $(TMPD)
endif
ifeq ($(TEST_TMPDIR),)
ifeq ($(BASE_TMPDIR),)
BASE_TMPDIR :=$(TMPDIR)
endif
ifeq ($(BASE_TMPDIR),)
BASE_TMPDIR :=/tmp
endif
# Use /dev/shm if it has the sticky bit set (otherwise, /tmp or other
# base dir), and create a randomly-named rocksdb.XXXX directory therein.
TEST_TMPDIR := $(shell f=/dev/shm; test -k $$f || f=$(BASE_TMPDIR); \
perl -le 'use File::Temp "tempdir";' \
-e 'print tempdir("'$$f'/rocksdb.XXXX", CLEANUP => 0)')
endif
export TEST_TMPDIR

@ -0,0 +1,82 @@
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
# Exit on error.
set -e
if [ -n "$USE_CLANG" ]; then
echo "Error: Coverage test is supported only for gcc."
exit 1
fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode_config_platform010.sh
GCOV=$GCC_BASE/bin/gcov
else
GCOV=$(which gcov)
fi
echo -e "Using $GCOV"
COVERAGE_DIR="$PWD/COVERAGE_REPORT"
mkdir -p $COVERAGE_DIR
# Find all gcno files to generate the coverage report
PYTHON=${1:-`which python3`}
echo -e "Using $PYTHON"
GCNO_FILES=`find $ROOT -name "*.gcno"`
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
# Parse the raw gcov report to more human readable form.
$PYTHON $ROOT/coverage/parse_gcov_output.py |
# Write the output to both stdout and report file.
tee $COVERAGE_DIR/coverage_report_all.txt &&
echo -e "Generated coverage report for all files: $COVERAGE_DIR/coverage_report_all.txt\n"
# TODO: we also need to get the files of the latest commits.
# Get the most recently committed files.
LATEST_FILES=`
git show --pretty="format:" --name-only HEAD |
grep -v "^$" |
paste -s -d,`
RECENT_REPORT=$COVERAGE_DIR/coverage_report_recent.txt
echo -e "Recently updated files: $LATEST_FILES\n" > $RECENT_REPORT
$GCOV --preserve-paths --relative-only --no-output $GCNO_FILES 2>/dev/null |
$PYTHON $ROOT/coverage/parse_gcov_output.py -interested-files $LATEST_FILES |
tee -a $RECENT_REPORT &&
echo -e "Generated coverage report for recently updated files: $RECENT_REPORT\n"
# Unless otherwise specified, we'll not generate html report by default
if [ -z "$HTML" ]; then
exit 0
fi
# Generate the html report. If we cannot find lcov in this machine, we'll simply
# skip this step.
echo "Generating the html coverage report..."
LCOV=$(which lcov || true 2>/dev/null)
if [ -z $LCOV ]
then
echo "Skip: Cannot find lcov to generate the html report."
exit 0
fi
LCOV_VERSION=$(lcov -v | grep 1.1 || true)
if [ $LCOV_VERSION ]
then
echo "Not supported lcov version. Expect lcov 1.1."
exit 0
fi
(cd $ROOT; lcov --no-external \
--capture \
--directory $PWD \
--gcov-tool $GCOV \
--output-file $COVERAGE_DIR/coverage.info)
genhtml $COVERAGE_DIR/coverage.info -o $COVERAGE_DIR
echo "HTML Coverage report is generated in $COVERAGE_DIR"

@ -0,0 +1,128 @@
#!/usr/bin/env python
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
from __future__ import print_function
import optparse
import re
import sys
# the gcov report follows certain pattern. Each file will have two lines
# of report, from which we can extract the file name, total lines and coverage
# percentage.
def parse_gcov_report(gcov_input):
per_file_coverage = {}
total_coverage = None
for line in sys.stdin:
line = line.strip()
# --First line of the coverage report (with file name in it)?
match_obj = re.match("^File '(.*)'$", line)
if match_obj:
# fetch the file name from the first line of the report.
current_file = match_obj.group(1)
continue
# -- Second line of the file report (with coverage percentage)
match_obj = re.match("^Lines executed:(.*)% of (.*)", line)
if match_obj:
coverage = float(match_obj.group(1))
lines = int(match_obj.group(2))
if current_file is not None:
per_file_coverage[current_file] = (coverage, lines)
current_file = None
else:
# If current_file is not set, we reach the last line of report,
# which contains the summarized coverage percentage.
total_coverage = (coverage, lines)
continue
# If the line's pattern doesn't fall into the above categories. We
# can simply ignore them since they're either empty line or doesn't
# find executable lines of the given file.
current_file = None
return per_file_coverage, total_coverage
def get_option_parser():
usage = (
"Parse the gcov output and generate more human-readable code "
+ "coverage report."
)
parser = optparse.OptionParser(usage)
parser.add_option(
"--interested-files",
"-i",
dest="filenames",
help="Comma separated files names. if specified, we will display "
+ "the coverage report only for interested source files. "
+ "Otherwise we will display the coverage report for all "
+ "source files.",
)
return parser
def display_file_coverage(per_file_coverage, total_coverage):
# To print out auto-adjustable column, we need to know the longest
# length of file names.
max_file_name_length = max(len(fname) for fname in per_file_coverage.keys())
# -- Print header
# size of separator is determined by 3 column sizes:
# file name, coverage percentage and lines.
header_template = "%" + str(max_file_name_length) + "s\t%s\t%s"
separator = "-" * (max_file_name_length + 10 + 20)
print(
header_template % ("Filename", "Coverage", "Lines")
) # noqa: E999 T25377293 Grandfathered in
print(separator)
# -- Print body
# template for printing coverage report for each file.
record_template = "%" + str(max_file_name_length) + "s\t%5.2f%%\t%10d"
for fname, coverage_info in per_file_coverage.items():
coverage, lines = coverage_info
print(record_template % (fname, coverage, lines))
# -- Print footer
if total_coverage:
print(separator)
print(record_template % ("Total", total_coverage[0], total_coverage[1]))
def report_coverage():
parser = get_option_parser()
(options, args) = parser.parse_args()
interested_files = set()
if options.filenames is not None:
interested_files = {f.strip() for f in options.filenames.split(",")}
# To make things simple, right now we only read gcov report from the input
per_file_coverage, total_coverage = parse_gcov_report(sys.stdin)
# Check if we need to display coverage info for interested files.
if len(interested_files):
per_file_coverage = dict(
(fname, per_file_coverage[fname])
for fname in interested_files
if fname in per_file_coverage
)
# If we only interested in several files, it makes no sense to report
# the total_coverage
total_coverage = None
if not len(per_file_coverage):
print("Cannot find coverage info for the given files.", file=sys.stderr)
return
display_file_coverage(per_file_coverage, total_coverage)
if __name__ == "__main__":
report_coverage()

@ -0,0 +1,121 @@
# This file is used by Meta-internal infrastructure as well as by Makefile
# When included from Makefile, there are rules to build DB_STRESS_CMD. When
# used directly with `make -f crashtest.mk ...` there will be no rules to
# build DB_STRESS_CMD so it must exist prior.
DB_STRESS_CMD?=./db_stress
include common.mk
CRASHTEST_MAKE=$(MAKE) -f crash_test.mk
CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --cleanup_cmd='$(DB_CLEANUP_CMD)'
.PHONY: crash_test crash_test_with_atomic_flush crash_test_with_txn \
crash_test_with_best_efforts_recovery crash_test_with_ts \
blackbox_crash_test blackbox_crash_test_with_atomic_flush \
blackbox_crash_test_with_txn blackbox_crash_test_with_ts \
blackbox_crash_test_with_best_efforts_recovery \
whitebox_crash_test whitebox_crash_test_with_atomic_flush \
whitebox_crash_test_with_txn whitebox_crash_test_with_ts \
blackbox_crash_test_with_multiops_wc_txn \
blackbox_crash_test_with_multiops_wp_txn \
crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
whitebox_crash_test_with_tiered_storage \
whitebox_crash_test_with_optimistic_txn \
blackbox_crash_test_with_optimistic_txn \
crash_test: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test
$(CRASHTEST_MAKE) blackbox_crash_test
crash_test_with_atomic_flush: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_atomic_flush
$(CRASHTEST_MAKE) blackbox_crash_test_with_atomic_flush
crash_test_with_txn: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_optimistic_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_optimistic_txn
crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
crash_test_with_ts: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_ts
$(CRASHTEST_MAKE) blackbox_crash_test_with_ts
crash_test_with_tiered_storage: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_tiered_storage
$(CRASHTEST_MAKE) blackbox_crash_test_with_tiered_storage
crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wc_txn
crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
$(CRASHTEST_MAKE) blackbox_crash_test_with_multiops_wp_txn
blackbox_crash_test: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --simple blackbox $(CRASH_TEST_EXT_ARGS)
$(CRASHTEST_PY) blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --cf_consistency blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --txn blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_best_efforts_recovery: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_best_efforts_recovery blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_ts: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --enable_ts blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_multiops_wc_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_multiops_txn --write_policy write_committed blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_multiops_txn --write_policy write_prepared blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn blackbox $(CRASH_TEST_EXT_ARGS)
ifeq ($(CRASH_TEST_KILL_ODD),)
CRASH_TEST_KILL_ODD=888887
endif
whitebox_crash_test: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --simple whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
$(CRASHTEST_PY) whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_atomic_flush: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --cf_consistency whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --txn whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --enable_ts whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)

@ -0,0 +1,165 @@
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/arena_wrapped_db_iter.h"
#include "memory/arena.h"
#include "rocksdb/env.h"
#include "rocksdb/iterator.h"
#include "rocksdb/options.h"
#include "table/internal_iterator.h"
#include "table/iterator_wrapper.h"
#include "util/user_comparator_wrapper.h"
namespace ROCKSDB_NAMESPACE {
Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
std::string* prop) {
if (prop_name == "rocksdb.iterator.super-version-number") {
// First try to pass the value returned from inner iterator.
if (!db_iter_->GetProperty(prop_name, prop).ok()) {
*prop = std::to_string(sv_number_);
}
return Status::OK();
}
return db_iter_->GetProperty(prop_name, prop);
}
void ArenaWrappedDBIter::Init(
Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
const MutableCFOptions& mutable_cf_options, const Version* version,
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
auto mem = arena_.AllocateAligned(sizeof(DBIter));
db_iter_ =
new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
ioptions.user_comparator, /* iter */ nullptr, version,
sequence, true, max_sequential_skip_in_iteration,
read_callback, db_impl, cfd, expose_blob_index);
sv_number_ = version_number;
read_options_ = read_options;
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false;
}
}
Status ArenaWrappedDBIter::Refresh() {
if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
return Status::NotSupported("Creating renew iterator is not allowed.");
}
assert(db_iter_ != nullptr);
// TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
// correct behavior. Will be corrected automatically when we take a snapshot
// here for the case of WritePreparedTxnDB.
uint64_t cur_sv_number = cfd_->GetSuperVersionNumber();
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
auto reinit_internal_iter = [&]() {
Env* env = db_iter_->env();
db_iter_->~DBIter();
arena_.~Arena();
new (&arena_) Arena();
SuperVersion* sv = cfd_->GetReferencedSuperVersion(db_impl_);
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
if (read_callback_) {
read_callback_->Refresh(latest_seq);
}
Init(env, read_options_, *(cfd_->ioptions()), sv->mutable_cf_options,
sv->current, latest_seq,
sv->mutable_cf_options.max_sequential_skip_in_iterations,
cur_sv_number, read_callback_, db_impl_, cfd_, expose_blob_index_,
allow_refresh_);
InternalIterator* internal_iter = db_impl_->NewInternalIterator(
read_options_, cfd_, sv, &arena_, latest_seq,
/* allow_unprepared_value */ true, /* db_iter */ this);
SetIterUnderDBIter(internal_iter);
};
while (true) {
if (sv_number_ != cur_sv_number) {
reinit_internal_iter();
break;
} else {
SequenceNumber latest_seq = db_impl_->GetLatestSequenceNumber();
// Refresh range-tombstones in MemTable
if (!read_options_.ignore_range_deletions) {
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
TEST_SYNC_POINT_CALLBACK("ArenaWrappedDBIter::Refresh:SV", nullptr);
auto t = sv->mem->NewRangeTombstoneIterator(
read_options_, latest_seq, false /* immutable_memtable */);
if (!t || t->empty()) {
// If memtable_range_tombstone_iter_ points to a non-empty tombstone
// iterator, then it means sv->mem is not the memtable that
// memtable_range_tombstone_iter_ points to, so SV must have changed
// after the sv_number_ != cur_sv_number check above. We will fall
// back to re-init the InternalIterator, and the tombstone iterator
// will be freed during db_iter destruction there.
if (memtable_range_tombstone_iter_) {
assert(!*memtable_range_tombstone_iter_ ||
sv_number_ != cfd_->GetSuperVersionNumber());
}
delete t;
} else { // current mutable memtable has range tombstones
if (!memtable_range_tombstone_iter_) {
delete t;
db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
// The memtable under DBIter did not have range tombstone before
// refresh.
reinit_internal_iter();
break;
} else {
delete *memtable_range_tombstone_iter_;
*memtable_range_tombstone_iter_ = new TruncatedRangeDelIterator(
std::unique_ptr<FragmentedRangeTombstoneIterator>(t),
&cfd_->internal_comparator(), nullptr, nullptr);
}
}
db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
}
// Refresh latest sequence number
db_iter_->set_sequence(latest_seq);
db_iter_->set_valid(false);
// Check again if the latest super version number is changed
uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
if (latest_sv_number != cur_sv_number) {
// If the super version number is changed after refreshing,
// fallback to Re-Init the InternalIterator
cur_sv_number = latest_sv_number;
continue;
}
break;
}
}
return Status::OK();
}
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
const MutableCFOptions& mutable_cf_options, const Version* version,
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
max_sequential_skip_in_iterations, version_number, read_callback,
db_impl, cfd, expose_blob_index, allow_refresh);
if (db_impl != nullptr && cfd != nullptr && allow_refresh) {
iter->StoreRefreshInfo(db_impl, cfd, read_callback, expose_blob_index);
}
return iter;
}
} // namespace ROCKSDB_NAMESPACE

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save