Revert "trying with memcpy in openssl"

This reverts commit 62d62a9d12.
Revert "randomize only the IV part of prefix"
2585 changed files with 398796 additions and 22728 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -152,15 +152,15 @@ commands:
    steps:
      - run:
          name: "Test low-variance benchmarks"
-          command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 10000000
+          command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 20000000
          environment:
            LD_LIBRARY_PATH: /usr/local/lib
            # How long to run parts of the test(s)
-            DURATION_RO: 400
-            DURATION_RW: 700
+            DURATION_RO: 300
+            DURATION_RW: 500
            # Keep threads within physical capacity of server (much lower than default)
            NUM_THREADS: 1
-            MAX_BACKGROUND_JOBS: 3
+            MAX_BACKGROUND_JOBS: 4
            # Don't run a couple of "optional" initial tests
            CI_TESTS_ONLY: "true"
            # Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree
@ -170,7 +170,11 @@ commands:
            # The benchmark host has 32GB memory
            # The following values are tailored to work with that
            # Note, tests may not exercise the targeted issues if the memory is increased on new test hosts.
-
+            COMPRESSION_TYPE: "none"
+            CACHE_INDEX_AND_FILTER_BLOCKS: 1
+            MIN_LEVEL_TO_COMPRESS: 3
+            CACHE_SIZE_MB: 10240
+            MB_WRITE_PER_SEC: 2

  post-benchmarks:
    steps:
@ -269,12 +273,12 @@ jobs:
          ./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression
      - post-steps

-  build-linux-shared_lib-alt_namespace-status_checked:
+  build-linux-static_lib-alt_namespace-status_checked:
    executor: linux-docker
    resource_class: 2xlarge
    steps:
      - pre-steps
-      - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check
+      - run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
      - post-steps

  build-linux-release:
@ -282,11 +286,21 @@ jobs:
    resource_class: 2xlarge
    steps:
      - checkout # check out the code in the project directory
+      - run: make V=1 -j32 LIB_MODE=shared release
+      - run: ls librocksdb.so # ensure shared lib built
+      - run: ./db_stress --version # ensure with gflags
+      - run: make clean
      - run: make V=1 -j32 release
+      - run: ls librocksdb.a # ensure static lib built
      - run: ./db_stress --version # ensure with gflags
      - run: make clean
      - run: apt-get remove -y libgflags-dev
+      - run: make V=1 -j32 LIB_MODE=shared release
+      - run: ls librocksdb.so # ensure shared lib built
+      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
+      - run: make clean
      - run: make V=1 -j32 release
+      - run: ls librocksdb.a # ensure static lib built
      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
      - post-steps

@ -302,27 +316,6 @@ jobs:
      - run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags

-  build-linux-lite:
-    executor: linux-docker
-    resource_class: large
-    steps:
-      - pre-steps
-      - run: LITE=1 make V=1 J=8 -j8 check
-      - post-steps
-
-  build-linux-lite-release:
-    executor: linux-docker
-    resource_class: large
-    steps:
-      - checkout # check out the code in the project directory
-      - run: LITE=1 make V=1 -j8 release
-      - run: ./db_stress --version # ensure with gflags
-      - run: make clean
-      - run: apt-get remove -y libgflags-dev
-      - run: LITE=1 make V=1 -j8 release
-      - run: if ./db_stress --version; then false; else true; fi # ensure without gflags
-      - post-steps
-
  build-linux-clang-no_test_run:
    executor: linux-docker
    resource_class: xlarge
@ -427,7 +420,10 @@ jobs:
    steps:
      - checkout # check out the code in the project directory
      - run: apt-get update -y && apt-get install -y libgflags-dev
-      - run: make V=1 -j8 unity_test
+      - run:
+          name: "Unity build"
+          command: make V=1 -j8 unity_test
+          no_output_timeout: 20m
      - run: make V=1 -j8 -k check-headers # could be moved to a different build
      - post-steps

@ -438,7 +434,7 @@ jobs:
      - pre-steps
      - setup-folly
      - build-folly
-      - run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check
+      - run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
      - post-steps

  build-linux-gcc-7-with-folly-lite-no-test:
@ -484,7 +480,7 @@ jobs:
    resource_class: 2xlarge
    steps:
      - pre-steps
-      - run: CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
+      - run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench # TODO: LIB_MODE only to work around unresolved linker failures
      - post-steps

  build-linux-clang-13-no_test_run:
@ -503,7 +499,7 @@ jobs:
      - pre-steps
      - setup-folly
      - build-folly
-      - run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
+      - run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
      - post-steps

  # This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
@ -520,7 +516,7 @@ jobs:
    resource_class: large
    steps:
      - pre-steps
-      - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
+      - run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush
      - post-steps

  build-linux-crashtest-tiered-storage-bb:
@ -530,7 +526,7 @@ jobs:
      - pre-steps
      - run:
          name: "run crashtest"
-          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 blackbox_crash_test_with_tiered_storage
+          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage
          no_output_timeout: 100m
      - post-steps

@ -541,7 +537,7 @@ jobs:
      - pre-steps
      - run:
          name: "run crashtest"
-          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 whitebox_crash_test_with_tiered_storage
+          command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage
          no_output_timeout: 100m
      - post-steps

@ -821,18 +817,16 @@ workflows:
      - build-linux-cmake-with-folly-coroutines
      - build-linux-cmake-with-benchmark
      - build-linux-encrypted_env-no_compression
-      - build-linux-lite
  jobs-linux-run-tests-san:
    jobs:
      - build-linux-clang10-asan
      - build-linux-clang10-ubsan
      - build-linux-clang10-mini-tsan
-      - build-linux-shared_lib-alt_namespace-status_checked
+      - build-linux-static_lib-alt_namespace-status_checked
  jobs-linux-no-test-run:
    jobs:
      - build-linux-release
      - build-linux-release-rtti
-      - build-linux-lite-release
      - build-examples
      - build-fuzzers
      - build-linux-clang-no_test_run
--- a/.github/workflows/sanity_check.yml
+++ b/.github/workflows/sanity_check.yml
@ -33,9 +33,7 @@ jobs:
      run: pip install argparse

    - name: Download clang-format-diff.py
-      uses: wei/wget@v1
-      with:
-        args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
+      run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py

    - name: Check format
      run: VERBOSE_CHECK=1 make check-format
--- a/.gitignore
+++ b/.gitignore
@ -95,3 +95,5 @@ fuzz/crash-*

 cmake-build-*
 third-party/folly/
+.cache
+*.sublime-*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -245,41 +245,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
  endif(HAS_S390X_MARCH_NATIVE)
 endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")

-option(PORTABLE "build a portable binary" OFF)
-option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
-option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF)
-option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF)
-if(PORTABLE)
-  add_definitions(-DROCKSDB_PORTABLE)
-
-  # MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
-  # is available, it is available by default.
-  if(FORCE_SSE42 AND NOT MSVC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
-  endif()
-  if(MSVC)
-    if(FORCE_AVX)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
-    endif()
-    # MSVC automatically enables BMI / lzcnt with AVX2.
-    if(FORCE_AVX2)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-    endif()
-  else()
-    if(FORCE_AVX)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
-    endif()
-    if(FORCE_AVX2)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt")
-    endif()
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
+  CHECK_C_COMPILER_FLAG("-march=loongarch64" HAS_LOONGARCH64)
+  if(HAS_LOONGARCH64)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
+  endif(HAS_LOONGARCH64)
+endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
+
+set(PORTABLE 0 CACHE STRING "Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU")
+if(PORTABLE STREQUAL 1)
+  # Usually nothing to do; compiler default is typically the most general
+  if(NOT MSVC)
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
    endif()
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64")
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64")
+    endif()
+  endif()
+elseif(PORTABLE MATCHES [^0]+)
+  # Name of a CPU arch spec or feature set to require
+  if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:${PORTABLE}")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${PORTABLE}")
  endif()
 else()
  if(MSVC)
+    # NOTE: No auto-detection of current CPU, but instead assume some useful
+    # level of optimization is supported
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
  else()
+    # Require instruction set from current CPU (with some legacy or opt-out
+    # exceptions)
    if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE)
      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
    elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC)
@ -294,25 +293,6 @@ if(NOT MSVC)
  set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
 endif()

-CHECK_CXX_SOURCE_COMPILES("
-#include <cstdint>
-#include <nmmintrin.h>
-#include <wmmintrin.h>
-int main() {
-  volatile uint32_t x = _mm_crc32_u32(0, 0);
-  const auto a = _mm_set_epi64x(0, 0);
-  const auto b = _mm_set_epi64x(0, 0);
-  const auto c = _mm_clmulepi64_si128(a, b, 0x00);
-  auto d = _mm_cvtsi128_si64(c);
-}
-" HAVE_SSE42)
-if(HAVE_SSE42)
-  add_definitions(-DHAVE_SSE42)
-  add_definitions(-DHAVE_PCLMUL)
-elseif(FORCE_SSE42)
-  message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
-endif()
-
 # Check if -latomic is required or not
 if (NOT MSVC)
  set(CMAKE_REQUIRED_FLAGS "--std=c++17")
@ -485,12 +465,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp")
 endif()

-option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
-if(ROCKSDB_LITE)
-  add_definitions(-DROCKSDB_LITE)
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
-endif()
-
 if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
  add_definitions(-fno-builtin-memcmp -DCYGWIN)
 elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
@ -573,7 +547,7 @@ if(HAVE_SCHED_GETCPU)
  add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
 endif()

-check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL)
+check_cxx_symbol_exists(getauxval "sys/auxv.h" HAVE_AUXV_GETAUXVAL)
 if(HAVE_AUXV_GETAUXVAL)
  add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
 endif()
@ -649,12 +623,14 @@ set(SOURCES
        cache/cache.cc
        cache/cache_entry_roles.cc
        cache/cache_key.cc
+        cache/cache_helpers.cc
        cache/cache_reservation_manager.cc
        cache/charged_cache.cc
        cache/clock_cache.cc
        cache/compressed_secondary_cache.cc
        cache/lru_cache.cc
        cache/secondary_cache.cc
+        cache/secondary_cache_adapter.cc
        cache/sharded_cache.cc
        db/arena_wrapped_db_iter.cc
        db/blob/blob_contents.cc
@ -741,6 +717,7 @@ set(SOURCES
        db/write_batch.cc
        db/write_batch_base.cc
        db/write_controller.cc
+        db/write_stall_stats.cc
        db/write_thread.cc
        env/composite_env.cc
        env/env.cc
@ -806,6 +783,7 @@ set(SOURCES
        table/block_based/block_based_table_iterator.cc
        table/block_based/block_based_table_reader.cc
        table/block_based/block_builder.cc
+        table/block_based/block_cache.cc
        table/block_based/block_prefetcher.cc
        table/block_based/block_prefix_index.cc
        table/block_based/data_block_hash_index.cc
@ -873,6 +851,7 @@ set(SOURCES
        util/compression_context_cache.cc
        util/concurrent_task_limiter_impl.cc
        util/crc32c.cc
+        util/data_structure.cc
        util/dynamic_bloom.cc
        util/hash.cc
        util/murmurhash.cc
@ -886,6 +865,8 @@ set(SOURCES
        util/string_util.cc
        util/thread_local.cc
        util/threadpool_imp.cc
+        util/udt_util.cc
+        util/write_batch_util.cc
        util/xxhash.cc
        utilities/agg_merge/agg_merge.cc
        utilities/backup/backup_engine.cc
@ -1000,12 +981,6 @@ if ( ROCKSDB_PLUGINS )
  endforeach()
 endif()

-if(HAVE_SSE42 AND NOT MSVC)
-  set_source_files_properties(
-    util/crc32c.cc
-    PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
-endif()
-
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
  list(APPEND SOURCES
    util/crc32c_ppc.c
@ -1271,6 +1246,7 @@ if(WITH_TESTS OR WITH_BENCHMARK_TOOLS)
  add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
  add_library(testharness STATIC
  test_util/mock_time_env.cc
+  test_util/secondary_cache_test_util.cc
  test_util/testharness.cc)
  target_link_libraries(testharness gtest)
 endif()
@ -1316,6 +1292,7 @@ if(WITH_TESTS)
        db/db_bloom_filter_test.cc
        db/db_compaction_filter_test.cc
        db/db_compaction_test.cc
+        db/db_clip_test.cc
        db/db_dynamic_level_test.cc
        db/db_encryption_test.cc
        db/db_flush_test.cc
@ -1446,6 +1423,7 @@ if(WITH_TESTS)
        util/timer_test.cc
        util/thread_list_test.cc
        util/thread_local_test.cc
+        util/udt_util_test.cc
        util/work_queue_test.cc
        utilities/agg_merge/agg_merge_test.cc
        utilities/backup/backup_engine_test.cc
@ -1614,3 +1592,6 @@ option(WITH_BENCHMARK "build benchmark tests" OFF)
 if(WITH_BENCHMARK)
  add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/)
 endif()
+
+target_include_directories(${PROJECT_NAME} PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,9 +1,164 @@
 # Rocksdb Change Log
-## Unreleased
+> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
+
+## 8.5.0 (07/21/2023)
+### Public API Changes
+* Removed recently added APIs `GeneralCache` and `MakeSharedGeneralCache()` as our plan changed to stop exposing a general-purpose cache interface. The old forms of these APIs, `Cache` and `NewLRUCache()`, are still available, although general-purpose caching support will be dropped eventually.
+
+### Behavior Changes
+* Option `periodic_compaction_seconds` no longer supports FIFO compaction: setting it has no effect on FIFO compactions. FIFO compaction users should only set option `ttl` instead.
+* Move prefetching responsibility to page cache for compaction read for non directIO use case
+
+### Performance Improvements
+* In case of direct_io, if buffer passed by callee is already aligned, RandomAccessFileRead::Read will avoid realloacting a new buffer, reducing memcpy and use already passed aligned buffer.
+* Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
+
+### Bug Fixes
+* Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest. Right now it's an experimental feature.
+
+## 8.4.0 (06/26/2023)
+### New Features
+* Add FSReadRequest::fs_scratch which is a data buffer allocated and provided by underlying FileSystem to RocksDB during reads, when FS wants to provide its own buffer with data instead of using RocksDB provided FSReadRequest::scratch. This can help in cpu optimization by avoiding copy from file system's buffer to RocksDB buffer. More details on how to use/enable it in file_system.h. Right now its supported only for MultiReads(async + sync) with non direct io.
+* Start logging non-zero user-defined timestamp sizes in WAL to signal user key format in subsequent records and use it during recovery. This change will break recovery from WAL files written by early versions that contain user-defined timestamps. The workaround is to ensure there are no WAL files to recover (i.e. by flushing before close) before upgrade.
+* Added new property "rocksdb.obsolete-sst-files-size-property" that reports the size of SST files that have become obsolete but have not yet been deleted or scheduled for deletion
+* Start to record the value of the flag `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` in the Manifest and table properties for a SST file when it is created. And use the recorded flag when creating a table reader for the SST file. This flag is only explicitly record if it's false.
+* Add a new option OptimisticTransactionDBOptions::shared_lock_buckets that enables sharing mutexes for validating transactions between DB instances, for better balancing memory efficiency and validation contention across DB instances. Different column families and DBs also now use different hash seeds in this validation, so that the same set of key names will not contend across DBs or column families.
+* Add a new ticker `rocksdb.files.marked.trash.deleted` to track the number of trash files deleted by background thread from the trash queue.
+* Add an API NewTieredVolatileCache() in include/rocksdb/cache.h to allocate an instance of a block cache with a primary block cache tier and a compressed secondary cache tier. A cache of this type distributes memory reservations against the block cache, such as WriteBufferManager, table reader memory etc., proportionally across both the primary and compressed secondary cache.
+* Add `WaitForCompact()` to wait for all flush and compactions jobs to finish. Jobs to wait include the unscheduled (queued, but not scheduled yet).
+* Add `WriteBatch::Release()` that releases the batch's serialized data to the caller.
+
+### Public API Changes
+* Add C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`.
+* change the FileSystem::use_async_io() API to SupportedOps API in order to extend it to various operations supported by underlying FileSystem. Right now it contains FSSupportedOps::kAsyncIO and FSSupportedOps::kFSBuffer. More details about FSSupportedOps in filesystem.h
+* Add new tickers: `rocksdb.error.handler.bg.error.count`, `rocksdb.error.handler.bg.io.error.count`, `rocksdb.error.handler.bg.retryable.io.error.count` to replace the misspelled ones: `rocksdb.error.handler.bg.errro.count`, `rocksdb.error.handler.bg.io.errro.count`, `rocksdb.error.handler.bg.retryable.io.errro.count` ('error' instead of 'errro'). Users should switch to use the new tickers before 9.0 release as the misspelled old tickers will be completely removed then.
+* Overload the API CreateColumnFamilyWithImport() to support creating ColumnFamily by importing multiple ColumnFamilies It requires that CFs should not overlap in user key range.
+
+### Behavior Changes
+* Change the default value for option `level_compaction_dynamic_level_bytes` to true. This affects users who use leveled compaction and do not set this option explicitly. These users may see additional background compactions following DB open. These compactions help to shape the LSM according to `level_compaction_dynamic_level_bytes` such that the size of each level Ln is approximately size of Ln-1 * `max_bytes_for_level_multiplier`. Turning on this option has other benefits too: see more detail in wiki: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#option-level_compaction_dynamic_level_bytes-and-levels-target-size and in option comment in advanced_options.h (#11525).
+* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468)
+For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468)
+* RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459)
+* When a DB is openend with `allow_ingest_behind=true` (currently only Universal compaction is supported), files in the last level, i.e. the ingested files,  will not be included in any compaction. (#11489)
+* Statistics `rocksdb.sst.read.micros` scope is expanded to all SST reads except for file ingestion and column family import (some compaction reads were previously excluded).
+
+### Bug Fixes
+* Reduced cases of illegally using Env::Default() during static destruction by never destroying the internal PosixEnv itself (except for builds checking for memory leaks). (#11538)
+* Fix extra prefetching during seek in async_io when BlockBasedTableOptions.num_file_reads_for_auto_readahead is 1 leading to extra reads than required.
+* Fix a bug where compactions that are qualified to be run as 2 subcompactions were only run as one subcompaction.
+* Fix a use-after-move bug in block.cc.
+
+## 8.3.0 (05/19/2023)
+### New Features
+* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
+* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache.
+* Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail.
+* Introduced a new option `CompactionOptionsFIFO::file_temperature_age_thresholds` that allows FIFO compaction to compact files to different temperatures based on key age (#11428).
+* Added a new ticker stat to count how many times RocksDB detected a corruption while verifying a block checksum: `BLOCK_CHECKSUM_MISMATCH_COUNT`.
+* New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open.
+* New statistics tickers for various iterator seek behaviors and relevant filtering, as \*`_LEVEL_SEEK_`\*. (#11460)
+
+### Public API Changes
+* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones.
+* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists.
+* Changed the meaning of various Bloom filter stats (prefix vs. whole key), with iterator-related filtering only being tracked in the new \*`_LEVEL_SEEK_`\*. stats. (#11460)
+
+### Behavior changes
+* For x86, CPU features are no longer detected at runtime nor in build scripts, but in source code using common preprocessor defines. This will likely unlock some small performance improvements on some newer hardware, but could hurt performance of the kCRC32c checksum, which is no longer the default, on some "portable" builds. See PR #11419 for details.
+
+### Bug Fixes
+* Delete an empty WAL file on DB open if the log number is less than the min log number to keep
+* Delete temp OPTIONS file on DB open if there is a failure to write it out or rename it
+
+### Performance Improvements
+* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406).
+
+## 8.2.0 (04/24/2023)
+### Public API Changes
+* `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
+* Add `multi_get_for_update` to C API.
+* Remove unnecessary constructor for CompressionOptions.
+
+### Behavior changes
+* Changed default block cache size from an 8MB to 32MB LRUCache, which increases the default number of cache shards from 16 to 64. This change is intended to minimize cache mutex contention under stress conditions. See https://github.com/facebook/rocksdb/wiki/Block-Cache for more information.
+* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321).
+* User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks.
+* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp.
+* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375)).
+
+### Bug Fixes
+* In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size.
+* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet.
+
+### New Features
+* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called.
+* Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()`
+* Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388)
+* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction.
+
+## 8.1.0 (03/18/2023)
+### Behavior changes
+* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
+* If the async_io ReadOption is specified for MultiGet or NewIterator on a platform that doesn't support IO uring, the option is ignored and synchronous IO is used.
+
+### Bug Fixes
+* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB.
+* Fixed a couple of cases where a Merge operand encountered during iteration wasn't reflected in the `internal_merge_count` PerfContext counter.
+* Fixed a bug in CreateColumnFamilyWithImport()/ExportColumnFamily() which did not support range tombstones (#11252).
+* Fixed a bug where an excluded column family from an atomic flush contains unflushed data that should've been included in this atomic flush (i.e, data of seqno less than the max seqno of this atomic flush), leading to potential data loss in this excluded column family when `WriteOptions::disableWAL == true` (#11148).
+
+### New Features
+* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits
+* Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries.
+* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit}
+* Add support for SecondaryCache with HyperClockCache (`HyperClockCacheOptions` inherits `secondary_cache` option from `ShardedCacheOptions`)
+* Add new db properties `rocksdb.cf-write-stall-stats`, `rocksdb.db-write-stall-stats`and APIs to examine them in a structured way. In particular, users of `GetMapProperty()` with property `kCFWriteStallStats`/`kDBWriteStallStats` can now use the functions in `WriteStallStatsMapKeys` to find stats in the map.
+
+### Public API Changes
+* Changed various functions and features in `Cache` that are mostly relevant to custom implementations or wrappers. Especially, asychronous lookup functionality is moved from `Lookup()` to a new `StartAsyncLookup()` function.
+
+## 8.0.0 (02/19/2023)
+### Behavior changes
+* `ReadOptions::verify_checksums=false` disables checksum verification for more reads of non-`CacheEntryRole::kDataBlock` blocks.
+* In case of scan with async_io enabled, if posix doesn't support IOUring, Status::NotSupported error will be returned to the users. Initially that error was swallowed and reads were switched to synchronous reads.
+
+### Bug Fixes
+* Fixed a data race on `ColumnFamilyData::flush_reason` caused by concurrent flushes.
+* Fixed an issue in `Get` and `MultiGet` when user-defined timestamps is enabled in combination with BlobDB.
+* Fixed some atypical behaviors for `LockWAL()` such as allowing concurrent/recursive use and not expecting `UnlockWAL()` after non-OK result. See API comments.
+* Fixed a feature interaction bug where for blobs `GetEntity` would expose the blob reference instead of the blob value.
+* Fixed `DisableManualCompaction()` and `CompactRangeOptions::canceled` to cancel compactions even when they are waiting on conflicting compactions to finish
+* Fixed a bug in which a successful `GetMergeOperands()` could transiently return `Status::MergeInProgress()`
+* Return the correct error (Status::NotSupported()) to MultiGet caller when ReadOptions::async_io flag is true and IO uring is not enabled. Previously, Status::Corruption() was being returned when the actual failure was lack of async IO support.
+* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block.
+
+### Feature Removal
+* Remove RocksDB Lite.
+* The feature block_cache_compressed is removed. Statistics related to it are removed too.
+* Remove deprecated Env::LoadEnv(). Use Env::CreateFromString() instead.
+* Remove deprecated FileSystem::Load(). Use FileSystem::CreateFromString() instead.
+* Removed the deprecated version of these utility functions and the corresponding Java bindings: `LoadOptionsFromFile`, `LoadLatestOptions`, `CheckOptionsCompatibility`.
+* Remove the FactoryFunc from the LoadObject method from the Customizable helper methods.
+
+### Public API Changes
+* Moved rarely-needed Cache class definition to new advanced_cache.h, and added a CacheWrapper class to advanced_cache.h. Minor changes to SimCache API definitions.
+* Completely removed the following deprecated/obsolete statistics: the tickers `BLOCK_CACHE_INDEX_BYTES_EVICT`, `BLOCK_CACHE_FILTER_BYTES_EVICT`, `BLOOM_FILTER_MICROS`, `NO_FILE_CLOSES`, `STALL_L0_SLOWDOWN_MICROS`, `STALL_MEMTABLE_COMPACTION_MICROS`, `STALL_L0_NUM_FILES_MICROS`, `RATE_LIMIT_DELAY_MILLIS`, `NO_ITERATORS`, `NUMBER_FILTERED_DELETES`, `WRITE_TIMEDOUT`, `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, `BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT` as well as the histograms `STALL_L0_SLOWDOWN_COUNT`, `STALL_MEMTABLE_COMPACTION_COUNT`, `STALL_L0_NUM_FILES_COUNT`, `HARD_RATE_LIMIT_DELAY_COUNT`, `SOFT_RATE_LIMIT_DELAY_COUNT`, `BLOB_DB_GC_MICROS`, and `NUM_DATA_BLOCKS_READ_PER_LEVEL`. Note that as a result, the C++ enum values of the still supported statistics have changed. Developers are advised to not rely on the actual numeric values.
+* Deprecated IngestExternalFileOptions::write_global_seqno and change default to false. This option only needs to be set to true to generate a DB compatible with RocksDB versions before 5.16.0.
+* Remove deprecated APIs `GetColumnFamilyOptionsFrom{Map|String}(const ColumnFamilyOptions&, ..)`, `GetDBOptionsFrom{Map|String}(const DBOptions&, ..)`, `GetBlockBasedTableOptionsFrom{Map|String}(const BlockBasedTableOptions& table_options, ..)` and ` GetPlainTableOptionsFrom{Map|String}(const PlainTableOptions& table_options,..)`.
+* Added a subcode of `Status::Corruption`, `Status::SubCode::kMergeOperatorFailed`, for users to identify corruption failures originating in the merge operator, as opposed to RocksDB's internally identified data corruptions
+
+### Build Changes
+* The `make` build now builds a shared library by default instead of a static library. Use `LIB_MODE=static` to override.
+
+### New Features
+* Compaction filters are now supported for wide-column entities by means of the `FilterV3` API. See the comment of the API for more details.
+* Added `do_not_compress_roles` to `CompressedSecondaryCacheOptions` to disable compression on certain kinds of block. Filter blocks are now not compressed by CompressedSecondaryCache by default.
+* Added a new `MultiGetEntity` API that enables batched wide-column point lookups. See the API comments for more details.
+
+## 7.10.0 (01/23/2023)
 ### Behavior changes
 * Make best-efforts recovery verify SST unique ID before Version construction (#10962)
 * Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file.
-* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.

 ### Bug Fixes
 * Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
@ -16,13 +171,20 @@
 * Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
 * Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching
 * Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d).
+* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075).

 ### New Features
 * When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.
-
-### New Features
 * Add BackupEngine feature to exclude files from backup that are known to be backed up elsewhere, using `CreateBackupOptions::exclude_files_callback`. To restore the DB, the excluded files must be provided in alternative backup directories using `RestoreOptions::alternate_dirs`.

+### Public API Changes
+* Substantial changes have been made to the Cache class to support internal development goals. Direct use of Cache class members is discouraged and further breaking modifications are expected in the future. SecondaryCache has some related changes and implementations will need to be updated. (Unlike Cache, SecondaryCache is still intended to support user implementations, and disruptive changes will be avoided.) (#10975)
+* Add `MergeOperationOutput::op_failure_scope` for merge operator users to control the blast radius of merge operator failures. Existing merge operator users do not need to make any change to preserve the old behavior
+
+### Performance Improvements
+* Updated xxHash source code, which should improve kXXH3 checksum speed, at least on ARM (#11098).
+* Improved CPU efficiency of DB reads, from block cache access improvements (#10975).
+
 ## 7.9.0 (11/21/2022)
 ### Performance Improvements
 * Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).
--- a/INSTALL.md
+++ b/INSTALL.md
@ -20,12 +20,15 @@ There are few options when compiling RocksDB:
 depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
 use binaries compiled by `make all` in production.

-* By default the binary we produce is optimized for the platform you're compiling on
-(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your
-CPU supports it. To print a warning if your CPU does not support SSE4.2, build with
-`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want
-to build a portable binary, add `PORTABLE=1` before your make commands, like this:
-`PORTABLE=1 make static_lib`.
+* By default the binary we produce is optimized for the CPU you're compiling on
+(`-march=native` or the equivalent). To build a binary compatible with the most
+general architecture supported by your CPU and compiler, set `PORTABLE=1` for
+the build, but performance will suffer as many operations benefit from newer
+and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`,
+it can be set to an architecture name recognized by your compiler. For example,
+on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports
+many or most of the available optimizations while still being compatible with
+most processors made since roughly 2013.

 ## Dependencies

@ -48,6 +51,11 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
 * If you wish to build the RocksJava static target, then cmake is required for building Snappy.

 * If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed.
+* You can do the following to install Google benchmark. These commands are copied from `./build_tools/ubuntu20_image/Dockerfile`:
+
+`$ git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark`
+
+`$ cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install`

 ## Supported platforms

@ -178,7 +186,7 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
        gmake rocksdbjava

 * **iOS**:
-  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
+  * Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`.

 * **Windows** (Visual Studio 2017 to up):
  * Read and follow the instructions at CMakeLists.txt
--- a/124
+++ b/124
@ -44,13 +44,6 @@ quoted_perl_command = $(subst ','\'',$(perl_command))
 # Set the default DEBUG_LEVEL to 1
 DEBUG_LEVEL?=1

-# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
-# Mode "static" means to link against static libraries (.a)
-# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
-#
-# Set the default LIB_MODE to static
-LIB_MODE?=static
-
 # OBJ_DIR is where the object files reside.  Default to the current directory
 OBJ_DIR?=.

@ -81,29 +74,42 @@ else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
 	endif
 endif

-$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
-
-# Lite build flag.
-LITE ?= 0
-ifeq ($(LITE), 0)
-ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
-  # Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
-  # specified instead of LITE=1 on the command line.
-  LITE=1
-endif
-else ifeq ($(LITE), 1)
-ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
-	OPT += -DROCKSDB_LITE
-endif
+# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
+# Mode "static" means to link against static libraries (.a)
+# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
+#
+ifeq ($(DEBUG_LEVEL), 0)
+# For optimized, set the default LIB_MODE to static for code size/efficiency
+	LIB_MODE?=static
+else
+# For debug, set the default LIB_MODE to shared for efficient `make check` etc.
+	LIB_MODE?=shared
 endif

+$(info $$DEBUG_LEVEL is $(DEBUG_LEVEL), $$LIB_MODE is $(LIB_MODE))
+
+# Detect what platform we're building on.
+# Export some common variables that might have been passed as Make variables
+# instead of environment variables.
+dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
+                  export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
+                  export LDFLAGS="$(EXTRA_LDFLAGS)"; \
+                  export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
+                  export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
+                  export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
+                  export PORTABLE="$(PORTABLE)"; \
+                  export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
+                  export USE_CLANG="$(USE_CLANG)"; \
+                  export LIB_MODE="$(LIB_MODE)"; \
+		  export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
+		  export USE_FOLLY="$(USE_FOLLY)"; \
+                  "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
+# this file is generated by the previous line to set build flags and sources
+include make_config.mk
+
 # Figure out optimize level.
 ifneq ($(DEBUG_LEVEL), 2)
-ifeq ($(LITE), 0)
 	OPTIMIZE_LEVEL ?= -O2
-else
-	OPTIMIZE_LEVEL ?= -Os
-endif
 endif
 # `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`.
 # In that case, the compiler default (`-O0` for gcc and clang) will be used.
@ -236,25 +242,6 @@ am__v_AR_1 =
 AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
 AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@

-# Detect what platform we're building on.
-# Export some common variables that might have been passed as Make variables
-# instead of environment variables.
-dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
-                  export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
-                  export LDFLAGS="$(EXTRA_LDFLAGS)"; \
-                  export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
-                  export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
-                  export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
-                  export PORTABLE="$(PORTABLE)"; \
-                  export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
-                  export USE_CLANG="$(USE_CLANG)"; \
-                  export LIB_MODE="$(LIB_MODE)"; \
-		  export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
-		  export USE_FOLLY="$(USE_FOLLY)"; \
-                  "$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
-# this file is generated by the previous line to set build flags and sources
-include make_config.mk
-
 ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
 include $(ROCKSDB_PLUGIN_MKS)
 ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\&
@ -337,13 +324,6 @@ endif
 ifeq ($(PLATFORM), OS_SOLARIS)
 	PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99
 endif
-ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
-	# found
-	CFLAGS += -fno-exceptions
-	CXXFLAGS += -fno-exceptions
-	# LUA is not supported under ROCKSDB_LITE
-	LUA_PATH =
-endif

 ifeq ($(LIB_MODE),shared)
 # So that binaries are executable from build location, in addition to install location
@ -357,8 +337,8 @@ ifneq ($(MACHINE), arm64)
 # linking with jemalloc (as it won't be arm64-compatible) and remove some other options
 # set during platform detection
 DISABLE_JEMALLOC=1
-PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
-PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
+PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS))
 endif
 endif
 endif
@ -559,7 +539,7 @@ endif

 ifdef USE_CLANG
 	# Used by some teams in Facebook
-	WARNING_FLAGS += -Wshift-sign-overflow
+	WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator
 endif

 ifeq ($(PLATFORM), OS_OPENBSD)
@ -1082,13 +1062,11 @@ check: all
 	rm -rf $(TEST_TMPDIR)
 ifneq ($(PLATFORM), OS_AIX)
 	$(PYTHON) tools/check_all_python.py
-ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
 ifndef ASSERT_STATUS_CHECKED # not yet working with these tests
 	$(PYTHON) tools/ldb_test.py
 	sh tools/rocksdb_dump_test.sh
 endif
 endif
-endif
 ifndef SKIP_FORMAT_BUCK_CHECKS
 	$(MAKE) check-format
 	$(MAKE) check-buck-targets
@ -1244,9 +1222,9 @@ clean: clean-ext-libraries-all clean-rocks clean-rocksjava
 clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava

 clean-rocks:
-	echo shared=$(ALL_SHARED_LIBS)
-	echo static=$(ALL_STATIC_LIBS)
-	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS)
+# Not practical to exactly match all versions/variants in naming (e.g. debug or not)
+	rm -f ${LIBNAME}*.so* ${LIBNAME}*.a
+	rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(MICROBENCHS)
 	rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
 	$(FIND) . -name "*.[oda]" -exec rm -f {} \;
 	$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
@ -1439,6 +1417,9 @@ thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY
 work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

+udt_util_test: $(OBJ_DIR)/util/udt_util_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

@ -1502,6 +1483,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR
 db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

+db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY)
+	$(AM_LINK)
+
 db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY)
 	$(AM_LINK)

@ -2070,7 +2054,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
 ifeq ($(PLATFORM), OS_SOLARIS)
 	ARCH := $(shell isainfo -b)
 else ifeq ($(PLATFORM), OS_OPENBSD)
-	ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE)))
+	ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
 		ARCH := 64
 	else
 		ARCH := 32
@ -2091,7 +2075,7 @@ ifneq ($(origin JNI_LIBC), undefined)
 endif

 ifeq (,$(ROCKSDBJNILIB))
-ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE)))
+ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
 	ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
 else
 	ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
@ -2457,6 +2441,8 @@ checkout_folly:
 	@# NOTE: this hack is required for gcc in some cases
 	perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h

+CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
+
 build_folly:
 	FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
 	if [ "$$FOLLY_INST_PATH" ]; then \
@ -2467,8 +2453,8 @@ build_folly:
 	fi
 	# Restore the original version of Invoke.h with boost dependency
 	cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h
-	cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \
-		CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
+	cd third-party/folly && \
+		CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests

 # ---------------------------------------------------------------------------
 #   Build size testing
@ -2489,18 +2475,6 @@ build_size:
 	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib $$(stat --printf="%s" `readlink -f librocksdb.so`)
 	strip `readlink -f librocksdb.so`
 	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
-	# === lite build, static ===
-	$(MAKE) clean
-	$(MAKE) LITE=1 static_lib
-	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite $$(stat --printf="%s" librocksdb.a)
-	strip librocksdb.a
-	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite_stripped $$(stat --printf="%s" librocksdb.a)
-	# === lite build, shared ===
-	$(MAKE) clean
-	$(MAKE) LITE=1 shared_lib
-	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite $$(stat --printf="%s" `readlink -f librocksdb.so`)
-	strip `readlink -f librocksdb.so`
-	$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)

 # ---------------------------------------------------------------------------
 #  	Platform-specific compilation
--- a/PLUGINS.md
+++ b/PLUGINS.md
@ -5,3 +5,4 @@ This is the list of all known third-party plugins for RocksDB. If something is m
 * [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
 * [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.
 * [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB.
+* [IPPCP](https://github.com/intel/ippcp-plugin-rocksdb): a plugin to enable encryption on RocksDB based on Intel optimized open source IPP-Crypto library.
--- a/README.md
+++ b/README.md
@ -1,8 +1,6 @@
 ## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage

 [![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
-[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
-[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)

 RocksDB is developed and maintained by Facebook Database Engineering Team.
 It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
--- a/ROCKSDB_LITE.md
+++ b/ROCKSDB_LITE.md
@ -1,21 +0,0 @@
-# RocksDBLite
-
-RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
-
-Some examples of the features disabled by ROCKSDB_LITE:
-* compiled-in support for LDB tool
-* No backup engine
-* No support for replication (which we provide in form of TransactionalIterator)
-* No advanced monitoring tools
-* No special-purpose memtables that are highly optimized for specific use cases
-* No Transactions
-
-When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
-* Nobody from mobile really needs your feature,
-* Your feature is adding a lot of weight to the binary.
-
-Don't add ROCKSDB_LITE compile guard if:
-* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
-* Your feature is not adding a lot of weight.
-
-If unsure, ask. :)
--- a/362
+++ b/362
@ -11,6 +11,7 @@ load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrap
 cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "cache/cache.cc",
        "cache/cache_entry_roles.cc",
+        "cache/cache_helpers.cc",
        "cache/cache_key.cc",
        "cache/cache_reservation_manager.cc",
        "cache/charged_cache.cc",
@ -18,6 +19,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "cache/compressed_secondary_cache.cc",
        "cache/lru_cache.cc",
        "cache/secondary_cache.cc",
+        "cache/secondary_cache_adapter.cc",
        "cache/sharded_cache.cc",
        "db/arena_wrapped_db_iter.cc",
        "db/blob/blob_contents.cc",
@ -104,6 +106,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "db/write_batch.cc",
        "db/write_batch_base.cc",
        "db/write_controller.cc",
+        "db/write_stall_stats.cc",
        "db/write_thread.cc",
        "env/composite_env.cc",
        "env/env.cc",
@ -180,6 +183,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "table/block_based/block_based_table_iterator.cc",
        "table/block_based/block_based_table_reader.cc",
        "table/block_based/block_builder.cc",
+        "table/block_based/block_cache.cc",
        "table/block_based/block_prefetcher.cc",
        "table/block_based/block_prefix_index.cc",
        "table/block_based/data_block_footer.cc",
@ -246,6 +250,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "util/concurrent_task_limiter_impl.cc",
        "util/crc32c.cc",
        "util/crc32c_arm64.cc",
+        "util/data_structure.cc",
        "util/dynamic_bloom.cc",
        "util/file_checksum_helper.cc",
        "util/hash.cc",
@ -259,6 +264,8 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "util/string_util.cc",
        "util/thread_local.cc",
        "util/threadpool_imp.cc",
+        "util/udt_util.cc",
+        "util/write_batch_util.cc",
        "util/xxhash.cc",
        "utilities/agg_merge/agg_merge.cc",
        "utilities/backup/backup_engine.cc",
@ -349,352 +356,14 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
        "//folly/synchronization:distributed_mutex",
    ], headers=None, link_whole=False, extra_test_libs=False)

-cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
-        "cache/cache.cc",
-        "cache/cache_entry_roles.cc",
-        "cache/cache_key.cc",
-        "cache/cache_reservation_manager.cc",
-        "cache/charged_cache.cc",
-        "cache/clock_cache.cc",
-        "cache/compressed_secondary_cache.cc",
-        "cache/lru_cache.cc",
-        "cache/secondary_cache.cc",
-        "cache/sharded_cache.cc",
-        "db/arena_wrapped_db_iter.cc",
-        "db/blob/blob_contents.cc",
-        "db/blob/blob_fetcher.cc",
-        "db/blob/blob_file_addition.cc",
-        "db/blob/blob_file_builder.cc",
-        "db/blob/blob_file_cache.cc",
-        "db/blob/blob_file_garbage.cc",
-        "db/blob/blob_file_meta.cc",
-        "db/blob/blob_file_reader.cc",
-        "db/blob/blob_garbage_meter.cc",
-        "db/blob/blob_log_format.cc",
-        "db/blob/blob_log_sequential_reader.cc",
-        "db/blob/blob_log_writer.cc",
-        "db/blob/blob_source.cc",
-        "db/blob/prefetch_buffer_collection.cc",
-        "db/builder.cc",
-        "db/c.cc",
-        "db/column_family.cc",
-        "db/compaction/compaction.cc",
-        "db/compaction/compaction_iterator.cc",
-        "db/compaction/compaction_job.cc",
-        "db/compaction/compaction_outputs.cc",
-        "db/compaction/compaction_picker.cc",
-        "db/compaction/compaction_picker_fifo.cc",
-        "db/compaction/compaction_picker_level.cc",
-        "db/compaction/compaction_picker_universal.cc",
-        "db/compaction/compaction_service_job.cc",
-        "db/compaction/compaction_state.cc",
-        "db/compaction/sst_partitioner.cc",
-        "db/compaction/subcompaction_state.cc",
-        "db/convenience.cc",
-        "db/db_filesnapshot.cc",
-        "db/db_impl/compacted_db_impl.cc",
-        "db/db_impl/db_impl.cc",
-        "db/db_impl/db_impl_compaction_flush.cc",
-        "db/db_impl/db_impl_debug.cc",
-        "db/db_impl/db_impl_experimental.cc",
-        "db/db_impl/db_impl_files.cc",
-        "db/db_impl/db_impl_open.cc",
-        "db/db_impl/db_impl_readonly.cc",
-        "db/db_impl/db_impl_secondary.cc",
-        "db/db_impl/db_impl_write.cc",
-        "db/db_info_dumper.cc",
-        "db/db_iter.cc",
-        "db/dbformat.cc",
-        "db/error_handler.cc",
-        "db/event_helpers.cc",
-        "db/experimental.cc",
-        "db/external_sst_file_ingestion_job.cc",
-        "db/file_indexer.cc",
-        "db/flush_job.cc",
-        "db/flush_scheduler.cc",
-        "db/forward_iterator.cc",
-        "db/import_column_family_job.cc",
-        "db/internal_stats.cc",
-        "db/log_reader.cc",
-        "db/log_writer.cc",
-        "db/logs_with_prep_tracker.cc",
-        "db/malloc_stats.cc",
-        "db/memtable.cc",
-        "db/memtable_list.cc",
-        "db/merge_helper.cc",
-        "db/merge_operator.cc",
-        "db/output_validator.cc",
-        "db/periodic_task_scheduler.cc",
-        "db/range_del_aggregator.cc",
-        "db/range_tombstone_fragmenter.cc",
-        "db/repair.cc",
-        "db/seqno_to_time_mapping.cc",
-        "db/snapshot_impl.cc",
-        "db/table_cache.cc",
-        "db/table_properties_collector.cc",
-        "db/transaction_log_impl.cc",
-        "db/trim_history_scheduler.cc",
-        "db/version_builder.cc",
-        "db/version_edit.cc",
-        "db/version_edit_handler.cc",
-        "db/version_set.cc",
-        "db/wal_edit.cc",
-        "db/wal_manager.cc",
-        "db/wide/wide_column_serialization.cc",
-        "db/wide/wide_columns.cc",
-        "db/write_batch.cc",
-        "db/write_batch_base.cc",
-        "db/write_controller.cc",
-        "db/write_thread.cc",
-        "env/composite_env.cc",
-        "env/env.cc",
-        "env/env_chroot.cc",
-        "env/env_encryption.cc",
-        "env/env_posix.cc",
-        "env/file_system.cc",
-        "env/file_system_tracer.cc",
-        "env/fs_posix.cc",
-        "env/fs_remap.cc",
-        "env/io_posix.cc",
-        "env/mock_env.cc",
-        "env/unique_id_gen.cc",
-        "file/delete_scheduler.cc",
-        "file/file_prefetch_buffer.cc",
-        "file/file_util.cc",
-        "file/filename.cc",
-        "file/line_file_reader.cc",
-        "file/random_access_file_reader.cc",
-        "file/read_write_util.cc",
-        "file/readahead_raf.cc",
-        "file/sequence_file_reader.cc",
-        "file/sst_file_manager_impl.cc",
-        "file/writable_file_writer.cc",
-        "logging/auto_roll_logger.cc",
-        "logging/event_logger.cc",
-        "logging/log_buffer.cc",
-        "memory/arena.cc",
-        "memory/concurrent_arena.cc",
-        "memory/jemalloc_nodump_allocator.cc",
-        "memory/memkind_kmem_allocator.cc",
-        "memory/memory_allocator.cc",
-        "memtable/alloc_tracker.cc",
-        "memtable/hash_linklist_rep.cc",
-        "memtable/hash_skiplist_rep.cc",
-        "memtable/skiplistrep.cc",
-        "memtable/vectorrep.cc",
-        "memtable/write_buffer_manager.cc",
-        "monitoring/histogram.cc",
-        "monitoring/histogram_windowing.cc",
-        "monitoring/in_memory_stats_history.cc",
-        "monitoring/instrumented_mutex.cc",
-        "monitoring/iostats_context.cc",
-        "monitoring/perf_context.cc",
-        "monitoring/perf_level.cc",
-        "monitoring/persistent_stats_history.cc",
-        "monitoring/statistics.cc",
-        "monitoring/thread_status_impl.cc",
-        "monitoring/thread_status_updater.cc",
-        "monitoring/thread_status_updater_debug.cc",
-        "monitoring/thread_status_util.cc",
-        "monitoring/thread_status_util_debug.cc",
-        "options/cf_options.cc",
-        "options/configurable.cc",
-        "options/customizable.cc",
-        "options/db_options.cc",
-        "options/options.cc",
-        "options/options_helper.cc",
-        "options/options_parser.cc",
-        "port/mmap.cc",
-        "port/port_posix.cc",
-        "port/stack_trace.cc",
-        "port/win/env_default.cc",
-        "port/win/env_win.cc",
-        "port/win/io_win.cc",
-        "port/win/port_win.cc",
-        "port/win/win_logger.cc",
-        "port/win/win_thread.cc",
-        "table/adaptive/adaptive_table_factory.cc",
-        "table/block_based/binary_search_index_reader.cc",
-        "table/block_based/block.cc",
-        "table/block_based/block_based_table_builder.cc",
-        "table/block_based/block_based_table_factory.cc",
-        "table/block_based/block_based_table_iterator.cc",
-        "table/block_based/block_based_table_reader.cc",
-        "table/block_based/block_builder.cc",
-        "table/block_based/block_prefetcher.cc",
-        "table/block_based/block_prefix_index.cc",
-        "table/block_based/data_block_footer.cc",
-        "table/block_based/data_block_hash_index.cc",
-        "table/block_based/filter_block_reader_common.cc",
-        "table/block_based/filter_policy.cc",
-        "table/block_based/flush_block_policy.cc",
-        "table/block_based/full_filter_block.cc",
-        "table/block_based/hash_index_reader.cc",
-        "table/block_based/index_builder.cc",
-        "table/block_based/index_reader_common.cc",
-        "table/block_based/parsed_full_filter_block.cc",
-        "table/block_based/partitioned_filter_block.cc",
-        "table/block_based/partitioned_index_iterator.cc",
-        "table/block_based/partitioned_index_reader.cc",
-        "table/block_based/reader_common.cc",
-        "table/block_based/uncompression_dict_reader.cc",
-        "table/block_fetcher.cc",
-        "table/compaction_merging_iterator.cc",
-        "table/cuckoo/cuckoo_table_builder.cc",
-        "table/cuckoo/cuckoo_table_factory.cc",
-        "table/cuckoo/cuckoo_table_reader.cc",
-        "table/format.cc",
-        "table/get_context.cc",
-        "table/iterator.cc",
-        "table/merging_iterator.cc",
-        "table/meta_blocks.cc",
-        "table/persistent_cache_helper.cc",
-        "table/plain/plain_table_bloom.cc",
-        "table/plain/plain_table_builder.cc",
-        "table/plain/plain_table_factory.cc",
-        "table/plain/plain_table_index.cc",
-        "table/plain/plain_table_key_coding.cc",
-        "table/plain/plain_table_reader.cc",
-        "table/sst_file_dumper.cc",
-        "table/sst_file_reader.cc",
-        "table/sst_file_writer.cc",
-        "table/table_factory.cc",
-        "table/table_properties.cc",
-        "table/two_level_iterator.cc",
-        "table/unique_id.cc",
-        "test_util/sync_point.cc",
-        "test_util/sync_point_impl.cc",
-        "test_util/transaction_test_util.cc",
-        "tools/dump/db_dump_tool.cc",
-        "tools/io_tracer_parser_tool.cc",
-        "tools/ldb_cmd.cc",
-        "tools/ldb_tool.cc",
-        "tools/sst_dump_tool.cc",
-        "trace_replay/block_cache_tracer.cc",
-        "trace_replay/io_tracer.cc",
-        "trace_replay/trace_record.cc",
-        "trace_replay/trace_record_handler.cc",
-        "trace_replay/trace_record_result.cc",
-        "trace_replay/trace_replay.cc",
-        "util/async_file_reader.cc",
-        "util/build_version.cc",
-        "util/cleanable.cc",
-        "util/coding.cc",
-        "util/compaction_job_stats_impl.cc",
-        "util/comparator.cc",
-        "util/compression.cc",
-        "util/compression_context_cache.cc",
-        "util/concurrent_task_limiter_impl.cc",
-        "util/crc32c.cc",
-        "util/crc32c_arm64.cc",
-        "util/dynamic_bloom.cc",
-        "util/file_checksum_helper.cc",
-        "util/hash.cc",
-        "util/murmurhash.cc",
-        "util/random.cc",
-        "util/rate_limiter.cc",
-        "util/ribbon_config.cc",
-        "util/slice.cc",
-        "util/status.cc",
-        "util/stderr_logger.cc",
-        "util/string_util.cc",
-        "util/thread_local.cc",
-        "util/threadpool_imp.cc",
-        "util/xxhash.cc",
-        "utilities/agg_merge/agg_merge.cc",
-        "utilities/backup/backup_engine.cc",
-        "utilities/blob_db/blob_compaction_filter.cc",
-        "utilities/blob_db/blob_db.cc",
-        "utilities/blob_db/blob_db_impl.cc",
-        "utilities/blob_db/blob_db_impl_filesnapshot.cc",
-        "utilities/blob_db/blob_dump_tool.cc",
-        "utilities/blob_db/blob_file.cc",
-        "utilities/cache_dump_load.cc",
-        "utilities/cache_dump_load_impl.cc",
-        "utilities/cassandra/cassandra_compaction_filter.cc",
-        "utilities/cassandra/format.cc",
-        "utilities/cassandra/merge_operator.cc",
-        "utilities/checkpoint/checkpoint_impl.cc",
-        "utilities/compaction_filters.cc",
-        "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
-        "utilities/convenience/info_log_finder.cc",
-        "utilities/counted_fs.cc",
-        "utilities/debug.cc",
-        "utilities/env_mirror.cc",
-        "utilities/env_timed.cc",
-        "utilities/fault_injection_env.cc",
-        "utilities/fault_injection_fs.cc",
-        "utilities/fault_injection_secondary_cache.cc",
-        "utilities/leveldb_options/leveldb_options.cc",
-        "utilities/memory/memory_util.cc",
-        "utilities/merge_operators.cc",
-        "utilities/merge_operators/bytesxor.cc",
-        "utilities/merge_operators/max.cc",
-        "utilities/merge_operators/put.cc",
-        "utilities/merge_operators/sortlist.cc",
-        "utilities/merge_operators/string_append/stringappend.cc",
-        "utilities/merge_operators/string_append/stringappend2.cc",
-        "utilities/merge_operators/uint64add.cc",
-        "utilities/object_registry.cc",
-        "utilities/option_change_migration/option_change_migration.cc",
-        "utilities/options/options_util.cc",
-        "utilities/persistent_cache/block_cache_tier.cc",
-        "utilities/persistent_cache/block_cache_tier_file.cc",
-        "utilities/persistent_cache/block_cache_tier_metadata.cc",
-        "utilities/persistent_cache/persistent_cache_tier.cc",
-        "utilities/persistent_cache/volatile_tier_impl.cc",
-        "utilities/simulator_cache/cache_simulator.cc",
-        "utilities/simulator_cache/sim_cache.cc",
-        "utilities/table_properties_collectors/compact_on_deletion_collector.cc",
-        "utilities/trace/file_trace_reader_writer.cc",
-        "utilities/trace/replayer_impl.cc",
-        "utilities/transactions/lock/lock_manager.cc",
-        "utilities/transactions/lock/point/point_lock_manager.cc",
-        "utilities/transactions/lock/point/point_lock_tracker.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
-        "utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
-        "utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
-        "utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
-        "utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
-        "utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
-        "utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
-        "utilities/transactions/optimistic_transaction.cc",
-        "utilities/transactions/optimistic_transaction_db_impl.cc",
-        "utilities/transactions/pessimistic_transaction.cc",
-        "utilities/transactions/pessimistic_transaction_db.cc",
-        "utilities/transactions/snapshot_checker.cc",
-        "utilities/transactions/transaction_base.cc",
-        "utilities/transactions/transaction_db_mutex_impl.cc",
-        "utilities/transactions/transaction_util.cc",
-        "utilities/transactions/write_prepared_txn.cc",
-        "utilities/transactions/write_prepared_txn_db.cc",
-        "utilities/transactions/write_unprepared_txn.cc",
-        "utilities/transactions/write_unprepared_txn_db.cc",
-        "utilities/ttl/db_ttl_impl.cc",
-        "utilities/wal_filter.cc",
-        "utilities/write_batch_with_index/write_batch_with_index.cc",
-        "utilities/write_batch_with_index/write_batch_with_index_internal.cc",
-    ], deps=[
-        "//folly/container:f14_hash",
-        "//folly/experimental/coro:blocking_wait",
-        "//folly/experimental/coro:collect",
-        "//folly/experimental/coro:coroutine",
-        "//folly/experimental/coro:task",
-        "//folly/synchronization:distributed_mutex",
-    ], headers=None, link_whole=True, extra_test_libs=False)
+cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False)

 cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
        "db/db_test_util.cc",
        "db/db_with_timestamp_test_util.cc",
        "table/mock_table.cc",
        "test_util/mock_time_env.cc",
+        "test_util/secondary_cache_test_util.cc",
        "test_util/testharness.cc",
        "test_util/testutil.cc",
        "tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
@ -725,6 +394,7 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
        "db_stress_tool/db_stress_test_base.cc",
        "db_stress_tool/db_stress_tool.cc",
        "db_stress_tool/expected_state.cc",
+        "db_stress_tool/expected_value.cc",
        "db_stress_tool/multi_ops_txns_stress.cc",
        "db_stress_tool/no_batched_ops_stress.cc",
        "test_util/testutil.cc",
@ -5082,6 +4752,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test",
            extra_compiler_flags=[])


+cpp_unittest_wrapper(name="db_clip_test",
+            srcs=["db/db_clip_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="db_compaction_filter_test",
            srcs=["db/db_compaction_filter_test.cc"],
            deps=[":rocksdb_test_lib"],
@ -5834,6 +5510,12 @@ cpp_unittest_wrapper(name="ttl_test",
            extra_compiler_flags=[])


+cpp_unittest_wrapper(name="udt_util_test",
+            srcs=["util/udt_util_test.cc"],
+            deps=[":rocksdb_test_lib"],
+            extra_compiler_flags=[])
+
+
 cpp_unittest_wrapper(name="util_merge_operators_test",
            srcs=["utilities/util_merge_operators_test.cc"],
            deps=[":rocksdb_test_lib"],
--- a/USERS.md
+++ b/USERS.md
@ -15,6 +15,28 @@ At Facebook, we use RocksDB as storage engines in multiple data management servi

 [2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/

+## Bilibili
+[Bilibili](bilibili.com) [uses](https://www.alluxio.io/blog/when-ai-meets-alluxio-at-bilibili-building-an-efficient-ai-platform-for-data-preprocessing-and-model-training/) Alluxio to speed up its ML training workloads, and Alluxio uses RocksDB to store its filesystem metadata, so Bilibili uses RocksDB.
+
+Bilibili's [real-time platform](https://www.alibabacloud.com/blog/architecture-and-practices-of-bilibilis-real-time-platform_596676) uses Flink, and uses RocksDB as Flink's state store.
+
+## TikTok
+TikTok, or its parent company ByteDance, uses RocksDB as the storage engine for some storage systems, such as its distributed graph database [ByteGraph](https://vldb.org/pvldb/vol15/p3306-li.pdf). 
+
+Also, TikTok uses [Alluxio](alluxio.io) to [speed up Presto queries](https://www.alluxio.io/resources/videos/improving-presto-performance-with-alluxio-at-tiktok/), and Alluxio stores the files' metadata in RocksDB.
+
+## FoundationDB
+[FoundationDB](https://www.foundationdb.org/) [uses](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp) RocksDB to implement a [key-value store interface](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp#L1127) in its server backend.
+
+## Apple
+Apple [uses](https://opensource.apple.com/projects/foundationdb/) FoundationDB, so it also uses RocksDB.
+
+## Snowflake
+Snowflake [uses](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/) FoundationDB, so it also uses RocksDB.
+
+## Microsoft
+The Bing search engine from Microsoft uses RocksDB as the storage engine for its web data platform: https://blogs.bing.com/Engineering-Blog/october-2021/RocksDB-in-Microsoft-Bing
+
 ## LinkedIn
 Two different use cases at Linkedin are using RocksDB as a storage engine:

@ -26,6 +48,9 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
 ## Yahoo
 Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights

+## Tencent
+[PaxosStore](https://github.com/Tencent/paxosstore) is a distributed database supporting WeChat. It uses RocksDB as its storage engine.
+
 ## Baidu
 [Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.

@ -79,9 +104,18 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
 ## TiKV
 [TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer.

+## TiDB
+[TiDB](https://github.com/pingcap/tidb) uses the TiKV distributed key-value database, so it uses RocksDB.
+
+## PingCAP
+[PingCAP](https://www.pingcap.com/) is the company behind TiDB, its cloud database service uses RocksDB.
+
 ## Apache Spark
 [Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store.

+## Databricks
+[Databricks](https://www.databricks.com/) [replaces AWS RDS with TiDB](https://www.pingcap.com/case-study/how-databricks-tackles-the-scalability-limit-with-a-mysql-alternative/) for scalability, so it uses RocksDB.
+
 ## Apache Flink
 [Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine.

--- a/buckifier/buckify_rocksdb.py
+++ b/buckifier/buckify_rocksdb.py
@ -26,7 +26,7 @@ from util import ColorString
 # $python3 buckifier/buckify_rocksdb.py \
 #        '{"fake": {
 #                      "extra_deps": [":test_dep", "//fakes/module:mock1"],
-#                      "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
+#                      "extra_compiler_flags": ["-DFOO_BAR", "-Os"]
 #                  }
 #         }'
 # (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
@ -154,16 +154,9 @@ def generate_targets(repo_path, deps_map):
    # rocksdb_whole_archive_lib
    TARGETS.add_library(
        "rocksdb_whole_archive_lib",
-        src_mk["LIB_SOURCES"] +
-        # always add range_tree, it's only excluded on ppc64, which we don't use internally
-        src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
+        [],
        deps=[
-            "//folly/container:f14_hash",
-            "//folly/experimental/coro:blocking_wait",
-            "//folly/experimental/coro:collect",
-            "//folly/experimental/coro:coroutine",
-            "//folly/experimental/coro:task",
-            "//folly/synchronization:distributed_mutex",
+            ":rocksdb_lib",
        ],
        headers=None,
        extra_external_deps="",
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@ -63,13 +63,7 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
    if [ "$LIB_MODE" == "shared" ]; then
      PIC_BUILD=1
    fi
-    if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM010" ]; then
-      source "$PWD/build_tools/fbcode_config_platform010.sh"
-    elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
-      source "$PWD/build_tools/fbcode_config_platform009.sh"
-    else
-      source "$PWD/build_tools/fbcode_config_platform009.sh"
-    fi
+    source "$PWD/build_tools/fbcode_config_platform010.sh"
 fi

 # Delete existing output, if it exists
@ -154,7 +148,7 @@ case "$TARGET_OS" in
        ;;
    IOS)
        PLATFORM=IOS
-        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
+        COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE "
        PLATFORM_SHARED_EXT=dylib
        PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
        CROSS_COMPILE=true
@ -425,7 +419,7 @@ EOF

    if ! test $ROCKSDB_DISABLE_JEMALLOC; then
        # Test whether jemalloc is available
-        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \
+        if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ljemalloc \
          2>/dev/null; then
            # This will enable some preprocessor identifiers in the Makefile
            JEMALLOC=1
@ -434,12 +428,19 @@ EOF
            WITH_JEMALLOC_FLAG=1
            # check for JEMALLOC installed with HomeBrew
            if [ "$PLATFORM" == "OS_MACOSX" ]; then
+                if [ "$TARGET_ARCHITECTURE" = "arm64" ]; then
+                    # on M1 Macs, homebrew installs here instead of /usr/local
+                    JEMALLOC_PREFIX="/opt/homebrew"
+                else
+                    JEMALLOC_PREFIX="/usr/local"
+                fi
                if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
                    JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
-                    JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include"
-                    JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
-                    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB"
-                    JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB"
+                    JEMALLOC_INCLUDE="-I${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/include"
+                    JEMALLOC_LIB="${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
+                    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
+                    JAVA_LDFLAGS="$JAVA_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
+                    JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
                fi
            fi
        fi
@ -627,7 +628,7 @@ EOF
  fi
 fi

-if test "0$PORTABLE" -eq 0; then
+if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then
  if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
    # Tune for this POWER processor, treating '+' models as base models
    POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
@ -650,37 +651,36 @@ if test "0$PORTABLE" -eq 0; then
    COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
  elif [ "$TARGET_OS" == "IOS" ]; then
    COMMON_FLAGS="$COMMON_FLAGS"
-  elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
-    # TODO: Not sure why we don't use -march=native on these OSes
-    if test "$USE_SSE"; then
-      TRY_SSE_ETC="1"
-    fi
  else
    COMMON_FLAGS="$COMMON_FLAGS -march=native "
  fi
 else
-  # PORTABLE=1
-  if test "$USE_SSE"; then
-    TRY_SSE_ETC="1"
-  fi
-
-  if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
-    COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
-  fi
-
-  if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
-    RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
-    COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
+  # PORTABLE specified
+  if [ "$PORTABLE" == 1 ]; then
+    if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
+      COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
+    elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
+      RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
+      COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
+    elif test "$USE_SSE"; then
+      # USE_SSE is DEPRECATED
+      # This is a rough approximation of the old USE_SSE behavior
+      COMMON_FLAGS="$COMMON_FLAGS -march=haswell"
+    fi
+    # Other than those cases, not setting -march= here.
+  else
+    # Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell
+    COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}"
  fi

  if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
-    # For portability compile for macOS 10.13 (2017) or newer
-    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13"
-    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13"
+    # For portability compile for macOS 10.14 or newer
+    COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.14"
+    PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.14"
    # -mmacosx-version-min must come first here.
-    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS"
-    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13"
-    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13"
+    PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.14 $PLATFORM_SHARED_LDFLAGS"
+    PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
+    JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.14"
    JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
    JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
    JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
@ -704,101 +704,6 @@ EOF
  fi
 fi

-if test "$TRY_SSE_ETC"; then
-  # The USE_SSE flag now means "attempt to compile with widely-available
-  # Intel architecture extensions utilized by specific optimizations in the
-  # source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
-  # It doesn't even really check that your current CPU is compatible.
-  #
-  # SSE4.2 available since nehalem, ca. 2008-2010
-  # Includes POPCNT for BitsSetToOne, BitParity
-  TRY_SSE42="-msse4.2"
-  # PCLMUL available since westmere, ca. 2010-2011
-  TRY_PCLMUL="-mpclmul"
-  # AVX2 available since haswell, ca. 2013-2015
-  TRY_AVX2="-mavx2"
-  # BMI available since haswell, ca. 2013-2015
-  # Primarily for TZCNT for CountTrailingZeroBits
-  TRY_BMI="-mbmi"
-  # LZCNT available since haswell, ca. 2013-2015
-  # For FloorLog2
-  TRY_LZCNT="-mlzcnt"
-fi
-
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null <<EOF
-  #include <cstdint>
-  #include <nmmintrin.h>
-  int main() {
-    volatile uint32_t x = _mm_crc32_u32(0, 0);
-    (void)x;
-  }
-EOF
-if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
-elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
-fi
-
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null <<EOF
-  #include <cstdint>
-  #include <wmmintrin.h>
-  int main() {
-    const auto a = _mm_set_epi64x(0, 0);
-    const auto b = _mm_set_epi64x(0, 0);
-    const auto c = _mm_clmulepi64_si128(a, b, 0x00);
-    auto d = _mm_cvtsi128_si64(c);
-    (void)d;
-  }
-EOF
-if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
-elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
-fi
-
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null <<EOF
-  #include <cstdint>
-  #include <immintrin.h>
-  int main() {
-    const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
-    const auto b = _mm256_permutevar8x32_epi32(a, a);
-    (void)b;
-  }
-EOF
-if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
-elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
-fi
-
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null <<EOF
-  #include <cstdint>
-  #include <immintrin.h>
-  int main(int argc, char *argv[]) {
-    (void)argv;
-    return (int)_tzcnt_u64((uint64_t)argc);
-  }
-EOF
-if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
-elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
-fi
-
-$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null <<EOF
-  #include <cstdint>
-  #include <immintrin.h>
-  int main(int argc, char *argv[]) {
-    (void)argv;
-    return (int)_lzcnt_u64((uint64_t)argc);
-  }
-EOF
-if [ "$?" = 0 ]; then
-  COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
-elif test "$USE_SSE"; then
-  echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
-fi
-
 $CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
  #include <cstdint>
  int main() {
--- a/build_tools/dependencies_platform009.sh
+++ b/build_tools/dependencies_platform009.sh
@ -1,22 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279
-CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30
-LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7
-GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413
-SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187
-ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187
-BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187
-LZ4_BASE=/mnt/gvfs/third-party2/lz4/6ca38d3c390be2774d61a300f151464bbd632d62/1.9.1/platform009/7f3b187
-ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f
-GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187
-JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform009/c305944
-NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187
-LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652
-TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187
-LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187
-KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e
-BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/
-VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187
-LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
-BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187
-GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d
--- a/build_tools/fbcode_config.sh
+++ b/build_tools/fbcode_config.sh
@ -147,7 +147,7 @@ else
 fi

 CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT"
 CXXFLAGS+=" $CFLAGS"

 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"
--- a/build_tools/fbcode_config_platform009.sh
+++ b/build_tools/fbcode_config_platform009.sh
@ -1,170 +0,0 @@
-#!/bin/sh
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
-#
-# Set environment variables so that we can compile rocksdb using
-# fbcode settings.  It uses the latest g++ and clang compilers and also
-# uses jemalloc
-# Environment variables that change the behavior of this script:
-# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
-
-
-BASEDIR=`dirname $BASH_SOURCE`
-source "$BASEDIR/dependencies_platform009.sh"
-
-CFLAGS=""
-
-# libgcc
-LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward"
-LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
-
-# glibc
-GLIBC_INCLUDE="$GLIBC_BASE/include"
-GLIBC_LIBS=" -L $GLIBC_BASE/lib"
-
-if test -z $PIC_BUILD; then
-  MAYBE_PIC=
-else
-  MAYBE_PIC=_pic
-fi
-
-if ! test $ROCKSDB_DISABLE_SNAPPY; then
-  # snappy
-  SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
-  SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
-  CFLAGS+=" -DSNAPPY"
-fi
-
-if ! test $ROCKSDB_DISABLE_ZLIB; then
-  # location of zlib headers and libraries
-  ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
-  ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
-  CFLAGS+=" -DZLIB"
-fi
-
-if ! test $ROCKSDB_DISABLE_BZIP; then
-  # location of bzip headers and libraries
-  BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
-  BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
-  CFLAGS+=" -DBZIP2"
-fi
-
-if ! test $ROCKSDB_DISABLE_LZ4; then
-  LZ4_INCLUDE=" -I $LZ4_BASE/include/"
-  LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
-  CFLAGS+=" -DLZ4"
-fi
-
-if ! test $ROCKSDB_DISABLE_ZSTD; then
-  ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
-  ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
-  CFLAGS+=" -DZSTD"
-fi
-
-# location of gflags headers and libraries
-GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
-GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
-CFLAGS+=" -DGFLAGS=gflags"
-
-BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
-BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
-
-GLOG_INCLUDE=" -I $GLOG_BASE/include/"
-GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a"
-
-# location of jemalloc
-JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
-JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
-
-# location of numa
-NUMA_INCLUDE=" -I $NUMA_BASE/include/"
-NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
-CFLAGS+=" -DNUMA"
-
-# location of libunwind
-LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
-
-# location of TBB
-TBB_INCLUDE=" -isystem $TBB_BASE/include/"
-TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
-CFLAGS+=" -DTBB"
-
-# location of LIBURING
-LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
-LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
-CFLAGS+=" -DLIBURING"
-
-test "$USE_SSE" || USE_SSE=1
-export USE_SSE
-test "$PORTABLE" || PORTABLE=1
-export PORTABLE
-
-BINUTILS="$BINUTILS_BASE/bin"
-AR="$BINUTILS/ar"
-AS="$BINUTILS/as"
-
-DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $GLOG_INCLUDE"
-
-STDLIBS="-L $GCC_BASE/lib64"
-
-CLANG_BIN="$CLANG_BASE/bin"
-CLANG_LIB="$CLANG_BASE/lib"
-CLANG_SRC="$CLANG_BASE/../../src"
-
-CLANG_ANALYZER="$CLANG_BIN/clang++"
-CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
-
-if [ -z "$USE_CLANG" ]; then
-  # gcc
-  CC="$GCC_BASE/bin/gcc"
-  CXX="$GCC_BASE/bin/g++"
-  AR="$GCC_BASE/bin/gcc-ar"
-
-  CFLAGS+=" -B$BINUTILS"
-  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
-  CFLAGS+=" -isystem $GLIBC_INCLUDE"
-  JEMALLOC=1
-else
-  # clang
-  CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
-  CC="$CLANG_BIN/clang"
-  CXX="$CLANG_BIN/clang++"
-  AR="$CLANG_BIN/llvm-ar"
-
-  KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
-
-  CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
-  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
-  CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
-  CFLAGS+=" -isystem $GLIBC_INCLUDE"
-  CFLAGS+=" -isystem $LIBGCC_INCLUDE"
-  CFLAGS+=" -isystem $CLANG_INCLUDE"
-  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
-  CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
-  CFLAGS+=" -Wno-expansion-to-defined "
-  CXXFLAGS="-nostdinc++"
-fi
-
-CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
-CXXFLAGS+=" $CFLAGS"
-
-EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
-EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
-EXEC_LDFLAGS+=" $LIBUNWIND"
-EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
-EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
-# required by libtbb
-EXEC_LDFLAGS+=" -ldl"
-
-PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
-PLATFORM_LDFLAGS+=" -B$BINUTILS"
-
-EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
-
-VALGRIND_VER="$VALGRIND_BASE/bin/"
-
-# lua not supported because it's on track for deprecation, I think
-LUA_PATH=
-LUA_LIB=
-
-export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB
--- a/build_tools/fbcode_config_platform010.sh
+++ b/build_tools/fbcode_config_platform010.sh
@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
 CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "

 CFLAGS+=" $DEPS_INCLUDE"
-CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
+CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT"
 CXXFLAGS+=" $CFLAGS"

 EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
--- a/build_tools/regression_build_test.sh
+++ b/build_tools/regression_build_test.sh
@ -360,7 +360,7 @@ function send_to_ods {
    echo >&2 "ERROR: Key $key doesn't have a value."
    return
  fi
-  curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
+  curl --silent "https://www.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
    --connect-timeout 60
 }

--- a/build_tools/update_dependencies.sh
+++ b/build_tools/update_dependencies.sh
@ -104,46 +104,3 @@ get_lib_base valgrind   LATEST platform010
 get_lib_base lua        5.3.4  platform010

 git diff $OUTPUT
-
-
-###########################################################
-#                platform009 dependencies                 #
-###########################################################
-
-OUTPUT="$BASEDIR/dependencies_platform009.sh"
-
-rm -f "$OUTPUT"
-touch "$OUTPUT"
-
-echo "Writing dependencies to $OUTPUT"
-
-# Compilers locations
-GCC_BASE=`readlink -f $TP2_LATEST/gcc/9.x/centos7-native/*/`
-CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/9.0.0/platform009/*/`
-
-log_header
-log_variable GCC_BASE
-log_variable CLANG_BASE
-
-# Libraries locations
-get_lib_base libgcc     9.x     platform009
-get_lib_base glibc      2.30    platform009
-get_lib_base snappy     LATEST  platform009
-get_lib_base zlib       LATEST  platform009
-get_lib_base bzip2      LATEST  platform009
-get_lib_base lz4        LATEST  platform009
-get_lib_base zstd       LATEST  platform009
-get_lib_base gflags     LATEST  platform009
-get_lib_base jemalloc   LATEST  platform009
-get_lib_base numa       LATEST  platform009
-get_lib_base libunwind  LATEST  platform009
-get_lib_base tbb        2018_U5 platform009
-get_lib_base liburing   LATEST  platform009
-get_lib_base benchmark  LATEST  platform009
-
-get_lib_base kernel-headers fb platform009
-get_lib_base binutils   LATEST centos7-native
-get_lib_base valgrind   LATEST platform009
-get_lib_base lua        5.3.4  platform009
-
-git diff $OUTPUT
--- a/cache/cache.cc
+++ b/cache/cache.cc
@ -16,7 +16,8 @@
 #include "util/string_util.h"

 namespace ROCKSDB_NAMESPACE {
-#ifndef ROCKSDB_LITE
+const Cache::CacheItemHelper kNoopCacheItemHelper{};
+
 static std::unordered_map<std::string, OptionTypeInfo>
    lru_cache_options_type_info = {
        {"capacity",
@ -64,7 +65,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
          OptionType::kBoolean, OptionVerificationType::kNormal,
          OptionTypeFlags::kMutable}},
 };
-#endif  // ROCKSDB_LITE

 Status SecondaryCache::CreateFromString(
    const ConfigOptions& config_options, const std::string& value,
@ -75,7 +75,6 @@ Status SecondaryCache::CreateFromString(
    Status status;
    std::shared_ptr<SecondaryCache> sec_cache;

-#ifndef ROCKSDB_LITE
    CompressedSecondaryCacheOptions sec_cache_opts;
    status = OptionTypeInfo::ParseStruct(config_options, "",
                                         &comp_sec_cache_options_type_info, "",
@ -84,19 +83,13 @@ Status SecondaryCache::CreateFromString(
      sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
    }

-#else
-    (void)config_options;
-    status = Status::NotSupported(
-        "Cannot load compressed secondary cache in LITE mode ", args);
-#endif  //! ROCKSDB_LITE

    if (status.ok()) {
      result->swap(sec_cache);
    }
    return status;
  } else {
-    return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
-                                            result);
+    return LoadSharedObject<SecondaryCache>(config_options, value, result);
  }
 }

@ -108,7 +101,6 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
  if (value.find('=') == std::string::npos) {
    cache = NewLRUCache(ParseSizeT(value));
  } else {
-#ifndef ROCKSDB_LITE
    LRUCacheOptions cache_opts;
    status = OptionTypeInfo::ParseStruct(config_options, "",
                                         &lru_cache_options_type_info, "",
@ -116,14 +108,51 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
    if (status.ok()) {
      cache = NewLRUCache(cache_opts);
    }
-#else
-    (void)config_options;
-    status = Status::NotSupported("Cannot load cache in LITE mode ", value);
-#endif  //! ROCKSDB_LITE
  }
  if (status.ok()) {
    result->swap(cache);
  }
  return status;
 }
+
+bool Cache::AsyncLookupHandle::IsReady() {
+  return pending_handle == nullptr || pending_handle->IsReady();
+}
+
+bool Cache::AsyncLookupHandle::IsPending() { return pending_handle != nullptr; }
+
+Cache::Handle* Cache::AsyncLookupHandle::Result() {
+  assert(!IsPending());
+  return result_handle;
+}
+
+void Cache::StartAsyncLookup(AsyncLookupHandle& async_handle) {
+  async_handle.found_dummy_entry = false;  // in case re-used
+  assert(!async_handle.IsPending());
+  async_handle.result_handle =
+      Lookup(async_handle.key, async_handle.helper, async_handle.create_context,
+             async_handle.priority, async_handle.stats);
+}
+
+Cache::Handle* Cache::Wait(AsyncLookupHandle& async_handle) {
+  WaitAll(&async_handle, 1);
+  return async_handle.Result();
+}
+
+void Cache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
+  for (size_t i = 0; i < count; ++i) {
+    if (async_handles[i].IsPending()) {
+      // If a pending handle gets here, it should be marked at "to be handled
+      // by a caller" by that caller erasing the pending_cache on it.
+      assert(async_handles[i].pending_cache == nullptr);
+    }
+  }
+}
+
+void Cache::SetEvictionCallback(EvictionCallback&& fn) {
+  // Overwriting non-empty with non-empty could indicate a bug
+  assert(!eviction_callback_ || !fn);
+  eviction_callback_ = std::move(fn);
+}
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_bench_tool.cc
+++ b/cache/cache_bench_tool.cc
@ -16,7 +16,7 @@
 #include "db/db_impl/db_impl.h"
 #include "monitoring/histogram.h"
 #include "port/port.h"
-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
 DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
 DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");

-DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
 DEFINE_bool(populate_cache, true, "Populate cache before operations");

 DEFINE_uint32(lookup_insert_percent, 87,
@ -71,17 +71,23 @@ DEFINE_uint32(

 DEFINE_uint32(gather_stats_entries_per_lock, 256,
              "For Cache::ApplyToAllEntries");
-DEFINE_bool(skewed, false, "If true, skew the key access distribution");

 DEFINE_bool(lean, false,
            "If true, no additional computation is performed besides cache "
            "operations.");

-#ifndef ROCKSDB_LITE
+DEFINE_bool(early_exit, false,
+            "Exit before deallocating most memory. Good for malloc stats, e.g."
+            "MALLOC_CONF=\"stats_print:true\"");
+
+DEFINE_bool(histograms, true,
+            "Whether to track and print histogram statistics.");
+
+DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
+
 DEFINE_string(secondary_cache_uri, "",
              "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
-#endif  // ROCKSDB_LITE

 DEFINE_string(cache_type, "lru_cache", "Type of block cache.");

@ -147,9 +153,6 @@ class SharedState {
 public:
  explicit SharedState(CacheBench* cache_bench)
      : cv_(&mu_),
-        num_initialized_(0),
-        start_(false),
-        num_done_(0),
        cache_bench_(cache_bench) {}

  ~SharedState() {}
@ -172,15 +175,27 @@ class SharedState {

  bool Started() const { return start_; }

+  void AddLookupStats(uint64_t hits, uint64_t misses) {
+    MutexLock l(&mu_);
+    lookup_count_ += hits + misses;
+    lookup_hits_ += hits;
+  }
+
+  double GetLookupHitRatio() const {
+    return 1.0 * lookup_hits_ / lookup_count_;
+  }
+
 private:
  port::Mutex mu_;
  port::CondVar cv_;

-  uint64_t num_initialized_;
-  bool start_;
-  uint64_t num_done_;
-
  CacheBench* cache_bench_;
+
+  uint64_t num_initialized_ = 0;
+  bool start_ = false;
+  uint64_t num_done_ = 0;
+  uint64_t lookup_count_ = 0;
+  uint64_t lookup_hits_ = 0;
 };

 // Per-thread state for concurrent executions of the same benchmark.
@ -192,27 +207,19 @@ struct ThreadState {
  uint64_t duration_us = 0;

  ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rnd(1000 + index), shared(_shared) {}
+      : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
 };

 struct KeyGen {
  char key_data[27];

-  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
-    uint64_t key = 0;
-    if (!FLAGS_skewed) {
-      uint64_t raw = rnd.Next();
-      // Skew according to setting
-      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
-        raw = std::min(raw, rnd.Next());
-      }
-      key = FastRange64(raw, max_key);
-    } else {
-      key = rnd.Skewed(max_log);
-      if (key > max_key) {
-        key -= max_key;
-      }
+  Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
+    uint64_t raw = rnd.Next();
+    // Skew according to setting
+    for (uint32_t i = 0; i < skew; ++i) {
+      raw = std::min(raw, rnd.Next());
    }
+    uint64_t key = FastRange64(raw, max_key);
    // Variable size and alignment
    size_t off = key % 8;
    key_data[0] = char{42};
@ -226,7 +233,7 @@ struct KeyGen {
  }
 };

-char* createValue(Random64& rnd) {
+Cache::ObjectPtr createValue(Random64& rnd) {
  char* rv = new char[FLAGS_value_bytes];
  // Fill with some filler data, and take some CPU time
  for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
@ -236,28 +243,36 @@ char* createValue(Random64& rnd) {
 }

 // Callbacks for secondary cache
-size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
+size_t SizeFn(Cache::ObjectPtr /*obj*/) { return FLAGS_value_bytes; }

-Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
-  memcpy(out, obj, size);
+Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
+                size_t length, char* out) {
+  memcpy(out, from_obj, length);
  return Status::OK();
 }

-// Different deleters to simulate using deleter to gather
-// stats on the code origin and kind of cache entries.
-void deleter1(const Slice& /*key*/, void* value) {
-  delete[] static_cast<char*>(value);
-}
-void deleter2(const Slice& /*key*/, void* value) {
-  delete[] static_cast<char*>(value);
-}
-void deleter3(const Slice& /*key*/, void* value) {
+Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/,
+                MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj,
+                size_t* out_charge) {
+  *out_obj = new char[data.size()];
+  memcpy(*out_obj, data.data(), data.size());
+  *out_charge = data.size();
+  return Status::OK();
+};
+
+void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
  delete[] static_cast<char*>(value);
 }

-Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
-Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
-Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
+Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
+Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn,
+                               SaveToFn, CreateFn, &helper1_wos);
+Cache::CacheItemHelper helper2_wos(CacheEntryRole::kIndexBlock, DeleteFn);
+Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn,
+                               SaveToFn, CreateFn, &helper2_wos);
+Cache::CacheItemHelper helper3_wos(CacheEntryRole::kFilterBlock, DeleteFn);
+Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn,
+                               SaveToFn, CreateFn, &helper3_wos);
 }  // namespace

 class CacheBench {
@ -275,32 +290,25 @@ class CacheBench {
        lookup_threshold_(insert_threshold_ +
                          kHundredthUint64 * FLAGS_lookup_percent),
        erase_threshold_(lookup_threshold_ +
-                         kHundredthUint64 * FLAGS_erase_percent),
-        skewed_(FLAGS_skewed) {
+                         kHundredthUint64 * FLAGS_erase_percent) {
    if (erase_threshold_ != 100U * kHundredthUint64) {
      fprintf(stderr, "Percentages must add to 100.\n");
      exit(1);
    }

-    max_log_ = 0;
-    if (skewed_) {
-      uint64_t max_key = max_key_;
-      while (max_key >>= 1) max_log_++;
-      if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
-    }
-
    if (FLAGS_cache_type == "clock_cache") {
      fprintf(stderr, "Old clock cache implementation has been removed.\n");
      exit(1);
    } else if (FLAGS_cache_type == "hyper_clock_cache") {
-      cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
-                                      FLAGS_num_shard_bits)
-                   .MakeSharedCache();
+      HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
+                                  FLAGS_num_shard_bits);
+      opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
+      cache_ = opts.MakeSharedCache();
    } else if (FLAGS_cache_type == "lru_cache") {
      LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
                           false /* strict_capacity_limit */,
                           0.5 /* high_pri_pool_ratio */);
-#ifndef ROCKSDB_LITE
+      opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
      if (!FLAGS_secondary_cache_uri.empty()) {
        Status s = SecondaryCache::CreateFromString(
            ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@ -313,7 +321,6 @@ class CacheBench {
        }
        opts.secondary_cache = secondary_cache;
      }
-#endif  // ROCKSDB_LITE

      cache_ = NewLRUCache(opts);
    } else {
@ -325,13 +332,50 @@ class CacheBench {
  ~CacheBench() {}

  void PopulateCache() {
-    Random64 rnd(1);
+    Random64 rnd(FLAGS_seed);
    KeyGen keygen;
-    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
-      Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
-                                createValue(rnd), &helper1, FLAGS_value_bytes);
+    size_t max_occ = 0;
+    size_t inserts_since_max_occ_increase = 0;
+    size_t keys_since_last_not_found = 0;
+
+    // Avoid redundant insertions by checking Lookup before Insert.
+    // Loop until insertions consistently fail to increase max occupancy or
+    // it becomes difficult to find keys not already inserted.
+    while (inserts_since_max_occ_increase < 100 &&
+           keys_since_last_not_found < 100) {
+      Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
+
+      Cache::Handle* handle = cache_->Lookup(key);
+      if (handle != nullptr) {
+        cache_->Release(handle);
+        ++keys_since_last_not_found;
+        continue;
+      }
+      keys_since_last_not_found = 0;
+
+      Status s =
+          cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
      assert(s.ok());
+
+      handle = cache_->Lookup(key);
+      if (!handle) {
+        fprintf(stderr, "Failed to lookup key just inserted.\n");
+        assert(false);
+        exit(42);
+      } else {
+        cache_->Release(handle);
+      }
+
+      size_t occ = cache_->GetOccupancyCount();
+      if (occ > max_occ) {
+        max_occ = occ;
+        inserts_since_max_occ_increase = 0;
+      } else {
+        ++inserts_since_max_occ_increase;
+      }
    }
+    printf("Population complete (%zu entries, %g average charge)\n", max_occ,
+           1.0 * FLAGS_cache_size / max_occ);
  }

  bool Run() {
@ -390,18 +434,21 @@ class CacheBench {
                                        FLAGS_ops_per_thread / elapsed_secs);
    printf("Thread ops/sec = %u\n", ops_per_sec);

-    printf("\nOperation latency (ns):\n");
-    HistogramImpl combined;
-    for (uint32_t i = 0; i < FLAGS_threads; i++) {
-      combined.Merge(threads[i]->latency_ns_hist);
-    }
-    printf("%s", combined.ToString().c_str());
+    printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());

-    if (FLAGS_gather_stats) {
-      printf("\nGather stats latency (us):\n");
-      printf("%s", stats_hist.ToString().c_str());
-    }
+    if (FLAGS_histograms) {
+      printf("\nOperation latency (ns):\n");
+      HistogramImpl combined;
+      for (uint32_t i = 0; i < FLAGS_threads; i++) {
+        combined.Merge(threads[i]->latency_ns_hist);
+      }
+      printf("%s", combined.ToString().c_str());

+      if (FLAGS_gather_stats) {
+        printf("\nGather stats latency (us):\n");
+        printf("%s", stats_hist.ToString().c_str());
+      }
+    }
    printf("\n%s", stats_report.c_str());

    return true;
@ -415,8 +462,6 @@ class CacheBench {
  const uint64_t insert_threshold_;
  const uint64_t lookup_threshold_;
  const uint64_t erase_threshold_;
-  const bool skewed_;
-  int max_log_;

  // A benchmark version of gathering stats on an active block cache by
  // iterating over it. The primary purpose is to measure the impact of
@ -436,7 +481,7 @@ class CacheBench {
    uint64_t total_entry_count = 0;
    uint64_t table_occupancy = 0;
    uint64_t table_size = 0;
-    std::set<Cache::DeleterFn> deleters;
+    std::set<const Cache::CacheItemHelper*> helpers;
    StopWatchNano timer(clock);

    for (;;) {
@ -461,7 +506,7 @@ class CacheBench {
                 << BytesToHumanString(static_cast<uint64_t>(
                        1.0 * total_charge / total_entry_count))
                 << "\n"
-                 << "Unique deleters: " << deleters.size() << "\n";
+                 << "Unique helpers: " << helpers.size() << "\n";
            *stats_report = ostr.str();
            return;
          }
@ -477,22 +522,26 @@ class CacheBench {
      total_key_size = 0;
      total_charge = 0;
      total_entry_count = 0;
-      deleters.clear();
-      auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
-                    Cache::DeleterFn deleter) {
+      helpers.clear();
+      auto fn = [&](const Slice& key, Cache::ObjectPtr /*value*/, size_t charge,
+                    const Cache::CacheItemHelper* helper) {
        total_key_size += key.size();
        total_charge += charge;
        ++total_entry_count;
-        // Something slightly more expensive as in (future) stats by category
-        deleters.insert(deleter);
+        // Something slightly more expensive as in stats by category
+        helpers.insert(helper);
      };
-      timer.Start();
+      if (FLAGS_histograms) {
+        timer.Start();
+      }
      Cache::ApplyToAllEntriesOptions opts;
      opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
      shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
      table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
      table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
-      stats_hist->Add(timer.ElapsedNanos() / 1000);
+      if (FLAGS_histograms) {
+        stats_hist->Add(timer.ElapsedNanos() / 1000);
+      }
    }
  }

@ -523,6 +572,8 @@ class CacheBench {
  void OperateCache(ThreadState* thread) {
    // To use looked-up values
    uint64_t result = 0;
+    uint64_t lookup_misses = 0;
+    uint64_t lookup_hits = 0;
    // To hold handles for a non-trivial amount of time
    Cache::Handle* handle = nullptr;
    KeyGen gen;
@ -531,18 +582,12 @@ class CacheBench {
    StopWatchNano timer(clock);

    for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+      Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
      uint64_t random_op = thread->rnd.Next();
-      Cache::CreateCallback create_cb = [](const void* buf, size_t size,
-                                           void** out_obj,
-                                           size_t* charge) -> Status {
-        *out_obj = reinterpret_cast<void*>(new char[size]);
-        memcpy(*out_obj, buf, size);
-        *charge = size;
-        return Status::OK();
-      };

-      timer.Start();
+      if (FLAGS_histograms) {
+        timer.Start();
+      }

      if (random_op < lookup_insert_threshold_) {
        if (handle) {
@ -550,15 +595,17 @@ class CacheBench {
          handle = nullptr;
        }
        // do lookup
-        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
-                                true);
+        handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
+                                Cache::Priority::LOW);
        if (handle) {
+          ++lookup_hits;
          if (!FLAGS_lean) {
            // do something with the data
            result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                               FLAGS_value_bytes);
          }
        } else {
+          ++lookup_misses;
          // do insert
          Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
                                    FLAGS_value_bytes, &handle);
@ -579,14 +626,17 @@ class CacheBench {
          handle = nullptr;
        }
        // do lookup
-        handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
-                                true);
+        handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
+                                Cache::Priority::LOW);
        if (handle) {
+          ++lookup_hits;
          if (!FLAGS_lean) {
            // do something with the data
            result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                               FLAGS_value_bytes);
          }
+        } else {
+          ++lookup_misses;
        }
      } else if (random_op < erase_threshold_) {
        // do erase
@ -595,7 +645,14 @@ class CacheBench {
        // Should be extremely unlikely (noop)
        assert(random_op >= kHundredthUint64 * 100U);
      }
-      thread->latency_ns_hist.Add(timer.ElapsedNanos());
+      if (FLAGS_histograms) {
+        thread->latency_ns_hist.Add(timer.ElapsedNanos());
+      }
+      thread->shared->AddLookupStats(lookup_hits, lookup_misses);
+    }
+    if (FLAGS_early_exit) {
+      MutexLock l(thread->shared->GetMutex());
+      exit(0);
    }
    if (handle) {
      cache_->Release(handle);
@ -617,6 +674,7 @@ class CacheBench {
 #ifndef NDEBUG
    printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
 #endif
+    printf("----------------------------\n");
    printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
    printf("DMutex impl name    : %s\n", DMutex::kName());
    printf("Number of threads   : %u\n", FLAGS_threads);
@ -956,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) {
    exit(1);
  }

+  if (FLAGS_seed == 0) {
+    FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
+    printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
+  }
+
  ROCKSDB_NAMESPACE::CacheBench bench;
  if (FLAGS_populate_cache) {
    bench.PopulateCache();
-    printf("Population complete\n");
-    printf("----------------------------\n");
  }
  if (bench.Run()) {
    return 0;
--- a/cache/cache_entry_roles.cc
+++ b/cache/cache_entry_roles.cc
@ -101,34 +101,4 @@ std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
  return GetPrefixedCacheEntryRoleName(kPrefix, role);
 }

-namespace {
-
-struct Registry {
-  std::mutex mutex;
-  UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map;
-  void Register(Cache::DeleterFn fn, CacheEntryRole role) {
-    std::lock_guard<std::mutex> lock(mutex);
-    role_map[fn] = role;
-  }
-  UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() {
-    std::lock_guard<std::mutex> lock(mutex);
-    return role_map;
-  }
-};
-
-Registry& GetRegistry() {
-  STATIC_AVOID_DESTRUCTION(Registry, registry);
-  return registry;
-}
-
-}  // namespace
-
-void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
-  GetRegistry().Register(fn, role);
-}
-
-UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
-  return GetRegistry().Copy();
-}
-
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_entry_roles.h
+++ b/cache/cache_entry_roles.h
@ -7,11 +7,8 @@

 #include <array>
 #include <cstdint>
-#include <memory>
-#include <type_traits>

 #include "rocksdb/cache.h"
-#include "util/hash_containers.h"

 namespace ROCKSDB_NAMESPACE {

@ -20,84 +17,4 @@ extern std::array<std::string, kNumCacheEntryRoles>
 extern std::array<std::string, kNumCacheEntryRoles>
    kCacheEntryRoleToHyphenString;

-// To associate cache entries with their role, we use a hack on the
-// existing Cache interface. Because the deleter of an entry can authenticate
-// the code origin of an entry, we can elaborate the choice of deleter to
-// also encode role information, without inferring false role information
-// from entries not choosing to encode a role.
-//
-// The rest of this file is for handling mappings between deleters and
-// roles.
-
-// To infer a role from a deleter, the deleter must be registered. This
-// can be done "manually" with this function. This function is thread-safe,
-// and the registration mappings go into private but static storage. (Note
-// that DeleterFn is a function pointer, not std::function. Registrations
-// should not be too many.)
-void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
-
-// Gets a copy of the registered deleter -> role mappings. This is the only
-// function for reading the mappings made with RegisterCacheDeleterRole.
-// Why only this interface for reading?
-// * This function has to be thread safe, which could incur substantial
-// overhead. We should not pay this overhead for every deleter look-up.
-// * This is suitable for preparing for batch operations, like with
-// CacheEntryStatsCollector.
-// * The number of mappings should be sufficiently small (dozens).
-UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
-
-// ************************************************************** //
-// An automatic registration infrastructure. This enables code
-// to simply ask for a deleter associated with a particular type
-// and role, and registration is automatic. In a sense, this is
-// a small dependency injection infrastructure, because linking
-// in new deleter instantiations is essentially sufficient for
-// making stats collection (using CopyCacheDeleterRoleMap) aware
-// of them.
-
-namespace cache_entry_roles_detail {
-
-template <typename T, CacheEntryRole R>
-struct RegisteredDeleter {
-  RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
-
-  // These have global linkage to help ensure compiler optimizations do not
-  // break uniqueness for each <T,R>
-  static void Delete(const Slice& /* key */, void* value) {
-    // Supports T == Something[], unlike delete operator
-    std::default_delete<T>()(
-        static_cast<typename std::remove_extent<T>::type*>(value));
-  }
-};
-
-template <CacheEntryRole R>
-struct RegisteredNoopDeleter {
-  RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
-
-  static void Delete(const Slice& /* key */, void* /* value */) {
-    // Here was `assert(value == nullptr);` but we can also put pointers
-    // to static data in Cache, for testing at least.
-  }
-};
-
-}  // namespace cache_entry_roles_detail
-
-// Get an automatically registered deleter for value type T and role R.
-// Based on C++ semantics, registration is invoked exactly once in a
-// thread-safe way on first call to this function, for each <T, R>.
-template <typename T, CacheEntryRole R>
-Cache::DeleterFn GetCacheEntryDeleterForRole() {
-  static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
-  return reg.Delete;
-}
-
-// Get an automatically registered no-op deleter (value should be nullptr)
-// and associated with role R. This is used for Cache "reservation" entries
-// such as for WriteBufferManager.
-template <CacheEntryRole R>
-Cache::DeleterFn GetNoopDeleterForRole() {
-  static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
-  return reg.Delete;
-}
-
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_entry_stats.h
+++ b/cache/cache_entry_stats.h
@ -10,8 +10,8 @@
 #include <memory>
 #include <mutex>

-#include "cache/cache_helpers.h"
 #include "cache/cache_key.h"
+#include "cache/typed_cache.h"
 #include "port/lang.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/status.h"
@ -111,11 +111,14 @@ class CacheEntryStatsCollector {
  // Gets or creates a shared instance of CacheEntryStatsCollector in the
  // cache itself, and saves into `ptr`. This shared_ptr will hold the
  // entry in cache until all refs are destroyed.
-  static Status GetShared(Cache *cache, SystemClock *clock,
+  static Status GetShared(Cache *raw_cache, SystemClock *clock,
                          std::shared_ptr<CacheEntryStatsCollector> *ptr) {
-    const Slice &cache_key = GetCacheKey();
+    assert(raw_cache);
+    BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
+        cache{raw_cache};

-    Cache::Handle *h = cache->Lookup(cache_key);
+    const Slice &cache_key = GetCacheKey();
+    auto h = cache.Lookup(cache_key);
    if (h == nullptr) {
      // Not yet in cache, but Cache doesn't provide a built-in way to
      // avoid racing insert. So we double-check under a shared mutex,
@ -123,15 +126,15 @@ class CacheEntryStatsCollector {
      STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
      std::lock_guard<std::mutex> lock(static_mutex);

-      h = cache->Lookup(cache_key);
+      h = cache.Lookup(cache_key);
      if (h == nullptr) {
-        auto new_ptr = new CacheEntryStatsCollector(cache, clock);
+        auto new_ptr = new CacheEntryStatsCollector(cache.get(), clock);
        // TODO: non-zero charge causes some tests that count block cache
        // usage to go flaky. Fix the problem somehow so we can use an
        // accurate charge.
        size_t charge = 0;
-        Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
-                                 Cache::Priority::HIGH);
+        Status s =
+            cache.Insert(cache_key, new_ptr, charge, &h, Cache::Priority::HIGH);
        if (!s.ok()) {
          assert(h == nullptr);
          delete new_ptr;
@ -140,11 +143,11 @@ class CacheEntryStatsCollector {
      }
    }
    // If we reach here, shared entry is in cache with handle `h`.
-    assert(cache->GetDeleter(h) == Deleter);
+    assert(cache.get()->GetCacheItemHelper(h) == cache.GetBasicHelper());

    // Build an aliasing shared_ptr that keeps `ptr` in cache while there
    // are references.
-    *ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
+    *ptr = cache.SharedGuard(h);
    return Status::OK();
  }

@ -157,10 +160,6 @@ class CacheEntryStatsCollector {
        cache_(cache),
        clock_(clock) {}

-  static void Deleter(const Slice &, void *value) {
-    delete static_cast<CacheEntryStatsCollector *>(value);
-  }
-
  static const Slice &GetCacheKey() {
    // For each template instantiation
    static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();
--- a/cache/cache_helpers.cc
+++ b/cache/cache_helpers.cc
@ -0,0 +1,40 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/cache_helpers.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+void ReleaseCacheHandleCleanup(void* arg1, void* arg2) {
+  Cache* const cache = static_cast<Cache*>(arg1);
+  assert(cache);
+
+  Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
+  assert(cache_handle);
+
+  cache->Release(cache_handle);
+}
+
+Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
+                   Cache::CreateContext* create_context,
+                   const Cache::CacheItemHelper* helper,
+                   Cache::Priority priority, size_t* out_charge) {
+  assert(helper);
+  assert(helper->create_cb);
+  Cache::ObjectPtr value;
+  size_t charge;
+  Status st = helper->create_cb(saved, create_context,
+                                cache->memory_allocator(), &value, &charge);
+  if (st.ok()) {
+    st =
+        cache->Insert(key, value, helper, charge, /*handle*/ nullptr, priority);
+    if (out_charge) {
+      *out_charge = charge;
+    }
+  }
+  return st;
+}
+
+}  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_helpers.h
+++ b/cache/cache_helpers.h
@ -7,7 +7,7 @@

 #include <cassert>

-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"
 #include "rocksdb/rocksdb_namespace.h"

 namespace ROCKSDB_NAMESPACE {
@ -17,22 +17,17 @@ template <typename T>
 T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
  assert(cache);
  assert(handle);
-
  return static_cast<T*>(cache->Value(handle));
 }

-// Simple generic deleter for Cache (to be used with Cache::Insert).
-template <typename T>
-void DeleteCacheEntry(const Slice& /* key */, void* value) {
-  delete static_cast<T*>(value);
-}
-
 // Turns a T* into a Slice so it can be used as a key with Cache.
 template <typename T>
-Slice GetSlice(const T* t) {
+Slice GetSliceForKey(const T* t) {
  return Slice(reinterpret_cast<const char*>(t), sizeof(T));
 }

+void ReleaseCacheHandleCleanup(void* arg1, void* arg2);
+
 // Generic resource management object for cache handles that releases the handle
 // when destroyed. Has unique ownership of the handle, so copying it is not
 // allowed, while moving it transfers ownership.
@ -88,7 +83,7 @@ class CacheHandleGuard {
    if (cleanable) {
      if (handle_ != nullptr) {
        assert(cache_);
-        cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_);
+        cleanable->RegisterCleanup(&ReleaseCacheHandleCleanup, cache_, handle_);
      }
    }
    ResetFields();
@ -115,16 +110,6 @@ class CacheHandleGuard {
    value_ = nullptr;
  }

-  static void ReleaseCacheHandle(void* arg1, void* arg2) {
-    Cache* const cache = static_cast<Cache*>(arg1);
-    assert(cache);
-
-    Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
-    assert(cache_handle);
-
-    cache->Release(cache_handle);
-  }
-
 private:
  Cache* cache_ = nullptr;
  Cache::Handle* handle_ = nullptr;
@ -139,7 +124,16 @@ template <typename T>
 std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
                                              Cache::Handle* handle) {
  auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
-  return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
+  return std::shared_ptr<T>(wrapper, GetFromCacheHandle<T>(cache, handle));
 }

+// Given the persistable data (saved) for a block cache entry, parse that
+// into a cache entry object and insert it into the given cache. The charge
+// of the new entry can be returned to the caller through `out_charge`.
+Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
+                   Cache::CreateContext* create_context,
+                   const Cache::CacheItemHelper* helper,
+                   Cache::Priority priority = Cache::Priority::LOW,
+                   size_t* out_charge = nullptr);
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/cache_key.cc
+++ b/cache/cache_key.cc
@ -8,7 +8,7 @@
 #include <algorithm>
 #include <atomic>

-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"
 #include "table/unique_id_impl.h"
 #include "util/hash.h"
 #include "util/math.h"
--- a/cache/cache_reservation_manager.cc
+++ b/cache/cache_reservation_manager.cc
@ -13,7 +13,6 @@
 #include <cstring>
 #include <memory>

-#include "cache/cache_entry_roles.h"
 #include "rocksdb/cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@ -41,17 +40,17 @@ CacheReservationManagerImpl<
 template <CacheEntryRole R>
 CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
    std::shared_ptr<Cache> cache, bool delayed_decrease)
-    : delayed_decrease_(delayed_decrease),
+    : cache_(cache),
+      delayed_decrease_(delayed_decrease),
      cache_allocated_size_(0),
      memory_used_(0) {
  assert(cache != nullptr);
-  cache_ = cache;
 }

 template <CacheEntryRole R>
 CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
  for (auto* handle : dummy_handles_) {
-    cache_->Release(handle, true);
+    cache_.ReleaseAndEraseIfLastRef(handle);
  }
 }

@ -115,8 +114,7 @@ Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
  Status return_status = Status::OK();
  while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
    Cache::Handle* handle = nullptr;
-    return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
-                                   GetNoopDeleterForRole<R>(), &handle);
+    return_status = cache_.Insert(GetNextCacheKey(), kSizeDummyEntry, &handle);

    if (return_status != Status::OK()) {
      return return_status;
@ -141,7 +139,7 @@ Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
         cache_allocated_size_.load(std::memory_order_relaxed)) {
    assert(!dummy_handles_.empty());
    auto* handle = dummy_handles_.back();
-    cache_->Release(handle, true);
+    cache_.ReleaseAndEraseIfLastRef(handle);
    dummy_handles_.pop_back();
    cache_allocated_size_ -= kSizeDummyEntry;
  }
@ -169,8 +167,9 @@ Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
 }

 template <CacheEntryRole R>
-Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() {
-  return GetNoopDeleterForRole<R>();
+const Cache::CacheItemHelper*
+CacheReservationManagerImpl<R>::TEST_GetCacheItemHelperForRole() {
+  return CacheInterface::GetHelper();
 }

 template class CacheReservationManagerImpl<
--- a/cache/cache_reservation_manager.h
+++ b/cache/cache_reservation_manager.h
@ -18,7 +18,7 @@

 #include "cache/cache_entry_roles.h"
 #include "cache/cache_key.h"
-#include "rocksdb/cache.h"
+#include "cache/typed_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
 #include "util/coding.h"
@ -197,10 +197,10 @@ class CacheReservationManagerImpl

  static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }

-  // For testing only - it is to help ensure the NoopDeleterForRole<R>
+  // For testing only - it is to help ensure the CacheItemHelperForRole<R>
  // accessed from CacheReservationManagerImpl and the one accessed from the
  // test are from the same translation units
-  static Cache::DeleterFn TEST_GetNoopDeleterForRole();
+  static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();

 private:
  static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
@ -211,7 +211,8 @@ class CacheReservationManagerImpl
  Status IncreaseCacheReservation(std::size_t new_mem_used);
  Status DecreaseCacheReservation(std::size_t new_mem_used);

-  std::shared_ptr<Cache> cache_;
+  using CacheInterface = PlaceholderSharedCacheInterface<R>;
+  CacheInterface cache_;
  bool delayed_decrease_;
  std::atomic<std::size_t> cache_allocated_size_;
  std::size_t memory_used_;
--- a/cache/cache_test.cc
+++ b/cache/cache_test.cc
@ -16,9 +16,12 @@
 #include <vector>

 #include "cache/lru_cache.h"
+#include "cache/typed_cache.h"
 #include "port/stack_trace.h"
+#include "test_util/secondary_cache_test_util.h"
 #include "test_util/testharness.h"
 #include "util/coding.h"
+#include "util/hash_containers.h"
 #include "util/string_util.h"

 // HyperClockCache only supports 16-byte keys, so some of the tests
@ -55,42 +58,43 @@ int DecodeKey32Bits(const Slice& k) {
  return DecodeFixed32(k.data());
 }

-void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
+Cache::ObjectPtr EncodeValue(uintptr_t v) {
+  return reinterpret_cast<Cache::ObjectPtr>(v);
+}

 int DecodeValue(void* v) {
  return static_cast<int>(reinterpret_cast<uintptr_t>(v));
 }

-void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}
-
-void EraseDeleter1(const Slice& /*key*/, void* value) {
-  Cache* cache = reinterpret_cast<Cache*>(value);
-  cache->Erase("foo");
-}
-
-void EraseDeleter2(const Slice& /*key*/, void* value) {
-  Cache* cache = reinterpret_cast<Cache*>(value);
-  cache->Erase(EncodeKey16Bytes(1234));
-}
-
-const std::string kLRU = "lru";
-const std::string kHyperClock = "hyper_clock";
-
+const Cache::CacheItemHelper kDumbHelper{
+    CacheEntryRole::kMisc,
+    [](Cache::ObjectPtr /*value*/, MemoryAllocator* /*alloc*/) {}};
+
+const Cache::CacheItemHelper kEraseOnDeleteHelper1{
+    CacheEntryRole::kMisc,
+    [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
+      Cache* cache = static_cast<Cache*>(value);
+      cache->Erase("foo");
+    }};
+
+const Cache::CacheItemHelper kEraseOnDeleteHelper2{
+    CacheEntryRole::kMisc,
+    [](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
+      Cache* cache = static_cast<Cache*>(value);
+      cache->Erase(EncodeKey16Bytes(1234));
+    }};
 }  // anonymous namespace

-class CacheTest : public testing::TestWithParam<std::string> {
+class CacheTest : public testing::Test,
+                  public secondary_cache_test_util::WithCacheTypeParam {
 public:
  static CacheTest* current_;
  static std::string type_;

-  static void Deleter(const Slice& key, void* v) {
-    if (type_ == kHyperClock) {
-      current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
-    } else {
-      current_->deleted_keys_.push_back(DecodeKey32Bits(key));
-    }
+  static void Deleter(Cache::ObjectPtr v, MemoryAllocator*) {
    current_->deleted_values_.push_back(DecodeValue(v));
  }
+  static const Cache::CacheItemHelper kHelper;

  static const int kCacheSize = 1000;
  static const int kNumShardBits = 4;
@ -98,13 +102,10 @@ class CacheTest : public testing::TestWithParam<std::string> {
  static const int kCacheSize2 = 100;
  static const int kNumShardBits2 = 2;

-  std::vector<int> deleted_keys_;
  std::vector<int> deleted_values_;
  std::shared_ptr<Cache> cache_;
  std::shared_ptr<Cache> cache2_;

-  size_t estimated_value_size_ = 1;
-
  CacheTest()
      : cache_(NewCache(kCacheSize, kNumShardBits, false)),
        cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
@ -114,41 +115,6 @@ class CacheTest : public testing::TestWithParam<std::string> {

  ~CacheTest() override {}

-  std::shared_ptr<Cache> NewCache(size_t capacity) {
-    auto type = GetParam();
-    if (type == kLRU) {
-      return NewLRUCache(capacity);
-    }
-    if (type == kHyperClock) {
-      return HyperClockCacheOptions(
-                 capacity, estimated_value_size_ /*estimated_value_size*/)
-          .MakeSharedCache();
-    }
-    return nullptr;
-  }
-
-  std::shared_ptr<Cache> NewCache(
-      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-      CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
-    auto type = GetParam();
-    if (type == kLRU) {
-      LRUCacheOptions co;
-      co.capacity = capacity;
-      co.num_shard_bits = num_shard_bits;
-      co.strict_capacity_limit = strict_capacity_limit;
-      co.high_pri_pool_ratio = 0;
-      co.metadata_charge_policy = charge_policy;
-      return NewLRUCache(co);
-    }
-    if (type == kHyperClock) {
-      return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/,
-                                    num_shard_bits, strict_capacity_limit,
-                                    nullptr /*allocator*/, charge_policy)
-          .MakeSharedCache();
-    }
-    return nullptr;
-  }
-
  // These functions encode/decode keys in tests cases that use
  // int keys.
  // Currently, HyperClockCache requires keys to be 16B long, whereas
@ -182,8 +148,8 @@ class CacheTest : public testing::TestWithParam<std::string> {

  void Insert(std::shared_ptr<Cache> cache, int key, int value,
              int charge = 1) {
-    EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
-                            &CacheTest::Deleter));
+    EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper,
+                            charge, /*handle*/ nullptr, Cache::Priority::HIGH));
  }

  void Erase(std::shared_ptr<Cache> cache, int key) {
@ -207,6 +173,9 @@ class CacheTest : public testing::TestWithParam<std::string> {
  void Erase2(int key) { Erase(cache2_, key); }
 };

+const Cache::CacheItemHelper CacheTest::kHelper{CacheEntryRole::kMisc,
+                                                &CacheTest::Deleter};
+
 CacheTest* CacheTest::current_;
 std::string CacheTest::type_;

@ -236,10 +205,8 @@ TEST_P(CacheTest, UsageTest) {
      key = EncodeKey(i);
    }
    auto kv_size = key.size() + 5;
-    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                            DumbDeleter));
-    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    kv_size, DumbDeleter));
+    ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size));
+    ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size));
    usage += kv_size;
    ASSERT_EQ(usage, cache->GetUsage());
    if (type == kHyperClock) {
@ -262,10 +229,8 @@ TEST_P(CacheTest, UsageTest) {
    } else {
      key = EncodeKey(static_cast<int>(1000 + i));
    }
-    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                            DumbDeleter));
-    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    key.size() + 5, DumbDeleter));
+    ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
+    ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
  }

  // the usage should be close to the capacity
@ -320,11 +285,9 @@ TEST_P(CacheTest, PinnedUsageTest) {
    auto kv_size = key.size() + 5;
    Cache::Handle* handle;
    Cache::Handle* handle_in_precise_cache;
-    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
-                            DumbDeleter, &handle));
+    ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size, &handle));
    assert(handle);
-    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    kv_size, DumbDeleter,
+    ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size,
                                    &handle_in_precise_cache));
    assert(handle_in_precise_cache);
    pinned_usage += kv_size;
@ -365,10 +328,8 @@ TEST_P(CacheTest, PinnedUsageTest) {
    } else {
      key = EncodeKey(static_cast<int>(1000 + i));
    }
-    ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
-                            DumbDeleter));
-    ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
-                                    key.size() + 5, DumbDeleter));
+    ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
+    ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
  }
  ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
  ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
@ -416,8 +377,7 @@ TEST_P(CacheTest, HitAndMiss) {
  ASSERT_EQ(201, Lookup(200));
  ASSERT_EQ(-1, Lookup(300));

-  ASSERT_EQ(1U, deleted_keys_.size());
-  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(1U, deleted_values_.size());
  if (GetParam() == kHyperClock) {
    ASSERT_EQ(102, deleted_values_[0]);
  } else {
@ -438,21 +398,20 @@ TEST_P(CacheTest, InsertSameKey) {

 TEST_P(CacheTest, Erase) {
  Erase(200);
-  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(0U, deleted_values_.size());

  Insert(100, 101);
  Insert(200, 201);
  Erase(100);
  ASSERT_EQ(-1, Lookup(100));
  ASSERT_EQ(201, Lookup(200));
-  ASSERT_EQ(1U, deleted_keys_.size());
-  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(1U, deleted_values_.size());
  ASSERT_EQ(101, deleted_values_[0]);

  Erase(100);
  ASSERT_EQ(-1, Lookup(100));
  ASSERT_EQ(201, Lookup(200));
-  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, deleted_values_.size());
 }

 TEST_P(CacheTest, EntriesArePinned) {
@ -469,23 +428,21 @@ TEST_P(CacheTest, EntriesArePinned) {
  Insert(100, 102);
  Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
  ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
-  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(0U, deleted_values_.size());
  ASSERT_EQ(2U, cache_->GetUsage());

  cache_->Release(h1);
-  ASSERT_EQ(1U, deleted_keys_.size());
-  ASSERT_EQ(100, deleted_keys_[0]);
+  ASSERT_EQ(1U, deleted_values_.size());
  ASSERT_EQ(101, deleted_values_[0]);
  ASSERT_EQ(1U, cache_->GetUsage());

  Erase(100);
  ASSERT_EQ(-1, Lookup(100));
-  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, deleted_values_.size());
  ASSERT_EQ(1U, cache_->GetUsage());

  cache_->Release(h2);
-  ASSERT_EQ(2U, deleted_keys_.size());
-  ASSERT_EQ(100, deleted_keys_[1]);
+  ASSERT_EQ(2U, deleted_values_.size());
  ASSERT_EQ(102, deleted_values_[1]);
  ASSERT_EQ(0U, cache_->GetUsage());
 }
@ -588,9 +545,9 @@ TEST_P(CacheTest, EvictEmptyCache) {
  // Insert item large than capacity to trigger eviction on empty cache.
  auto cache = NewCache(1, 0, false);
  if (type == kLRU) {
-    ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
+    ASSERT_OK(cache->Insert("foo", nullptr, &kDumbHelper, 10));
  } else {
-    ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
+    ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10));
  }
 }

@ -601,19 +558,19 @@ TEST_P(CacheTest, EraseFromDeleter) {
  // the cache at that point.
  std::shared_ptr<Cache> cache = NewCache(10, 0, false);
  std::string foo, bar;
-  Cache::DeleterFn erase_deleter;
+  const Cache::CacheItemHelper* erase_helper;
  if (type == kLRU) {
    foo = "foo";
    bar = "bar";
-    erase_deleter = EraseDeleter1;
+    erase_helper = &kEraseOnDeleteHelper1;
  } else {
    foo = EncodeKey(1234);
    bar = EncodeKey(5678);
-    erase_deleter = EraseDeleter2;
+    erase_helper = &kEraseOnDeleteHelper2;
  }

-  ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
-  ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
+  ASSERT_OK(cache->Insert(foo, nullptr, &kDumbHelper, 1));
+  ASSERT_OK(cache->Insert(bar, cache.get(), erase_helper, 1));

  cache->Erase(bar);
  ASSERT_EQ(nullptr, cache->Lookup(foo));
@ -675,50 +632,51 @@ TEST_P(CacheTest, NewId) {
  ASSERT_NE(a, b);
 }

-class Value {
- public:
-  explicit Value(int v) : v_(v) {}
-
-  int v_;
-};
-
-namespace {
-void deleter(const Slice& /*key*/, void* value) {
-  delete static_cast<Value*>(value);
-}
-}  // namespace
-
 TEST_P(CacheTest, ReleaseAndErase) {
  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
  Cache::Handle* handle;
-  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
-                           &CacheTest::Deleter, &handle);
+  Status s =
+      cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
  ASSERT_TRUE(s.ok());
  ASSERT_EQ(5U, cache->GetCapacity());
  ASSERT_EQ(1U, cache->GetUsage());
-  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(0U, deleted_values_.size());
  auto erased = cache->Release(handle, true);
  ASSERT_TRUE(erased);
  // This tests that deleter has been called
-  ASSERT_EQ(1U, deleted_keys_.size());
+  ASSERT_EQ(1U, deleted_values_.size());
 }

 TEST_P(CacheTest, ReleaseWithoutErase) {
  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
  Cache::Handle* handle;
-  Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
-                           &CacheTest::Deleter, &handle);
+  Status s =
+      cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
  ASSERT_TRUE(s.ok());
  ASSERT_EQ(5U, cache->GetCapacity());
  ASSERT_EQ(1U, cache->GetUsage());
-  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(0U, deleted_values_.size());
  auto erased = cache->Release(handle);
  ASSERT_FALSE(erased);
  // This tests that deleter is not called. When cache has free capacity it is
  // not expected to immediately erase the released items.
-  ASSERT_EQ(0U, deleted_keys_.size());
+  ASSERT_EQ(0U, deleted_values_.size());
 }

+namespace {
+class Value {
+ public:
+  explicit Value(int v) : v_(v) {}
+
+  int v_;
+
+  static constexpr auto kCacheEntryRole = CacheEntryRole::kMisc;
+};
+
+using SharedCache = BasicTypedSharedCacheInterface<Value>;
+using TypedHandle = SharedCache::TypedHandle;
+}  // namespace
+
 TEST_P(CacheTest, SetCapacity) {
  auto type = GetParam();
  if (type == kHyperClock) {
@ -731,19 +689,19 @@ TEST_P(CacheTest, SetCapacity) {
  // lets create a cache with capacity 5,
  // then, insert 5 elements, then increase capacity
  // to 10, returned capacity should be 10, usage=5
-  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
-  std::vector<Cache::Handle*> handles(10);
+  SharedCache cache{NewCache(5, 0, false)};
+  std::vector<TypedHandle*> handles(10);
  // Insert 5 entries, but not releasing.
  for (int i = 0; i < 5; i++) {
    std::string key = EncodeKey(i + 1);
-    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
    ASSERT_TRUE(s.ok());
  }
-  ASSERT_EQ(5U, cache->GetCapacity());
-  ASSERT_EQ(5U, cache->GetUsage());
-  cache->SetCapacity(10);
-  ASSERT_EQ(10U, cache->GetCapacity());
-  ASSERT_EQ(5U, cache->GetUsage());
+  ASSERT_EQ(5U, cache.get()->GetCapacity());
+  ASSERT_EQ(5U, cache.get()->GetUsage());
+  cache.get()->SetCapacity(10);
+  ASSERT_EQ(10U, cache.get()->GetCapacity());
+  ASSERT_EQ(5U, cache.get()->GetUsage());

  // test2: decrease capacity
  // insert 5 more elements to cache, then release 5,
@ -751,77 +709,77 @@ TEST_P(CacheTest, SetCapacity) {
  // and usage should be 7
  for (int i = 5; i < 10; i++) {
    std::string key = EncodeKey(i + 1);
-    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
    ASSERT_TRUE(s.ok());
  }
-  ASSERT_EQ(10U, cache->GetCapacity());
-  ASSERT_EQ(10U, cache->GetUsage());
+  ASSERT_EQ(10U, cache.get()->GetCapacity());
+  ASSERT_EQ(10U, cache.get()->GetUsage());
  for (int i = 0; i < 5; i++) {
-    cache->Release(handles[i]);
+    cache.Release(handles[i]);
  }
-  ASSERT_EQ(10U, cache->GetCapacity());
-  ASSERT_EQ(10U, cache->GetUsage());
-  cache->SetCapacity(7);
-  ASSERT_EQ(7, cache->GetCapacity());
-  ASSERT_EQ(7, cache->GetUsage());
+  ASSERT_EQ(10U, cache.get()->GetCapacity());
+  ASSERT_EQ(10U, cache.get()->GetUsage());
+  cache.get()->SetCapacity(7);
+  ASSERT_EQ(7, cache.get()->GetCapacity());
+  ASSERT_EQ(7, cache.get()->GetUsage());

  // release remaining 5 to keep valgrind happy
  for (int i = 5; i < 10; i++) {
-    cache->Release(handles[i]);
+    cache.Release(handles[i]);
  }

  // Make sure this doesn't crash or upset ASAN/valgrind
-  cache->DisownData();
+  cache.get()->DisownData();
 }

 TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
  // test1: set the flag to false. Insert more keys than capacity. See if they
  // all go through.
-  std::shared_ptr<Cache> cache = NewCache(5, 0, false);
-  std::vector<Cache::Handle*> handles(10);
+  SharedCache cache{NewCache(5, 0, false)};
+  std::vector<TypedHandle*> handles(10);
  Status s;
  for (int i = 0; i < 10; i++) {
    std::string key = EncodeKey(i + 1);
-    s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
    ASSERT_OK(s);
    ASSERT_NE(nullptr, handles[i]);
  }
-  ASSERT_EQ(10, cache->GetUsage());
+  ASSERT_EQ(10, cache.get()->GetUsage());

  // test2: set the flag to true. Insert and check if it fails.
  std::string extra_key = EncodeKey(100);
  Value* extra_value = new Value(0);
-  cache->SetStrictCapacityLimit(true);
-  Cache::Handle* handle;
-  s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  cache.get()->SetStrictCapacityLimit(true);
+  TypedHandle* handle;
+  s = cache.Insert(extra_key, extra_value, 1, &handle);
  ASSERT_TRUE(s.IsMemoryLimit());
  ASSERT_EQ(nullptr, handle);
-  ASSERT_EQ(10, cache->GetUsage());
+  ASSERT_EQ(10, cache.get()->GetUsage());

  for (int i = 0; i < 10; i++) {
-    cache->Release(handles[i]);
+    cache.Release(handles[i]);
  }

  // test3: init with flag being true.
-  std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
+  SharedCache cache2{NewCache(5, 0, true)};
  for (int i = 0; i < 5; i++) {
    std::string key = EncodeKey(i + 1);
-    s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    s = cache2.Insert(key, new Value(i + 1), 1, &handles[i]);
    ASSERT_OK(s);
    ASSERT_NE(nullptr, handles[i]);
  }
-  s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
+  s = cache2.Insert(extra_key, extra_value, 1, &handle);
  ASSERT_TRUE(s.IsMemoryLimit());
  ASSERT_EQ(nullptr, handle);
  // test insert without handle
-  s = cache2->Insert(extra_key, extra_value, 1, &deleter);
+  s = cache2.Insert(extra_key, extra_value, 1);
  // AS if the key have been inserted into cache but get evicted immediately.
  ASSERT_OK(s);
-  ASSERT_EQ(5, cache2->GetUsage());
-  ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
+  ASSERT_EQ(5, cache2.get()->GetUsage());
+  ASSERT_EQ(nullptr, cache2.Lookup(extra_key));

  for (int i = 0; i < 5; i++) {
-    cache2->Release(handles[i]);
+    cache2.Release(handles[i]);
  }
 }

@ -829,55 +787,54 @@ TEST_P(CacheTest, OverCapacity) {
  size_t n = 10;

  // a LRUCache with n entries and one shard only
-  std::shared_ptr<Cache> cache = NewCache(n, 0, false);
-
-  std::vector<Cache::Handle*> handles(n + 1);
+  SharedCache cache{NewCache(n, 0, false)};
+  std::vector<TypedHandle*> handles(n + 1);

  // Insert n+1 entries, but not releasing.
  for (int i = 0; i < static_cast<int>(n + 1); i++) {
    std::string key = EncodeKey(i + 1);
-    Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
+    Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
    ASSERT_TRUE(s.ok());
  }

  // Guess what's in the cache now?
  for (int i = 0; i < static_cast<int>(n + 1); i++) {
    std::string key = EncodeKey(i + 1);
-    auto h = cache->Lookup(key);
+    auto h = cache.Lookup(key);
    ASSERT_TRUE(h != nullptr);
-    if (h) cache->Release(h);
+    if (h) cache.Release(h);
  }

  // the cache is over capacity since nothing could be evicted
-  ASSERT_EQ(n + 1U, cache->GetUsage());
+  ASSERT_EQ(n + 1U, cache.get()->GetUsage());
  for (int i = 0; i < static_cast<int>(n + 1); i++) {
-    cache->Release(handles[i]);
+    cache.Release(handles[i]);
  }

  if (GetParam() == kHyperClock) {
    // Make sure eviction is triggered.
-    ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
+    ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0]));

    // cache is under capacity now since elements were released
-    ASSERT_GE(n, cache->GetUsage());
+    ASSERT_GE(n, cache.get()->GetUsage());

    // clean up
-    cache->Release(handles[0]);
+    cache.Release(handles[0]);
  } else {
    // LRUCache checks for over-capacity in Release.

    // cache is exactly at capacity now with minimal eviction
-    ASSERT_EQ(n, cache->GetUsage());
+    ASSERT_EQ(n, cache.get()->GetUsage());

    // element 0 is evicted and the rest is there
    // This is consistent with the LRU policy since the element 0
    // was released first
    for (int i = 0; i < static_cast<int>(n + 1); i++) {
      std::string key = EncodeKey(i + 1);
-      auto h = cache->Lookup(key);
+      auto h = cache.Lookup(key);
      if (h) {
        ASSERT_NE(static_cast<size_t>(i), 0U);
-        cache->Release(h);
+        cache.Release(h);
      } else {
        ASSERT_EQ(static_cast<size_t>(i), 0U);
      }
@ -885,40 +842,15 @@ TEST_P(CacheTest, OverCapacity) {
  }
 }

-namespace {
-std::vector<std::pair<int, int>> legacy_callback_state;
-void legacy_callback(void* value, size_t charge) {
-  legacy_callback_state.push_back(
-      {DecodeValue(value), static_cast<int>(charge)});
-}
-};  // namespace
-
-TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
-  std::vector<std::pair<int, int>> inserted;
-  legacy_callback_state.clear();
-
-  for (int i = 0; i < 10; ++i) {
-    Insert(i, i * 2, i + 1);
-    inserted.push_back({i * 2, i + 1});
-  }
-  cache_->ApplyToAllCacheEntries(legacy_callback, true);
-
-  std::sort(inserted.begin(), inserted.end());
-  std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
-  ASSERT_EQ(inserted.size(), legacy_callback_state.size());
-  for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
-    EXPECT_EQ(inserted[i], legacy_callback_state[i]);
-  }
-}
-
 TEST_P(CacheTest, ApplyToAllEntriesTest) {
  std::vector<std::string> callback_state;
-  const auto callback = [&](const Slice& key, void* value, size_t charge,
-                            Cache::DeleterFn deleter) {
+  const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
+                            size_t charge,
+                            const Cache::CacheItemHelper* helper) {
    callback_state.push_back(std::to_string(DecodeKey(key)) + "," +
                             std::to_string(DecodeValue(value)) + "," +
                             std::to_string(charge));
-    assert(deleter == &CacheTest::Deleter);
+    assert(helper == &CacheTest::kHelper);
  };

  std::vector<std::string> inserted;
@ -957,8 +889,8 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {

  // For callback
  int special_count = 0;
-  const auto callback = [&](const Slice&, void*, size_t charge,
-                            Cache::DeleterFn) {
+  const auto callback = [&](const Slice&, Cache::ObjectPtr, size_t charge,
+                            const Cache::CacheItemHelper*) {
    if (charge == static_cast<size_t>(kSpecialCharge)) {
      ++special_count;
    }
@ -1020,13 +952,105 @@ TEST_P(CacheTest, GetChargeAndDeleter) {
  Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
  ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
  ASSERT_EQ(1, cache_->GetCharge(h1));
-  ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
+  ASSERT_EQ(&CacheTest::kHelper, cache_->GetCacheItemHelper(h1));
  cache_->Release(h1);
 }

+namespace {
+bool AreTwoCacheKeysOrdered(Cache* cache) {
+  std::vector<std::string> keys;
+  const auto callback = [&](const Slice& key, Cache::ObjectPtr /*value*/,
+                            size_t /*charge*/,
+                            const Cache::CacheItemHelper* /*helper*/) {
+    keys.push_back(key.ToString());
+  };
+  cache->ApplyToAllEntries(callback, /*opts*/ {});
+  EXPECT_EQ(keys.size(), 2U);
+  EXPECT_NE(keys[0], keys[1]);
+  return keys[0] < keys[1];
+}
+}  // namespace
+
+TEST_P(CacheTest, CacheUniqueSeeds) {
+  // kQuasiRandomHashSeed should generate unique seeds (up to 2 billion before
+  // repeating)
+  UnorderedSet<uint32_t> seeds_seen;
+  // Roughly sqrt(number of possible values) for a decent chance at detecting
+  // a random collision if it's possible (shouldn't be)
+  uint16_t kSamples = 20000;
+  seeds_seen.reserve(kSamples);
+
+  // Hash seed should affect ordering of entries in the table, so we should
+  // have extremely high chance of seeing two entries ordered both ways.
+  bool seen_forward_order = false;
+  bool seen_reverse_order = false;
+
+  for (int i = 0; i < kSamples; ++i) {
+    auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
+      opts.hash_seed = LRUCacheOptions::kQuasiRandomHashSeed;
+      opts.num_shard_bits = 0;
+      opts.metadata_charge_policy = kDontChargeCacheMetadata;
+    });
+    auto val = cache->GetHashSeed();
+    ASSERT_TRUE(seeds_seen.insert(val).second);
+
+    ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
+    ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
+
+    if (AreTwoCacheKeysOrdered(cache.get())) {
+      seen_forward_order = true;
+    } else {
+      seen_reverse_order = true;
+    }
+  }
+
+  ASSERT_TRUE(seen_forward_order);
+  ASSERT_TRUE(seen_reverse_order);
+}
+
+TEST_P(CacheTest, CacheHostSeed) {
+  // kHostHashSeed should generate a consistent seed within this process
+  // (and other processes on the same host, but not unit testing that).
+  // And we should be able to use that chosen seed as an explicit option
+  // (for debugging).
+  // And we should verify consistent ordering of entries.
+  uint32_t expected_seed = 0;
+  bool expected_order = false;
+  // 10 iterations -> chance of a random seed falsely appearing consistent
+  // should be low, just 1 in 2^9.
+  for (int i = 0; i < 10; ++i) {
+    auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
+      if (i != 5) {
+        opts.hash_seed = LRUCacheOptions::kHostHashSeed;
+      } else {
+        // Can be used as explicit seed
+        opts.hash_seed = static_cast<int32_t>(expected_seed);
+        ASSERT_GE(opts.hash_seed, 0);
+      }
+      opts.num_shard_bits = 0;
+      opts.metadata_charge_policy = kDontChargeCacheMetadata;
+    });
+    ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
+    ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
+    uint32_t val = cache->GetHashSeed();
+    bool order = AreTwoCacheKeysOrdered(cache.get());
+    if (i != 0) {
+      ASSERT_EQ(val, expected_seed);
+      ASSERT_EQ(order, expected_order);
+    } else {
+      expected_seed = val;
+      expected_order = order;
+    }
+  }
+  // Printed for reference in case it's needed to reproduce other unit test
+  // failures on another host
+  fprintf(stderr, "kHostHashSeed -> %u\n", (unsigned)expected_seed);
+}
+
 INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
-                        testing::Values(kLRU, kHyperClock));
-INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
+                        secondary_cache_test_util::GetTestingCacheTypes());
+INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest,
+                        testing::Values(secondary_cache_test_util::kLRU));

 }  // namespace ROCKSDB_NAMESPACE

--- a/cache/charged_cache.cc
+++ b/cache/charged_cache.cc
@ -11,65 +11,57 @@ namespace ROCKSDB_NAMESPACE {

 ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
                           std::shared_ptr<Cache> block_cache)
-    : cache_(cache),
+    : CacheWrapper(cache),
      cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
          std::make_shared<
              CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
              block_cache))) {}

-Status ChargedCache::Insert(const Slice& key, void* value, size_t charge,
-                            DeleterFn deleter, Handle** handle,
-                            Priority priority) {
-  Status s = cache_->Insert(key, value, charge, deleter, handle, priority);
-  if (s.ok()) {
-    // Insert may cause the cache entry eviction if the cache is full. So we
-    // directly call the reservation manager to update the total memory used
-    // in the cache.
-    assert(cache_res_mgr_);
-    cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
-        .PermitUncheckedError();
-  }
-  return s;
-}
-
-Status ChargedCache::Insert(const Slice& key, void* value,
+Status ChargedCache::Insert(const Slice& key, ObjectPtr obj,
                            const CacheItemHelper* helper, size_t charge,
                            Handle** handle, Priority priority) {
-  Status s = cache_->Insert(key, value, helper, charge, handle, priority);
+  Status s = target_->Insert(key, obj, helper, charge, handle, priority);
  if (s.ok()) {
    // Insert may cause the cache entry eviction if the cache is full. So we
    // directly call the reservation manager to update the total memory used
    // in the cache.
    assert(cache_res_mgr_);
-    cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+    cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
        .PermitUncheckedError();
  }
  return s;
 }

-Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) {
-  return cache_->Lookup(key, stats);
-}
-
 Cache::Handle* ChargedCache::Lookup(const Slice& key,
                                    const CacheItemHelper* helper,
-                                    const CreateCallback& create_cb,
-                                    Priority priority, bool wait,
-                                    Statistics* stats) {
-  auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats);
+                                    CreateContext* create_context,
+                                    Priority priority, Statistics* stats) {
+  auto handle = target_->Lookup(key, helper, create_context, priority, stats);
  // Lookup may promote the KV pair from the secondary cache to the primary
  // cache. So we directly call the reservation manager to update the total
  // memory used in the cache.
+  if (helper && helper->create_cb) {
+    assert(cache_res_mgr_);
+    cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
+        .PermitUncheckedError();
+  }
+  return handle;
+}
+
+void ChargedCache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
+  target_->WaitAll(async_handles, count);
+  // In case of any promotions. Although some could finish by return of
+  // StartAsyncLookup, Wait/WaitAll will generally be used, so simpler to
+  // update here.
  assert(cache_res_mgr_);
-  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+  cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
      .PermitUncheckedError();
-  return handle;
 }

 bool ChargedCache::Release(Cache::Handle* handle, bool useful,
                           bool erase_if_last_ref) {
-  size_t memory_used_delta = cache_->GetUsage(handle);
-  bool erased = cache_->Release(handle, useful, erase_if_last_ref);
+  size_t memory_used_delta = target_->GetUsage(handle);
+  bool erased = target_->Release(handle, useful, erase_if_last_ref);
  if (erased) {
    assert(cache_res_mgr_);
    cache_res_mgr_
@ -80,8 +72,8 @@ bool ChargedCache::Release(Cache::Handle* handle, bool useful,
 }

 bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
-  size_t memory_used_delta = cache_->GetUsage(handle);
-  bool erased = cache_->Release(handle, erase_if_last_ref);
+  size_t memory_used_delta = target_->GetUsage(handle);
+  bool erased = target_->Release(handle, erase_if_last_ref);
  if (erased) {
    assert(cache_res_mgr_);
    cache_res_mgr_
@ -92,25 +84,25 @@ bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
 }

 void ChargedCache::Erase(const Slice& key) {
-  cache_->Erase(key);
+  target_->Erase(key);
  assert(cache_res_mgr_);
-  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+  cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
      .PermitUncheckedError();
 }

 void ChargedCache::EraseUnRefEntries() {
-  cache_->EraseUnRefEntries();
+  target_->EraseUnRefEntries();
  assert(cache_res_mgr_);
-  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+  cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
      .PermitUncheckedError();
 }

 void ChargedCache::SetCapacity(size_t capacity) {
-  cache_->SetCapacity(capacity);
+  target_->SetCapacity(capacity);
  // SetCapacity can result in evictions when the cache capacity is decreased,
  // so we would want to update the cache reservation here as well.
  assert(cache_res_mgr_);
-  cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
+  cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
      .PermitUncheckedError();
 }

--- a/cache/charged_cache.h
+++ b/cache/charged_cache.h
@ -8,7 +8,7 @@
 #include <string>

 #include "port/port.h"
-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"

 namespace ROCKSDB_NAMESPACE {

@ -17,22 +17,21 @@ class ConcurrentCacheReservationManager;
 // A cache interface which wraps around another cache and takes care of
 // reserving space in block cache towards a single global memory limit, and
 // forwards all the calls to the underlying cache.
-class ChargedCache : public Cache {
+class ChargedCache : public CacheWrapper {
 public:
  ChargedCache(std::shared_ptr<Cache> cache,
               std::shared_ptr<Cache> block_cache);
-  ~ChargedCache() override = default;

-  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
-                Handle** handle, Priority priority) override;
-  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+  Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
                size_t charge, Handle** handle = nullptr,
                Priority priority = Priority::LOW) override;

-  Cache::Handle* Lookup(const Slice& key, Statistics* stats) override;
  Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
-                        const CreateCallback& create_cb, Priority priority,
-                        bool wait, Statistics* stats = nullptr) override;
+                        CreateContext* create_context,
+                        Priority priority = Priority::LOW,
+                        Statistics* stats = nullptr) override;
+
+  void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;

  bool Release(Cache::Handle* handle, bool useful,
               bool erase_if_last_ref = false) override;
@ -44,69 +43,9 @@ class ChargedCache : public Cache {
  static const char* kClassName() { return "ChargedCache"; }
  const char* Name() const override { return kClassName(); }

-  uint64_t NewId() override { return cache_->NewId(); }
-
  void SetCapacity(size_t capacity) override;

-  void SetStrictCapacityLimit(bool strict_capacity_limit) override {
-    cache_->SetStrictCapacityLimit(strict_capacity_limit);
-  }
-
-  bool HasStrictCapacityLimit() const override {
-    return cache_->HasStrictCapacityLimit();
-  }
-
-  void* Value(Cache::Handle* handle) override { return cache_->Value(handle); }
-
-  bool IsReady(Cache::Handle* handle) override {
-    return cache_->IsReady(handle);
-  }
-
-  void Wait(Cache::Handle* handle) override { cache_->Wait(handle); }
-
-  void WaitAll(std::vector<Handle*>& handles) override {
-    cache_->WaitAll(handles);
-  }
-
-  bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); }
-
-  size_t GetCapacity() const override { return cache_->GetCapacity(); }
-
-  size_t GetUsage() const override { return cache_->GetUsage(); }
-
-  size_t GetUsage(Cache::Handle* handle) const override {
-    return cache_->GetUsage(handle);
-  }
-
-  size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
-
-  size_t GetCharge(Cache::Handle* handle) const override {
-    return cache_->GetCharge(handle);
-  }
-
-  Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override {
-    return cache_->GetDeleter(handle);
-  }
-
-  void ApplyToAllEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               Cache::DeleterFn deleter)>& callback,
-      const Cache::ApplyToAllEntriesOptions& opts) override {
-    cache_->ApplyToAllEntries(callback, opts);
-  }
-
-  void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge),
-                              bool thread_safe) override {
-    cache_->ApplyToAllCacheEntries(callback, thread_safe);
-  }
-
-  std::string GetPrintableOptions() const override {
-    return cache_->GetPrintableOptions();
-  }
-
-  void DisownData() override { return cache_->DisownData(); }
-
-  inline Cache* GetCache() const { return cache_.get(); }
+  inline Cache* GetCache() const { return target_.get(); }

  inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
      const {
@ -114,7 +53,6 @@ class ChargedCache : public Cache {
  }

 private:
-  std::shared_ptr<Cache> cache_;
  std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
 };

--- a/cache/clock_cache.cc
+++ b/cache/clock_cache.cc
--- a/cache/clock_cache.h
+++ b/cache/clock_cache.h
@ -24,6 +24,7 @@
 #include "rocksdb/cache.h"
 #include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
+#include "util/math.h"

 namespace ROCKSDB_NAMESPACE {

@ -145,7 +146,7 @@ class ClockCacheTest;
 //     (erased by user) but can be read by existing references, and ref count
 //     changed by Ref and Release.
 //
-// A special case is "detached" entries, which are heap-allocated handles
+// A special case is "standalone" entries, which are heap-allocated handles
 // not in the table. They are always Invisible and freed on zero refs.
 //
 // State transitions:
@ -200,8 +201,8 @@ class ClockCacheTest;
 // table occupancy limit has been reached. If strict_capacity_limit=false,
 // we must never fail Insert, and if a Handle* is provided, we have to return
 // a usable Cache handle on success. The solution to this (typically rare)
-// problem is "detached" handles, which are usable by the caller but not
-// actually available for Lookup in the Cache. Detached handles are allocated
+// problem is "standalone" handles, which are usable by the caller but not
+// actually available for Lookup in the Cache. Standalone handles are allocated
 // independently on the heap and specially marked so that they are freed on
 // the heap when their last reference is released.
 //
@ -305,23 +306,17 @@ constexpr double kLoadFactor = 0.7;
 constexpr double kStrictLoadFactor = 0.84;

 struct ClockHandleBasicData {
-  void* value = nullptr;
-  Cache::DeleterFn deleter = nullptr;
+  Cache::ObjectPtr value = nullptr;
+  const Cache::CacheItemHelper* helper = nullptr;
  // A lossless, reversible hash of the fixed-size (16 byte) cache key. This
  // eliminates the need to store a hash separately.
  UniqueId64x2 hashed_key = kNullUniqueId64x2;
  size_t total_charge = 0;

-  // For total_charge_and_flags
-  // "Detached" means the handle is allocated separately from hash table.
-  static constexpr uint64_t kFlagDetached = uint64_t{1} << 63;
-  // Extract just the total charge
-  static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1;
-
  inline size_t GetTotalCharge() const { return total_charge; }

  // Calls deleter (if non-null) on cache key and value
-  void FreeData() const;
+  void FreeData(MemoryAllocator* allocator) const;

  // Required by concept HandleImpl
  const UniqueId64x2& GetHash() const { return hashed_key; }
@ -377,14 +372,134 @@ struct ClockHandle : public ClockHandleBasicData {
  static constexpr uint8_t kMaxCountdown = kHighCountdown;
  // TODO: make these coundown values tuning parameters for eviction?

-  // See above
-  std::atomic<uint64_t> meta{};
+  // See above. Mutable for read reference counting.
+  mutable std::atomic<uint64_t> meta{};
+
+  // Whether this is a "deteched" handle that is independently allocated
+  // with `new` (so must be deleted with `delete`).
+  // TODO: ideally this would be packed into some other data field, such
+  // as upper bits of total_charge, but that incurs a measurable performance
+  // regression.
+  bool standalone = false;

-  // Anticipating use for SecondaryCache support
-  void* reserved_for_future_use = nullptr;
+  inline bool IsStandalone() const { return standalone; }
+
+  inline void SetStandalone() { standalone = true; }
 };  // struct ClockHandle

-class HyperClockTable {
+class BaseClockTable {
+ public:
+  BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
+                 MemoryAllocator* allocator,
+                 const Cache::EvictionCallback* eviction_callback,
+                 const uint32_t* hash_seed)
+      : metadata_charge_policy_(metadata_charge_policy),
+        allocator_(allocator),
+        eviction_callback_(*eviction_callback),
+        hash_seed_(*hash_seed) {}
+
+  template <class Table>
+  typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
+                                               size_t capacity,
+                                               bool strict_capacity_limit,
+                                               bool allow_uncharged);
+
+  template <class Table>
+  Status Insert(const ClockHandleBasicData& proto,
+                typename Table::HandleImpl** handle, Cache::Priority priority,
+                size_t capacity, bool strict_capacity_limit);
+
+  void Ref(ClockHandle& handle);
+
+  size_t GetOccupancy() const {
+    return occupancy_.load(std::memory_order_relaxed);
+  }
+
+  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+
+  size_t GetStandaloneUsage() const {
+    return standalone_usage_.load(std::memory_order_relaxed);
+  }
+
+  uint32_t GetHashSeed() const { return hash_seed_; }
+
+  struct EvictionData {
+    size_t freed_charge = 0;
+    size_t freed_count = 0;
+  };
+
+  void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
+
+#ifndef NDEBUG
+  // Acquire N references
+  void TEST_RefN(ClockHandle& handle, size_t n);
+  // Helper for TEST_ReleaseN
+  void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
+#endif
+
+ private:  // fns
+  // Creates a "standalone" handle for returning from an Insert operation that
+  // cannot be completed by actually inserting into the table.
+  // Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
+  template <class HandleImpl>
+  HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
+
+  // Helper for updating `usage_` for new entry with given `total_charge`
+  // and evicting if needed under strict_capacity_limit=true rules. This
+  // means the operation might fail with Status::MemoryLimit. If
+  // `need_evict_for_occupancy`, then eviction of at least one entry is
+  // required, and the operation should fail if not possible.
+  // NOTE: Otherwise, occupancy_ is not managed in this function
+  template <class Table>
+  Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
+                                     bool need_evict_for_occupancy,
+                                     typename Table::InsertState& state);
+
+  // Helper for updating `usage_` for new entry with given `total_charge`
+  // and evicting if needed under strict_capacity_limit=false rules. This
+  // means that updating `usage_` always succeeds even if forced to exceed
+  // capacity. If `need_evict_for_occupancy`, then eviction of at least one
+  // entry is required, and the operation should return false if such eviction
+  // is not possible. `usage_` is not updated in that case. Otherwise, returns
+  // true, indicating success.
+  // NOTE: occupancy_ is not managed in this function
+  template <class Table>
+  bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
+                                      bool need_evict_for_occupancy,
+                                      typename Table::InsertState& state);
+
+ protected:  // data
+  // We partition the following members into different cache lines
+  // to avoid false sharing among Lookup, Release, Erase and Insert
+  // operations in ClockCacheShard.
+
+  // Clock algorithm sweep pointer.
+  std::atomic<uint64_t> clock_pointer_{};
+
+  ALIGN_AS(CACHE_LINE_SIZE)
+  // Number of elements in the table.
+  std::atomic<size_t> occupancy_{};
+
+  // Memory usage by entries tracked by the cache (including standalone)
+  std::atomic<size_t> usage_{};
+
+  // Part of usage by standalone entries (not in table)
+  std::atomic<size_t> standalone_usage_{};
+
+  ALIGN_AS(CACHE_LINE_SIZE)
+  const CacheMetadataChargePolicy metadata_charge_policy_;
+
+  // From Cache, for deleter
+  MemoryAllocator* const allocator_;
+
+  // A reference to Cache::eviction_callback_
+  const Cache::EvictionCallback& eviction_callback_;
+
+  // A reference to ShardedCacheBase::hash_seed_
+  const uint32_t& hash_seed_;
+};
+
+class HyperClockTable : public BaseClockTable {
 public:
  // Target size to be exactly a common cache line size (see static_assert in
  // clock_cache.cc)
@ -393,16 +508,6 @@ class HyperClockTable {
    // up in this slot or a higher one.
    std::atomic<uint32_t> displacements{};

-    // Whether this is a "deteched" handle that is independently allocated
-    // with `new` (so must be deleted with `delete`).
-    // TODO: ideally this would be packed into some other data field, such
-    // as upper bits of total_charge, but that incurs a measurable performance
-    // regression.
-    bool detached = false;
-
-    inline bool IsDetached() const { return detached; }
-
-    inline void SetDetached() { detached = true; }
  };  // struct HandleImpl

  struct Opts {
@ -411,75 +516,70 @@ class HyperClockTable {

  HyperClockTable(size_t capacity, bool strict_capacity_limit,
                  CacheMetadataChargePolicy metadata_charge_policy,
-                  const Opts& opts);
+                  MemoryAllocator* allocator,
+                  const Cache::EvictionCallback* eviction_callback,
+                  const uint32_t* hash_seed, const Opts& opts);
  ~HyperClockTable();

-  Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
-                Cache::Priority priority, size_t capacity,
-                bool strict_capacity_limit);
+  // For BaseClockTable::Insert
+  struct InsertState {};
+
+  void StartInsert(InsertState& state);
+
+  // Returns true iff there is room for the proposed number of entries.
+  bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
+
+  HandleImpl* DoInsert(const ClockHandleBasicData& proto,
+                       uint64_t initial_countdown, bool take_ref,
+                       InsertState& state);
+
+  // Runs the clock eviction algorithm trying to reclaim at least
+  // requested_charge. Returns how much is evicted, which could be less
+  // if it appears impossible to evict the requested amount without blocking.
+  void Evict(size_t requested_charge, InsertState& state, EvictionData* data);

  HandleImpl* Lookup(const UniqueId64x2& hashed_key);

  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);

-  void Ref(HandleImpl& handle);
-
  void Erase(const UniqueId64x2& hashed_key);

-  void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
-                                size_t index_begin, size_t index_end,
-                                bool apply_if_will_be_deleted) const;
-
  void EraseUnRefEntries();

  size_t GetTableSize() const { return size_t{1} << length_bits_; }

-  int GetLengthBits() const { return length_bits_; }
-
-  size_t GetOccupancy() const {
-    return occupancy_.load(std::memory_order_relaxed);
-  }
-
  size_t GetOccupancyLimit() const { return occupancy_limit_; }

-  size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
+  const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }

-  size_t GetDetachedUsage() const {
-    return detached_usage_.load(std::memory_order_relaxed);
+#ifndef NDEBUG
+  size_t& TEST_MutableOccupancyLimit() const {
+    return const_cast<size_t&>(occupancy_limit_);
  }

-  // Acquire/release N references
-  void TEST_RefN(HandleImpl& handle, size_t n);
+  // Release N references
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+#endif

 private:  // functions
  // Returns x mod 2^{length_bits_}.
  inline size_t ModTableSize(uint64_t x) {
-    return static_cast<size_t>(x) & length_bits_mask_;
+    return BitwiseAnd(x, length_bits_mask_);
  }

-  // Runs the clock eviction algorithm trying to reclaim at least
-  // requested_charge. Returns how much is evicted, which could be less
-  // if it appears impossible to evict the requested amount without blocking.
-  inline void Evict(size_t requested_charge, size_t* freed_charge,
-                    size_t* freed_count);
-
-  // Returns the first slot in the probe sequence, starting from the given
-  // probe number, with a handle e such that match(e) is true. At every
-  // step, the function first tests whether match(e) holds. If this is false,
-  // it evaluates abort(e) to decide whether the search should be aborted,
-  // and in the affirmative returns -1. For every handle e probed except
-  // the last one, the function runs update(e).
-  // The probe parameter is modified as follows. We say a probe to a handle
-  // e is aborting if match(e) is false and abort(e) is true. Then the final
-  // value of probe is one more than the last non-aborting probe during the
-  // call. This is so that that the variable can be used to keep track of
-  // progress across consecutive calls to FindSlot.
+  // Returns the first slot in the probe sequence with a handle e such that
+  // match_fn(e) is true. At every step, the function first tests whether
+  // match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
+  // whether the search should be aborted, and if so, FindSlot immediately
+  // returns nullptr. For every handle e that is not a match and not aborted,
+  // FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
+  // slot will be the last probed because the next would cycle back to the first
+  // slot probed. This function uses templates instead of std::function to
+  // minimize the risk of heap-allocated closures being created.
+  template <typename MatchFn, typename AbortFn, typename UpdateFn>
  inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
-                              std::function<bool(HandleImpl*)> match,
-                              std::function<bool(HandleImpl*)> stop,
-                              std::function<void(HandleImpl*)> update,
-                              size_t& probe);
+                              const MatchFn& match_fn, const AbortFn& abort_fn,
+                              const UpdateFn& update_fn);

  // Re-decrement all displacements in probe path starting from beginning
  // until (not including) the given handle
@ -492,32 +592,7 @@ class HyperClockTable {
  // before releasing it so that it can be provided to this function.
  inline void ReclaimEntryUsage(size_t total_charge);

-  // Helper for updating `usage_` for new entry with given `total_charge`
-  // and evicting if needed under strict_capacity_limit=true rules. This
-  // means the operation might fail with Status::MemoryLimit. If
-  // `need_evict_for_occupancy`, then eviction of at least one entry is
-  // required, and the operation should fail if not possible.
-  // NOTE: Otherwise, occupancy_ is not managed in this function
-  inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
-                                            size_t capacity,
-                                            bool need_evict_for_occupancy);
-
-  // Helper for updating `usage_` for new entry with given `total_charge`
-  // and evicting if needed under strict_capacity_limit=false rules. This
-  // means that updating `usage_` always succeeds even if forced to exceed
-  // capacity. If `need_evict_for_occupancy`, then eviction of at least one
-  // entry is required, and the operation should return false if such eviction
-  // is not possible. `usage_` is not updated in that case. Otherwise, returns
-  // true, indicating success.
-  // NOTE: occupancy_ is not managed in this function
-  inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
-                                             size_t capacity,
-                                             bool need_evict_for_occupancy);
-
-  // Creates a "detached" handle for returning from an Insert operation that
-  // cannot be completed by actually inserting into the table.
-  // Updates `detached_usage_` but not `usage_` nor `occupancy_`.
-  inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto);
+  MemoryAllocator* GetAllocator() const { return allocator_; }

  // Returns the number of bits used to hash an element in the hash
  // table.
@ -537,24 +612,6 @@ class HyperClockTable {

  // Array of slots comprising the hash table.
  const std::unique_ptr<HandleImpl[]> array_;
-
-  // We partition the following members into different cache lines
-  // to avoid false sharing among Lookup, Release, Erase and Insert
-  // operations in ClockCacheShard.
-
-  ALIGN_AS(CACHE_LINE_SIZE)
-  // Clock algorithm sweep pointer.
-  std::atomic<uint64_t> clock_pointer_{};
-
-  ALIGN_AS(CACHE_LINE_SIZE)
-  // Number of elements in the table.
-  std::atomic<size_t> occupancy_{};
-
-  // Memory usage by entries tracked by the cache (including detached)
-  std::atomic<size_t> usage_{};
-
-  // Part of usage by detached entries (not in table)
-  std::atomic<size_t> detached_usage_{};
 };  // class HyperClockTable

 // A single shard of sharded cache.
@ -563,7 +620,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
 public:
  ClockCacheShard(size_t capacity, bool strict_capacity_limit,
                  CacheMetadataChargePolicy metadata_charge_policy,
-                  const typename Table::Opts& opts);
+                  MemoryAllocator* allocator,
+                  const Cache::EvictionCallback* eviction_callback,
+                  const uint32_t* hash_seed, const typename Table::Opts& opts);

  // For CacheShard concept
  using HandleImpl = typename Table::HandleImpl;
@ -573,22 +632,23 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
  static inline uint32_t HashPieceForSharding(HashCref hash) {
    return Upper32of64(hash[0]);
  }
-  static inline HashVal ComputeHash(const Slice& key) {
+  static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
    assert(key.size() == kCacheKeySize);
    HashVal in;
    HashVal out;
    // NOTE: endian dependence
    // TODO: use GetUnaligned?
    std::memcpy(&in, key.data(), kCacheKeySize);
-    BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
+    BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]);
    return out;
  }

  // For reconstructing key from hashed_key. Requires the caller to provide
  // backing storage for the Slice in `unhashed`
  static inline Slice ReverseHash(const UniqueId64x2& hashed,
-                                  UniqueId64x2* unhashed) {
+                                  UniqueId64x2* unhashed, uint32_t seed) {
    BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
+    (*unhashed)[0] ^= seed;
    // NOTE: endian dependence
    return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
  }
@ -600,9 +660,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {

  void SetStrictCapacityLimit(bool strict_capacity_limit);

-  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
-                size_t charge, Cache::DeleterFn deleter, HandleImpl** handle,
-                Cache::Priority priority);
+  Status Insert(const Slice& key, const UniqueId64x2& hashed_key,
+                Cache::ObjectPtr value, const Cache::CacheItemHelper* helper,
+                size_t charge, HandleImpl** handle, Cache::Priority priority);
+
+  HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key,
+                               Cache::ObjectPtr obj,
+                               const Cache::CacheItemHelper* helper,
+                               size_t charge, bool allow_uncharged);

  HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);

@ -618,7 +683,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {

  size_t GetUsage() const;

-  size_t GetDetachedUsage() const;
+  size_t GetStandaloneUsage() const;

  size_t GetPinnedUsage() const;

@ -629,37 +694,30 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
  size_t GetTableAddressCount() const;

  void ApplyToSomeEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
+      const std::function<void(const Slice& key, Cache::ObjectPtr obj,
+                               size_t charge,
+                               const Cache::CacheItemHelper* helper)>& callback,
      size_t average_entries_per_lock, size_t* state);

  void EraseUnRefEntries();

  std::string GetPrintableOptions() const { return std::string{}; }

-  // SecondaryCache not yet supported
-  Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
-                const Cache::CacheItemHelper* helper, size_t charge,
-                HandleImpl** handle, Cache::Priority priority) {
-    return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
-                  priority);
-  }
-
  HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
                     const Cache::CacheItemHelper* /*helper*/,
-                     const Cache::CreateCallback& /*create_cb*/,
-                     Cache::Priority /*priority*/, bool /*wait*/,
-                     Statistics* /*stats*/) {
+                     Cache::CreateContext* /*create_context*/,
+                     Cache::Priority /*priority*/, Statistics* /*stats*/) {
    return Lookup(key, hashed_key);
  }

-  bool IsReady(HandleImpl* /*handle*/) { return true; }
-
-  void Wait(HandleImpl* /*handle*/) {}
-
+#ifndef NDEBUG
+  size_t& TEST_MutableOccupancyLimit() const {
+    return table_.TEST_MutableOccupancyLimit();
+  }
  // Acquire/release N references
  void TEST_RefN(HandleImpl* handle, size_t n);
  void TEST_ReleaseN(HandleImpl* handle, size_t n);
+#endif

 private:  // data
  Table table_;
@ -679,18 +737,15 @@ class HyperClockCache
 public:
  using Shard = ClockCacheShard<HyperClockTable>;

-  HyperClockCache(size_t capacity, size_t estimated_value_size,
-                  int num_shard_bits, bool strict_capacity_limit,
-                  CacheMetadataChargePolicy metadata_charge_policy,
-                  std::shared_ptr<MemoryAllocator> memory_allocator);
+  explicit HyperClockCache(const HyperClockCacheOptions& opts);

  const char* Name() const override { return "HyperClockCache"; }

-  void* Value(Handle* handle) override;
+  Cache::ObjectPtr Value(Handle* handle) override;

  size_t GetCharge(Handle* handle) const override;

-  DeleterFn GetDeleter(Handle* handle) const override;
+  const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;

  void ReportProblems(
      const std::shared_ptr<Logger>& /*info_log*/) const override;
--- a/cache/compressed_secondary_cache.cc
+++ b/cache/compressed_secondary_cache.cc
@ -9,7 +9,7 @@
 #include <cstdint>
 #include <memory>

-#include "memory/memory_allocator.h"
+#include "memory/memory_allocator_impl.h"
 #include "monitoring/perf_context_imp.h"
 #include "util/compression.h"
 #include "util/string_util.h"
@ -17,30 +17,24 @@
 namespace ROCKSDB_NAMESPACE {

 CompressedSecondaryCache::CompressedSecondaryCache(
-    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-    double high_pri_pool_ratio, double low_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy,
-    CompressionType compression_type, uint32_t compress_format_version,
-    bool enable_custom_split_merge)
-    : cache_options_(capacity, num_shard_bits, strict_capacity_limit,
-                     high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
-                     use_adaptive_mutex, metadata_charge_policy,
-                     compression_type, compress_format_version,
-                     enable_custom_split_merge) {
-  cache_ =
-      NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
-                  high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
-                  metadata_charge_policy, low_pri_pool_ratio);
+    const CompressedSecondaryCacheOptions& opts)
+    : cache_(opts.LRUCacheOptions::MakeSharedCache()),
+      cache_options_(opts),
+      cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
+          std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+              cache_))) {}
+
+CompressedSecondaryCache::~CompressedSecondaryCache() {
+  assert(cache_res_mgr_->GetTotalReservedCacheSize() == 0);
 }

-CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
-
 std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
-    const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
-    bool advise_erase, bool& is_in_sec_cache) {
+    const Slice& key, const Cache::CacheItemHelper* helper,
+    Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
+    bool& kept_in_sec_cache) {
+  assert(helper);
  std::unique_ptr<SecondaryCacheResultHandle> handle;
-  is_in_sec_cache = false;
+  kept_in_sec_cache = false;
  Cache::Handle* lru_handle = cache_->Lookup(key);
  if (lru_handle == nullptr) {
    return nullptr;
@ -64,12 +58,15 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
    ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
    handle_value_charge = cache_->GetCharge(lru_handle);
  }
+  MemoryAllocator* allocator = cache_options_.memory_allocator.get();

  Status s;
-  void* value{nullptr};
+  Cache::ObjectPtr value{nullptr};
  size_t charge{0};
-  if (cache_options_.compression_type == kNoCompression) {
-    s = create_cb(ptr->get(), handle_value_charge, &value, &charge);
+  if (cache_options_.compression_type == kNoCompression ||
+      cache_options_.do_not_compress_roles.Contains(helper->role)) {
+    s = helper->create_cb(Slice(ptr->get(), handle_value_charge),
+                          create_context, allocator, &value, &charge);
  } else {
    UncompressionContext uncompression_context(cache_options_.compression_type);
    UncompressionInfo uncompression_info(uncompression_context,
@ -79,14 +76,14 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
    size_t uncompressed_size{0};
    CacheAllocationPtr uncompressed = UncompressData(
        uncompression_info, (char*)ptr->get(), handle_value_charge,
-        &uncompressed_size, cache_options_.compress_format_version,
-        cache_options_.memory_allocator.get());
+        &uncompressed_size, cache_options_.compress_format_version, allocator);

    if (!uncompressed) {
      cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
      return nullptr;
    }
-    s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge);
+    s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
+                          create_context, allocator, &value, &charge);
  }

  if (!s.ok()) {
@ -98,30 +95,32 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
    cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
    // Insert a dummy handle.
    cache_
-        ->Insert(key, /*value=*/nullptr, /*charge=*/0,
-                 GetDeletionCallback(cache_options_.enable_custom_split_merge))
+        ->Insert(key, /*obj=*/nullptr,
+                 GetHelper(cache_options_.enable_custom_split_merge),
+                 /*charge=*/0)
        .PermitUncheckedError();
  } else {
-    is_in_sec_cache = true;
+    kept_in_sec_cache = true;
    cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
  }
  handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
  return handle;
 }

-Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
+Status CompressedSecondaryCache::Insert(const Slice& key,
+                                        Cache::ObjectPtr value,
                                        const Cache::CacheItemHelper* helper) {
  if (value == nullptr) {
    return Status::InvalidArgument();
  }

  Cache::Handle* lru_handle = cache_->Lookup(key);
-  Cache::DeleterFn del_cb =
-      GetDeletionCallback(cache_options_.enable_custom_split_merge);
+  auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
  if (lru_handle == nullptr) {
    PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
    // Insert a dummy handle if the handle is evicted for the first time.
-    return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb);
+    return cache_->Insert(key, /*obj=*/nullptr, internal_helper,
+                          /*charge=*/0);
  } else {
    cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
  }
@ -137,7 +136,8 @@ Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
  Slice val(ptr.get(), size);

  std::string compressed_val;
-  if (cache_options_.compression_type != kNoCompression) {
+  if (cache_options_.compression_type != kNoCompression &&
+      !cache_options_.do_not_compress_roles.Contains(helper->role)) {
    PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
    CompressionOptions compression_opts;
    CompressionContext compression_context(cache_options_.compression_type);
@ -169,10 +169,10 @@ Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
    size_t charge{0};
    CacheValueChunk* value_chunks_head =
        SplitValueIntoChunks(val, cache_options_.compression_type, charge);
-    return cache_->Insert(key, value_chunks_head, charge, del_cb);
+    return cache_->Insert(key, value_chunks_head, internal_helper, charge);
  } else {
    CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
-    return cache_->Insert(key, buf, size, del_cb);
+    return cache_->Insert(key, buf, internal_helper, size);
  }
 }

@ -276,50 +276,43 @@ CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
  return ptr;
 }

-Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback(
-    bool enable_custom_split_merge) {
+const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
+    bool enable_custom_split_merge) const {
  if (enable_custom_split_merge) {
-    return [](const Slice& /*key*/, void* obj) {
-      CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj);
-      while (chunks_head != nullptr) {
-        CacheValueChunk* tmp_chunk = chunks_head;
-        chunks_head = chunks_head->next;
-        tmp_chunk->Free();
-        obj = nullptr;
-      };
-    };
+    static const Cache::CacheItemHelper kHelper{
+        CacheEntryRole::kMisc,
+        [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
+          CacheValueChunk* chunks_head = static_cast<CacheValueChunk*>(obj);
+          while (chunks_head != nullptr) {
+            CacheValueChunk* tmp_chunk = chunks_head;
+            chunks_head = chunks_head->next;
+            tmp_chunk->Free();
+            obj = nullptr;
+          };
+        }};
+    return &kHelper;
  } else {
-    return [](const Slice& /*key*/, void* obj) {
-      delete reinterpret_cast<CacheAllocationPtr*>(obj);
-      obj = nullptr;
-    };
+    static const Cache::CacheItemHelper kHelper{
+        CacheEntryRole::kMisc,
+        [](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
+          delete static_cast<CacheAllocationPtr*>(obj);
+          obj = nullptr;
+        }};
+    return &kHelper;
  }
 }

-std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
-    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-    double high_pri_pool_ratio, double low_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy,
-    CompressionType compression_type, uint32_t compress_format_version,
-    bool enable_custom_split_merge) {
-  return std::make_shared<CompressedSecondaryCache>(
-      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
-      low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
-      metadata_charge_policy, compression_type, compress_format_version,
-      enable_custom_split_merge);
+std::shared_ptr<SecondaryCache>
+CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
+  return std::make_shared<CompressedSecondaryCache>(*this);
+}
+
+Status CompressedSecondaryCache::Deflate(size_t decrease) {
+  return cache_res_mgr_->UpdateCacheReservation(decrease, /*increase=*/true);
 }

-std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
-    const CompressedSecondaryCacheOptions& opts) {
-  // The secondary_cache is disabled for this LRUCache instance.
-  assert(opts.secondary_cache == nullptr);
-  return NewCompressedSecondaryCache(
-      opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
-      opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
-      opts.use_adaptive_mutex, opts.metadata_charge_policy,
-      opts.compression_type, opts.compress_format_version,
-      opts.enable_custom_split_merge);
+Status CompressedSecondaryCache::Inflate(size_t increase) {
+  return cache_res_mgr_->UpdateCacheReservation(increase, /*increase=*/false);
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/compressed_secondary_cache.h
+++ b/cache/compressed_secondary_cache.h
@ -9,8 +9,9 @@
 #include <cstddef>
 #include <memory>

+#include "cache/cache_reservation_manager.h"
 #include "cache/lru_cache.h"
-#include "memory/memory_allocator.h"
+#include "memory/memory_allocator_impl.h"
 #include "rocksdb/secondary_cache.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@ -21,7 +22,7 @@ namespace ROCKSDB_NAMESPACE {

 class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
 public:
-  CompressedSecondaryCacheResultHandle(void* value, size_t size)
+  CompressedSecondaryCacheResultHandle(Cache::ObjectPtr value, size_t size)
      : value_(value), size_(size) {}
  ~CompressedSecondaryCacheResultHandle() override = default;

@ -34,12 +35,12 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {

  void Wait() override {}

-  void* Value() override { return value_; }
+  Cache::ObjectPtr Value() override { return value_; }

  size_t Size() override { return size_; }

 private:
-  void* value_;
+  Cache::ObjectPtr value_;
  size_t size_;
 };

@ -69,26 +70,19 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {

 class CompressedSecondaryCache : public SecondaryCache {
 public:
-  CompressedSecondaryCache(
-      size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-      double high_pri_pool_ratio, double low_pri_pool_ratio,
-      std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-      bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
-      CacheMetadataChargePolicy metadata_charge_policy =
-          kDefaultCacheMetadataChargePolicy,
-      CompressionType compression_type = CompressionType::kLZ4Compression,
-      uint32_t compress_format_version = 2,
-      bool enable_custom_split_merge = false);
+  explicit CompressedSecondaryCache(
+      const CompressedSecondaryCacheOptions& opts);
  ~CompressedSecondaryCache() override;

  const char* Name() const override { return "CompressedSecondaryCache"; }

-  Status Insert(const Slice& key, void* value,
+  Status Insert(const Slice& key, Cache::ObjectPtr value,
                const Cache::CacheItemHelper* helper) override;

  std::unique_ptr<SecondaryCacheResultHandle> Lookup(
-      const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
-      bool advise_erase, bool& is_in_sec_cache) override;
+      const Slice& key, const Cache::CacheItemHelper* helper,
+      Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
+      bool& kept_in_sec_cache) override;

  bool SupportForceErase() const override { return true; }

@ -100,10 +94,16 @@ class CompressedSecondaryCache : public SecondaryCache {

  Status GetCapacity(size_t& capacity) override;

+  Status Deflate(size_t decrease) override;
+
+  Status Inflate(size_t increase) override;
+
  std::string GetPrintableOptions() const override;

+  size_t TEST_GetUsage() { return cache_->GetUsage(); }
+
 private:
-  friend class CompressedSecondaryCacheTest;
+  friend class CompressedSecondaryCacheTestBase;
  static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
      128, 256, 512, 1024, 2048, 4096, 8192, 16384};

@ -129,11 +129,12 @@ class CompressedSecondaryCache : public SecondaryCache {
  CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
                                          size_t& charge);

-  // An implementation of Cache::DeleterFn.
-  static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge);
+  // TODO: clean up to use cleaner interfaces in typed_cache.h
+  const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
  std::shared_ptr<Cache> cache_;
  CompressedSecondaryCacheOptions cache_options_;
  mutable port::Mutex capacity_mutex_;
+  std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
 };

 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/compressed_secondary_cache_test.cc
+++ b/cache/compressed_secondary_cache_test.cc
@ -5,86 +5,45 @@

 #include "cache/compressed_secondary_cache.h"

+#include <array>
 #include <iterator>
 #include <memory>
 #include <tuple>

+#include "cache/secondary_cache_adapter.h"
 #include "memory/jemalloc_nodump_allocator.h"
 #include "rocksdb/convenience.h"
+#include "test_util/secondary_cache_test_util.h"
 #include "test_util/testharness.h"
 #include "test_util/testutil.h"
+#include "util/cast_util.h"

 namespace ROCKSDB_NAMESPACE {

-class CompressedSecondaryCacheTest : public testing::Test {
- public:
-  CompressedSecondaryCacheTest() : fail_create_(false) {}
-  ~CompressedSecondaryCacheTest() override = default;
-
- protected:
-  class TestItem {
-   public:
-    TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
-      memcpy(buf_.get(), buf, size);
-    }
-    ~TestItem() = default;
-
-    char* Buf() { return buf_.get(); }
-    [[nodiscard]] size_t Size() const { return size_; }
-
-   private:
-    std::unique_ptr<char[]> buf_;
-    size_t size_;
-  };
+using secondary_cache_test_util::GetTestingCacheTypes;
+using secondary_cache_test_util::WithCacheType;

-  static size_t SizeCallback(void* obj) {
-    return reinterpret_cast<TestItem*>(obj)->Size();
-  }
-
-  static Status SaveToCallback(void* from_obj, size_t from_offset,
-                               size_t length, void* out) {
-    auto item = reinterpret_cast<TestItem*>(from_obj);
-    const char* buf = item->Buf();
-    EXPECT_EQ(length, item->Size());
-    EXPECT_EQ(from_offset, 0);
-    memcpy(out, buf, length);
-    return Status::OK();
-  }
-
-  static void DeletionCallback(const Slice& /*key*/, void* obj) {
-    delete reinterpret_cast<TestItem*>(obj);
-    obj = nullptr;
-  }
+// 16 bytes for HCC compatibility
+const std::string key0 = "____    ____key0";
+const std::string key1 = "____    ____key1";
+const std::string key2 = "____    ____key2";
+const std::string key3 = "____    ____key3";

-  static Cache::CacheItemHelper helper_;
-
-  static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
-                                   size_t /*size*/, void* /*out*/) {
-    return Status::NotSupported();
-  }
-
-  static Cache::CacheItemHelper helper_fail_;
-
-  Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
-                                                void** out_obj,
-                                                size_t* charge) -> Status {
-    if (fail_create_) {
-      return Status::NotSupported();
-    }
-    *out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
-    *charge = size;
-    return Status::OK();
-  };
-
-  void SetFailCreate(bool fail) { fail_create_ = fail; }
+class CompressedSecondaryCacheTestBase : public testing::Test,
+                                         public WithCacheType {
+ public:
+  CompressedSecondaryCacheTestBase() {}
+  ~CompressedSecondaryCacheTestBase() override = default;

+ protected:
  void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
                       bool sec_cache_is_compressed) {
    get_perf_context()->Reset();
-    bool is_in_sec_cache{true};
+    bool kept_in_sec_cache{true};
    // Lookup an non-existent key.
-    std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup(
-        "k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle0 =
+        sec_cache->Lookup(key0, GetHelper(), this, true, /*advise_erase=*/true,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle0, nullptr);

    Random rnd(301);
@ -92,25 +51,25 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::string str1(rnd.RandomString(1000));
    TestItem item1(str1.data(), str1.length());
    // A dummy handle is inserted if the item is inserted for the first time.
-    ASSERT_OK(sec_cache->Insert("k1", &item1,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);

-    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
-        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle1_1, nullptr);

    // Insert and Lookup the item k1 for the second time and advise erasing it.
-    ASSERT_OK(sec_cache->Insert("k1", &item1,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);

-    std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup(
-        "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
+                          kept_in_sec_cache);
    ASSERT_NE(handle1_2, nullptr);
-    ASSERT_FALSE(is_in_sec_cache);
+    ASSERT_FALSE(kept_in_sec_cache);
    if (sec_cache_is_compressed) {
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
                1000);
@ -127,22 +86,22 @@ class CompressedSecondaryCacheTest : public testing::Test {
    ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0);

    // Lookup the item k1 again.
-    std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup(
-        "k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_3 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle1_3, nullptr);

    // Insert and Lookup the item k2.
    std::string str2(rnd.RandomString(1000));
    TestItem item2(str2.data(), str2.length());
-    ASSERT_OK(sec_cache->Insert("k2", &item2,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
-    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
-        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
+        sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle2_1, nullptr);

-    ASSERT_OK(sec_cache->Insert("k2", &item2,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
    if (sec_cache_is_compressed) {
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -153,8 +112,9 @@ class CompressedSecondaryCacheTest : public testing::Test {
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
    }
-    std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup(
-        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_2 =
+        sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_NE(handle2_2, nullptr);
    std::unique_ptr<TestItem> val2 =
        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value()));
@ -223,28 +183,26 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::string str1(rnd.RandomString(1000));
    TestItem item1(str1.data(), str1.length());
    // Insert a dummy handle.
-    ASSERT_OK(sec_cache->Insert("k1", &item1,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
    // Insert k1.
-    ASSERT_OK(sec_cache->Insert("k1", &item1,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));

    // Insert and Lookup the second item.
    std::string str2(rnd.RandomString(200));
    TestItem item2(str2.data(), str2.length());
    // Insert a dummy handle, k1 is not evicted.
-    ASSERT_OK(sec_cache->Insert("k2", &item2,
-                                &CompressedSecondaryCacheTest::helper_));
-    bool is_in_sec_cache{false};
-    std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup(
-        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
+    bool kept_in_sec_cache{false};
+    std::unique_ptr<SecondaryCacheResultHandle> handle1 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle1, nullptr);

    // Insert k2 and k1 is evicted.
-    ASSERT_OK(sec_cache->Insert("k2", &item2,
-                                &CompressedSecondaryCacheTest::helper_));
-    std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup(
-        "k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
+    std::unique_ptr<SecondaryCacheResultHandle> handle2 =
+        sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_NE(handle2, nullptr);
    std::unique_ptr<TestItem> val2 =
        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
@ -252,27 +210,26 @@ class CompressedSecondaryCacheTest : public testing::Test {
    ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);

    // Insert k1 again and a dummy handle is inserted.
-    ASSERT_OK(sec_cache->Insert("k1", &item1,
-                                &CompressedSecondaryCacheTest::helper_));
+    ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));

-    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
-        "k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
+        sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle1_1, nullptr);

    // Create Fails.
    SetFailCreate(true);
-    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
-        "k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
+    std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
+        sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/true,
+                          kept_in_sec_cache);
    ASSERT_EQ(handle2_1, nullptr);

    // Save Fails.
    std::string str3 = rnd.RandomString(10);
    TestItem item3(str3.data(), str3.length());
    // The Status is OK because a dummy handle is inserted.
-    ASSERT_OK(sec_cache->Insert("k3", &item3,
-                                &CompressedSecondaryCacheTest::helper_fail_));
-    ASSERT_NOK(sec_cache->Insert("k3", &item3,
-                                 &CompressedSecondaryCacheTest::helper_fail_));
+    ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail()));
+    ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail()));

    sec_cache.reset();
  }
@ -296,28 +253,22 @@ class CompressedSecondaryCacheTest : public testing::Test {
    secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge;
    std::shared_ptr<SecondaryCache> secondary_cache =
        NewCompressedSecondaryCache(secondary_cache_opts);
-    LRUCacheOptions lru_cache_opts(
+    std::shared_ptr<Cache> cache = NewCache(
        /*_capacity =*/1300, /*_num_shard_bits =*/0,
-        /*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5,
-        /*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex,
-        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0);
-    lru_cache_opts.secondary_cache = secondary_cache;
-    std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts);
+        /*_strict_capacity_limit =*/true, secondary_cache);
    std::shared_ptr<Statistics> stats = CreateDBStatistics();

    get_perf_context()->Reset();
    Random rnd(301);
    std::string str1 = rnd.RandomString(1001);
    auto item1_1 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert(
-        "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));

    std::string str2 = rnd.RandomString(1012);
    auto item2_1 = new TestItem(str2.data(), str2.length());
    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2_1, GetHelper(), str2.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@ -326,22 +277,19 @@ class CompressedSecondaryCacheTest : public testing::Test {
    auto item3_1 = new TestItem(str3.data(), str3.length());
    // After this Insert, primary cache contains k3 and secondary cache contains
    // k1's dummy item and k2's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length()));
+    ASSERT_OK(cache->Insert(key3, item3_1, GetHelper(), str3.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);

    // After this Insert, primary cache contains k1 and secondary cache contains
    // k1's dummy item, k2's dummy item, and k3's dummy item.
    auto item1_2 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert(
-        "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);

    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's item, k2's dummy item, and k3's dummy item.
    auto item2_2 = new TestItem(str2.data(), str2.length());
-    ASSERT_OK(cache->Insert(
-        "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
    if (sec_cache_is_compressed) {
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -356,8 +304,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
    // After this Insert, primary cache contains k3 and secondary cache contains
    // k1's item and k2's item.
    auto item3_2 = new TestItem(str3.data(), str3.length());
-    ASSERT_OK(cache->Insert(
-        "k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length()));
+    ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
    if (sec_cache_is_compressed) {
      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -370,8 +317,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
    }

    Cache::Handle* handle;
-    handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_NE(handle, nullptr);
    auto val3 = static_cast<TestItem*>(cache->Value(handle));
@ -380,15 +326,13 @@ class CompressedSecondaryCacheTest : public testing::Test {
    cache->Release(handle);

    // Lookup an non-existent key.
-    handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key0, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_EQ(handle, nullptr);

    // This Lookup should just insert a dummy handle in the primary cache
    // and the k1 is still in the secondary cache.
-    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_NE(handle, nullptr);
    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -400,8 +344,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
    // This Lookup should erase k1 from the secondary cache and insert
    // it into primary cache; then k3 is demoted.
    // k2 and k3 are in secondary cache.
-    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_NE(handle, nullptr);
    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -409,8 +352,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
    cache->Release(handle);

    // k2 is still in secondary cache.
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_NE(handle, nullptr);
    ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2);
@ -418,8 +360,7 @@ class CompressedSecondaryCacheTest : public testing::Test {

    // Testing SetCapacity().
    ASSERT_OK(secondary_cache->SetCapacity(0));
-    handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());
    ASSERT_EQ(handle, nullptr);

@ -429,35 +370,30 @@ class CompressedSecondaryCacheTest : public testing::Test {
    ASSERT_EQ(capacity, 7000);
    auto item1_3 = new TestItem(str1.data(), str1.length());
    // After this Insert, primary cache contains k1.
-    ASSERT_OK(cache->Insert(
-        "k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key1, item1_3, GetHelper(), str2.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4);

    auto item2_3 = new TestItem(str2.data(), str2.length());
    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_OK(cache->Insert(key2, item2_3, GetHelper(), str1.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4);

    auto item1_4 = new TestItem(str1.data(), str1.length());
    // After this Insert, primary cache contains k1 and secondary cache contains
    // k1's dummy item and k2's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key1, item1_4, GetHelper(), str2.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5);

    auto item2_4 = new TestItem(str2.data(), str2.length());
    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's real item and k2's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2_4, GetHelper(), str2.length()));
    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5);
    // This Lookup should just insert a dummy handle in the primary cache
    // and the k1 is still in the secondary cache.
-    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true,
+    handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
                           stats.get());

    ASSERT_NE(handle, nullptr);
@ -485,31 +421,31 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::shared_ptr<SecondaryCache> secondary_cache =
        NewCompressedSecondaryCache(secondary_cache_opts);

-    LRUCacheOptions opts(
+    std::shared_ptr<Cache> cache = NewCache(
        /*_capacity=*/1300, /*_num_shard_bits=*/0,
-        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
-        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
-        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
-    opts.secondary_cache = secondary_cache;
-    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+        /*_strict_capacity_limit=*/false, secondary_cache);

    Random rnd(301);
    std::string str1 = rnd.RandomString(1001);
    auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
-    ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length()));
-    ASSERT_OK(cache->Insert("k1", item1.get(),
-                            &CompressedSecondaryCacheTest::helper_,
-                            str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1.get(), GetHelper(), str1.length()));
    item1.release();  // Appease clang-analyze "potential memory leak"

    Cache::Handle* handle;
-    handle = cache->Lookup("k2", nullptr, test_item_creator,
-                           Cache::Priority::LOW, true);
+    handle = cache->Lookup(key2, nullptr, this, Cache::Priority::LOW);
    ASSERT_EQ(handle, nullptr);
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, false);
+    handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_EQ(handle, nullptr);

+    Cache::AsyncLookupHandle ah;
+    ah.key = key2;
+    ah.helper = GetHelper();
+    ah.create_context = this;
+    ah.priority = Cache::Priority::LOW;
+    cache->StartAsyncLookup(ah);
+    cache->Wait(ah);
+    ASSERT_EQ(ah.Result(), nullptr);
+
    cache.reset();
    secondary_cache.reset();
  }
@ -532,40 +468,29 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::shared_ptr<SecondaryCache> secondary_cache =
        NewCompressedSecondaryCache(secondary_cache_opts);

-    LRUCacheOptions opts(
+    std::shared_ptr<Cache> cache = NewCache(
        /*_capacity=*/1300, /*_num_shard_bits=*/0,
-        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
-        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
-        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
-    opts.secondary_cache = secondary_cache;
-    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+        /*_strict_capacity_limit=*/true, secondary_cache);

    Random rnd(301);
    std::string str1 = rnd.RandomString(1001);
    auto item1 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert("k1", item1,
-                            &CompressedSecondaryCacheTest::helper_fail_,
-                            str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1, GetHelperFail(), str1.length()));

    std::string str2 = rnd.RandomString(1002);
    auto item2 = new TestItem(str2.data(), str2.length());
    // k1 should be demoted to the secondary cache.
-    ASSERT_OK(cache->Insert("k2", item2,
-                            &CompressedSecondaryCacheTest::helper_fail_,
-                            str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2, GetHelperFail(), str2.length()));

    Cache::Handle* handle;
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
    ASSERT_NE(handle, nullptr);
    cache->Release(handle);
    // This lookup should fail, since k1 demotion would have failed.
-    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key1, GetHelperFail(), this, Cache::Priority::LOW);
    ASSERT_EQ(handle, nullptr);
    // Since k1 was not promoted, k2 should still be in cache.
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
    ASSERT_NE(handle, nullptr);
    cache->Release(handle);

@ -591,39 +516,30 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::shared_ptr<SecondaryCache> secondary_cache =
        NewCompressedSecondaryCache(secondary_cache_opts);

-    LRUCacheOptions opts(
+    std::shared_ptr<Cache> cache = NewCache(
        /*_capacity=*/1300, /*_num_shard_bits=*/0,
-        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
-        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
-        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
-    opts.secondary_cache = secondary_cache;
-    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+        /*_strict_capacity_limit=*/true, secondary_cache);

    Random rnd(301);
    std::string str1 = rnd.RandomString(1001);
    auto item1 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_,
-                            str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1, GetHelper(), str1.length()));

    std::string str2 = rnd.RandomString(1002);
    auto item2 = new TestItem(str2.data(), str2.length());
    // k1 should be demoted to the secondary cache.
-    ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
-                            str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));

    Cache::Handle* handle;
    SetFailCreate(true);
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_NE(handle, nullptr);
    cache->Release(handle);
    // This lookup should fail, since k1 creation would have failed
-    handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_EQ(handle, nullptr);
    // Since k1 didn't get promoted, k2 should still be in cache
-    handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                           test_item_creator, Cache::Priority::LOW, true);
+    handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_NE(handle, nullptr);
    cache->Release(handle);

@ -649,43 +565,34 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::shared_ptr<SecondaryCache> secondary_cache =
        NewCompressedSecondaryCache(secondary_cache_opts);

-    LRUCacheOptions opts(
+    std::shared_ptr<Cache> cache = NewCache(
        /*_capacity=*/1300, /*_num_shard_bits=*/0,
-        /*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
-        /*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
-        kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
-    opts.secondary_cache = secondary_cache;
-    std::shared_ptr<Cache> cache = NewLRUCache(opts);
+        /*_strict_capacity_limit=*/false, secondary_cache);

    Random rnd(301);
    std::string str1 = rnd.RandomString(1001);
    auto item1_1 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert(
-        "k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));

    std::string str2 = rnd.RandomString(1002);
    std::string str2_clone{str2};
    auto item2 = new TestItem(str2.data(), str2.length());
    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's dummy item.
-    ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
-                            str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));

    // After this Insert, primary cache contains k1 and secondary cache contains
    // k1's dummy item and k2's dummy item.
    auto item1_2 = new TestItem(str1.data(), str1.length());
-    ASSERT_OK(cache->Insert(
-        "k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
+    ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));

    auto item2_2 = new TestItem(str2.data(), str2.length());
    // After this Insert, primary cache contains k2 and secondary cache contains
    // k1's item and k2's dummy item.
-    ASSERT_OK(cache->Insert(
-        "k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
+    ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));

    Cache::Handle* handle2;
-    handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                            test_item_creator, Cache::Priority::LOW, true);
+    handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_NE(handle2, nullptr);
    cache->Release(handle2);

@ -693,14 +600,12 @@ class CompressedSecondaryCacheTest : public testing::Test {
    // strict_capacity_limit is true, but the lookup should still succeed.
    // A k1's dummy item is inserted into primary cache.
    Cache::Handle* handle1;
-    handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
-                            test_item_creator, Cache::Priority::LOW, true);
+    handle1 = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_NE(handle1, nullptr);
    cache->Release(handle1);

    // Since k1 didn't get inserted, k2 should still be in cache
-    handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
-                            test_item_creator, Cache::Priority::LOW, true);
+    handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
    ASSERT_NE(handle2, nullptr);
    cache->Release(handle2);

@ -723,8 +628,9 @@ class CompressedSecondaryCacheTest : public testing::Test {

    using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
    std::unique_ptr<CompressedSecondaryCache> sec_cache =
-        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
-                                                   allocator);
+        std::make_unique<CompressedSecondaryCache>(
+            CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
+                                            allocator));
    Random rnd(301);
    // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
    size_t str_size{8500};
@ -741,7 +647,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
    current_chunk = current_chunk->next;
    ASSERT_EQ(current_chunk->size, 98);

-    sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+    sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
  }

  void MergeChunksIntoValueTest() {
@ -775,7 +681,8 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::string str = str1 + str2 + str3;

    std::unique_ptr<CompressedSecondaryCache> sec_cache =
-        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0);
+        std::make_unique<CompressedSecondaryCache>(
+            CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
    size_t charge{0};
    CacheAllocationPtr value =
        sec_cache->MergeChunksIntoValue(chunks_head, charge);
@ -805,8 +712,9 @@ class CompressedSecondaryCacheTest : public testing::Test {

    using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
    std::unique_ptr<CompressedSecondaryCache> sec_cache =
-        std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
-                                                   allocator);
+        std::make_unique<CompressedSecondaryCache>(
+            CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
+                                            allocator));
    Random rnd(301);
    // 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
    size_t str_size{8500};
@ -822,31 +730,29 @@ class CompressedSecondaryCacheTest : public testing::Test {
    std::string value_str{value.get(), charge};
    ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);

-    sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
+    sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
  }
-
- private:
-  bool fail_create_;
 };

-Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_(
-    CompressedSecondaryCacheTest::SizeCallback,
-    CompressedSecondaryCacheTest::SaveToCallback,
-    CompressedSecondaryCacheTest::DeletionCallback);
+class CompressedSecondaryCacheTest
+    : public CompressedSecondaryCacheTestBase,
+      public testing::WithParamInterface<std::string> {
+  const std::string& Type() override { return GetParam(); }
+};

-Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_(
-    CompressedSecondaryCacheTest::SizeCallback,
-    CompressedSecondaryCacheTest::SaveToCallbackFail,
-    CompressedSecondaryCacheTest::DeletionCallback);
+INSTANTIATE_TEST_CASE_P(CompressedSecondaryCacheTest,
+                        CompressedSecondaryCacheTest, GetTestingCacheTypes());

 class CompressedSecCacheTestWithCompressAndAllocatorParam
-    : public CompressedSecondaryCacheTest,
-      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+    : public CompressedSecondaryCacheTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, std::string>> {
 public:
  CompressedSecCacheTestWithCompressAndAllocatorParam() {
    sec_cache_is_compressed_ = std::get<0>(GetParam());
    use_jemalloc_ = std::get<1>(GetParam());
  }
+  const std::string& Type() override { return std::get<2>(GetParam()); }
  bool sec_cache_is_compressed_;
  bool use_jemalloc_;
 };
@ -857,20 +763,20 @@ TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) {

 INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
                        CompressedSecCacheTestWithCompressAndAllocatorParam,
-                        ::testing::Combine(testing::Bool(), testing::Bool()));
+                        ::testing::Combine(testing::Bool(), testing::Bool(),
+                                           GetTestingCacheTypes()));

 class CompressedSecondaryCacheTestWithCompressionParam
-    : public CompressedSecondaryCacheTest,
-      public ::testing::WithParamInterface<bool> {
+    : public CompressedSecondaryCacheTestBase,
+      public ::testing::WithParamInterface<std::tuple<bool, std::string>> {
 public:
  CompressedSecondaryCacheTestWithCompressionParam() {
-    sec_cache_is_compressed_ = GetParam();
+    sec_cache_is_compressed_ = std::get<0>(GetParam());
  }
+  const std::string& Type() override { return std::get<1>(GetParam()); }
  bool sec_cache_is_compressed_;
 };

-#ifndef ROCKSDB_LITE
-
 TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
  std::shared_ptr<SecondaryCache> sec_cache{nullptr};
  std::string sec_cache_uri;
@ -934,7 +840,6 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
  BasicTestHelper(sec_cache, sec_cache_is_compressed_);
 }

-#endif  // ROCKSDB_LITE

 TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) {
  FailsTest(sec_cache_is_compressed_);
@ -960,18 +865,92 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
  IntegrationFullCapacityTest(sec_cache_is_compressed_);
 }

+TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
+  CompressedSecondaryCacheOptions opts;
+  opts.capacity = 2048;
+  opts.num_shard_bits = 0;
+
+  if (sec_cache_is_compressed_) {
+    if (!LZ4_Supported()) {
+      ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
+      return;
+    }
+  } else {
+    opts.compression_type = CompressionType::kNoCompression;
+  }
+
+  // Select a random subset to include, for fast test
+  Random& r = *Random::GetTLSInstance();
+  CacheEntryRoleSet do_not_compress;
+  for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    // A few included on average, but decent chance of zero
+    if (r.OneIn(5)) {
+      do_not_compress.Add(static_cast<CacheEntryRole>(i));
+    }
+  }
+  opts.do_not_compress_roles = do_not_compress;
+
+  std::shared_ptr<SecondaryCache> sec_cache = NewCompressedSecondaryCache(opts);
+
+  // Fixed seed to ensure consistent compressibility (doesn't compress)
+  std::string junk(Random(301).RandomString(1000));
+
+  for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
+    CacheEntryRole role = static_cast<CacheEntryRole>(i);
+
+    // Uniquify `junk`
+    junk[0] = static_cast<char>(i);
+    TestItem item{junk.data(), junk.length()};
+    Slice ith_key = Slice(junk.data(), 16);
+
+    get_perf_context()->Reset();
+    ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1U);
+
+    ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
+    ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1U);
+
+    bool kept_in_sec_cache{true};
+    std::unique_ptr<SecondaryCacheResultHandle> handle =
+        sec_cache->Lookup(ith_key, GetHelper(role), this, true,
+                          /*advise_erase=*/true, kept_in_sec_cache);
+    ASSERT_NE(handle, nullptr);
+
+    // Lookup returns the right data
+    std::unique_ptr<TestItem> val =
+        std::unique_ptr<TestItem>(static_cast<TestItem*>(handle->Value()));
+    ASSERT_NE(val, nullptr);
+    ASSERT_EQ(memcmp(val->Buf(), item.Buf(), item.Size()), 0);
+
+    bool compressed =
+        sec_cache_is_compressed_ && !do_not_compress.Contains(role);
+    if (compressed) {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
+                1000);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
+                1007);
+    } else {
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
+      ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
+    }
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
                        CompressedSecondaryCacheTestWithCompressionParam,
-                        testing::Bool());
+                        testing::Combine(testing::Bool(),
+                                         GetTestingCacheTypes()));

 class CompressedSecCacheTestWithCompressAndSplitParam
-    : public CompressedSecondaryCacheTest,
-      public ::testing::WithParamInterface<std::tuple<bool, bool>> {
+    : public CompressedSecondaryCacheTestBase,
+      public ::testing::WithParamInterface<
+          std::tuple<bool, bool, std::string>> {
 public:
  CompressedSecCacheTestWithCompressAndSplitParam() {
    sec_cache_is_compressed_ = std::get<0>(GetParam());
    enable_custom_split_merge_ = std::get<1>(GetParam());
  }
+  const std::string& Type() override { return std::get<2>(GetParam()); }
  bool sec_cache_is_compressed_;
  bool enable_custom_split_merge_;
 };
@ -982,20 +961,112 @@ TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) {

 INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
                        CompressedSecCacheTestWithCompressAndSplitParam,
-                        ::testing::Combine(testing::Bool(), testing::Bool()));
+                        ::testing::Combine(testing::Bool(), testing::Bool(),
+                                           GetTestingCacheTypes()));

-TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
+TEST_P(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
  SplitValueIntoChunksTest();
 }

-TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
+TEST_P(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
  MergeChunksIntoValueTest();
 }

-TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
+TEST_P(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
  SplictValueAndMergeChunksTest();
 }

+class CompressedSecCacheTestWithTiered : public ::testing::Test {
+ public:
+  CompressedSecCacheTestWithTiered() {
+    LRUCacheOptions lru_opts;
+    TieredVolatileCacheOptions opts;
+    lru_opts.capacity = 70 << 20;
+    opts.cache_opts = &lru_opts;
+    opts.cache_type = PrimaryCacheType::kCacheTypeLRU;
+    opts.comp_cache_opts.capacity = 30 << 20;
+    cache_ = NewTieredVolatileCache(opts);
+    cache_res_mgr_ =
+        std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+            cache_);
+  }
+
+ protected:
+  CacheReservationManager* cache_res_mgr() { return cache_res_mgr_.get(); }
+
+  Cache* GetCache() {
+    return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
+               cache_.get())
+        ->TEST_GetCache();
+  }
+
+  SecondaryCache* GetSecondaryCache() {
+    return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
+               cache_.get())
+        ->TEST_GetSecondaryCache();
+  }
+
+  size_t GetPercent(size_t val, unsigned int percent) {
+    return static_cast<size_t>(val * percent / 100);
+  }
+
+ private:
+  std::shared_ptr<Cache> cache_;
+  std::shared_ptr<CacheReservationManager> cache_res_mgr_;
+};
+
+bool CacheUsageWithinBounds(size_t val1, size_t val2, size_t error) {
+  return ((val1 < (val2 + error)) && (val1 > (val2 - error)));
+}
+
+TEST_F(CompressedSecCacheTestWithTiered, CacheReservationManager) {
+  CompressedSecondaryCache* sec_cache =
+      reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
+
+  // Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to
+  // double explicit casts
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
+               GetPercent(30 << 20, 1));
+  EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
+
+  ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20));
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
+               GetPercent(37 << 20, 1));
+  EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
+               GetPercent(3 << 20, 1));
+
+  ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0));
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
+               GetPercent(30 << 20, 1));
+  EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
+}
+
+TEST_F(CompressedSecCacheTestWithTiered,
+       CacheReservationManagerMultipleUpdate) {
+  CompressedSecondaryCache* sec_cache =
+      reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
+
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
+               GetPercent(30 << 20, 1));
+  EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
+
+  int i;
+  for (i = 0; i < 10; ++i) {
+    ASSERT_OK(cache_res_mgr()->UpdateCacheReservation((1 + i) << 20));
+  }
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
+               GetPercent(37 << 20, 1));
+  EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
+               GetPercent(3 << 20, 1));
+
+  for (i = 10; i > 0; --i) {
+    ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(((i - 1) << 20)));
+  }
+  EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
+               GetPercent(30 << 20, 1));
+  EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@ -14,28 +14,29 @@
 #include <cstdio>
 #include <cstdlib>

+#include "cache/secondary_cache_adapter.h"
 #include "monitoring/perf_context_imp.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "port/lang.h"
 #include "util/distributed_mutex.h"

 namespace ROCKSDB_NAMESPACE {
 namespace lru_cache {

-// A distinct pointer value for marking "dummy" cache entries
-void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker");
-
-LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
+LRUHandleTable::LRUHandleTable(int max_upper_hash_bits,
+                               MemoryAllocator* allocator)
    : length_bits_(/* historical starting size*/ 4),
      list_(new LRUHandle* [size_t{1} << length_bits_] {}),
      elems_(0),
-      max_length_bits_(max_upper_hash_bits) {}
+      max_length_bits_(max_upper_hash_bits),
+      allocator_(allocator) {}

 LRUHandleTable::~LRUHandleTable() {
+  auto alloc = allocator_;
  ApplyToEntriesRange(
-      [](LRUHandle* h) {
+      [alloc](LRUHandle* h) {
        if (!h->HasRefs()) {
-          h->Free();
+          h->Free(alloc);
        }
      },
      0, size_t{1} << length_bits_);
@ -95,7 +96,7 @@ void LRUHandleTable::Resize() {
  std::unique_ptr<LRUHandle* []> new_list {
    new LRUHandle* [size_t{1} << new_length_bits] {}
  };
-  uint32_t count = 0;
+  [[maybe_unused]] uint32_t count = 0;
  for (uint32_t i = 0; i < old_length; i++) {
    LRUHandle* h = list_[i];
    while (h != nullptr) {
@ -118,7 +119,8 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                             double low_pri_pool_ratio, bool use_adaptive_mutex,
                             CacheMetadataChargePolicy metadata_charge_policy,
                             int max_upper_hash_bits,
-                             SecondaryCache* secondary_cache)
+                             MemoryAllocator* allocator,
+                             const Cache::EvictionCallback* eviction_callback)
    : CacheShardBase(metadata_charge_policy),
      capacity_(0),
      high_pri_pool_usage_(0),
@ -128,11 +130,11 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
      high_pri_pool_capacity_(0),
      low_pri_pool_ratio_(low_pri_pool_ratio),
      low_pri_pool_capacity_(0),
-      table_(max_upper_hash_bits),
+      table_(max_upper_hash_bits, allocator),
      usage_(0),
      lru_usage_(0),
      mutex_(use_adaptive_mutex),
-      secondary_cache_(secondary_cache) {
+      eviction_callback_(*eviction_callback) {
  // Make empty circular linked list.
  lru_.next = &lru_;
  lru_.prev = &lru_;
@ -159,13 +161,14 @@ void LRUCacheShard::EraseUnRefEntries() {
  }

  for (auto entry : last_reference_list) {
-    entry->Free();
+    entry->Free(table_.GetAllocator());
  }
 }

 void LRUCacheShard::ApplyToSomeEntries(
-    const std::function<void(const Slice& key, void* value, size_t charge,
-                             DeleterFn deleter)>& callback,
+    const std::function<void(const Slice& key, Cache::ObjectPtr value,
+                             size_t charge,
+                             const Cache::CacheItemHelper* helper)>& callback,
    size_t average_entries_per_lock, size_t* state) {
  // The state is essentially going to be the starting hash, which works
  // nicely even if we resize between calls because we use upper-most
@ -192,11 +195,8 @@ void LRUCacheShard::ApplyToSomeEntries(
  table_.ApplyToEntriesRange(
      [callback,
       metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
-        DeleterFn deleter = h->IsSecondaryCacheCompatible()
-                                ? h->info_.helper->del_cb
-                                : h->info_.deleter;
        callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
-                 deleter);
+                 h->helper);
      },
      index_begin, index_end);
 }
@ -334,16 +334,19 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
  }
 }

-void LRUCacheShard::TryInsertIntoSecondaryCache(
-    autovector<LRUHandle*> evicted_handles) {
-  for (auto entry : evicted_handles) {
-    if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
-        !entry->IsInSecondaryCache()) {
-      secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
-          .PermitUncheckedError();
+void LRUCacheShard::NotifyEvicted(
+    const autovector<LRUHandle*>& evicted_handles) {
+  MemoryAllocator* alloc = table_.GetAllocator();
+  for (LRUHandle* entry : evicted_handles) {
+    if (eviction_callback_ &&
+        eviction_callback_(entry->key(),
+                           reinterpret_cast<Cache::Handle*>(entry))) {
+      // Callback took ownership of obj; just free handle
+      free(entry);
+    } else {
+      // Free the entries here outside of mutex for performance reasons.
+      entry->Free(alloc);
    }
-    // Free the entries here outside of mutex for performance reasons.
-    entry->Free();
  }
 }

@ -357,7 +360,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
    EvictFromLRU(0, &last_reference_list);
  }

-  TryInsertIntoSecondaryCache(last_reference_list);
+  NotifyEvicted(last_reference_list);
 }

 void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
@ -365,8 +368,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
  strict_capacity_limit_ = strict_capacity_limit;
 }

-Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
-                                 bool free_handle_on_fail) {
+Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
  Status s = Status::OK();
  autovector<LRUHandle*> last_reference_list;

@ -385,10 +387,9 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
        // into cache and get evicted immediately.
        last_reference_list.push_back(e);
      } else {
-        if (free_handle_on_fail) {
-          free(e);
-          *handle = nullptr;
-        }
+        free(e);
+        e = nullptr;
+        *handle = nullptr;
        s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
      }
    } else {
@ -420,192 +421,27 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
    }
  }

-  TryInsertIntoSecondaryCache(last_reference_list);
+  NotifyEvicted(last_reference_list);

  return s;
 }

-void LRUCacheShard::Promote(LRUHandle* e) {
-  SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
-
-  assert(secondary_handle->IsReady());
-  // e is not thread-shared here; OK to modify "immutable" fields as well as
-  // "mutable" (normally requiring mutex)
-  e->SetIsPending(false);
-  e->value = secondary_handle->Value();
-  assert(e->total_charge == 0);
-  size_t value_size = secondary_handle->Size();
-  delete secondary_handle;
-
-  if (e->value) {
-    e->CalcTotalCharge(value_size, metadata_charge_policy_);
-    Status s;
-    if (e->IsStandalone()) {
-      assert(secondary_cache_ && secondary_cache_->SupportForceErase());
-
-      // Insert a dummy handle and return a standalone handle to caller.
-      // Charge the standalone handle.
-      autovector<LRUHandle*> last_reference_list;
-      bool free_standalone_handle{false};
-      {
-        DMutexLock l(mutex_);
-
-        // Free the space following strict LRU policy until enough space
-        // is freed or the lru list is empty.
-        EvictFromLRU(e->total_charge, &last_reference_list);
-
-        if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) {
-          free_standalone_handle = true;
-        } else {
-          usage_ += e->total_charge;
-        }
-      }
-
-      TryInsertIntoSecondaryCache(last_reference_list);
-      if (free_standalone_handle) {
-        e->Unref();
-        e->Free();
-        e = nullptr;
-      } else {
-        PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
-      }
-
-      // Insert a dummy handle into the primary cache. This dummy handle is
-      // not IsSecondaryCacheCompatible().
-      // FIXME? This should not overwrite an existing non-dummy entry in the
-      // rare case that one exists
-      Cache::Priority priority =
-          e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW;
-      s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0,
-                 /*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr,
-                 priority);
-    } else {
-      e->SetInCache(true);
-      LRUHandle* handle = e;
-      // This InsertItem() could fail if the cache is over capacity and
-      // strict_capacity_limit_ is true. In such a case, we don't want
-      // InsertItem() to free the handle, since the item is already in memory
-      // and the caller will most likely just read it from disk if we erase it
-      // here.
-      s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
-      if (s.ok()) {
-        PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
-      }
-    }
-
-    if (!s.ok()) {
-      // Item is in memory, but not accounted against the cache capacity.
-      // When the handle is released, the item should get deleted.
-      assert(!e->InCache());
-    }
-  } else {
-    // Secondary cache lookup failed. The caller will take care of detecting
-    // this and eventually releasing e.
-    assert(!e->value);
-    assert(!e->InCache());
-  }
-}
-
 LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
-                                 const Cache::CacheItemHelper* helper,
-                                 const Cache::CreateCallback& create_cb,
-                                 Cache::Priority priority, bool wait,
-                                 Statistics* stats) {
-  LRUHandle* e = nullptr;
-  bool found_dummy_entry{false};
-  {
-    DMutexLock l(mutex_);
-    e = table_.Lookup(key, hash);
-    if (e != nullptr) {
-      assert(e->InCache());
-      if (e->value == kDummyValueMarker) {
-        // For a dummy handle, if it was retrieved from secondary cache,
-        // it may still exist in secondary cache.
-        // If the handle exists in secondary cache, the value should be
-        // erased from sec cache and be inserted into primary cache.
-        found_dummy_entry = true;
-        // Let the dummy entry be overwritten
-        e = nullptr;
-      } else {
-        if (!e->HasRefs()) {
-          // The entry is in LRU since it's in hash and has no external
-          // references.
-          LRU_Remove(e);
-        }
-        e->Ref();
-        e->SetHit();
-      }
-    }
-  }
-
-  // If handle table lookup failed or the handle is a dummy one, allocate
-  // a handle outside the mutex if we re going to lookup in the secondary cache.
-  //
-  // When a block is firstly Lookup from CompressedSecondaryCache, we just
-  // insert a dummy block into the primary cache (charging the actual size of
-  // the block) and don't erase the block from CompressedSecondaryCache. A
-  // standalone handle is returned to the caller. Only if the block is hit
-  // again, we erase it from CompressedSecondaryCache and add it into the
-  // primary cache.
-  if (!e && secondary_cache_ && helper && helper->saveto_cb) {
-    // For objects from the secondary cache, we expect the caller to provide
-    // a way to create/delete the primary cache object. The only case where
-    // a deleter would not be required is for dummy entries inserted for
-    // accounting purposes, which we won't demote to the secondary cache
-    // anyway.
-    assert(create_cb && helper->del_cb);
-    bool is_in_sec_cache{false};
-    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
-        secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry,
-                                 is_in_sec_cache);
-    if (secondary_handle != nullptr) {
-      e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
-
-      e->m_flags = 0;
-      e->im_flags = 0;
-      e->SetSecondaryCacheCompatible(true);
-      e->info_.helper = helper;
-      e->key_length = key.size();
-      e->hash = hash;
-      e->refs = 0;
-      e->next = e->prev = nullptr;
-      e->SetPriority(priority);
-      memcpy(e->key_data, key.data(), key.size());
-      e->value = nullptr;
-      e->sec_handle = secondary_handle.release();
-      e->total_charge = 0;
-      e->Ref();
-      e->SetIsInSecondaryCache(is_in_sec_cache);
-      e->SetIsStandalone(secondary_cache_->SupportForceErase() &&
-                         !found_dummy_entry);
-
-      if (wait) {
-        Promote(e);
-        if (e) {
-          if (!e->value) {
-            // The secondary cache returned a handle, but the lookup failed.
-            e->Unref();
-            e->Free();
-            e = nullptr;
-          } else {
-            PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
-            RecordTick(stats, SECONDARY_CACHE_HITS);
-          }
-        }
-      } else {
-        // If wait is false, we always return a handle and let the caller
-        // release the handle after checking for success or failure.
-        e->SetIsPending(true);
-        // This may be slightly inaccurate, if the lookup eventually fails.
-        // But the probability is very low.
-        PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
-        RecordTick(stats, SECONDARY_CACHE_HITS);
-      }
-    } else {
-      // Caller will most likely overwrite the dummy entry with an Insert
-      // after this Lookup fails
-      assert(e == nullptr);
+                                 const Cache::CacheItemHelper* /*helper*/,
+                                 Cache::CreateContext* /*create_context*/,
+                                 Cache::Priority /*priority*/,
+                                 Statistics* /*stats*/) {
+  DMutexLock l(mutex_);
+  LRUHandle* e = table_.Lookup(key, hash);
+  if (e != nullptr) {
+    assert(e->InCache());
+    if (!e->HasRefs()) {
+      // The entry is in LRU since it's in hash and has no external
+      // references.
+      LRU_Remove(e);
    }
+    e->Ref();
+    e->SetHit();
  }
  return e;
 }
@ -614,8 +450,6 @@ bool LRUCacheShard::Ref(LRUHandle* e) {
  DMutexLock l(mutex_);
  // To create another reference - entry must be already externally referenced.
  assert(e->HasRefs());
-  // Pending handles are not for sharing
-  assert(!e->IsPending());
  e->Ref();
  return true;
 }
@ -639,14 +473,13 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
  if (e == nullptr) {
    return false;
  }
-  bool last_reference = false;
-  // Must Wait or WaitAll first on pending handles. Otherwise, would leak
-  // a secondary cache handle.
-  assert(!e->IsPending());
+  bool must_free;
+  bool was_in_cache;
  {
    DMutexLock l(mutex_);
-    last_reference = e->Unref();
-    if (last_reference && e->InCache()) {
+    must_free = e->Unref();
+    was_in_cache = e->InCache();
+    if (must_free && was_in_cache) {
      // The item is still in cache, and nobody else holds a reference to it.
      if (usage_ > capacity_ || erase_if_last_ref) {
        // The LRU list must be empty since the cache is full.
@ -657,28 +490,39 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
      } else {
        // Put the item back on the LRU list, and don't free it.
        LRU_Insert(e);
-        last_reference = false;
+        must_free = false;
      }
    }
-    // If it was the last reference, then decrement the cache usage.
-    if (last_reference) {
+    // If about to be freed, then decrement the cache usage.
+    if (must_free) {
      assert(usage_ >= e->total_charge);
      usage_ -= e->total_charge;
    }
  }

  // Free the entry here outside of mutex for performance reasons.
-  if (last_reference) {
-    e->Free();
+  if (must_free) {
+    // Only call eviction callback if we're sure no one requested erasure
+    // FIXME: disabled because of test churn
+    if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ &&
+        eviction_callback_(e->key(), reinterpret_cast<Cache::Handle*>(e))) {
+      // Callback took ownership of obj; just free handle
+      free(e);
+    } else {
+      e->Free(table_.GetAllocator());
+    }
  }
-  return last_reference;
+  return must_free;
 }

-Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
-                             size_t charge,
-                             void (*deleter)(const Slice& key, void* value),
-                             const Cache::CacheItemHelper* helper,
-                             LRUHandle** handle, Cache::Priority priority) {
+LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
+                                       Cache::ObjectPtr value,
+                                       const Cache::CacheItemHelper* helper,
+                                       size_t charge) {
+  assert(helper);
+  // value == nullptr is reserved for indicating failure in SecondaryCache
+  assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr));
+
  // Allocate the memory here outside of the mutex.
  // If the cache is full, we'll have to release it.
  // It shouldn't happen very often though.
@ -688,27 +532,58 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
  e->value = value;
  e->m_flags = 0;
  e->im_flags = 0;
-  if (helper) {
-    // Use only one of the two parameters
-    assert(deleter == nullptr);
-    // value == nullptr is reserved for indicating failure for when secondary
-    // cache compatible
-    assert(value != nullptr);
-    e->SetSecondaryCacheCompatible(true);
-    e->info_.helper = helper;
-  } else {
-    e->info_.deleter = deleter;
-  }
+  e->helper = helper;
  e->key_length = key.size();
  e->hash = hash;
  e->refs = 0;
  e->next = e->prev = nullptr;
-  e->SetInCache(true);
-  e->SetPriority(priority);
  memcpy(e->key_data, key.data(), key.size());
  e->CalcTotalCharge(charge, metadata_charge_policy_);

-  return InsertItem(e, handle, /* free_handle_on_fail */ true);
+  return e;
+}
+
+Status LRUCacheShard::Insert(const Slice& key, uint32_t hash,
+                             Cache::ObjectPtr value,
+                             const Cache::CacheItemHelper* helper,
+                             size_t charge, LRUHandle** handle,
+                             Cache::Priority priority) {
+  LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
+  e->SetPriority(priority);
+  e->SetInCache(true);
+  return InsertItem(e, handle);
+}
+
+LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash,
+                                           Cache::ObjectPtr value,
+                                           const Cache::CacheItemHelper* helper,
+                                           size_t charge,
+                                           bool allow_uncharged) {
+  LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
+  e->SetIsStandalone(true);
+  e->Ref();
+
+  autovector<LRUHandle*> last_reference_list;
+
+  {
+    DMutexLock l(mutex_);
+
+    EvictFromLRU(e->total_charge, &last_reference_list);
+
+    if (strict_capacity_limit_ && (usage_ + e->total_charge) > capacity_) {
+      if (allow_uncharged) {
+        e->total_charge = 0;
+      } else {
+        free(e);
+        e = nullptr;
+      }
+    } else {
+      usage_ += e->total_charge;
+    }
+  }
+
+  NotifyEvicted(last_reference_list);
+  return e;
 }

 void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@ -733,18 +608,8 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
  // Free the entry here outside of mutex for performance reasons.
  // last_reference will only be true if e != nullptr.
  if (last_reference) {
-    e->Free();
-  }
-}
-
-bool LRUCacheShard::IsReady(LRUHandle* e) {
-  bool ready = true;
-  if (e->IsPending()) {
-    assert(secondary_cache_);
-    assert(e->sec_handle);
-    ready = e->sec_handle->IsReady();
+    e->Free(table_.GetAllocator());
  }
-  return ready;
 }

 size_t LRUCacheShard::GetUsage() const {
@ -781,30 +646,20 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
  str.append(buffer);
 }

-LRUCache::LRUCache(size_t capacity, int num_shard_bits,
-                   bool strict_capacity_limit, double high_pri_pool_ratio,
-                   double low_pri_pool_ratio,
-                   std::shared_ptr<MemoryAllocator> allocator,
-                   bool use_adaptive_mutex,
-                   CacheMetadataChargePolicy metadata_charge_policy,
-                   std::shared_ptr<SecondaryCache> _secondary_cache)
-    : ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
-                   std::move(allocator)),
-      secondary_cache_(std::move(_secondary_cache)) {
+LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) {
  size_t per_shard = GetPerShardCapacity();
-  SecondaryCache* secondary_cache = secondary_cache_.get();
-  InitShards([=](LRUCacheShard* cs) {
-    new (cs) LRUCacheShard(
-        per_shard, strict_capacity_limit, high_pri_pool_ratio,
-        low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
-        /* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
+  MemoryAllocator* alloc = memory_allocator();
+  InitShards([&](LRUCacheShard* cs) {
+    new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit,
+                           opts.high_pri_pool_ratio, opts.low_pri_pool_ratio,
+                           opts.use_adaptive_mutex, opts.metadata_charge_policy,
+                           /* max_upper_hash_bits */ 32 - opts.num_shard_bits,
+                           alloc, &eviction_callback_);
  });
 }

-void* LRUCache::Value(Handle* handle) {
+Cache::ObjectPtr LRUCache::Value(Handle* handle) {
  auto h = reinterpret_cast<const LRUHandle*>(handle);
-  assert(!h->IsPending() || h->value == nullptr);
-  assert(h->value != kDummyValueMarker);
  return h->value;
 }

@ -813,13 +668,10 @@ size_t LRUCache::GetCharge(Handle* handle) const {
      GetShard(0).metadata_charge_policy_);
 }

-Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
+const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
+    Handle* handle) const {
  auto h = reinterpret_cast<const LRUHandle*>(handle);
-  if (h->IsSecondaryCacheCompatible()) {
-    return h->info_.helper->del_cb;
-  } else {
-    return h->info_.deleter;
-  }
+  return h->helper;
 }

 size_t LRUCache::TEST_GetLRUSize() {
@ -830,51 +682,9 @@ double LRUCache::GetHighPriPoolRatio() {
  return GetShard(0).GetHighPriPoolRatio();
 }

-void LRUCache::WaitAll(std::vector<Handle*>& handles) {
-  if (secondary_cache_) {
-    std::vector<SecondaryCacheResultHandle*> sec_handles;
-    sec_handles.reserve(handles.size());
-    for (Handle* handle : handles) {
-      if (!handle) {
-        continue;
-      }
-      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
-      if (!lru_handle->IsPending()) {
-        continue;
-      }
-      sec_handles.emplace_back(lru_handle->sec_handle);
-    }
-    secondary_cache_->WaitAll(sec_handles);
-    for (Handle* handle : handles) {
-      if (!handle) {
-        continue;
-      }
-      LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
-      if (!lru_handle->IsPending()) {
-        continue;
-      }
-      GetShard(lru_handle->hash).Promote(lru_handle);
-    }
-  }
-}
-
-void LRUCache::AppendPrintableOptions(std::string& str) const {
-  ShardedCache::AppendPrintableOptions(str);  // options from shard
-  if (secondary_cache_) {
-    str.append("  secondary_cache:\n");
-    str.append(secondary_cache_->GetPrintableOptions());
-  }
-}
-
 }  // namespace lru_cache

-std::shared_ptr<Cache> NewLRUCache(
-    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-    double high_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy,
-    const std::shared_ptr<SecondaryCache>& secondary_cache,
-    double low_pri_pool_ratio) {
+std::shared_ptr<Cache> LRUCacheOptions::MakeSharedCache() const {
  if (num_shard_bits >= 20) {
    return nullptr;  // The cache cannot be sharded into too many fine pieces.
  }
@ -890,32 +700,24 @@ std::shared_ptr<Cache> NewLRUCache(
    // Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
    return nullptr;
  }
-  if (num_shard_bits < 0) {
-    num_shard_bits = GetDefaultCacheShardBits(capacity);
-  }
-  return std::make_shared<LRUCache>(
-      capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
-      low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
-      metadata_charge_policy, secondary_cache);
-}
-
-std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
-  return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
-                     cache_opts.strict_capacity_limit,
-                     cache_opts.high_pri_pool_ratio,
-                     cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
-                     cache_opts.metadata_charge_policy,
-                     cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
-}
-
-std::shared_ptr<Cache> NewLRUCache(
-    size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-    double high_pri_pool_ratio,
-    std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
-    CacheMetadataChargePolicy metadata_charge_policy,
-    double low_pri_pool_ratio) {
-  return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
-                     high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
-                     metadata_charge_policy, nullptr, low_pri_pool_ratio);
+  // For sanitized options
+  LRUCacheOptions opts = *this;
+  if (opts.num_shard_bits < 0) {
+    opts.num_shard_bits = GetDefaultCacheShardBits(capacity);
+  }
+  std::shared_ptr<Cache> cache = std::make_shared<LRUCache>(opts);
+  if (secondary_cache) {
+    cache = std::make_shared<CacheWithSecondaryAdapter>(cache, secondary_cache);
+  }
+  return cache;
+}
+
+std::shared_ptr<RowCache> LRUCacheOptions::MakeSharedRowCache() const {
+  if (secondary_cache) {
+    // Not allowed for a RowCache
+    return nullptr;
+  }
+  // Works while RowCache is an alias for Cache
+  return MakeSharedCache();
 }
 }  // namespace ROCKSDB_NAMESPACE
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@ -13,9 +13,9 @@

 #include "cache/sharded_cache.h"
 #include "port/lang.h"
+#include "port/likely.h"
 #include "port/malloc.h"
 #include "port/port.h"
-#include "rocksdb/secondary_cache.h"
 #include "util/autovector.h"
 #include "util/distributed_mutex.h"

@ -48,19 +48,9 @@ namespace lru_cache {
 // While refs > 0, public properties like value and deleter must not change.

 struct LRUHandle {
-  void* value;
-  union Info {
-    Info() {}
-    ~Info() {}
-    Cache::DeleterFn deleter;
-    const Cache::CacheItemHelper* helper;
-  } info_;
-  // An entry is not added to the LRUHandleTable until the secondary cache
-  // lookup is complete, so its safe to have this union.
-  union {
-    LRUHandle* next_hash;
-    SecondaryCacheResultHandle* sec_handle;
-  };
+  Cache::ObjectPtr value;
+  const Cache::CacheItemHelper* helper;
+  LRUHandle* next_hash;
  LRUHandle* next;
  LRUHandle* prev;
  size_t total_charge;  // TODO(opt): Only allow uint32_t?
@ -93,14 +83,8 @@ struct LRUHandle {
    IM_IS_HIGH_PRI = (1 << 0),
    // Whether this entry is low priority entry.
    IM_IS_LOW_PRI = (1 << 1),
-    // Can this be inserted into the secondary cache.
-    IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
-    // Is the handle still being read from a lower tier.
-    IM_IS_PENDING = (1 << 3),
-    // Whether this handle is still in a lower tier
-    IM_IS_IN_SECONDARY_CACHE = (1 << 4),
    // Marks result handles that should not be inserted into cache
-    IM_IS_STANDALONE = (1 << 5),
+    IM_IS_STANDALONE = (1 << 2),
  };

  // Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
@ -130,13 +114,6 @@ struct LRUHandle {
  bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
  bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
  bool HasHit() const { return m_flags & M_HAS_HIT; }
-  bool IsSecondaryCacheCompatible() const {
-    return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
-  }
-  bool IsPending() const { return im_flags & IM_IS_PENDING; }
-  bool IsInSecondaryCache() const {
-    return im_flags & IM_IS_IN_SECONDARY_CACHE;
-  }
  bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }

  void SetInCache(bool in_cache) {
@ -178,30 +155,6 @@ struct LRUHandle {

  void SetHit() { m_flags |= M_HAS_HIT; }

-  void SetSecondaryCacheCompatible(bool compat) {
-    if (compat) {
-      im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
-    } else {
-      im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
-    }
-  }
-
-  void SetIsPending(bool pending) {
-    if (pending) {
-      im_flags |= IM_IS_PENDING;
-    } else {
-      im_flags &= ~IM_IS_PENDING;
-    }
-  }
-
-  void SetIsInSecondaryCache(bool is_in_secondary_cache) {
-    if (is_in_secondary_cache) {
-      im_flags |= IM_IS_IN_SECONDARY_CACHE;
-    } else {
-      im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
-    }
-  }
-
  void SetIsStandalone(bool is_standalone) {
    if (is_standalone) {
      im_flags |= IM_IS_STANDALONE;
@ -210,22 +163,11 @@ struct LRUHandle {
    }
  }

-  void Free() {
+  void Free(MemoryAllocator* allocator) {
    assert(refs == 0);
-
-    if (!IsSecondaryCacheCompatible() && info_.deleter) {
-      (*info_.deleter)(key(), value);
-    } else if (IsSecondaryCacheCompatible()) {
-      if (IsPending()) {
-        assert(sec_handle != nullptr);
-        SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
-        tmp_sec_handle->Wait();
-        value = tmp_sec_handle->Value();
-        delete tmp_sec_handle;
-      }
-      if (value) {
-        (*info_.helper->del_cb)(key(), value);
-      }
+    assert(helper);
+    if (helper->del_cb) {
+      helper->del_cb(value, allocator);
    }

    free(this);
@ -267,7 +209,7 @@ struct LRUHandle {
 // 4.4.3's builtin hashtable.
 class LRUHandleTable {
 public:
-  explicit LRUHandleTable(int max_upper_hash_bits);
+  explicit LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator);
  ~LRUHandleTable();

  LRUHandle* Lookup(const Slice& key, uint32_t hash);
@ -291,6 +233,8 @@ class LRUHandleTable {

  size_t GetOccupancyCount() const { return elems_; }

+  MemoryAllocator* GetAllocator() const { return allocator_; }
+
 private:
  // Return a pointer to slot that points to a cache entry that
  // matches key/hash.  If there is no such cache entry, return a
@ -312,16 +256,22 @@ class LRUHandleTable {

  // Set from max_upper_hash_bits (see constructor).
  const int max_length_bits_;
+
+  // From Cache, needed for delete
+  MemoryAllocator* const allocator_;
 };

 // A single shard of sharded cache.
 class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
 public:
+  // NOTE: the eviction_callback ptr is saved, as is it assumed to be kept
+  // alive in Cache.
  LRUCacheShard(size_t capacity, bool strict_capacity_limit,
                double high_pri_pool_ratio, double low_pri_pool_ratio,
                bool use_adaptive_mutex,
                CacheMetadataChargePolicy metadata_charge_policy,
-                int max_upper_hash_bits, SecondaryCache* secondary_cache);
+                int max_upper_hash_bits, MemoryAllocator* allocator,
+                const Cache::EvictionCallback* eviction_callback);

 public:  // Type definitions expected as parameter to ShardedCache
  using HandleImpl = LRUHandle;
@ -329,8 +279,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  using HashCref = uint32_t;

 public:  // Function definitions expected as parameter to ShardedCache
-  static inline HashVal ComputeHash(const Slice& key) {
-    return Lower32of64(GetSliceNPHash64(key));
+  static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
+    return Lower32of64(GetSliceNPHash64(key, seed));
  }

  // Separate from constructor so caller can easily make an array of LRUCache
@ -348,29 +298,21 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  void SetLowPriorityPoolRatio(double low_pri_pool_ratio);

  // Like Cache methods, but with an extra "hash" parameter.
-  inline Status Insert(const Slice& key, uint32_t hash, void* value,
-                       size_t charge, Cache::DeleterFn deleter,
-                       LRUHandle** handle, Cache::Priority priority) {
-    return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
-  }
-  inline Status Insert(const Slice& key, uint32_t hash, void* value,
-                       const Cache::CacheItemHelper* helper, size_t charge,
-                       LRUHandle** handle, Cache::Priority priority) {
-    assert(helper);
-    return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
-  }
-  // If helper_cb is null, the values of the following arguments don't matter.
+  Status Insert(const Slice& key, uint32_t hash, Cache::ObjectPtr value,
+                const Cache::CacheItemHelper* helper, size_t charge,
+                LRUHandle** handle, Cache::Priority priority);
+
+  LRUHandle* CreateStandalone(const Slice& key, uint32_t hash,
+                              Cache::ObjectPtr obj,
+                              const Cache::CacheItemHelper* helper,
+                              size_t charge, bool allow_uncharged);
+
  LRUHandle* Lookup(const Slice& key, uint32_t hash,
                    const Cache::CacheItemHelper* helper,
-                    const Cache::CreateCallback& create_cb,
-                    Cache::Priority priority, bool wait, Statistics* stats);
-  inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
-    return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
-                  nullptr);
-  }
+                    Cache::CreateContext* create_context,
+                    Cache::Priority priority, Statistics* stats);
+
  bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
-  bool IsReady(LRUHandle* /*handle*/);
-  void Wait(LRUHandle* /*handle*/) {}
  bool Ref(LRUHandle* handle);
  void Erase(const Slice& key, uint32_t hash);

@ -384,8 +326,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  size_t GetTableAddressCount() const;

  void ApplyToSomeEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
+      const std::function<void(const Slice& key, Cache::ObjectPtr value,
+                               size_t charge,
+                               const Cache::CacheItemHelper* helper)>& callback,
      size_t average_entries_per_lock, size_t* state);

  void EraseUnRefEntries();
@ -409,23 +352,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
 private:
  friend class LRUCache;
  // Insert an item into the hash table and, if handle is null, insert into
-  // the LRU list. Older items are evicted as necessary. If the cache is full
-  // and free_handle_on_fail is true, the item is deleted and handle is set to
-  // nullptr.
-  Status InsertItem(LRUHandle* item, LRUHandle** handle,
-                    bool free_handle_on_fail);
-  Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
-                DeleterFn deleter, const Cache::CacheItemHelper* helper,
-                LRUHandle** handle, Cache::Priority priority);
-  // Promote an item looked up from the secondary cache to the LRU cache.
-  // The item may be still in the secondary cache.
-  // It is only inserted into the hash table and not the LRU list, and only
-  // if the cache is not at full capacity, as is the case during Insert.  The
-  // caller should hold a reference on the LRUHandle. When the caller releases
-  // the last reference, the item is added to the LRU list.
-  // The item is promoted to the high pri or low pri pool as specified by the
-  // caller in Lookup.
-  void Promote(LRUHandle* e);
+  // the LRU list. Older items are evicted as necessary. Frees `item` on
+  // non-OK status.
+  Status InsertItem(LRUHandle* item, LRUHandle** handle);
+
  void LRU_Remove(LRUHandle* e);
  void LRU_Insert(LRUHandle* e);

@ -439,8 +369,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  // holding the mutex_.
  void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);

-  // Try to insert the evicted handles into the secondary cache.
-  void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
+  void NotifyEvicted(const autovector<LRUHandle*>& evicted_handles);
+
+  LRUHandle* CreateHandle(const Slice& key, uint32_t hash,
+                          Cache::ObjectPtr value,
+                          const Cache::CacheItemHelper* helper, size_t charge);

  // Initialized before use.
  size_t capacity_;
@ -503,8 +436,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
  // don't mind mutex_ invoking the non-const actions.
  mutable DMutex mutex_;

-  // Owned by LRUCache
-  SecondaryCache* secondary_cache_;
+  // A reference to Cache::eviction_callback_
+  const Cache::EvictionCallback& eviction_callback_;
 };

 class LRUCache
@ -513,28 +446,16 @@ class LRUCache
 #endif
    : public ShardedCache<LRUCacheShard> {
 public:
-  LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-           double high_pri_pool_ratio, double low_pri_pool_ratio,
-           std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
-           bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
-           CacheMetadataChargePolicy metadata_charge_policy =
-               kDontChargeCacheMetadata,
-           std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
+  explicit LRUCache(const LRUCacheOptions& opts);
  const char* Name() const override { return "LRUCache"; }
-  void* Value(Handle* handle) override;
+  ObjectPtr Value(Handle* handle) override;
  size_t GetCharge(Handle* handle) const override;
-  DeleterFn GetDeleter(Handle* handle) const override;
-  void WaitAll(std::vector<Handle*>& handles) override;
+  const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;

  // Retrieves number of elements in LRU, for unit test purpose only.
  size_t TEST_GetLRUSize();
  // Retrieves high pri pool ratio.
  double GetHighPriPoolRatio();
-
-  void AppendPrintableOptions(std::string& str) const override;
-
- private:
-  std::shared_ptr<SecondaryCache> secondary_cache_;
 };

 }  // namespace lru_cache
--- a/cache/lru_cache_test.cc
+++ b/cache/lru_cache_test.cc
--- a/cache/secondary_cache.cc
+++ b/cache/secondary_cache.cc
@ -11,20 +11,32 @@ namespace ROCKSDB_NAMESPACE {

 namespace {

-size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); }
+void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {}

-Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length,
-                   void* out) {
+size_t SliceSize(Cache::ObjectPtr obj) {
+  return static_cast<Slice*>(obj)->size();
+}
+
+Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length,
+                   char* out) {
  const Slice& slice = *static_cast<Slice*>(from_obj);
  std::memcpy(out, slice.data() + from_offset, length);
  return Status::OK();
 }

+Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*,
+                  Cache::ObjectPtr*, size_t*) {
+  return Status::NotSupported("Only for dumping data into SecondaryCache");
+}
+
 }  // namespace

 Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
+  static Cache::CacheItemHelper helper_no_secondary{CacheEntryRole::kMisc,
+                                                    &NoopDelete};
  static Cache::CacheItemHelper helper{
-      &SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()};
+      CacheEntryRole::kMisc, &NoopDelete, &SliceSize,
+      &SliceSaveTo,          &FailCreate, &helper_no_secondary};
  // NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
  return Insert(key, const_cast<Slice*>(&saved), &helper);
 }
--- a/cache/secondary_cache_adapter.cc
+++ b/cache/secondary_cache_adapter.cc
@ -0,0 +1,433 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#include "cache/secondary_cache_adapter.h"
+
+#include "monitoring/perf_context_imp.h"
+#include "util/cast_util.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+namespace {
+// A distinct pointer value for marking "dummy" cache entries
+struct Dummy {
+  char val[7] = "kDummy";
+};
+const Dummy kDummy{};
+Cache::ObjectPtr const kDummyObj = const_cast<Dummy*>(&kDummy);
+}  // namespace
+
+// When CacheWithSecondaryAdapter is constructed with the distribute_cache_res
+// parameter set to true, it manages the entire memory budget across the
+// primary and secondary cache. The secondary cache is assumed to be in
+// memory, such as the CompressedSecondaryCache. When a placeholder entry
+// is inserted by a CacheReservationManager instance to reserve memory,
+// the CacheWithSecondaryAdapter ensures that the reservation is distributed
+// proportionally across the primary/secondary caches.
+//
+// The primary block cache is initially sized to the sum of the primary cache
+// budget + teh secondary cache budget, as follows -
+//   |---------    Primary Cache Configured Capacity  -----------|
+//   |---Secondary Cache Budget----|----Primary Cache Budget-----|
+//
+// A ConcurrentCacheReservationManager member in the CacheWithSecondaryAdapter,
+// pri_cache_res_,
+// is used to help with tracking the distribution of memory reservations.
+// Initially, it accounts for the entire secondary cache budget as a
+// reservation against the primary cache. This shrinks the usable capacity of
+// the primary cache to the budget that the user originally desired.
+//
+//   |--Reservation for Sec Cache--|-Pri Cache Usable Capacity---|
+//
+// When a reservation placeholder is inserted into the adapter, it is inserted
+// directly into the primary cache. This means the entire charge of the
+// placeholder is counted against the primary cache. To compensate and count
+// a portion of it against the secondary cache, the secondary cache Deflate()
+// method is called to shrink it. Since the Deflate() causes the secondary
+// actual usage to shrink, it is refelcted here by releasing an equal amount
+// from the pri_cache_res_ reservation. The Deflate() in the secondary cache
+// can be, but is not required to be, implemented using its own cache
+// reservation manager.
+//
+// For example, if the pri/sec ratio is 70/30, and the combined capacity is
+// 100MB, the intermediate and final  state after inserting a reservation
+// placeholder for 10MB would be as follows -
+//
+//   |-Reservation for Sec Cache-|-Pri Cache Usable Capacity-|---R---|
+// 1. After inserting the placeholder in primary
+//   |-------  30MB -------------|------- 60MB -------------|-10MB--|
+// 2. After deflating the secondary and adjusting the reservation for
+//    secondary against the primary
+//   |-------  27MB -------------|------- 63MB -------------|-10MB--|
+//
+// Likewise, when the user inserted placeholder is released, the secondary
+// cache Inflate() method is called to grow it, and the pri_cache_res_
+// reservation is increased by an equal amount.
+//
+// Another way of implementing this would have been to simply split the user
+// reservation into primary and seconary components. However, this would
+// require allocating a structure to track the associated secondary cache
+// reservation, which adds some complexity and overhead.
+//
+CacheWithSecondaryAdapter::CacheWithSecondaryAdapter(
+    std::shared_ptr<Cache> target,
+    std::shared_ptr<SecondaryCache> secondary_cache, bool distribute_cache_res)
+    : CacheWrapper(std::move(target)),
+      secondary_cache_(std::move(secondary_cache)),
+      distribute_cache_res_(distribute_cache_res) {
+  target_->SetEvictionCallback([this](const Slice& key, Handle* handle) {
+    return EvictionHandler(key, handle);
+  });
+  if (distribute_cache_res_) {
+    size_t sec_capacity = 0;
+    pri_cache_res_ = std::make_shared<ConcurrentCacheReservationManager>(
+        std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
+            target_));
+    Status s = secondary_cache_->GetCapacity(sec_capacity);
+    assert(s.ok());
+    // Initially, the primary cache is sized to uncompressed cache budget plsu
+    // compressed secondary cache budget. The secondary cache budget is then
+    // taken away from the primary cache through cache reservations. Later,
+    // when a placeholder entry is inserted by the caller, its inserted
+    // into the primary cache and the portion that should be assigned to the
+    // secondary cache is freed from the reservation.
+    s = pri_cache_res_->UpdateCacheReservation(sec_capacity);
+    assert(s.ok());
+    sec_cache_res_ratio_ = (double)sec_capacity / target_->GetCapacity();
+  }
+}
+
+CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
+  // `*this` will be destroyed before `*target_`, so we have to prevent
+  // use after free
+  target_->SetEvictionCallback({});
+#ifndef NDEBUG
+  if (distribute_cache_res_) {
+    size_t sec_capacity = 0;
+    Status s = secondary_cache_->GetCapacity(sec_capacity);
+    assert(s.ok());
+    assert(pri_cache_res_->GetTotalReservedCacheSize() == sec_capacity);
+  }
+#endif  // NDEBUG
+}
+
+bool CacheWithSecondaryAdapter::EvictionHandler(const Slice& key,
+                                                Handle* handle) {
+  auto helper = GetCacheItemHelper(handle);
+  if (helper->IsSecondaryCacheCompatible()) {
+    auto obj = target_->Value(handle);
+    // Ignore dummy entry
+    if (obj != kDummyObj) {
+      // Spill into secondary cache.
+      secondary_cache_->Insert(key, obj, helper).PermitUncheckedError();
+    }
+  }
+  // Never takes ownership of obj
+  return false;
+}
+
+bool CacheWithSecondaryAdapter::ProcessDummyResult(Cache::Handle** handle,
+                                                   bool erase) {
+  if (*handle && target_->Value(*handle) == kDummyObj) {
+    target_->Release(*handle, erase);
+    *handle = nullptr;
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void CacheWithSecondaryAdapter::CleanupCacheObject(
+    ObjectPtr obj, const CacheItemHelper* helper) {
+  if (helper->del_cb) {
+    helper->del_cb(obj, memory_allocator());
+  }
+}
+
+Cache::Handle* CacheWithSecondaryAdapter::Promote(
+    std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
+    const Slice& key, const CacheItemHelper* helper, Priority priority,
+    Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache) {
+  assert(secondary_handle->IsReady());
+
+  ObjectPtr obj = secondary_handle->Value();
+  if (!obj) {
+    // Nothing found.
+    return nullptr;
+  }
+  // Found something.
+  switch (helper->role) {
+    case CacheEntryRole::kFilterBlock:
+      RecordTick(stats, SECONDARY_CACHE_FILTER_HITS);
+      break;
+    case CacheEntryRole::kIndexBlock:
+      RecordTick(stats, SECONDARY_CACHE_INDEX_HITS);
+      break;
+    case CacheEntryRole::kDataBlock:
+      RecordTick(stats, SECONDARY_CACHE_DATA_HITS);
+      break;
+    default:
+      break;
+  }
+  PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
+  RecordTick(stats, SECONDARY_CACHE_HITS);
+
+  // Note: SecondaryCache::Size() is really charge (from the CreateCallback)
+  size_t charge = secondary_handle->Size();
+  Handle* result = nullptr;
+  // Insert into primary cache, possibly as a standalone+dummy entries.
+  if (secondary_cache_->SupportForceErase() && !found_dummy_entry) {
+    // Create standalone and insert dummy
+    // Allow standalone to be created even if cache is full, to avoid
+    // reading the entry from storage.
+    result =
+        CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
+    assert(result);
+    PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
+
+    // Insert dummy to record recent use
+    // TODO: try to avoid case where inserting this dummy could overwrite a
+    // regular entry
+    Status s = Insert(key, kDummyObj, &kNoopCacheItemHelper, /*charge=*/0,
+                      /*handle=*/nullptr, priority);
+    s.PermitUncheckedError();
+    // Nothing to do or clean up on dummy insertion failure
+  } else {
+    // Insert regular entry into primary cache.
+    // Don't allow it to spill into secondary cache again if it was kept there.
+    Status s = Insert(
+        key, obj, kept_in_sec_cache ? helper->without_secondary_compat : helper,
+        charge, &result, priority);
+    if (s.ok()) {
+      assert(result);
+      PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
+    } else {
+      // Create standalone result instead, even if cache is full, to avoid
+      // reading the entry from storage.
+      result =
+          CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
+      assert(result);
+      PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
+    }
+  }
+  return result;
+}
+
+Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
+                                         const CacheItemHelper* helper,
+                                         size_t charge, Handle** handle,
+                                         Priority priority) {
+  Status s = target_->Insert(key, value, helper, charge, handle, priority);
+  if (s.ok() && value == nullptr && distribute_cache_res_) {
+    size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
+    s = secondary_cache_->Deflate(sec_charge);
+    assert(s.ok());
+    s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/false);
+    assert(s.ok());
+  }
+
+  return s;
+}
+
+Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key,
+                                                 const CacheItemHelper* helper,
+                                                 CreateContext* create_context,
+                                                 Priority priority,
+                                                 Statistics* stats) {
+  // NOTE: we could just StartAsyncLookup() and Wait(), but this should be a bit
+  // more efficient
+  Handle* result =
+      target_->Lookup(key, helper, create_context, priority, stats);
+  bool secondary_compatible = helper && helper->IsSecondaryCacheCompatible();
+  bool found_dummy_entry =
+      ProcessDummyResult(&result, /*erase=*/secondary_compatible);
+  if (!result && secondary_compatible) {
+    // Try our secondary cache
+    bool kept_in_sec_cache = false;
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+        secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true,
+                                 found_dummy_entry, /*out*/ kept_in_sec_cache);
+    if (secondary_handle) {
+      result = Promote(std::move(secondary_handle), key, helper, priority,
+                       stats, found_dummy_entry, kept_in_sec_cache);
+    }
+  }
+  return result;
+}
+
+bool CacheWithSecondaryAdapter::Release(Handle* handle,
+                                        bool erase_if_last_ref) {
+  if (erase_if_last_ref) {
+    ObjectPtr v = target_->Value(handle);
+    if (v == nullptr && distribute_cache_res_) {
+      size_t charge = target_->GetCharge(handle);
+      size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
+      Status s = secondary_cache_->Inflate(sec_charge);
+      assert(s.ok());
+      s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/true);
+      assert(s.ok());
+    }
+  }
+  return target_->Release(handle, erase_if_last_ref);
+}
+
+Cache::ObjectPtr CacheWithSecondaryAdapter::Value(Handle* handle) {
+  ObjectPtr v = target_->Value(handle);
+  // TODO with stacked secondaries: might fail in EvictionHandler
+  assert(v != kDummyObj);
+  return v;
+}
+
+void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary(
+    AsyncLookupHandle& async_handle) {
+  assert(!async_handle.IsPending());
+  assert(async_handle.result_handle == nullptr);
+
+  std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
+      secondary_cache_->Lookup(async_handle.key, async_handle.helper,
+                               async_handle.create_context, /*wait*/ false,
+                               async_handle.found_dummy_entry,
+                               /*out*/ async_handle.kept_in_sec_cache);
+  if (secondary_handle) {
+    // TODO with stacked secondaries: Check & process if already ready?
+    async_handle.pending_handle = secondary_handle.release();
+    async_handle.pending_cache = secondary_cache_.get();
+  }
+}
+
+void CacheWithSecondaryAdapter::StartAsyncLookup(
+    AsyncLookupHandle& async_handle) {
+  target_->StartAsyncLookup(async_handle);
+  if (!async_handle.IsPending()) {
+    bool secondary_compatible =
+        async_handle.helper &&
+        async_handle.helper->IsSecondaryCacheCompatible();
+    async_handle.found_dummy_entry |= ProcessDummyResult(
+        &async_handle.result_handle, /*erase=*/secondary_compatible);
+
+    if (async_handle.Result() == nullptr && secondary_compatible) {
+      // Not found and not pending on another secondary cache
+      StartAsyncLookupOnMySecondary(async_handle);
+    }
+  }
+}
+
+void CacheWithSecondaryAdapter::WaitAll(AsyncLookupHandle* async_handles,
+                                        size_t count) {
+  if (count == 0) {
+    // Nothing to do
+    return;
+  }
+  // Requests that are pending on *my* secondary cache, at the start of this
+  // function
+  std::vector<AsyncLookupHandle*> my_pending;
+  // Requests that are pending on an "inner" secondary cache (managed somewhere
+  // under target_), as of the start of this function
+  std::vector<AsyncLookupHandle*> inner_pending;
+
+  // Initial accounting of pending handles, excluding those already handled
+  // by "outer" secondary caches. (See cur->pending_cache = nullptr.)
+  for (size_t i = 0; i < count; ++i) {
+    AsyncLookupHandle* cur = async_handles + i;
+    if (cur->pending_cache) {
+      assert(cur->IsPending());
+      assert(cur->helper);
+      assert(cur->helper->IsSecondaryCacheCompatible());
+      if (cur->pending_cache == secondary_cache_.get()) {
+        my_pending.push_back(cur);
+        // Mark as "to be handled by this caller"
+        cur->pending_cache = nullptr;
+      } else {
+        // Remember as potentially needing a lookup in my secondary
+        inner_pending.push_back(cur);
+      }
+    }
+  }
+
+  // Wait on inner-most cache lookups first
+  // TODO with stacked secondaries: because we are not using proper
+  // async/await constructs here yet, there is a false synchronization point
+  // here where all the results at one level are needed before initiating
+  // any lookups at the next level. Probably not a big deal, but worth noting.
+  if (!inner_pending.empty()) {
+    target_->WaitAll(async_handles, count);
+  }
+
+  // For those that failed to find something, convert to lookup in my
+  // secondary cache.
+  for (AsyncLookupHandle* cur : inner_pending) {
+    if (cur->Result() == nullptr) {
+      // Not found, try my secondary
+      StartAsyncLookupOnMySecondary(*cur);
+      if (cur->IsPending()) {
+        assert(cur->pending_cache == secondary_cache_.get());
+        my_pending.push_back(cur);
+        // Mark as "to be handled by this caller"
+        cur->pending_cache = nullptr;
+      }
+    }
+  }
+
+  // Wait on all lookups on my secondary cache
+  {
+    std::vector<SecondaryCacheResultHandle*> my_secondary_handles;
+    for (AsyncLookupHandle* cur : my_pending) {
+      my_secondary_handles.push_back(cur->pending_handle);
+    }
+    secondary_cache_->WaitAll(std::move(my_secondary_handles));
+  }
+
+  // Process results
+  for (AsyncLookupHandle* cur : my_pending) {
+    std::unique_ptr<SecondaryCacheResultHandle> secondary_handle(
+        cur->pending_handle);
+    cur->pending_handle = nullptr;
+    cur->result_handle = Promote(
+        std::move(secondary_handle), cur->key, cur->helper, cur->priority,
+        cur->stats, cur->found_dummy_entry, cur->kept_in_sec_cache);
+    assert(cur->pending_cache == nullptr);
+  }
+}
+
+std::string CacheWithSecondaryAdapter::GetPrintableOptions() const {
+  std::string str = target_->GetPrintableOptions();
+  str.append("  secondary_cache:\n");
+  str.append(secondary_cache_->GetPrintableOptions());
+  return str;
+}
+
+const char* CacheWithSecondaryAdapter::Name() const {
+  // To the user, at least for now, configure the underlying cache with
+  // a secondary cache. So we pretend to be that cache
+  return target_->Name();
+}
+
+std::shared_ptr<Cache> NewTieredVolatileCache(
+    TieredVolatileCacheOptions& opts) {
+  if (!opts.cache_opts) {
+    return nullptr;
+  }
+
+  std::shared_ptr<Cache> cache;
+  if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) {
+    LRUCacheOptions cache_opts =
+        *(static_cast_with_check<LRUCacheOptions, ShardedCacheOptions>(
+            opts.cache_opts));
+    cache_opts.capacity += opts.comp_cache_opts.capacity;
+    cache = cache_opts.MakeSharedCache();
+  } else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) {
+    HyperClockCacheOptions cache_opts =
+        *(static_cast_with_check<HyperClockCacheOptions, ShardedCacheOptions>(
+            opts.cache_opts));
+    cache = cache_opts.MakeSharedCache();
+  } else {
+    return nullptr;
+  }
+  std::shared_ptr<SecondaryCache> sec_cache;
+  sec_cache = NewCompressedSecondaryCache(opts.comp_cache_opts);
+
+  return std::make_shared<CacheWithSecondaryAdapter>(cache, sec_cache, true);
+}
+}  // namespace ROCKSDB_NAMESPACE
--- a/cache/secondary_cache_adapter.h
+++ b/cache/secondary_cache_adapter.h
@ -0,0 +1,76 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+#pragma once
+
+#include "cache/cache_reservation_manager.h"
+#include "rocksdb/secondary_cache.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+class CacheWithSecondaryAdapter : public CacheWrapper {
+ public:
+  explicit CacheWithSecondaryAdapter(
+      std::shared_ptr<Cache> target,
+      std::shared_ptr<SecondaryCache> secondary_cache,
+      bool distribute_cache_res = false);
+
+  ~CacheWithSecondaryAdapter() override;
+
+  Status Insert(const Slice& key, ObjectPtr value,
+                const CacheItemHelper* helper, size_t charge,
+                Handle** handle = nullptr,
+                Priority priority = Priority::LOW) override;
+
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
+                 CreateContext* create_context,
+                 Priority priority = Priority::LOW,
+                 Statistics* stats = nullptr) override;
+
+  using Cache::Release;
+  bool Release(Handle* handle, bool erase_if_last_ref = false) override;
+
+  ObjectPtr Value(Handle* handle) override;
+
+  void StartAsyncLookup(AsyncLookupHandle& async_handle) override;
+
+  void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
+
+  std::string GetPrintableOptions() const override;
+
+  const char* Name() const override;
+
+  Cache* TEST_GetCache() { return target_.get(); }
+
+  SecondaryCache* TEST_GetSecondaryCache() { return secondary_cache_.get(); }
+
+ private:
+  bool EvictionHandler(const Slice& key, Handle* handle);
+
+  void StartAsyncLookupOnMySecondary(AsyncLookupHandle& async_handle);
+
+  Handle* Promote(
+      std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
+      const Slice& key, const CacheItemHelper* helper, Priority priority,
+      Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache);
+
+  bool ProcessDummyResult(Cache::Handle** handle, bool erase);
+
+  void CleanupCacheObject(ObjectPtr obj, const CacheItemHelper* helper);
+
+  std::shared_ptr<SecondaryCache> secondary_cache_;
+  // Whether to proportionally distribute cache memory reservations, i.e
+  // placeholder entries with null value and a non-zero charge, across
+  // the primary and secondary caches.
+  bool distribute_cache_res_;
+  // A cache reservation manager to keep track of secondary cache memory
+  // usage by reserving equivalent capacity against the primary cache
+  std::shared_ptr<ConcurrentCacheReservationManager> pri_cache_res_;
+  // Fraction of a cache memory reservation to be assigned to the secondary
+  // cache
+  double sec_cache_res_ratio_;
+};
+
+}  // namespace ROCKSDB_NAMESPACE
--- a/cache/sharded_cache.cc
+++ b/cache/sharded_cache.cc
@ -13,20 +13,57 @@
 #include <cstdint>
 #include <memory>

+#include "env/unique_id_gen.h"
+#include "rocksdb/env.h"
 #include "util/hash.h"
 #include "util/math.h"
 #include "util/mutexlock.h"

 namespace ROCKSDB_NAMESPACE {
+namespace {
+// The generated seeds must fit in 31 bits so that
+// ShardedCacheOptions::hash_seed can be set to it explicitly, for
+// diagnostic/debugging purposes.
+constexpr uint32_t kSeedMask = 0x7fffffff;
+uint32_t DetermineSeed(int32_t hash_seed_option) {
+  if (hash_seed_option >= 0) {
+    // User-specified exact seed
+    return static_cast<uint32_t>(hash_seed_option);
+  }
+  static SemiStructuredUniqueIdGen gen;
+  if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) {
+    std::string hostname;
+    Status s = Env::Default()->GetHostNameString(&hostname);
+    if (s.ok()) {
+      return GetSliceHash(hostname) & kSeedMask;
+    } else {
+      // Fall back on something stable within the process.
+      return BitwiseAnd(gen.GetBaseUpper(), kSeedMask);
+    }
+  } else {
+    // for kQuasiRandomHashSeed and fallback
+    uint32_t val = gen.GenerateNext<uint32_t>() & kSeedMask;
+    // Perform some 31-bit bijective transformations so that we get
+    // quasirandom, not just incrementing. (An incrementing seed from a
+    // random starting point would be fine, but hard to describe in a name.)
+    // See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like
+    // transformation here for our bijection in the lower 31 bits.
+    // See https://en.wikipedia.org/wiki/MurmurHash
+    val *= /*31-bit prime*/ 1150630961;
+    val ^= (val & kSeedMask) >> 17;
+    val *= /*31-bit prime*/ 1320603883;
+    return val & kSeedMask;
+  }
+}
+}  // namespace

-ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
-                                   bool strict_capacity_limit,
-                                   std::shared_ptr<MemoryAllocator> allocator)
-    : Cache(std::move(allocator)),
+ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts)
+    : Cache(opts.memory_allocator),
      last_id_(1),
-      shard_mask_((uint32_t{1} << num_shard_bits) - 1),
-      strict_capacity_limit_(strict_capacity_limit),
-      capacity_(capacity) {}
+      shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1),
+      hash_seed_(DetermineSeed(opts.hash_seed)),
+      strict_capacity_limit_(opts.strict_capacity_limit),
+      capacity_(opts.capacity) {}

 size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
  uint32_t num_shards = GetNumShards();
--- a/cache/sharded_cache.h
+++ b/cache/sharded_cache.h
@ -15,7 +15,7 @@

 #include "port/lang.h"
 #include "port/port.h"
-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"

@ -34,8 +34,8 @@ class CacheShardBase {
  std::string GetPrintableOptions() const { return ""; }
  using HashVal = uint64_t;
  using HashCref = uint64_t;
-  static inline HashVal ComputeHash(const Slice& key) {
-    return GetSliceNPHash64(key);
+  static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
+    return GetSliceNPHash64(key, seed);
  }
  static inline uint32_t HashPieceForSharding(HashCref hash) {
    return Lower32of64(hash);
@ -49,21 +49,19 @@ class CacheShardBase {
    HashCref GetHash() const;
    ...
  };
-  Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
-                DeleterFn deleter, HandleImpl** handle,
-                Cache::Priority priority) = 0;
-  Status Insert(const Slice& key, HashCref hash, void* value,
+  Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value,
                const Cache::CacheItemHelper* helper, size_t charge,
-                HandleImpl** handle, Cache::Priority priority) = 0;
-  HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
+                HandleImpl** handle, Cache::Priority priority,
+                bool standalone) = 0;
+  Handle* CreateStandalone(const Slice& key, HashCref hash, ObjectPtr obj,
+                           const CacheItemHelper* helper,
+                           size_t charge, bool allow_uncharged) = 0;
  HandleImpl* Lookup(const Slice& key, HashCref hash,
                        const Cache::CacheItemHelper* helper,
-                        const Cache::CreateCallback& create_cb,
-                        Cache::Priority priority, bool wait,
+                        Cache::CreateContext* create_context,
+                        Cache::Priority priority,
                        Statistics* stats) = 0;
  bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
-  bool IsReady(HandleImpl* handle) = 0;
-  void Wait(HandleImpl* handle) = 0;
  bool Ref(HandleImpl* handle) = 0;
  void Erase(const Slice& key, HashCref hash) = 0;
  void SetCapacity(size_t capacity) = 0;
@ -77,8 +75,9 @@ class CacheShardBase {
  // *state == 0 and implementation sets *state = SIZE_MAX to indicate
  // completion.
  void ApplyToSomeEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
+      const std::function<void(const Slice& key, ObjectPtr value,
+                               size_t charge,
+                               const Cache::CacheItemHelper* helper)>& callback,
      size_t average_entries_per_lock, size_t* state) = 0;
  void EraseUnRefEntries() = 0;
  */
@ -90,9 +89,7 @@ class CacheShardBase {
 // Portions of ShardedCache that do not depend on the template parameter
 class ShardedCacheBase : public Cache {
 public:
-  ShardedCacheBase(size_t capacity, int num_shard_bits,
-                   bool strict_capacity_limit,
-                   std::shared_ptr<MemoryAllocator> memory_allocator);
+  explicit ShardedCacheBase(const ShardedCacheOptions& opts);
  virtual ~ShardedCacheBase() = default;

  int GetNumShardBits() const;
@ -107,6 +104,8 @@ class ShardedCacheBase : public Cache {
  size_t GetUsage(Handle* handle) const override;
  std::string GetPrintableOptions() const override;

+  uint32_t GetHashSeed() const override { return hash_seed_; }
+
 protected:  // fns
  virtual void AppendPrintableOptions(std::string& str) const = 0;
  size_t GetPerShardCapacity() const;
@ -115,6 +114,7 @@ class ShardedCacheBase : public Cache {
 protected:                        // data
  std::atomic<uint64_t> last_id_;  // For NewId
  const uint32_t shard_mask_;
+  const uint32_t hash_seed_;

  // Dynamic configuration parameters, guarded by config_mutex_
  bool strict_capacity_limit_;
@ -135,10 +135,8 @@ class ShardedCache : public ShardedCacheBase {
  using HashCref = typename CacheShard::HashCref;
  using HandleImpl = typename CacheShard::HandleImpl;

-  ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
-               std::shared_ptr<MemoryAllocator> allocator)
-      : ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
-                         allocator),
+  explicit ShardedCache(const ShardedCacheOptions& opts)
+      : ShardedCacheBase(opts),
        shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
            sizeof(CacheShard) * GetNumShards()))),
        destroy_shards_in_dtor_(false) {}
@ -172,41 +170,38 @@ class ShardedCache : public ShardedCacheBase {
        [s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
  }

-  Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
-                Handle** handle, Priority priority) override {
-    HashVal hash = CacheShard::ComputeHash(key);
-    auto h_out = reinterpret_cast<HandleImpl**>(handle);
-    return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
-                                 priority);
-  }
-  Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
+  Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
                size_t charge, Handle** handle = nullptr,
                Priority priority = Priority::LOW) override {
-    if (!helper) {
-      return Status::InvalidArgument();
-    }
-    HashVal hash = CacheShard::ComputeHash(key);
+    assert(helper);
+    HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
    auto h_out = reinterpret_cast<HandleImpl**>(handle);
-    return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
+    return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out,
                                 priority);
  }

-  Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
-    HashVal hash = CacheShard::ComputeHash(key);
-    HandleImpl* result = GetShard(hash).Lookup(key, hash);
+  Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
+                           const CacheItemHelper* helper, size_t charge,
+                           bool allow_uncharged) override {
+    assert(helper);
+    HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
+    HandleImpl* result = GetShard(hash).CreateStandalone(
+        key, hash, obj, helper, charge, allow_uncharged);
    return reinterpret_cast<Handle*>(result);
  }
-  Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
-                 const CreateCallback& create_cb, Priority priority, bool wait,
+
+  Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr,
+                 CreateContext* create_context = nullptr,
+                 Priority priority = Priority::LOW,
                 Statistics* stats = nullptr) override {
-    HashVal hash = CacheShard::ComputeHash(key);
-    HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
-                                               priority, wait, stats);
+    HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
+    HandleImpl* result = GetShard(hash).Lookup(key, hash, helper,
+                                               create_context, priority, stats);
    return reinterpret_cast<Handle*>(result);
  }

  void Erase(const Slice& key) override {
-    HashVal hash = CacheShard::ComputeHash(key);
+    HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
    GetShard(hash).Erase(key, hash);
  }

@ -215,14 +210,6 @@ class ShardedCache : public ShardedCacheBase {
    auto h = reinterpret_cast<HandleImpl*>(handle);
    return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
  }
-  bool IsReady(Handle* handle) override {
-    auto h = reinterpret_cast<HandleImpl*>(handle);
-    return GetShard(h->GetHash()).IsReady(h);
-  }
-  void Wait(Handle* handle) override {
-    auto h = reinterpret_cast<HandleImpl*>(handle);
-    GetShard(h->GetHash()).Wait(h);
-  }
  bool Ref(Handle* handle) override {
    auto h = reinterpret_cast<HandleImpl*>(handle);
    return GetShard(h->GetHash()).Ref(h);
@ -238,14 +225,14 @@ class ShardedCache : public ShardedCacheBase {
    return SumOverShards2(&CacheShard::GetPinnedUsage);
  }
  size_t GetOccupancyCount() const override {
-    return SumOverShards2(&CacheShard::GetPinnedUsage);
+    return SumOverShards2(&CacheShard::GetOccupancyCount);
  }
  size_t GetTableAddressCount() const override {
    return SumOverShards2(&CacheShard::GetTableAddressCount);
  }
  void ApplyToAllEntries(
-      const std::function<void(const Slice& key, void* value, size_t charge,
-                               DeleterFn deleter)>& callback,
+      const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
+                               const CacheItemHelper* helper)>& callback,
      const ApplyToAllEntriesOptions& opts) override {
    uint32_t num_shards = GetNumShards();
    // Iterate over part of each shard, rotating between shards, to
--- a/cache/typed_cache.h
+++ b/cache/typed_cache.h
@ -0,0 +1,375 @@
+//  Copyright (c) Meta Platforms, Inc. and affiliates.
+//  This source code is licensed under both the GPLv2 (found in the
+//  COPYING file in the root directory) and Apache 2.0 License
+//  (found in the LICENSE.Apache file in the root directory).
+
+// APIs for accessing Cache in a type-safe and convenient way. Cache is kept
+// at a low, thin level of abstraction so that different implementations can
+// be plugged in, but these wrappers provide clean, convenient access to the
+// most common operations.
+//
+// A number of template classes are needed for sharing common structure. The
+// key classes are these:
+//
+// * PlaceholderCacheInterface - Used for making cache reservations, with
+// entries that have a charge but no value.
+// * BasicTypedCacheInterface<TValue> - Used for primary cache storage of
+// objects of type TValue.
+// * FullTypedCacheHelper<TValue, TCreateContext> - Used for secondary cache
+// compatible storage of objects of type TValue.
+// * For each of these, there's a "Shared" version
+// (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache,
+// rather than assuming external ownership by holding only a raw `Cache*`.
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+#include "cache/cache_helpers.h"
+#include "rocksdb/advanced_cache.h"
+#include "rocksdb/advanced_options.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+// For future consideration:
+// * Pass in value to Insert with std::unique_ptr& to simplify ownership
+//   transfer logic in callers
+// * Make key type a template parameter (e.g. useful for table cache)
+// * Closer integration with CacheHandleGuard (opt-in, so not always
+//   paying the extra overhead)
+
+#define CACHE_TYPE_DEFS()                     \
+  using Priority = Cache::Priority;           \
+  using Handle = Cache::Handle;               \
+  using ObjectPtr = Cache::ObjectPtr;         \
+  using CreateContext = Cache::CreateContext; \
+  using CacheItemHelper = Cache::CacheItemHelper /* caller ; */
+
+template <typename CachePtr>
+class BaseCacheInterface {
+ public:
+  CACHE_TYPE_DEFS();
+
+  /*implicit*/ BaseCacheInterface(CachePtr cache) : cache_(std::move(cache)) {}
+
+  inline void Release(Handle* handle) { cache_->Release(handle); }
+
+  inline void ReleaseAndEraseIfLastRef(Handle* handle) {
+    cache_->Release(handle, /*erase_if_last_ref*/ true);
+  }
+
+  inline void RegisterReleaseAsCleanup(Handle* handle, Cleanable& cleanable) {
+    cleanable.RegisterCleanup(&ReleaseCacheHandleCleanup, get(), handle);
+  }
+
+  inline Cache* get() const { return &*cache_; }
+
+  explicit inline operator bool() const noexcept { return cache_ != nullptr; }
+
+ protected:
+  CachePtr cache_;
+};
+
+// PlaceholderCacheInterface - Used for making cache reservations, with
+// entries that have a charge but no value. CacheEntryRole is required as
+// a template parameter.
+template <CacheEntryRole kRole, typename CachePtr = Cache*>
+class PlaceholderCacheInterface : public BaseCacheInterface<CachePtr> {
+ public:
+  CACHE_TYPE_DEFS();
+  using BaseCacheInterface<CachePtr>::BaseCacheInterface;
+
+  inline Status Insert(const Slice& key, size_t charge, Handle** handle) {
+    return this->cache_->Insert(key, /*value=*/nullptr, GetHelper(), charge,
+                                handle);
+  }
+
+  static const Cache::CacheItemHelper* GetHelper() {
+    static const Cache::CacheItemHelper kHelper{kRole};
+    return &kHelper;
+  }
+};
+
+template <CacheEntryRole kRole>
+using PlaceholderSharedCacheInterface =
+    PlaceholderCacheInterface<kRole, std::shared_ptr<Cache>>;
+
+template <class TValue>
+class BasicTypedCacheHelperFns {
+ public:
+  CACHE_TYPE_DEFS();
+  // E.g. char* for char[]
+  using TValuePtr = std::remove_extent_t<TValue>*;
+
+ protected:
+  inline static ObjectPtr UpCastValue(TValuePtr value) { return value; }
+  inline static TValuePtr DownCastValue(ObjectPtr value) {
+    return static_cast<TValuePtr>(value);
+  }
+
+  static void Delete(ObjectPtr value, MemoryAllocator* allocator) {
+    // FIXME: Currently, no callers actually allocate the ObjectPtr objects
+    // using the custom allocator, just subobjects that keep a reference to
+    // the allocator themselves (with CacheAllocationPtr).
+    if (/*DISABLED*/ false && allocator) {
+      if constexpr (std::is_destructible_v<TValue>) {
+        DownCastValue(value)->~TValue();
+      }
+      allocator->Deallocate(value);
+    } else {
+      // Like delete but properly handles TValue=char[] etc.
+      std::default_delete<TValue>{}(DownCastValue(value));
+    }
+  }
+};
+
+// In its own class to try to minimize the number of distinct CacheItemHelper
+// instances (e.g. don't vary by CachePtr)
+template <class TValue, CacheEntryRole kRole>
+class BasicTypedCacheHelper : public BasicTypedCacheHelperFns<TValue> {
+ public:
+  static const Cache::CacheItemHelper* GetBasicHelper() {
+    static const Cache::CacheItemHelper kHelper{kRole,
+                                                &BasicTypedCacheHelper::Delete};
+    return &kHelper;
+  }
+};
+
+// BasicTypedCacheInterface - Used for primary cache storage of objects of
+// type TValue, which can be cleaned up with std::default_delete<TValue>. The
+// role is provided by TValue::kCacheEntryRole or given in an optional
+// template parameter.
+template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole,
+          typename CachePtr = Cache*>
+class BasicTypedCacheInterface : public BaseCacheInterface<CachePtr>,
+                                 public BasicTypedCacheHelper<TValue, kRole> {
+ public:
+  CACHE_TYPE_DEFS();
+  using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
+  struct TypedHandle : public Handle {};
+  using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
+  // ctor
+  using BaseCacheInterface<CachePtr>::BaseCacheInterface;
+  struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle {
+    TypedHandle* Result() {
+      return reinterpret_cast<TypedHandle*>(Cache::AsyncLookupHandle::Result());
+    }
+  };
+
+  inline Status Insert(const Slice& key, TValuePtr value, size_t charge,
+                       TypedHandle** handle = nullptr,
+                       Priority priority = Priority::LOW) {
+    auto untyped_handle = reinterpret_cast<Handle**>(handle);
+    return this->cache_->Insert(
+        key, BasicTypedCacheHelperFns<TValue>::UpCastValue(value),
+        GetBasicHelper(), charge, untyped_handle, priority);
+  }
+
+  inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) {
+    return reinterpret_cast<TypedHandle*>(
+        this->cache_->BasicLookup(key, stats));
+  }
+
+  inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) {
+    assert(async_handle.helper == nullptr);
+    this->cache_->StartAsyncLookup(async_handle);
+  }
+
+  inline CacheHandleGuard<TValue> Guard(TypedHandle* handle) {
+    if (handle) {
+      return CacheHandleGuard<TValue>(&*this->cache_, handle);
+    } else {
+      return {};
+    }
+  }
+
+  inline std::shared_ptr<TValue> SharedGuard(TypedHandle* handle) {
+    if (handle) {
+      return MakeSharedCacheHandleGuard<TValue>(&*this->cache_, handle);
+    } else {
+      return {};
+    }
+  }
+
+  inline TValuePtr Value(TypedHandle* handle) {
+    return BasicTypedCacheHelperFns<TValue>::DownCastValue(
+        this->cache_->Value(handle));
+  }
+};
+
+// BasicTypedSharedCacheInterface - Like BasicTypedCacheInterface but with a
+// shared_ptr<Cache> for keeping Cache alive.
+template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole>
+using BasicTypedSharedCacheInterface =
+    BasicTypedCacheInterface<TValue, kRole, std::shared_ptr<Cache>>;
+
+// TValue must implement ContentSlice() and ~TValue
+// TCreateContext must implement Create(std::unique_ptr<TValue>*, ...)
+template <class TValue, class TCreateContext>
+class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns<TValue> {
+ public:
+  CACHE_TYPE_DEFS();
+
+ protected:
+  using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
+  using BasicTypedCacheHelperFns<TValue>::DownCastValue;
+  using BasicTypedCacheHelperFns<TValue>::UpCastValue;
+
+  static size_t Size(ObjectPtr v) {
+    TValuePtr value = DownCastValue(v);
+    auto slice = value->ContentSlice();
+    return slice.size();
+  }
+
+  static Status SaveTo(ObjectPtr v, size_t from_offset, size_t length,
+                       char* out) {
+    TValuePtr value = DownCastValue(v);
+    auto slice = value->ContentSlice();
+    assert(from_offset < slice.size());
+    assert(from_offset + length <= slice.size());
+    std::copy_n(slice.data() + from_offset, length, out);
+    return Status::OK();
+  }
+
+  static Status Create(const Slice& data, CreateContext* context,
+                       MemoryAllocator* allocator, ObjectPtr* out_obj,
+                       size_t* out_charge) {
+    std::unique_ptr<TValue> value = nullptr;
+    if constexpr (sizeof(TCreateContext) > 0) {
+      TCreateContext* tcontext = static_cast<TCreateContext*>(context);
+      tcontext->Create(&value, out_charge, data, allocator);
+    } else {
+      TCreateContext::Create(&value, out_charge, data, allocator);
+    }
+    *out_obj = UpCastValue(value.release());
+    return Status::OK();
+  }
+};
+
+// In its own class to try to minimize the number of distinct CacheItemHelper
+// instances (e.g. don't vary by CachePtr)
+template <class TValue, class TCreateContext, CacheEntryRole kRole>
+class FullTypedCacheHelper
+    : public FullTypedCacheHelperFns<TValue, TCreateContext> {
+ public:
+  static const Cache::CacheItemHelper* GetFullHelper() {
+    static const Cache::CacheItemHelper kHelper{
+        kRole,
+        &FullTypedCacheHelper::Delete,
+        &FullTypedCacheHelper::Size,
+        &FullTypedCacheHelper::SaveTo,
+        &FullTypedCacheHelper::Create,
+        BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper()};
+    return &kHelper;
+  }
+};
+
+// FullTypedCacheHelper - Used for secondary cache compatible storage of
+// objects of type TValue. In addition to BasicTypedCacheInterface constraints,
+// we require TValue::ContentSlice() to return persistable data. This
+// simplifies usage for the normal case of simple secondary cache compatibility
+// (can give you a Slice to the data already in memory). In addition to
+// TCreateContext performing the role of Cache::CreateContext, it is also
+// expected to provide a function Create(std::unique_ptr<TValue>* value,
+// size_t* out_charge, const Slice& data, MemoryAllocator* allocator) for
+// creating new TValue.
+template <class TValue, class TCreateContext,
+          CacheEntryRole kRole = TValue::kCacheEntryRole,
+          typename CachePtr = Cache*>
+class FullTypedCacheInterface
+    : public BasicTypedCacheInterface<TValue, kRole, CachePtr>,
+      public FullTypedCacheHelper<TValue, TCreateContext, kRole> {
+ public:
+  CACHE_TYPE_DEFS();
+  using typename BasicTypedCacheInterface<TValue, kRole, CachePtr>::TypedHandle;
+  using typename BasicTypedCacheInterface<TValue, kRole,
+                                          CachePtr>::TypedAsyncLookupHandle;
+  using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
+  using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
+  using FullTypedCacheHelper<TValue, TCreateContext, kRole>::GetFullHelper;
+  using BasicTypedCacheHelperFns<TValue>::UpCastValue;
+  using BasicTypedCacheHelperFns<TValue>::DownCastValue;
+  // ctor
+  using BasicTypedCacheInterface<TValue, kRole,
+                                 CachePtr>::BasicTypedCacheInterface;
+
+  // Insert with SecondaryCache compatibility (subject to CacheTier).
+  // (Basic Insert() also inherited.)
+  inline Status InsertFull(
+      const Slice& key, TValuePtr value, size_t charge,
+      TypedHandle** handle = nullptr, Priority priority = Priority::LOW,
+      CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
+    auto untyped_handle = reinterpret_cast<Handle**>(handle);
+    auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier
+                      ? GetFullHelper()
+                      : GetBasicHelper();
+    return this->cache_->Insert(key, UpCastValue(value), helper, charge,
+                                untyped_handle, priority);
+  }
+
+  // Like SecondaryCache::InsertSaved, with SecondaryCache compatibility
+  // (subject to CacheTier).
+  inline Status InsertSaved(
+      const Slice& key, const Slice& data, TCreateContext* create_context,
+      Priority priority = Priority::LOW,
+      CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier,
+      size_t* out_charge = nullptr) {
+    ObjectPtr value;
+    size_t charge;
+    Status st = GetFullHelper()->create_cb(data, create_context,
+                                           this->cache_->memory_allocator(),
+                                           &value, &charge);
+    if (out_charge) {
+      *out_charge = charge;
+    }
+    if (st.ok()) {
+      st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/,
+                      priority, lowest_used_cache_tier);
+    } else {
+      GetFullHelper()->del_cb(value, this->cache_->memory_allocator());
+    }
+    return st;
+  }
+
+  // Lookup with SecondaryCache support (subject to CacheTier).
+  // (Basic Lookup() also inherited.)
+  inline TypedHandle* LookupFull(
+      const Slice& key, TCreateContext* create_context = nullptr,
+      Priority priority = Priority::LOW, Statistics* stats = nullptr,
+      CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
+    if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
+      return reinterpret_cast<TypedHandle*>(this->cache_->Lookup(
+          key, GetFullHelper(), create_context, priority, stats));
+    } else {
+      return BasicTypedCacheInterface<TValue, kRole, CachePtr>::Lookup(key,
+                                                                       stats);
+    }
+  }
+
+  inline void StartAsyncLookupFull(
+      TypedAsyncLookupHandle& async_handle,
+      CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
+    if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
+      async_handle.helper = GetFullHelper();
+      this->cache_->StartAsyncLookup(async_handle);
+    } else {
+      BasicTypedCacheInterface<TValue, kRole, CachePtr>::StartAsyncLookup(
+          async_handle);
+    }
+  }
+};
+
+// FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a
+// shared_ptr<Cache> for keeping Cache alive.
+template <class TValue, class TCreateContext,
+          CacheEntryRole kRole = TValue::kCacheEntryRole>
+using FullTypedSharedCacheInterface =
+    FullTypedCacheInterface<TValue, TCreateContext, kRole,
+                            std::shared_ptr<Cache>>;
+
+#undef CACHE_TYPE_DEFS
+
+}  // namespace ROCKSDB_NAMESPACE
--- a/coverage/coverage_test.sh
+++ b/coverage/coverage_test.sh
@ -12,7 +12,7 @@ fi
 ROOT=".."
 # Fetch right version of gcov
 if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
-  source $ROOT/build_tools/fbcode_config_platform009.sh
+  source $ROOT/build_tools/fbcode_config_platform010.sh
  GCOV=$GCC_BASE/bin/gcov
 else
  GCOV=$(which gcov)
--- a/crash_test.mk
+++ b/crash_test.mk
@ -21,6 +21,8 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --
 	blackbox_crash_test_with_multiops_wp_txn \
 	crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
 	whitebox_crash_test_with_tiered_storage \
+	whitebox_crash_test_with_optimistic_txn \
+	blackbox_crash_test_with_optimistic_txn \

 crash_test: $(DB_STRESS_CMD)
 # Do not parallelize
@ -37,6 +39,11 @@ crash_test_with_txn: $(DB_STRESS_CMD)
 	$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
 	$(CRASHTEST_MAKE) blackbox_crash_test_with_txn

+crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
+# Do not parallelize
+	$(CRASHTEST_MAKE) whitebox_crash_test_with_optimistic_txn
+	$(CRASHTEST_MAKE) blackbox_crash_test_with_optimistic_txn
+
 crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery

 crash_test_with_ts: $(DB_STRESS_CMD)
@ -80,6 +87,9 @@ blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
 blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)

+blackbox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --optimistic_txn blackbox $(CRASH_TEST_EXT_ARGS)
+
 ifeq ($(CRASH_TEST_KILL_ODD),)
  CRASH_TEST_KILL_ODD=888887
 endif
@ -105,3 +115,7 @@ whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
 whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
 	$(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \
      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
+
+whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
+	$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
+      $(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@ -47,6 +47,11 @@ void ArenaWrappedDBIter::Init(
  read_options_ = read_options;
  allow_refresh_ = allow_refresh;
  memtable_range_tombstone_iter_ = nullptr;
+
+  if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
+                             FSSupportedOps::kAsyncIO)) {
+    read_options_.async_io = false;
+  }
 }

 Status ArenaWrappedDBIter::Refresh() {
--- a/db/blob/blob_contents.cc
+++ b/db/blob/blob_contents.cc
@ -13,12 +13,6 @@

 namespace ROCKSDB_NAMESPACE {

-std::unique_ptr<BlobContents> BlobContents::Create(
-    CacheAllocationPtr&& allocation, size_t size) {
-  return std::unique_ptr<BlobContents>(
-      new BlobContents(std::move(allocation), size));
-}
-
 size_t BlobContents::ApproximateMemoryUsage() const {
  size_t usage = 0;

@ -45,46 +39,4 @@ size_t BlobContents::ApproximateMemoryUsage() const {
  return usage;
 }

-size_t BlobContents::SizeCallback(void* obj) {
-  assert(obj);
-
-  return static_cast<const BlobContents*>(obj)->size();
-}
-
-Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
-                                    size_t length, void* out) {
-  assert(from_obj);
-
-  const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
-  assert(buf->size() >= from_offset + length);
-
-  memcpy(out, buf->data().data() + from_offset, length);
-
-  return Status::OK();
-}
-
-Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
-  static Cache::CacheItemHelper cache_helper(
-      &SizeCallback, &SaveToCallback,
-      GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
-
-  return &cache_helper;
-}
-
-Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
-                                    const void* buf, size_t size,
-                                    void** out_obj, size_t* charge) {
-  assert(allocation);
-
-  memcpy(allocation.get(), buf, size);
-
-  std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
-  BlobContents* const contents = obj.release();
-
-  *out_obj = contents;
-  *charge = contents->ApproximateMemoryUsage();
-
-  return Status::OK();
-}
-
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/blob/blob_contents.h
+++ b/db/blob/blob_contents.h
@ -7,8 +7,8 @@

 #include <memory>

-#include "memory/memory_allocator.h"
-#include "rocksdb/cache.h"
+#include "memory/memory_allocator_impl.h"
+#include "rocksdb/advanced_cache.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "rocksdb/slice.h"
 #include "rocksdb/status.h"
@ -18,8 +18,8 @@ namespace ROCKSDB_NAMESPACE {
 // A class representing a single uncompressed value read from a blob file.
 class BlobContents {
 public:
-  static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
-                                              size_t size);
+  BlobContents(CacheAllocationPtr&& allocation, size_t size)
+      : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}

  BlobContents(const BlobContents&) = delete;
  BlobContents& operator=(const BlobContents&) = delete;
@ -34,23 +34,26 @@ class BlobContents {

  size_t ApproximateMemoryUsage() const;

-  // Callbacks for secondary cache
-  static size_t SizeCallback(void* obj);
-
-  static Status SaveToCallback(void* from_obj, size_t from_offset,
-                               size_t length, void* out);
-
-  static Cache::CacheItemHelper* GetCacheItemHelper();
-
-  static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
-                               size_t size, void** out_obj, size_t* charge);
+  // For TypedCacheInterface
+  const Slice& ContentSlice() const { return data_; }
+  static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kBlobValue;

 private:
-  BlobContents(CacheAllocationPtr&& allocation, size_t size)
-      : allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
-
  CacheAllocationPtr allocation_;
  Slice data_;
 };

+class BlobContentsCreator : public Cache::CreateContext {
+ public:
+  static void Create(std::unique_ptr<BlobContents>* out, size_t* out_charge,
+                     const Slice& contents, MemoryAllocator* alloc) {
+    auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc),
+                                contents.size());
+    out->reset(raw);
+    if (out_charge) {
+      *out_charge = raw->ApproximateMemoryUsage();
+    }
+  }
+};
+
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/blob/blob_counting_iterator.h
+++ b/db/blob/blob_counting_iterator.h
@ -134,13 +134,6 @@ class BlobCountingIterator : public InternalIterator {
    if (!iter_->Valid()) {
      status_ = iter_->status();
      return;
-    } else if (iter_->IsDeleteRangeSentinelKey()) {
-      // CompactionMergingIterator emits range tombstones, and range tombstone
-      // keys can be truncated at file boundaries. This means the range
-      // tombstone keys can have op_type kTypeBlobIndex.
-      // This could crash the ProcessInFlow() call below since
-      // value is empty for these keys.
-      return;
    }

    TEST_SYNC_POINT(
--- a/db/blob/blob_file_builder.cc
+++ b/db/blob/blob_file_builder.cc
@ -13,6 +13,7 @@
 #include "db/blob/blob_index.h"
 #include "db/blob/blob_log_format.h"
 #include "db/blob/blob_log_writer.h"
+#include "db/blob/blob_source.h"
 #include "db/event_helpers.h"
 #include "db/version_set.h"
 #include "file/filename.h"
@ -258,6 +259,7 @@ Status BlobFileBuilder::CompressBlobIfNeeded(
    return Status::OK();
  }

+  // TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
  CompressionOptions opts;
  CompressionContext context(blob_compression_type_);
  constexpr uint64_t sample_for_compression = 0;
@ -393,7 +395,7 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
                                                 uint64_t blob_offset) const {
  Status s = Status::OK();

-  auto blob_cache = immutable_options_->blob_cache;
+  BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache};
  auto statistics = immutable_options_->statistics.get();
  bool warm_cache =
      prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
@ -407,34 +409,12 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,

    const Cache::Priority priority = Cache::Priority::BOTTOM;

-    // Objects to be put into the cache have to be heap-allocated and
-    // self-contained, i.e. own their contents. The Cache has to be able to
-    // take unique ownership of them.
-    CacheAllocationPtr allocation =
-        AllocateBlock(blob.size(), blob_cache->memory_allocator());
-    memcpy(allocation.get(), blob.data(), blob.size());
-    std::unique_ptr<BlobContents> buf =
-        BlobContents::Create(std::move(allocation), blob.size());
-
-    Cache::CacheItemHelper* const cache_item_helper =
-        BlobContents::GetCacheItemHelper();
-    assert(cache_item_helper);
-
-    if (immutable_options_->lowest_used_cache_tier ==
-        CacheTier::kNonVolatileBlockTier) {
-      s = blob_cache->Insert(key, buf.get(), cache_item_helper,
-                             buf->ApproximateMemoryUsage(),
-                             nullptr /* cache_handle */, priority);
-    } else {
-      s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
-                             cache_item_helper->del_cb,
-                             nullptr /* cache_handle */, priority);
-    }
+    s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority,
+                               immutable_options_->lowest_used_cache_tier);

    if (s.ok()) {
      RecordTick(statistics, BLOB_DB_CACHE_ADD);
-      RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
-      buf.release();
+      RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size());
    } else {
      RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
    }
--- a/db/blob/blob_file_cache.cc
+++ b/db/blob/blob_file_cache.cc
@ -25,7 +25,7 @@ BlobFileCache::BlobFileCache(Cache* cache,
                             HistogramImpl* blob_file_read_hist,
                             const std::shared_ptr<IOTracer>& io_tracer)
    : cache_(cache),
-      mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
+      mutex_(kNumberOfMutexStripes),
      immutable_options_(immutable_options),
      file_options_(file_options),
      column_family_id_(column_family_id),
@ -37,29 +37,29 @@ BlobFileCache::BlobFileCache(Cache* cache,
 }

 Status BlobFileCache::GetBlobFileReader(
-    uint64_t blob_file_number,
+    const ReadOptions& read_options, uint64_t blob_file_number,
    CacheHandleGuard<BlobFileReader>* blob_file_reader) {
  assert(blob_file_reader);
  assert(blob_file_reader->IsEmpty());

-  const Slice key = GetSlice(&blob_file_number);
+  const Slice key = GetSliceForKey(&blob_file_number);

  assert(cache_);

-  Cache::Handle* handle = cache_->Lookup(key);
+  TypedHandle* handle = cache_.Lookup(key);
  if (handle) {
-    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    *blob_file_reader = cache_.Guard(handle);
    return Status::OK();
  }

  TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");

  // Check again while holding mutex
-  MutexLock lock(mutex_.get(key));
+  MutexLock lock(&mutex_.Get(key));

-  handle = cache_->Lookup(key);
+  handle = cache_.Lookup(key);
  if (handle) {
-    *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+    *blob_file_reader = cache_.Guard(handle);
    return Status::OK();
  }

@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader(
  {
    assert(file_options_);
    const Status s = BlobFileReader::Create(
-        *immutable_options_, *file_options_, column_family_id_,
+        *immutable_options_, read_options, *file_options_, column_family_id_,
        blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
    if (!s.ok()) {
      RecordTick(statistics, NO_FILE_ERRORS);
@ -84,8 +84,7 @@ Status BlobFileCache::GetBlobFileReader(
  {
    constexpr size_t charge = 1;

-    const Status s = cache_->Insert(key, reader.get(), charge,
-                                    &DeleteCacheEntry<BlobFileReader>, &handle);
+    const Status s = cache_.Insert(key, reader.get(), charge, &handle);
    if (!s.ok()) {
      RecordTick(statistics, NO_FILE_ERRORS);
      return s;
@ -94,7 +93,7 @@ Status BlobFileCache::GetBlobFileReader(

  reader.release();

-  *blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
+  *blob_file_reader = cache_.Guard(handle);

  return Status::OK();
 }
--- a/db/blob/blob_file_cache.h
+++ b/db/blob/blob_file_cache.h
@ -7,7 +7,8 @@

 #include <cinttypes>

-#include "cache/cache_helpers.h"
+#include "cache/typed_cache.h"
+#include "db/blob/blob_file_reader.h"
 #include "rocksdb/rocksdb_namespace.h"
 #include "util/mutexlock.h"

@ -18,7 +19,6 @@ struct ImmutableOptions;
 struct FileOptions;
 class HistogramImpl;
 class Status;
-class BlobFileReader;
 class Slice;
 class IOTracer;

@ -32,14 +32,18 @@ class BlobFileCache {
  BlobFileCache(const BlobFileCache&) = delete;
  BlobFileCache& operator=(const BlobFileCache&) = delete;

-  Status GetBlobFileReader(uint64_t blob_file_number,
+  Status GetBlobFileReader(const ReadOptions& read_options,
+                           uint64_t blob_file_number,
                           CacheHandleGuard<BlobFileReader>* blob_file_reader);

 private:
-  Cache* cache_;
+  using CacheInterface =
+      BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
+  using TypedHandle = CacheInterface::TypedHandle;
+  CacheInterface cache_;
  // Note: mutex_ below is used to guard against multiple threads racing to open
  // the same file.
-  Striped<port::Mutex, Slice> mutex_;
+  Striped<CacheAlignedWrapper<port::Mutex>> mutex_;
  const ImmutableOptions* immutable_options_;
  const FileOptions* file_options_;
  uint32_t column_family_id_;
--- a/db/blob/blob_file_cache_test.cc
+++ b/db/blob/blob_file_cache_test.cc
@ -118,7 +118,9 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
  // First try: reader should be opened and put in cache
  CacheHandleGuard<BlobFileReader> first;

-  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  const ReadOptions read_options;
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
+                                              &first));
  ASSERT_NE(first.GetValue(), nullptr);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -126,7 +128,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
  // Second try: reader should be served from cache
  CacheHandleGuard<BlobFileReader> second;

-  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
+                                              &second));
  ASSERT_NE(second.GetValue(), nullptr);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -163,19 +166,21 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
  CacheHandleGuard<BlobFileReader> first;
  CacheHandleGuard<BlobFileReader> second;

+  const ReadOptions read_options;
  SyncPoint::GetInstance()->SetCallBack(
      "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
        // Disabling sync points to prevent infinite recursion
        SyncPoint::GetInstance()->DisableProcessing();
-
-        ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
+        ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options,
+                                                    blob_file_number, &second));
        ASSERT_NE(second.GetValue(), nullptr);
        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
        ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
      });
  SyncPoint::GetInstance()->EnableProcessing();

-  ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
+  ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
+                                              &first));
  ASSERT_NE(first.GetValue(), nullptr);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -213,8 +218,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {

  CacheHandleGuard<BlobFileReader> reader;

+  const ReadOptions read_options;
  ASSERT_TRUE(
-      blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
+      blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
+          .IsIOError());
  ASSERT_EQ(reader.GetValue(), nullptr);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
@ -253,8 +260,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
  // strict_capacity_limit is set
  CacheHandleGuard<BlobFileReader> reader;

-  ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
-                  .IsMemoryLimit());
+  const ReadOptions read_options;
+  ASSERT_TRUE(
+      blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
+          .IsMemoryLimit());
  ASSERT_EQ(reader.GetValue(), nullptr);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
  ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
--- a/db/blob/blob_file_completion_callback.h
+++ b/db/blob/blob_file_completion_callback.h
@ -23,32 +23,19 @@ class BlobFileCompletionCallback {
      const std::vector<std::shared_ptr<EventListener>>& listeners,
      const std::string& dbname)
      : event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
-#ifndef ROCKSDB_LITE
    sst_file_manager_ = sst_file_manager;
    mutex_ = mutex;
    error_handler_ = error_handler;
-#else
-    (void)sst_file_manager;
-    (void)mutex;
-    (void)error_handler;
-#endif  // ROCKSDB_LITE
  }

  void OnBlobFileCreationStarted(const std::string& file_name,
                                 const std::string& column_family_name,
                                 int job_id,
                                 BlobFileCreationReason creation_reason) {
-#ifndef ROCKSDB_LITE
    // Notify the listeners.
    EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
                                                column_family_name, file_name,
                                                job_id, creation_reason);
-#else
-    (void)file_name;
-    (void)column_family_name;
-    (void)job_id;
-    (void)creation_reason;
-#endif
  }

  Status OnBlobFileCompleted(const std::string& file_name,
@ -61,7 +48,6 @@ class BlobFileCompletionCallback {
                             uint64_t blob_count, uint64_t blob_bytes) {
    Status s;

-#ifndef ROCKSDB_LITE
    auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
    if (sfm) {
      // Report new blob files to SstFileManagerImpl
@ -74,7 +60,6 @@ class BlobFileCompletionCallback {
        error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
      }
    }
-#endif  // !ROCKSDB_LITE

    // Notify the listeners.
    EventHelpers::LogAndNotifyBlobFileCreationFinished(
@ -89,11 +74,9 @@ class BlobFileCompletionCallback {
  }

 private:
-#ifndef ROCKSDB_LITE
  SstFileManager* sst_file_manager_;
  InstrumentedMutex* mutex_;
  ErrorHandler* error_handler_;
-#endif  // ROCKSDB_LITE
  EventLogger* event_logger_;
  std::vector<std::shared_ptr<EventListener>> listeners_;
  std::string dbname_;
--- a/db/blob/blob_file_reader.cc
+++ b/db/blob/blob_file_reader.cc
@ -12,7 +12,7 @@
 #include "db/blob/blob_log_format.h"
 #include "file/file_prefetch_buffer.h"
 #include "file/filename.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "options/cf_options.h"
 #include "rocksdb/file_system.h"
 #include "rocksdb/slice.h"
@ -26,9 +26,10 @@
 namespace ROCKSDB_NAMESPACE {

 Status BlobFileReader::Create(
-    const ImmutableOptions& immutable_options, const FileOptions& file_options,
-    uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
-    uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
+    const ImmutableOptions& immutable_options, const ReadOptions& read_options,
+    const FileOptions& file_options, uint32_t column_family_id,
+    HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
+    const std::shared_ptr<IOTracer>& io_tracer,
    std::unique_ptr<BlobFileReader>* blob_file_reader) {
  assert(blob_file_reader);
  assert(!*blob_file_reader);
@ -52,15 +53,17 @@ Status BlobFileReader::Create(
  CompressionType compression_type = kNoCompression;

  {
-    const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
-                                &compression_type);
+    const Status s =
+        ReadHeader(file_reader.get(), read_options, column_family_id,
+                   statistics, &compression_type);
    if (!s.ok()) {
      return s;
    }
  }

  {
-    const Status s = ReadFooter(file_reader.get(), file_size, statistics);
+    const Status s =
+        ReadFooter(file_reader.get(), read_options, file_size, statistics);
    if (!s.ok()) {
      return s;
    }
@ -134,6 +137,7 @@ Status BlobFileReader::OpenFile(
 }

 Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
+                                  const ReadOptions& read_options,
                                  uint32_t column_family_id,
                                  Statistics* statistics,
                                  CompressionType* compression_type) {
@ -151,9 +155,10 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
    constexpr size_t read_size = BlobLogHeader::kSize;

    // TODO: rate limit reading headers from blob files.
-    const Status s = ReadFromFile(file_reader, read_offset, read_size,
-                                  statistics, &header_slice, &buf, &aligned_buf,
-                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    const Status s =
+        ReadFromFile(file_reader, read_options, read_offset, read_size,
+                     statistics, &header_slice, &buf, &aligned_buf,
+                     Env::IO_TOTAL /* rate_limiter_priority */);
    if (!s.ok()) {
      return s;
    }
@ -187,6 +192,7 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
 }

 Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
+                                  const ReadOptions& read_options,
                                  uint64_t file_size, Statistics* statistics) {
  assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
  assert(file_reader);
@ -202,9 +208,10 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
    constexpr size_t read_size = BlobLogFooter::kSize;

    // TODO: rate limit reading footers from blob files.
-    const Status s = ReadFromFile(file_reader, read_offset, read_size,
-                                  statistics, &footer_slice, &buf, &aligned_buf,
-                                  Env::IO_TOTAL /* rate_limiter_priority */);
+    const Status s =
+        ReadFromFile(file_reader, read_options, read_offset, read_size,
+                     statistics, &footer_slice, &buf, &aligned_buf,
+                     Env::IO_TOTAL /* rate_limiter_priority */);
    if (!s.ok()) {
      return s;
    }
@ -232,6 +239,7 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
 }

 Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
+                                    const ReadOptions& read_options,
                                    uint64_t read_offset, size_t read_size,
                                    Statistics* statistics, Slice* slice,
                                    Buffer* buf, AlignedBuf* aligned_buf,
@ -246,17 +254,23 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,

  Status s;

+  IOOptions io_options;
+  s = file_reader->PrepareIOOptions(read_options, io_options);
+  if (!s.ok()) {
+    return s;
+  }
+
  if (file_reader->use_direct_io()) {
    constexpr char* scratch = nullptr;

-    s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
+    s = file_reader->Read(io_options, read_offset, read_size, slice, scratch,
                          aligned_buf, rate_limiter_priority);
  } else {
    buf->reset(new char[read_size]);
    constexpr AlignedBuf* aligned_scratch = nullptr;

-    s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
-                          buf->get(), aligned_scratch, rate_limiter_priority);
+    s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(),
+                          aligned_scratch, rate_limiter_priority);
  }

  if (!s.ok()) {
@ -324,8 +338,13 @@ Status BlobFileReader::GetBlob(
    Status s;
    constexpr bool for_compaction = true;

+    IOOptions io_options;
+    s = file_reader_->PrepareIOOptions(read_options, io_options);
+    if (!s.ok()) {
+      return s;
+    }
    prefetched = prefetch_buffer->TryReadFromCache(
-        IOOptions(), file_reader_.get(), record_offset,
+        io_options, file_reader_.get(), record_offset,
        static_cast<size_t>(record_size), &record_slice, &s,
        read_options.rate_limiter_priority, for_compaction);
    if (!s.ok()) {
@ -338,10 +357,10 @@ Status BlobFileReader::GetBlob(
    PERF_COUNTER_ADD(blob_read_count, 1);
    PERF_COUNTER_ADD(blob_read_byte, record_size);
    PERF_TIMER_GUARD(blob_read_time);
-    const Status s = ReadFromFile(file_reader_.get(), record_offset,
-                                  static_cast<size_t>(record_size), statistics_,
-                                  &record_slice, &buf, &aligned_buf,
-                                  read_options.rate_limiter_priority);
+    const Status s = ReadFromFile(
+        file_reader_.get(), read_options, record_offset,
+        static_cast<size_t>(record_size), statistics_, &record_slice, &buf,
+        &aligned_buf, read_options.rate_limiter_priority);
    if (!s.ok()) {
      return s;
    }
@ -420,11 +439,11 @@ void BlobFileReader::MultiGetBlob(
    assert(req->offset >= adjustment);
    adjustments.push_back(adjustment);

-    FSReadRequest read_req = {};
+    FSReadRequest read_req;
    read_req.offset = req->offset - adjustment;
    read_req.len = req->len + adjustment;
-    read_reqs.emplace_back(read_req);
    total_len += read_req.len;
+    read_reqs.emplace_back(std::move(read_req));
  }

  RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
@ -569,12 +588,7 @@ Status BlobFileReader::UncompressBlobIfNeeded(
  assert(result);

  if (compression_type == kNoCompression) {
-    CacheAllocationPtr allocation =
-        AllocateBlock(value_slice.size(), allocator);
-    memcpy(allocation.get(), value_slice.data(), value_slice.size());
-
-    *result = BlobContents::Create(std::move(allocation), value_slice.size());
-
+    BlobContentsCreator::Create(result, nullptr, value_slice, allocator);
    return Status::OK();
  }

@ -602,7 +616,7 @@ Status BlobFileReader::UncompressBlobIfNeeded(
    return Status::Corruption("Unable to uncompress blob");
  }

-  *result = BlobContents::Create(std::move(output), uncompressed_size);
+  result->reset(new BlobContents(std::move(output), uncompressed_size));

  return Status::OK();
 }
--- a/db/blob/blob_file_reader.h
+++ b/db/blob/blob_file_reader.h
@ -29,6 +29,7 @@ class Statistics;
 class BlobFileReader {
 public:
  static Status Create(const ImmutableOptions& immutable_options,
+                       const ReadOptions& read_options,
                       const FileOptions& file_options,
                       uint32_t column_family_id,
                       HistogramImpl* blob_file_read_hist,
@ -74,15 +75,18 @@ class BlobFileReader {
                         std::unique_ptr<RandomAccessFileReader>* file_reader);

  static Status ReadHeader(const RandomAccessFileReader* file_reader,
+                           const ReadOptions& read_options,
                           uint32_t column_family_id, Statistics* statistics,
                           CompressionType* compression_type);

  static Status ReadFooter(const RandomAccessFileReader* file_reader,
-                           uint64_t file_size, Statistics* statistics);
+                           const ReadOptions& read_options, uint64_t file_size,
+                           Statistics* statistics);

  using Buffer = std::unique_ptr<char[]>;

  static Status ReadFromFile(const RandomAccessFileReader* file_reader,
+                             const ReadOptions& read_options,
                             uint64_t read_offset, size_t read_size,
                             Statistics* statistics, Slice* slice, Buffer* buf,
                             AlignedBuf* aligned_buf,
--- a/db/blob/blob_file_reader_test.cc
+++ b/db/blob/blob_file_reader_test.cc
@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {

  std::unique_ptr<BlobFileReader> reader;

+  ReadOptions read_options;
  ASSERT_OK(BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader));
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));

  // Make sure the blob can be retrieved with and without checksum verification
-  ReadOptions read_options;
  read_options.verify_checksums = false;

  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
-  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
-                                     column_family_id, blob_file_read_hist,
-                                     blob_file_number, nullptr /*IOTracer*/,
-                                     &reader)
+  const ReadOptions read_options;
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
+                                     FileOptions(), column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
                  .IsCorruption());
 }

@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
-  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
-                                     column_family_id, blob_file_read_hist,
-                                     blob_file_number, nullptr /*IOTracer*/,
-                                     &reader)
+  const ReadOptions read_options;
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
+                                     FileOptions(), column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
                  .IsCorruption());
 }

@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
-  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
-                                     column_family_id, blob_file_read_hist,
-                                     blob_file_number, nullptr /*IOTracer*/,
-                                     &reader)
+  const ReadOptions read_options;
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
+                                     FileOptions(), column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
                  .IsCorruption());
 }

@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
-  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
-                                     column_family_id, blob_file_read_hist,
-                                     blob_file_number, nullptr /*IOTracer*/,
-                                     &reader)
+  const ReadOptions read_options;
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
+                                     FileOptions(), column_family_id,
+                                     blob_file_read_hist, blob_file_number,
+                                     nullptr /*IOTracer*/, &reader)
                  .IsCorruption());
 }

@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
  std::unique_ptr<BlobFileReader> reader;

  constexpr uint32_t incorrect_column_family_id = 2;
-
-  ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
-                                     incorrect_column_family_id,
+  const ReadOptions read_options;
+  ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
+                                     FileOptions(), incorrect_column_family_id,
                                     blob_file_read_hist, blob_file_number,
                                     nullptr /*IOTracer*/, &reader)
                  .IsCorruption());
@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
+  const ReadOptions read_options;
  ASSERT_OK(BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader));
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));

  SyncPoint::GetInstance()->SetCallBack(
      "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
@ -728,13 +728,12 @@ TEST_F(BlobFileReaderTest, Compression) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
+  ReadOptions read_options;
  ASSERT_OK(BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader));
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));

  // Make sure the blob can be retrieved with and without checksum verification
-  ReadOptions read_options;
  read_options.verify_checksums = false;

  constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -803,10 +802,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
+  const ReadOptions read_options;
  ASSERT_OK(BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader));
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));

  SyncPoint::GetInstance()->SetCallBack(
      "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
@ -895,10 +894,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
+  const ReadOptions read_options;
  const Status s = BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader);
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);

  const bool fail_during_create =
      (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
@ -983,10 +982,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
  constexpr HistogramImpl* blob_file_read_hist = nullptr;

  std::unique_ptr<BlobFileReader> reader;
-
+  const ReadOptions read_options;
  const Status s = BlobFileReader::Create(
-      immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
-      blob_file_number, nullptr /*IOTracer*/, &reader);
+      immutable_options, read_options, FileOptions(), column_family_id,
+      blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);

  const bool fail_during_create =
      sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";
--- a/db/blob/blob_log_sequential_reader.cc
+++ b/db/blob/blob_log_sequential_reader.cc
@ -7,7 +7,7 @@
 #include "db/blob/blob_log_sequential_reader.h"

 #include "file/random_access_file_reader.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "util/stop_watch.h"

 namespace ROCKSDB_NAMESPACE {
--- a/db/blob/blob_log_writer.cc
+++ b/db/blob/blob_log_writer.cc
@ -10,7 +10,7 @@

 #include "db/blob/blob_log_format.h"
 #include "file/writable_file_writer.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "rocksdb/system_clock.h"
 #include "test_util/sync_point.h"
 #include "util/coding.h"
--- a/db/blob/blob_source.cc
+++ b/db/blob/blob_source.cc
@ -13,7 +13,7 @@
 #include "db/blob/blob_contents.h"
 #include "db/blob/blob_file_reader.h"
 #include "db/blob/blob_log_format.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "options/cf_options.h"
 #include "table/get_context.h"
 #include "table/multiget_context.h"
@ -30,16 +30,14 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options,
      blob_file_cache_(blob_file_cache),
      blob_cache_(immutable_options->blob_cache),
      lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
-#ifndef ROCKSDB_LITE
  auto bbto =
      immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
  if (bbto &&
      bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
              .charged == CacheEntryRoleOptions::Decision::kEnabled) {
-    blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
-                                                 bbto->block_cache);
+    blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
+        immutable_options->blob_cache, bbto->block_cache)};
  }
-#endif  // ROCKSDB_LITE
 }

 BlobSource::~BlobSource() = default;
@ -82,9 +80,8 @@ Status BlobSource::PutBlobIntoCache(
  assert(cached_blob);
  assert(cached_blob->IsEmpty());

-  Cache::Handle* cache_handle = nullptr;
+  TypedHandle* cache_handle = nullptr;
  const Status s = InsertEntryIntoCache(cache_key, blob->get(),
-                                        (*blob)->ApproximateMemoryUsage(),
                                        &cache_handle, Cache::Priority::BOTTOM);
  if (s.ok()) {
    blob->release();
@ -106,26 +103,10 @@ Status BlobSource::PutBlobIntoCache(
  return s;
 }

-Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
-  Cache::Handle* cache_handle = nullptr;
-
-  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
-    Cache::CreateCallback create_cb =
-        [allocator = blob_cache_->memory_allocator()](
-            const void* buf, size_t size, void** out_obj,
-            size_t* charge) -> Status {
-      return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
-                                          size, out_obj, charge);
-    };
-
-    cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
-                                       create_cb, Cache::Priority::BOTTOM,
-                                       true /* wait_for_cache */, statistics_);
-  } else {
-    cache_handle = blob_cache_->Lookup(key, statistics_);
-  }
-
-  return cache_handle;
+BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const {
+  return blob_cache_.LookupFull(key, nullptr /* context */,
+                                Cache::Priority::BOTTOM, statistics_,
+                                lowest_used_cache_tier_);
 }

 void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
@ -166,24 +147,11 @@ void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
 }

 Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
-                                        size_t charge,
-                                        Cache::Handle** cache_handle,
+                                        TypedHandle** cache_handle,
                                        Cache::Priority priority) const {
-  Status s;
-
-  Cache::CacheItemHelper* const cache_item_helper =
-      BlobContents::GetCacheItemHelper();
-  assert(cache_item_helper);
-
-  if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
-    s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
-                            priority);
-  } else {
-    s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
-                            cache_handle, priority);
-  }
-
-  return s;
+  return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(),
+                                cache_handle, priority,
+                                lowest_used_cache_tier_);
 }

 Status BlobSource::GetBlob(const ReadOptions& read_options,
@ -241,7 +209,8 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,

  {
    CacheHandleGuard<BlobFileReader> blob_file_reader;
-    s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
+                                            &blob_file_reader);
    if (!s.ok()) {
      return s;
    }
@ -252,9 +221,10 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,
      return Status::Corruption("Compression type mismatch when reading blob");
    }

-    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
-                                           ? blob_cache_->memory_allocator()
-                                           : nullptr;
+    MemoryAllocator* const allocator =
+        (blob_cache_ && read_options.fill_cache)
+            ? blob_cache_.get()->memory_allocator()
+            : nullptr;

    uint64_t read_size = 0;
    s = blob_file_reader.GetValue()->GetBlob(
@ -403,8 +373,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
    }

    CacheHandleGuard<BlobFileReader> blob_file_reader;
-    Status s =
-        blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
+    Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
+                                                   &blob_file_reader);
    if (!s.ok()) {
      for (size_t i = 0; i < _blob_reqs.size(); ++i) {
        BlobReadRequest* const req = _blob_reqs[i].first;
@ -418,9 +388,10 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,

    assert(blob_file_reader.GetValue());

-    MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
-                                           ? blob_cache_->memory_allocator()
-                                           : nullptr;
+    MemoryAllocator* const allocator =
+        (blob_cache_ && read_options.fill_cache)
+            ? blob_cache_.get()->memory_allocator()
+            : nullptr;

    blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
                                              _blob_reqs, &_bytes_read);
--- a/db/blob/blob_source.h
+++ b/db/blob/blob_source.h
@ -8,8 +8,9 @@
 #include <cinttypes>
 #include <memory>

-#include "cache/cache_helpers.h"
 #include "cache/cache_key.h"
+#include "cache/typed_cache.h"
+#include "db/blob/blob_contents.h"
 #include "db/blob/blob_file_cache.h"
 #include "db/blob/blob_read_request.h"
 #include "rocksdb/cache.h"
@ -23,7 +24,6 @@ struct ImmutableOptions;
 class Status;
 class FilePrefetchBuffer;
 class Slice;
-class BlobContents;

 // BlobSource is a class that provides universal access to blobs, regardless of
 // whether they are in the blob cache, secondary cache, or (remote) storage.
@ -95,9 +95,9 @@ class BlobSource {
                               uint64_t* bytes_read);

  inline Status GetBlobFileReader(
-      uint64_t blob_file_number,
+      const ReadOptions& read_options, uint64_t blob_file_number,
      CacheHandleGuard<BlobFileReader>* blob_file_reader) {
-    return blob_file_cache_->GetBlobFileReader(blob_file_number,
+    return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number,
                                               blob_file_reader);
  }

@ -106,6 +106,14 @@ class BlobSource {
  bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
                        uint64_t offset, size_t* charge = nullptr) const;

+  // For TypedSharedCacheInterface
+  void Create(BlobContents** out, const char* buf, size_t size,
+              MemoryAllocator* alloc);
+
+  using SharedCacheInterface =
+      FullTypedSharedCacheInterface<BlobContents, BlobContentsCreator>;
+  using TypedHandle = SharedCacheInterface::TypedHandle;
+
 private:
  Status GetBlobFromCache(const Slice& cache_key,
                          CacheHandleGuard<BlobContents>* cached_blob) const;
@ -120,10 +128,10 @@ class BlobSource {
  static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
                           PinnableSlice* value);

-  Cache::Handle* GetEntryFromCache(const Slice& key) const;
+  TypedHandle* GetEntryFromCache(const Slice& key) const;

  Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
-                              size_t charge, Cache::Handle** cache_handle,
+                              TypedHandle** cache_handle,
                              Cache::Priority priority) const;

  inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
@ -141,7 +149,7 @@ class BlobSource {
  BlobFileCache* blob_file_cache_;

  // A cache to store uncompressed blobs.
-  std::shared_ptr<Cache> blob_cache_;
+  mutable SharedCacheInterface blob_cache_;

  // The control option of how the cache tiers will be used. Currently rocksdb
  // support block/blob cache (volatile tier) and secondary cache (this tier
--- a/db/blob/blob_source_test.cc
+++ b/db/blob/blob_source_test.cc
@ -517,7 +517,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
                  compression, blob_offsets, blob_sizes);

    CacheHandleGuard<BlobFileReader> blob_file_reader;
-    ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
+    ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number,
+                                            &blob_file_reader));
    ASSERT_NE(blob_file_reader.GetValue(), nullptr);

    const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
@ -1139,26 +1140,18 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
                         blob_file_cache.get());

  CacheHandleGuard<BlobFileReader> file_reader;
-  ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
+  ReadOptions read_options;
+  ASSERT_OK(
+      blob_source.GetBlobFileReader(read_options, file_number, &file_reader));
  ASSERT_NE(file_reader.GetValue(), nullptr);
  const uint64_t file_size = file_reader.GetValue()->GetFileSize();
  ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);

-  ReadOptions read_options;
  read_options.verify_checksums = true;

  auto blob_cache = options_.blob_cache;
  auto secondary_cache = lru_cache_opts_.secondary_cache;

-  Cache::CreateCallback create_cb = [](const void* buf, size_t size,
-                                       void** out_obj,
-                                       size_t* charge) -> Status {
-    CacheAllocationPtr allocation(new char[size]);
-
-    return BlobContents::CreateCallback(std::move(allocation), buf, size,
-                                        out_obj, charge);
-  };
-
  {
    // GetBlob
    std::vector<PinnableSlice> values(keys.size());
@ -1219,15 +1212,16 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
    {
      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
      const Slice key0 = cache_key.AsSlice();
-      auto handle0 = blob_cache->Lookup(key0, statistics);
+      auto handle0 = blob_cache->BasicLookup(key0, statistics);
      ASSERT_EQ(handle0, nullptr);

      // key0's item should be in the secondary cache.
-      bool is_in_sec_cache = false;
-      auto sec_handle0 =
-          secondary_cache->Lookup(key0, create_cb, true,
-                                  /*advise_erase=*/true, is_in_sec_cache);
-      ASSERT_FALSE(is_in_sec_cache);
+      bool kept_in_sec_cache = false;
+      auto sec_handle0 = secondary_cache->Lookup(
+          key0, BlobSource::SharedCacheInterface::GetFullHelper(),
+          /*context*/ nullptr, true,
+          /*advise_erase=*/true, kept_in_sec_cache);
+      ASSERT_FALSE(kept_in_sec_cache);
      ASSERT_NE(sec_handle0, nullptr);
      ASSERT_TRUE(sec_handle0->IsReady());
      auto value = static_cast<BlobContents*>(sec_handle0->Value());
@ -1246,15 +1240,16 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
    {
      CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
      const Slice key1 = cache_key.AsSlice();
-      auto handle1 = blob_cache->Lookup(key1, statistics);
+      auto handle1 = blob_cache->BasicLookup(key1, statistics);
      ASSERT_NE(handle1, nullptr);
      blob_cache->Release(handle1);

-      bool is_in_sec_cache = false;
-      auto sec_handle1 =
-          secondary_cache->Lookup(key1, create_cb, true,
-                                  /*advise_erase=*/true, is_in_sec_cache);
-      ASSERT_FALSE(is_in_sec_cache);
+      bool kept_in_sec_cache = false;
+      auto sec_handle1 = secondary_cache->Lookup(
+          key1, BlobSource::SharedCacheInterface::GetFullHelper(),
+          /*context*/ nullptr, true,
+          /*advise_erase=*/true, kept_in_sec_cache);
+      ASSERT_FALSE(kept_in_sec_cache);
      ASSERT_EQ(sec_handle1, nullptr);

      ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
@ -1276,7 +1271,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
      // key0 should be in the primary cache.
      CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
      const Slice key0 = cache_key0.AsSlice();
-      auto handle0 = blob_cache->Lookup(key0, statistics);
+      auto handle0 = blob_cache->BasicLookup(key0, statistics);
      ASSERT_NE(handle0, nullptr);
      auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
      ASSERT_NE(value, nullptr);
@ -1286,12 +1281,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
      // key1 is not in the primary cache and is in the secondary cache.
      CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
      const Slice key1 = cache_key1.AsSlice();
-      auto handle1 = blob_cache->Lookup(key1, statistics);
+      auto handle1 = blob_cache->BasicLookup(key1, statistics);
      ASSERT_EQ(handle1, nullptr);

      // erase key0 from the primary cache.
      blob_cache->Erase(key0);
-      handle0 = blob_cache->Lookup(key0, statistics);
+      handle0 = blob_cache->BasicLookup(key0, statistics);
      ASSERT_EQ(handle0, nullptr);

      // key1 promotion should succeed due to the primary cache being empty. we
@ -1307,7 +1302,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
      // in the secondary cache. So, the primary cache's Lookup() without
      // secondary cache support cannot see it. (NOTE: The dummy handle used
      // to be a leaky abstraction but not anymore.)
-      handle1 = blob_cache->Lookup(key1, statistics);
+      handle1 = blob_cache->BasicLookup(key1, statistics);
      ASSERT_EQ(handle1, nullptr);

      // But after another access, it is promoted to primary cache
@ -1315,7 +1310,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
                                               blob_offsets[1]));

      // And Lookup() can find it (without secondary cache support)
-      handle1 = blob_cache->Lookup(key1, statistics);
+      handle1 = blob_cache->BasicLookup(key1, statistics);
      ASSERT_NE(handle1, nullptr);
      ASSERT_NE(blob_cache->Value(handle1), nullptr);
      blob_cache->Release(handle1);
@ -1379,7 +1374,7 @@ class BlobSourceCacheReservationTest : public DBTestBase {

  static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
      CacheEntryRole::kBlobCache>::GetDummyEntrySize();
-  static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
+  static constexpr std::size_t kCacheCapacity = 2 * kSizeDummyEntry;
  static constexpr int kNumShardBits = 0;  // 2^0 shard

  static constexpr uint32_t kColumnFamilyId = 1;
@ -1398,7 +1393,6 @@ class BlobSourceCacheReservationTest : public DBTestBase {
  std::string db_session_id_;
 };

-#ifndef ROCKSDB_LITE
 TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
  options_.cf_paths.emplace_back(
      test::PerThreadDBPath(
@ -1513,11 +1507,10 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
  }
 }

-TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
+TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
  options_.cf_paths.emplace_back(
      test::PerThreadDBPath(
-          env_,
-          "BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
+          env_, "BlobSourceCacheReservationTest_IncreaseCacheReservation"),
      0);

  GenerateKeysAndBlobs();
@ -1525,7 +1518,7 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
  DestroyAndReopen(options_);

  ImmutableOptions immutable_options(options_);
-  constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
+  constexpr size_t blob_size = 24 << 10;  // 24KB
  for (size_t i = 0; i < kNumBlobs; ++i) {
    blob_file_size_ -= blobs_[i].size();  // old blob size
    blob_strs_[i].resize(blob_size, '@');
@ -1583,11 +1576,6 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {

    std::vector<PinnableSlice> values(keys_.size());

-    // Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
-    // can't fit all the blobs in the cache at the same time, which means we
-    // should observe cache evictions once we reach the cache's capacity.
-    // Due to the overhead of the cache and the BlobContents objects, as well as
-    // jemalloc bin sizes, this happens after inserting seven blobs.
    uint64_t blob_bytes = 0;
    for (size_t i = 0; i < kNumBlobs; ++i) {
      ASSERT_OK(blob_source.GetBlob(
@ -1598,22 +1586,21 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
      // Release cache handle
      values[i].Reset();

-      if (i < kNumBlobs / 2 - 1) {
-        size_t charge = 0;
-        ASSERT_TRUE(blob_source.TEST_BlobInCache(
-            kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
+      size_t charge = 0;
+      ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
+                                               blob_offsets[i], &charge));

-        blob_bytes += charge;
-      }
+      blob_bytes += charge;

-      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
+      ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(),
+                (blob_bytes <= kSizeDummyEntry) ? kSizeDummyEntry
+                                                : (2 * kSizeDummyEntry));
      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
      ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
                options_.blob_cache->GetUsage());
    }
  }
 }
-#endif  // ROCKSDB_LITE

 }  // namespace ROCKSDB_NAMESPACE

--- a/db/blob/db_blob_basic_test.cc
+++ b/db/blob/db_blob_basic_test.cc
@ -11,6 +11,7 @@
 #include "db/blob/blob_index.h"
 #include "db/blob/blob_log_format.h"
 #include "db/db_test_util.h"
+#include "db/db_with_timestamp_test_util.h"
 #include "port/stack_trace.h"
 #include "test_util/sync_point.h"
 #include "utilities/fault_injection_env.h"
@ -584,7 +585,6 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
  }
 }

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
  Options options = GetDefaultOptions();

@ -773,7 +773,6 @@ TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
    ASSERT_EQ(values[2], second_blob);
  }
 }
-#endif  // !ROCKSDB_LITE

 TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
  Options options = GetDefaultOptions();
@ -1062,7 +1061,6 @@ TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
                  .IsCorruption());
 }

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobBasicTest, GenerateIOTracing) {
  Options options = GetDefaultOptions();
  options.enable_blob_files = true;
@ -1117,7 +1115,6 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
    ASSERT_GT(blob_files_op_count, 2);
  }
 }
-#endif  // !ROCKSDB_LITE

 TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
  Options options = GetDefaultOptions();
@ -1219,7 +1216,6 @@ TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
  ASSERT_EQ(values[2], "v2_0");
 }

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobBasicTest, Properties) {
  Options options = GetDefaultOptions();
  options.enable_blob_files = true;
@ -1382,7 +1378,6 @@ TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
                 BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
                 blob_size + BlobLogFooter::kSize));
 }
-#endif  // !ROCKSDB_LITE

 class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
                               public testing::WithParamInterface<std::string> {
@ -1632,7 +1627,6 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
            options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
 }

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
  Options options = GetDefaultOptions();

@ -1700,7 +1694,6 @@ TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
                              /*end=*/nullptr));
  EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
 }
-#endif  // !ROCKSDB_LITE

 TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
  CompressedSecondaryCacheOptions secondary_cache_opts;
@ -1779,6 +1772,461 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
            1);
 }

+TEST_F(DBBlobBasicTest, GetEntityBlob) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+
+  Reopen(options);
+
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  constexpr char other_key[] = "other_key";
+  constexpr char other_blob_value[] = "other_blob_value";
+
+  ASSERT_OK(Put(key, blob_value));
+  ASSERT_OK(Put(other_key, other_blob_value));
+
+  ASSERT_OK(Flush());
+
+  WideColumns expected_columns{{kDefaultWideColumnName, blob_value}};
+  WideColumns other_expected_columns{
+      {kDefaultWideColumnName, other_blob_value}};
+
+  {
+    PinnableWideColumns result;
+    ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key,
+                             &result));
+    ASSERT_EQ(result.columns(), expected_columns);
+  }
+
+  {
+    PinnableWideColumns result;
+    ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
+                             other_key, &result));
+
+    ASSERT_EQ(result.columns(), other_expected_columns);
+  }
+
+  {
+    constexpr size_t num_keys = 2;
+
+    std::array<Slice, num_keys> keys{{key, other_key}};
+    std::array<PinnableWideColumns, num_keys> results;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
+                        &keys[0], &results[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(results[0].columns(), expected_columns);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(results[1].columns(), other_expected_columns);
+  }
+}
+
+class DBBlobWithTimestampTest : public DBBasicTestWithTimestampBase {
+ protected:
+  DBBlobWithTimestampTest()
+      : DBBasicTestWithTimestampBase("db_blob_with_timestamp_test") {}
+};
+
+TEST_F(DBBlobWithTimestampTest, GetBlob) {
+  Options options = GetDefaultOptions();
+  options.create_if_missing = true;
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  const std::string ts = Timestamp(1, 0);
+  constexpr char key[] = "key";
+  constexpr char blob_value[] = "blob_value";
+
+  ASSERT_OK(db_->Put(write_opts, key, ts, blob_value));
+
+  ASSERT_OK(Flush());
+
+  const std::string read_ts = Timestamp(2, 0);
+  Slice read_ts_slice(read_ts);
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts_slice;
+  std::string value;
+  ASSERT_OK(db_->Get(read_opts, key, &value));
+  ASSERT_EQ(value, blob_value);
+}
+
+TEST_F(DBBlobWithTimestampTest, MultiGetBlobs) {
+  constexpr size_t min_blob_size = 6;
+
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.min_blob_size = min_blob_size;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+
+  // Put then retrieve three key-values. The first value is below the size limit
+  // and is thus stored inline; the other two are stored separately as blobs.
+  constexpr size_t num_keys = 3;
+
+  constexpr char first_key[] = "first_key";
+  constexpr char first_value[] = "short";
+  static_assert(sizeof(first_value) - 1 < min_blob_size,
+                "first_value too long to be inlined");
+
+  DestroyAndReopen(options);
+  WriteOptions write_opts;
+  const std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, first_key, ts, first_value));
+
+  constexpr char second_key[] = "second_key";
+  constexpr char second_value[] = "long_value";
+  static_assert(sizeof(second_value) - 1 >= min_blob_size,
+                "second_value too short to be stored as blob");
+
+  ASSERT_OK(db_->Put(write_opts, second_key, ts, second_value));
+
+  constexpr char third_key[] = "third_key";
+  constexpr char third_value[] = "other_long_value";
+  static_assert(sizeof(third_value) - 1 >= min_blob_size,
+                "third_value too short to be stored as blob");
+
+  ASSERT_OK(db_->Put(write_opts, third_key, ts, third_value));
+
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+  const std::string read_ts = Timestamp(2, 0);
+  Slice read_ts_slice(read_ts);
+  read_options.timestamp = &read_ts_slice;
+  std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
+
+  {
+    std::array<PinnableSlice, num_keys> values;
+    std::array<Status, num_keys> statuses;
+
+    db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                  &values[0], &statuses[0]);
+
+    ASSERT_OK(statuses[0]);
+    ASSERT_EQ(values[0], first_value);
+
+    ASSERT_OK(statuses[1]);
+    ASSERT_EQ(values[1], second_value);
+
+    ASSERT_OK(statuses[2]);
+    ASSERT_EQ(values[2], third_value);
+  }
+}
+
+TEST_F(DBBlobWithTimestampTest, GetMergeBlobWithPut) {
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  const std::string ts = Timestamp(1, 0);
+  ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(
+      db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v2"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(
+      db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v3"));
+  ASSERT_OK(Flush());
+
+  std::string value;
+  const std::string read_ts = Timestamp(2, 0);
+  Slice read_ts_slice(read_ts);
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts_slice;
+  ASSERT_OK(db_->Get(read_opts, "Key1", &value));
+  ASSERT_EQ(value, "v1,v2,v3");
+}
+
+TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) {
+  constexpr size_t num_keys = 3;
+
+  Options options = GetDefaultOptions();
+  options.merge_operator = MergeOperators::CreateStringAppendOperator();
+  options.enable_blob_files = true;
+  options.min_blob_size = 0;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+
+  WriteOptions write_opts;
+  const std::string ts = Timestamp(1, 0);
+
+  ASSERT_OK(db_->Put(write_opts, "Key0", ts, "v0_0"));
+  ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1_0"));
+  ASSERT_OK(db_->Put(write_opts, "Key2", ts, "v2_0"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(
+      db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_1"));
+  ASSERT_OK(
+      db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v1_1"));
+  ASSERT_OK(Flush());
+  ASSERT_OK(
+      db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_2"));
+  ASSERT_OK(Flush());
+
+  const std::string read_ts = Timestamp(2, 0);
+  Slice read_ts_slice(read_ts);
+  ReadOptions read_opts;
+  read_opts.timestamp = &read_ts_slice;
+  std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
+  std::array<PinnableSlice, num_keys> values;
+  std::array<Status, num_keys> statuses;
+
+  db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, &keys[0],
+                &values[0], &statuses[0]);
+
+  ASSERT_OK(statuses[0]);
+  ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
+
+  ASSERT_OK(statuses[1]);
+  ASSERT_EQ(values[1], "v1_0,v1_1");
+
+  ASSERT_OK(statuses[2]);
+  ASSERT_EQ(values[2], "v2_0");
+}
+
+TEST_F(DBBlobWithTimestampTest, IterateBlobs) {
+  Options options = GetDefaultOptions();
+  options.enable_blob_files = true;
+  options.create_if_missing = true;
+  const size_t kTimestampSize = Timestamp(0, 0).size();
+  TestComparator test_cmp(kTimestampSize);
+  options.comparator = &test_cmp;
+
+  DestroyAndReopen(options);
+
+  int num_blobs = 5;
+  std::vector<std::string> keys;
+  std::vector<std::string> blobs;
+
+  WriteOptions write_opts;
+  std::vector<std::string> write_timestamps = {Timestamp(1, 0),
+                                               Timestamp(2, 0)};
+
+  // For each key in ["key0", ... "keyi", ...], write two versions:
+  // Timestamp(1, 0), "blobi0"
+  // Timestamp(2, 0), "blobi1"
+  for (int i = 0; i < num_blobs; i++) {
+    keys.push_back("key" + std::to_string(i));
+    blobs.push_back("blob" + std::to_string(i));
+    for (size_t j = 0; j < write_timestamps.size(); j++) {
+      ASSERT_OK(db_->Put(write_opts, keys[i], write_timestamps[j],
+                         blobs[i] + std::to_string(j)));
+    }
+  }
+  ASSERT_OK(Flush());
+
+  ReadOptions read_options;
+  std::vector<std::string> read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)};
+  Slice ts_upper_bound(read_timestamps[1]);
+  read_options.timestamp = &ts_upper_bound;
+
+  auto check_iter_entry =
+      [](const Iterator* iter, const std::string& expected_key,
+         const std::string& expected_ts, const std::string& expected_value,
+         bool key_is_internal = true) {
+        ASSERT_OK(iter->status());
+        if (key_is_internal) {
+          std::string expected_ukey_and_ts;
+          expected_ukey_and_ts.assign(expected_key.data(), expected_key.size());
+          expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size());
+
+          ParsedInternalKey parsed_ikey;
+          ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey,
+                                     true /* log_err_key */));
+          ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts);
+        } else {
+          ASSERT_EQ(iter->key(), expected_key);
+        }
+        ASSERT_EQ(iter->timestamp(), expected_ts);
+        ASSERT_EQ(iter->value(), expected_value);
+      };
+
+  // Forward iterating one version of each key, get in this order:
+  // [("key0", Timestamp(2, 0), "blob01"),
+  //  ("key1", Timestamp(2, 0), "blob11")...]
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToFirst();
+    for (int i = 0; i < num_blobs; i++) {
+      check_iter_entry(iter.get(), keys[i], write_timestamps[1],
+                       blobs[i] + std::to_string(1), /*key_is_internal*/ false);
+      iter->Next();
+    }
+  }
+
+  // Forward iteration, then reverse to backward.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToFirst();
+    for (int i = 0; i < num_blobs * 2 - 1; i++) {
+      if (i < num_blobs) {
+        check_iter_entry(iter.get(), keys[i], write_timestamps[1],
+                         blobs[i] + std::to_string(1),
+                         /*key_is_internal*/ false);
+        if (i != num_blobs - 1) {
+          iter->Next();
+        }
+      } else {
+        if (i != num_blobs) {
+          check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i],
+                           write_timestamps[1],
+                           blobs[num_blobs * 2 - 1 - i] + std::to_string(1),
+                           /*key_is_internal*/ false);
+        }
+        iter->Prev();
+      }
+    }
+  }
+
+  // Backward iterating one versions of each key, get in this order:
+  // [("key4", Timestamp(2, 0), "blob41"),
+  //  ("key3", Timestamp(2, 0), "blob31")...]
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToLast();
+    for (int i = 0; i < num_blobs; i++) {
+      check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1],
+                       blobs[num_blobs - 1 - i] + std::to_string(1),
+                       /*key_is_internal*/ false);
+      iter->Prev();
+    }
+  }
+
+  // Backward iteration, then reverse to forward.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToLast();
+    for (int i = 0; i < num_blobs * 2 - 1; i++) {
+      if (i < num_blobs) {
+        check_iter_entry(iter.get(), keys[num_blobs - 1 - i],
+                         write_timestamps[1],
+                         blobs[num_blobs - 1 - i] + std::to_string(1),
+                         /*key_is_internal*/ false);
+        if (i != num_blobs - 1) {
+          iter->Prev();
+        }
+      } else {
+        if (i != num_blobs) {
+          check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1],
+                           blobs[i - num_blobs] + std::to_string(1),
+                           /*key_is_internal*/ false);
+        }
+        iter->Next();
+      }
+    }
+  }
+
+  Slice ts_lower_bound(read_timestamps[0]);
+  read_options.iter_start_ts = &ts_lower_bound;
+  // Forward iterating multiple versions of the same key, get in this order:
+  // [("key0", Timestamp(2, 0), "blob01"),
+  //  ("key0", Timestamp(1, 0), "blob00"),
+  //  ("key1", Timestamp(2, 0), "blob11")...]
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToFirst();
+    for (int i = 0; i < num_blobs; i++) {
+      for (size_t j = write_timestamps.size(); j > 0; --j) {
+        check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
+                         blobs[i] + std::to_string(j - 1));
+        iter->Next();
+      }
+    }
+  }
+
+  // Backward iterating multiple versions of the same key, get in this order:
+  // [("key4", Timestamp(1, 0), "blob00"),
+  //  ("key4", Timestamp(2, 0), "blob01"),
+  //  ("key3", Timestamp(1, 0), "blob10")...]
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToLast();
+    for (int i = num_blobs; i > 0; i--) {
+      for (size_t j = 0; j < write_timestamps.size(); j++) {
+        check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
+                         blobs[i - 1] + std::to_string(j));
+        iter->Prev();
+      }
+    }
+  }
+
+  int upper_bound_idx = num_blobs - 2;
+  int lower_bound_idx = 1;
+  Slice upper_bound_slice(keys[upper_bound_idx]);
+  Slice lower_bound_slice(keys[lower_bound_idx]);
+  read_options.iterate_upper_bound = &upper_bound_slice;
+  read_options.iterate_lower_bound = &lower_bound_slice;
+
+  // Forward iteration with upper and lower bound.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToFirst();
+    for (int i = lower_bound_idx; i < upper_bound_idx; i++) {
+      for (size_t j = write_timestamps.size(); j > 0; --j) {
+        check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
+                         blobs[i] + std::to_string(j - 1));
+        iter->Next();
+      }
+    }
+  }
+
+  // Backward iteration with upper and lower bound.
+  {
+    std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
+    ASSERT_OK(iter->status());
+
+    iter->SeekToLast();
+    for (int i = upper_bound_idx; i > lower_bound_idx; i--) {
+      for (size_t j = 0; j < write_timestamps.size(); j++) {
+        check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
+                         blobs[i - 1] + std::to_string(j));
+        iter->Prev();
+      }
+    }
+  }
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/blob/db_blob_compaction_test.cc
+++ b/db/blob/db_blob_compaction_test.cc
@ -16,7 +16,6 @@ class DBBlobCompactionTest : public DBTestBase {
  explicit DBBlobCompactionTest()
      : DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}

-#ifndef ROCKSDB_LITE
  const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
    VersionSet* const versions = dbfull()->GetVersionSet();
    assert(versions);
@ -30,7 +29,6 @@ class DBBlobCompactionTest : public DBTestBase {

    return internal_stats->TEST_GetCompactionStats();
  }
-#endif  // ROCKSDB_LITE
 };

 namespace {
@ -250,7 +248,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
  ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
  ASSERT_EQ("value", value);

-#ifndef ROCKSDB_LITE
  const auto& compaction_stats = GetCompactionStats();
  ASSERT_GE(compaction_stats.size(), 2);

@ -258,7 +255,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
  // this involves neither reading nor writing blobs
  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
-#endif  // ROCKSDB_LITE

  Close();
 }
@ -299,7 +295,6 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
    ASSERT_EQ(long_value, value);
  }

-#ifndef ROCKSDB_LITE
  const auto& compaction_stats = GetCompactionStats();
  ASSERT_GE(compaction_stats.size(), 2);

@ -307,12 +302,10 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
  // this involves reading but not writing blobs
  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
-#endif  // ROCKSDB_LITE

  Close();
 }

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
  Options options = GetDefaultOptions();

@ -388,7 +381,6 @@ TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {

  Close();
 }
-#endif

 TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
  Options options = GetDefaultOptions();
@ -413,7 +405,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
    ASSERT_EQ(new_blob_value, Get(key));
  }

-#ifndef ROCKSDB_LITE
  const auto& compaction_stats = GetCompactionStats();
  ASSERT_GE(compaction_stats.size(), 2);

@ -421,7 +412,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
  // this involves writing but not reading blobs
  ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
-#endif  // ROCKSDB_LITE

  Close();
 }
@ -540,7 +530,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
    ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
  }

-#ifndef ROCKSDB_LITE
  const auto& compaction_stats = GetCompactionStats();
  ASSERT_GE(compaction_stats.size(), 2);

@ -548,7 +537,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
  // this involves reading and writing blobs
  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
  ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
-#endif  // ROCKSDB_LITE

  Close();
 }
@ -606,7 +594,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
                              /*end=*/nullptr));
  ASSERT_EQ(blob_files, GetBlobFileNumbers());

-#ifndef ROCKSDB_LITE
  const auto& compaction_stats = GetCompactionStats();
  ASSERT_GE(compaction_stats.size(), 2);

@ -614,7 +601,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
  // this involves reading but not writing blobs
  ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
  ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
-#endif  // ROCKSDB_LITE

  Close();
 }
--- a/db/blob/db_blob_corruption_test.cc
+++ b/db/blob/db_blob_corruption_test.cc
@ -34,7 +34,6 @@ class DBBlobCorruptionTest : public DBTestBase {
  }
 };

-#ifndef ROCKSDB_LITE
 TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
  Options options = GetDefaultOptions();
  options.enable_blob_files = true;
@ -71,7 +70,6 @@ TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
-#endif  // !ROCKSDB_LITE
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/blob/db_blob_index_test.cc
+++ b/db/blob/db_blob_index_test.cc
@ -131,9 +131,7 @@ class DBBlobIndexTest : public DBTestBase {
        ASSERT_OK(Flush());
        ASSERT_OK(
            dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
-#ifndef ROCKSDB_LITE
        ASSERT_EQ("0,1", FilesPerLevel());
-#endif  // !ROCKSDB_LITE
        break;
    }
  }
@ -459,7 +457,6 @@ TEST_F(DBBlobIndexTest, Iterate) {
    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
           create_blob_iterator, check_is_blob(false));

-#ifndef ROCKSDB_LITE
    // Iterator with blob support and using seek.
    ASSERT_OK(dbfull()->SetOptions(
        cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
@ -484,7 +481,6 @@ TEST_F(DBBlobIndexTest, Iterate) {
           create_blob_iterator, check_is_blob(false));
    verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
           create_blob_iterator, check_is_blob(false));
-#endif  // !ROCKSDB_LITE

    for (auto* snapshot : snapshots) {
      dbfull()->ReleaseSnapshot(snapshot);
@ -584,12 +580,10 @@ TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
  Status expected_status;
  verify(1, expected_status, expected_value);

-#ifndef ROCKSDB_LITE
  // Test DBIter::FindValueForCurrentKeyUsingSeek flow.
  ASSERT_OK(dbfull()->SetOptions(cfh(),
                                 {{"max_sequential_skip_in_iterations", "0"}}));
  verify(1, expected_status, expected_value);
-#endif  // !ROCKSDB_LITE
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/builder.cc
+++ b/db/builder.cc
@ -15,6 +15,7 @@

 #include "db/blob/blob_file_builder.h"
 #include "db/compaction/compaction_iterator.h"
+#include "db/dbformat.h"
 #include "db/event_helpers.h"
 #include "db/internal_stats.h"
 #include "db/merge_helper.h"
@ -56,8 +57,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
 Status BuildTable(
    const std::string& dbname, VersionSet* versions,
    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
-    const FileOptions& file_options, TableCache* table_cache,
-    InternalIterator* iter,
+    const FileOptions& file_options, const ReadOptions& read_options,
+    TableCache* table_cache, InternalIterator* iter,
    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
        range_del_iters,
    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
@ -107,11 +108,9 @@ Status BuildTable(
  std::vector<std::string> blob_file_paths;
  std::string file_checksum = kUnknownFileChecksum;
  std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
-#ifndef ROCKSDB_LITE
  EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
                                               tboptions.column_family_name,
                                               fname, job_id, tboptions.reason);
-#endif  // !ROCKSDB_LITE
  Env* env = db_options.env;
  assert(env);
  FileSystem* fs = db_options.fs.get();
@ -176,10 +175,10 @@ Status BuildTable(
      builder = NewTableBuilder(tboptions, file_writer.get());
    }

+    auto ucmp = tboptions.internal_comparator.user_comparator();
    MergeHelper merge(
-        env, tboptions.internal_comparator.user_comparator(),
-        ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
-        true /* internal key corruption is not ok */,
+        env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(),
+        ioptions.logger, true /* internal key corruption is not ok */,
        snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);

    std::unique_ptr<BlobFileBuilder> blob_file_builder(
@ -197,32 +196,49 @@ Status BuildTable(

    const std::atomic<bool> kManualCompactionCanceledFalse{false};
    CompactionIterator c_iter(
-        iter, tboptions.internal_comparator.user_comparator(), &merge,
-        kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
-        job_snapshot, snapshot_checker, env,
+        iter, ucmp, &merge, kMaxSequenceNumber, &snapshots,
+        earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
        ShouldReportDetailedTime(env, ioptions.stats),
        true /* internal key corruption is not ok */, range_del_agg.get(),
        blob_file_builder.get(), ioptions.allow_data_in_errors,
        ioptions.enforce_single_del_contracts,
        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
+        true /* must_count_input_entries */,
        /*compaction=*/nullptr, compaction_filter.get(),
        /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);

+    const size_t ts_sz = ucmp->timestamp_size();
+    const bool strip_timestamp =
+        ts_sz > 0 && !ioptions.persist_user_defined_timestamps;
+
+    std::string key_after_flush_buf;
    c_iter.SeekToFirst();
    for (; c_iter.Valid(); c_iter.Next()) {
      const Slice& key = c_iter.key();
      const Slice& value = c_iter.value();
      const ParsedInternalKey& ikey = c_iter.ikey();
-      // Generate a rolling 64-bit hash of the key and values
-      // Note :
-      // Here "key" integrates 'sequence_number'+'kType'+'user key'.
-      s = output_validator.Add(key, value);
+      Slice key_after_flush = key;
+      // If user defined timestamps will be stripped from user key after flush,
+      // the in memory version of the key act logically the same as one with a
+      // minimum timestamp. We update the timestamp here so file boundary and
+      // output validator, block builder all see the effect of the stripping.
+      if (strip_timestamp) {
+        key_after_flush_buf.clear();
+        ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz);
+        key_after_flush = key_after_flush_buf;
+      }
+
+      //  Generate a rolling 64-bit hash of the key and values
+      //  Note :
+      //  Here "key" integrates 'sequence_number'+'kType'+'user key'.
+      s = output_validator.Add(key_after_flush, value);
      if (!s.ok()) {
        break;
      }
-      builder->Add(key, value);
+      builder->Add(key_after_flush, value);

-      s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
+      s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence,
+                                 ikey.type);
      if (!s.ok()) {
        break;
      }
@ -242,21 +258,28 @@ Status BuildTable(

    if (s.ok()) {
      auto range_del_it = range_del_agg->NewIterator();
+      Slice last_tombstone_start_user_key{};
      for (range_del_it->SeekToFirst(); range_del_it->Valid();
           range_del_it->Next()) {
        auto tombstone = range_del_it->Tombstone();
        auto kv = tombstone.Serialize();
+        // TODO(yuzhangyu): handle range deletion for UDT in memtables only.
        builder->Add(kv.first.Encode(), kv.second);
        InternalKey tombstone_end = tombstone.SerializeEndKey();
        meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
                                       tboptions.internal_comparator);
        if (version) {
-          SizeApproximationOptions approx_opts;
-          approx_opts.files_size_error_margin = 0.1;
-          meta->compensated_range_deletion_size += versions->ApproximateSize(
-              approx_opts, version, kv.first.Encode(), tombstone_end.Encode(),
-              0 /* start_level */, -1 /* end_level */,
-              TableReaderCaller::kFlush);
+          if (last_tombstone_start_user_key.empty() ||
+              ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
+                                            range_del_it->start_key()) < 0) {
+            SizeApproximationOptions approx_opts;
+            approx_opts.files_size_error_margin = 0.1;
+            meta->compensated_range_deletion_size += versions->ApproximateSize(
+                approx_opts, read_options, version, kv.first.Encode(),
+                tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */,
+                TableReaderCaller::kFlush);
+          }
+          last_tombstone_start_user_key = range_del_it->start_key();
        }
      }
    }
@ -264,8 +287,9 @@ Status BuildTable(
    TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
    const bool empty = builder->IsEmpty();
    if (num_input_entries != nullptr) {
+      assert(c_iter.HasNumInputEntryScanned());
      *num_input_entries =
-          c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
+          c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
    }
    if (!s.ok() || empty) {
      builder->Abandon();
@ -288,7 +312,10 @@ Status BuildTable(
    if (s.ok() && !empty) {
      uint64_t file_size = builder->FileSize();
      meta->fd.file_size = file_size;
+      meta->tail_size = builder->GetTailSize();
      meta->marked_for_compaction = builder->NeedCompact();
+      meta->user_defined_timestamps_persisted =
+          ioptions.persist_user_defined_timestamps;
      assert(meta->fd.GetFileSize() > 0);
      tp = builder
               ->GetTableProperties();  // refresh now that builder is finished
@ -348,6 +375,8 @@ Status BuildTable(
      s = *io_status;
    }

+    // TODO(yuzhangyu): handle the key copy in the blob when ts should be
+    // stripped.
    if (blob_file_builder) {
      if (s.ok()) {
        s = blob_file_builder->Finish();
@ -366,7 +395,6 @@ Status BuildTable(
      // here because this is a special case after we finish the table building.
      // No matter whether use_direct_io_for_flush_and_compaction is true,
      // the goal is to cache it here for further user reads.
-      ReadOptions read_options;
      std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
          read_options, file_options, tboptions.internal_comparator, *meta,
          nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
@ -378,7 +406,8 @@ Status BuildTable(
          MaxFileSizeForL0MetaPin(mutable_cf_options),
          /*smallest_compaction_key=*/nullptr,
          /*largest_compaction_key*/ nullptr,
-          /*allow_unprepared_value*/ false));
+          /*allow_unprepared_value*/ false,
+          mutable_cf_options.block_protection_bytes_per_key));
      s = it->status();
      if (s.ok() && paranoid_file_checks) {
        OutputValidator file_validator(tboptions.internal_comparator,
--- a/db/builder.h
+++ b/db/builder.h
@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
 extern Status BuildTable(
    const std::string& dbname, VersionSet* versions,
    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
-    const FileOptions& file_options, TableCache* table_cache,
-    InternalIterator* iter,
+    const FileOptions& file_options, const ReadOptions& read_options,
+    TableCache* table_cache, InternalIterator* iter,
    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
        range_del_iters,
    FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
--- a/db/c.cc
+++ b/db/c.cc
@ -7,8 +7,6 @@
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

-#ifndef ROCKSDB_LITE
-
 #include "rocksdb/c.h"

 #include <cstdlib>
@ -17,7 +15,7 @@
 #include <vector>

 #include "port/port.h"
-#include "rocksdb/cache.h"
+#include "rocksdb/advanced_cache.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/convenience.h"
@ -48,6 +46,7 @@
 #include "rocksdb/utilities/write_batch_with_index.h"
 #include "rocksdb/write_batch.h"
 #include "utilities/merge_operators.h"
+#include "rocksdb/env_encryption.h"

 using ROCKSDB_NAMESPACE::BackupEngine;
 using ROCKSDB_NAMESPACE::BackupEngineOptions;
@ -69,6 +68,7 @@ using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
 using ROCKSDB_NAMESPACE::CompactRangeOptions;
 using ROCKSDB_NAMESPACE::Comparator;
 using ROCKSDB_NAMESPACE::CompressionType;
+using ROCKSDB_NAMESPACE::ConfigOptions;
 using ROCKSDB_NAMESPACE::CuckooTableOptions;
 using ROCKSDB_NAMESPACE::DB;
 using ROCKSDB_NAMESPACE::DBOptions;
@ -78,6 +78,8 @@ using ROCKSDB_NAMESPACE::EnvOptions;
 using ROCKSDB_NAMESPACE::FileLock;
 using ROCKSDB_NAMESPACE::FilterPolicy;
 using ROCKSDB_NAMESPACE::FlushOptions;
+using ROCKSDB_NAMESPACE::HistogramData;
+using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
 using ROCKSDB_NAMESPACE::InfoLogLevel;
 using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
 using ROCKSDB_NAMESPACE::Iterator;
@ -208,6 +210,9 @@ struct rocksdb_logger_t {
 struct rocksdb_lru_cache_options_t {
  LRUCacheOptions rep;
 };
+struct rocksdb_hyper_clock_cache_options_t {
+  HyperClockCacheOptions rep;
+};
 struct rocksdb_memory_allocator_t {
  std::shared_ptr<MemoryAllocator> rep;
 };
@ -276,6 +281,11 @@ struct rocksdb_compactionfiltercontext_t {
  CompactionFilter::Context rep;
 };

+struct rocksdb_statistics_histogram_data_t {
+  rocksdb_statistics_histogram_data_t() : rep() {}
+  HistogramData rep;
+};
+
 struct rocksdb_compactionfilter_t : public CompactionFilter {
  void* state_;
  void (*destructor_)(void*);
@ -1054,6 +1064,36 @@ rocksdb_column_family_handle_t* rocksdb_create_column_family(
  return handle;
 }

+rocksdb_column_family_handle_t** rocksdb_create_column_families(
+    rocksdb_t* db, const rocksdb_options_t* column_family_options,
+    int num_column_families, const char* const* column_family_names,
+    size_t* lencfs, char** errptr) {
+  std::vector<ColumnFamilyHandle*> handles;
+  std::vector<std::string> names;
+  for (int i = 0; i != num_column_families; ++i) {
+    names.push_back(std::string(column_family_names[i]));
+  }
+  SaveError(errptr, db->rep->CreateColumnFamilies(
+                        ColumnFamilyOptions(column_family_options->rep), names,
+                        &handles));
+
+  *lencfs = handles.size();
+  rocksdb_column_family_handle_t** c_handles =
+      static_cast<rocksdb_column_family_handle_t**>(
+          malloc(sizeof(rocksdb_column_family_handle_t*) * handles.size()));
+  for (size_t i = 0; i != handles.size(); ++i) {
+    c_handles[i] = new rocksdb_column_family_handle_t;
+    c_handles[i]->rep = handles[i];
+  }
+
+  return c_handles;
+}
+
+void rocksdb_create_column_families_destroy(
+    rocksdb_column_family_handle_t** list) {
+  free(list);
+}
+
 rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
    rocksdb_t* db, const rocksdb_options_t* column_family_options,
    const char* column_family_name, int ttl, char** errptr) {
@ -1805,6 +1845,17 @@ void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
  SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
 }

+void rocksdb_flush_cfs(rocksdb_t* db, const rocksdb_flushoptions_t* options,
+                       rocksdb_column_family_handle_t** column_families,
+                       int num_column_families, char** errptr) {
+  std::vector<ColumnFamilyHandle*> column_family_handles;
+  for (int i = 0; i < num_column_families; i++) {
+    column_family_handles.push_back(column_families[i]->rep);
+  }
+
+  SaveError(errptr, db->rep->Flush(options->rep, column_family_handles));
+}
+
 void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
  SaveError(errptr, db->rep->FlushWAL(sync));
 }
@ -2498,8 +2549,12 @@ void rocksdb_load_latest_options(
    rocksdb_options_t*** list_column_family_options, char** errptr) {
  DBOptions db_opt;
  std::vector<ColumnFamilyDescriptor> cf_descs;
-  Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
-                               &cf_descs, ignore_unknown_options, &cache->rep);
+  ConfigOptions config_opts;
+  config_opts.ignore_unknown_options = ignore_unknown_options;
+  config_opts.input_strings_escaped = true;
+  config_opts.env = env->rep;
+  Status s = LoadLatestOptions(config_opts, std::string(db_path), &db_opt,
+                               &cf_descs, &cache->rep);
  if (s.ok()) {
    char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
    rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
@ -2620,14 +2675,6 @@ void rocksdb_block_based_options_set_block_cache(
  }
 }

-void rocksdb_block_based_options_set_block_cache_compressed(
-    rocksdb_block_based_table_options_t* options,
-    rocksdb_cache_t* block_cache_compressed) {
-  if (block_cache_compressed) {
-    options->rep.block_cache_compressed = block_cache_compressed->rep;
-  }
-}
-
 void rocksdb_block_based_options_set_whole_key_filtering(
    rocksdb_block_based_table_options_t* options, unsigned char v) {
  options->rep.whole_key_filtering = v;
@ -2983,6 +3030,29 @@ void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
  opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
 }

+void rocksdb_options_set_statistics_level(rocksdb_options_t* opt, int level) {
+  if (!opt->rep.statistics) {
+    return;
+  }
+
+  if (level < rocksdb_statistics_level_disable_all) {
+    level = rocksdb_statistics_level_disable_all;
+  }
+  if (level > rocksdb_statistics_level_all) {
+    level = rocksdb_statistics_level_all;
+  }
+  opt->rep.statistics->set_stats_level(
+      static_cast<ROCKSDB_NAMESPACE::StatsLevel>(level));
+}
+
+int rocksdb_options_get_statistics_level(rocksdb_options_t* opt) {
+  if (!opt->rep.statistics) {
+    return ROCKSDB_NAMESPACE::StatsLevel::kDisableAll;
+  }
+
+  return static_cast<int>(opt->rep.statistics->get_stats_level());
+}
+
 void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
                                                      unsigned char val) {
  opt->rep.skip_stats_update_on_db_open = val;
@ -3730,16 +3800,21 @@ void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
      ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
 }

-void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
-                                             uint32_t user_key_len,
-                                             int bloom_bits_per_key,
-                                             double hash_table_ratio,
-                                             size_t index_sparseness) {
+void rocksdb_options_set_plain_table_factory(
+    rocksdb_options_t* opt, uint32_t user_key_len, int bloom_bits_per_key,
+    double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
+    char encoding_type, unsigned char full_scan_mode,
+    unsigned char store_index_in_file) {
  ROCKSDB_NAMESPACE::PlainTableOptions options;
  options.user_key_len = user_key_len;
  options.bloom_bits_per_key = bloom_bits_per_key;
  options.hash_table_ratio = hash_table_ratio;
  options.index_sparseness = index_sparseness;
+  options.huge_page_tlb_size = huge_page_tlb_size;
+  options.encoding_type =
+      static_cast<ROCKSDB_NAMESPACE::EncodingType>(encoding_type);
+  options.full_scan_mode = full_scan_mode;
+  options.store_index_in_file = store_index_in_file;

  ROCKSDB_NAMESPACE::TableFactory* factory =
      ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
@ -3817,6 +3892,26 @@ char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
  return nullptr;
 }

+uint64_t rocksdb_options_statistics_get_ticker_count(rocksdb_options_t* opt,
+                                                     uint32_t ticker_type) {
+  ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+  if (statistics) {
+    return statistics->getTickerCount(ticker_type);
+  }
+  return 0;
+}
+
+void rocksdb_options_statistics_get_histogram_data(
+    rocksdb_options_t* opt, uint32_t type,
+    rocksdb_statistics_histogram_data_t* const data) {
+  ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
+  if (statistics) {
+    statistics->histogramData(type, &data->rep);
+  } else {
+    *data = rocksdb_statistics_histogram_data_t{};
+  }
+}
+
 void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
                                     rocksdb_ratelimiter_t* limiter) {
  if (limiter) {
@ -3878,6 +3973,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory(
  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
 }

+void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
+    rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger,
+    double deletion_ratio) {
+  std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
+      compact_on_del = NewCompactOnDeletionCollectorFactory(
+          window_size, num_dels_trigger, deletion_ratio);
+  opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
+}
+
 void rocksdb_set_perf_level(int v) {
  PerfLevel level = static_cast<PerfLevel>(v);
  SetPerfLevel(level);
@ -4054,6 +4158,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
      return rep->blob_decompress_time;
    case rocksdb_internal_range_del_reseek_count:
      return rep->internal_range_del_reseek_count;
+    case rocksdb_block_read_cpu_time:
+      return rep->block_read_cpu_time;
    default:
      break;
  }
@ -4449,6 +4555,15 @@ rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
  return opt->rep.io_timeout.count();
 }

+void rocksdb_readoptions_set_async_io(rocksdb_readoptions_t* opt,
+                                      unsigned char v) {
+  opt->rep.async_io = v;
+}
+
+unsigned char rocksdb_readoptions_get_async_io(rocksdb_readoptions_t* opt) {
+  return opt->rep.async_io;
+}
+
 void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
                                       const char* ts, size_t tslen) {
  if (ts == nullptr) {
@ -4660,12 +4775,59 @@ rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
 }

 rocksdb_cache_t* rocksdb_cache_create_lru_opts(
-    rocksdb_lru_cache_options_t* opt) {
+    const rocksdb_lru_cache_options_t* opt) {
  rocksdb_cache_t* c = new rocksdb_cache_t;
  c->rep = NewLRUCache(opt->rep);
  return c;
 }

+rocksdb_hyper_clock_cache_options_t* rocksdb_hyper_clock_cache_options_create(
+    size_t capacity, size_t estimated_entry_charge) {
+  return new rocksdb_hyper_clock_cache_options_t{
+      HyperClockCacheOptions(capacity, estimated_entry_charge)};
+}
+
+void rocksdb_hyper_clock_cache_options_destroy(
+    rocksdb_hyper_clock_cache_options_t* opt) {
+  delete opt;
+}
+
+void rocksdb_hyper_clock_cache_options_set_capacity(
+    rocksdb_hyper_clock_cache_options_t* opts, size_t capacity) {
+  opts->rep.capacity = capacity;
+}
+
+void rocksdb_hyper_clock_cache_options_set_estimated_entry_charge(
+    rocksdb_hyper_clock_cache_options_t* opts, size_t estimated_entry_charge) {
+  opts->rep.estimated_entry_charge = estimated_entry_charge;
+}
+
+void rocksdb_hyper_clock_cache_options_set_num_shard_bits(
+    rocksdb_hyper_clock_cache_options_t* opts, int num_shard_bits) {
+  opts->rep.num_shard_bits = num_shard_bits;
+}
+
+void rocksdb_hyper_clock_cache_options_set_memory_allocator(
+    rocksdb_hyper_clock_cache_options_t* opts,
+    rocksdb_memory_allocator_t* memory_allocator) {
+  opts->rep.memory_allocator = memory_allocator->rep;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_hyper_clock(
+    size_t capacity, size_t estimated_entry_charge) {
+  HyperClockCacheOptions opts(capacity, estimated_entry_charge);
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = opts.MakeSharedCache();
+  return c;
+}
+
+rocksdb_cache_t* rocksdb_cache_create_hyper_clock_opts(
+    const rocksdb_hyper_clock_cache_options_t* opts) {
+  rocksdb_cache_t* c = new rocksdb_cache_t;
+  c->rep = opts->rep.MakeSharedCache();
+  return c;
+}
+
 void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }

 void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
@ -4676,18 +4838,26 @@ void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
  cache->rep->SetCapacity(capacity);
 }

-size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
+size_t rocksdb_cache_get_capacity(const rocksdb_cache_t* cache) {
  return cache->rep->GetCapacity();
 }

-size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
+size_t rocksdb_cache_get_usage(const rocksdb_cache_t* cache) {
  return cache->rep->GetUsage();
 }

-size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
+size_t rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache) {
  return cache->rep->GetPinnedUsage();
 }

+size_t rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache) {
+  return cache->rep->GetTableAddressCount();
+}
+
+size_t rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache) {
+  return cache->rep->GetOccupancyCount();
+}
+
 rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
                                        uint64_t target_size) {
  rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
@ -4705,6 +4875,20 @@ rocksdb_env_t* rocksdb_create_default_env() {
  return result;
 }

+rocksdb_env_t* rocksdb_create_encrypted_env(const char* key) {
+  rocksdb_env_t* result = new rocksdb_env_t;
+  std::shared_ptr<rocksdb::EncryptionProvider> provider;
+  Status status = rocksdb::EncryptionProvider::CreateFromString(
+      ConfigOptions(), "ippcp", &provider);
+  assert(status.ok());
+  status =
+      provider->AddCipher("", key, 32, false);
+  assert(status.ok());
+  result->rep = NewEncryptedEnv(Env::Default(), provider);
+  result->is_default = true;
+  return result;
+}
+
 rocksdb_env_t* rocksdb_create_mem_env() {
  rocksdb_env_t* result = new rocksdb_env_t;
  result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
@ -4906,6 +5090,12 @@ void rocksdb_ingestexternalfileoptions_set_ingest_behind(
  opt->rep.ingest_behind = ingest_behind;
 }

+void rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level(
+    rocksdb_ingestexternalfileoptions_t* opt,
+    unsigned char fail_if_not_bottommost_level) {
+  opt->rep.fail_if_not_bottommost_level = fail_if_not_bottommost_level;
+}
+
 void rocksdb_ingestexternalfileoptions_destroy(
    rocksdb_ingestexternalfileoptions_t* opt) {
  delete opt;
@ -5067,6 +5257,17 @@ rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
  return result;
 }

+void rocksdb_fifo_compaction_options_set_allow_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts,
+    unsigned char allow_compaction) {
+  fifo_opts->rep.allow_compaction = allow_compaction;
+}
+
+unsigned char rocksdb_fifo_compaction_options_get_allow_compaction(
+    rocksdb_fifo_compaction_options_t* fifo_opts) {
+  return fifo_opts->rep.allow_compaction;
+}
+
 void rocksdb_fifo_compaction_options_set_max_table_files_size(
    rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
  fifo_opts->rep.max_table_files_size = size;
@ -5487,6 +5688,20 @@ int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
  }
 }

+rocksdb_t* rocksdb_transactiondb_get_base_db(rocksdb_transactiondb_t* txn_db) {
+  DB* base_db = txn_db->rep->GetBaseDB();
+
+  if (base_db != nullptr) {
+    rocksdb_t* result = new rocksdb_t;
+    result->rep = base_db;
+    return result;
+  }
+
+  return nullptr;
+}
+
+void rocksdb_transactiondb_close_base_db(rocksdb_t* base_db) { delete base_db; }
+
 rocksdb_transaction_t* rocksdb_transaction_begin(
    rocksdb_transactiondb_t* txn_db,
    const rocksdb_writeoptions_t* write_options,
@ -5771,6 +5986,35 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
  }
 }

+void rocksdb_transaction_multi_get_for_update(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGetForUpdate(options->rep, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
 void rocksdb_transaction_multi_get_cf(
    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
    const rocksdb_column_family_handle_t* const* column_families,
@ -5803,6 +6047,38 @@ void rocksdb_transaction_multi_get_cf(
  }
 }

+void rocksdb_transaction_multi_get_for_update_cf(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    const rocksdb_column_family_handle_t* const* column_families,
+    size_t num_keys, const char* const* keys_list,
+    const size_t* keys_list_sizes, char** values_list,
+    size_t* values_list_sizes, char** errs) {
+  std::vector<Slice> keys(num_keys);
+  std::vector<ColumnFamilyHandle*> cfs(num_keys);
+  for (size_t i = 0; i < num_keys; i++) {
+    keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
+    cfs[i] = column_families[i]->rep;
+  }
+  std::vector<std::string> values(num_keys);
+  std::vector<Status> statuses =
+      txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values);
+  for (size_t i = 0; i < num_keys; i++) {
+    if (statuses[i].ok()) {
+      values_list[i] = CopyString(values[i]);
+      values_list_sizes[i] = values[i].size();
+      errs[i] = nullptr;
+    } else {
+      values_list[i] = nullptr;
+      values_list_sizes[i] = 0;
+      if (!statuses[i].IsNotFound()) {
+        errs[i] = strdup(statuses[i].ToString().c_str());
+      } else {
+        errs[i] = nullptr;
+      }
+    }
+  }
+}
+
 // Read a key outside a transaction
 char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
                                const rocksdb_readoptions_t* options,
@ -6104,6 +6380,18 @@ void rocksdb_transactiondb_flush_cf(
  SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
 }

+void rocksdb_transactiondb_flush_cfs(
+    rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
+    rocksdb_column_family_handle_t** column_families, int num_column_families,
+    char** errptr) {
+  std::vector<ColumnFamilyHandle*> column_family_handles;
+  for (int i = 0; i < num_column_families; i++) {
+    column_family_handles.push_back(column_families[i]->rep);
+  }
+
+  SaveError(errptr, txn_db->rep->Flush(options->rep, column_family_handles));
+}
+
 rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
    rocksdb_transactiondb_t* txn_db, char** errptr) {
  Checkpoint* checkpoint;
@ -6391,6 +6679,59 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) {
  db->rep->EnableManualCompaction();
 }

-}  // end extern "C"
+rocksdb_statistics_histogram_data_t*
+rocksdb_statistics_histogram_data_create() {
+  return new rocksdb_statistics_histogram_data_t{};
+}
+
+void rocksdb_statistics_histogram_data_destroy(
+    rocksdb_statistics_histogram_data_t* data) {
+  delete data;
+}
+
+double rocksdb_statistics_histogram_data_get_median(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.median;
+}
+
+double rocksdb_statistics_histogram_data_get_p95(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.percentile95;
+}
+
+double rocksdb_statistics_histogram_data_get_p99(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.percentile99;
+}
+
+double rocksdb_statistics_histogram_data_get_average(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.average;
+}
+
+double rocksdb_statistics_histogram_data_get_std_dev(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.standard_deviation;
+}
+
+double rocksdb_statistics_histogram_data_get_max(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.max;
+}
+
+uint64_t rocksdb_statistics_histogram_data_get_count(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.count;
+}
+
+uint64_t rocksdb_statistics_histogram_data_get_sum(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.sum;
+}

-#endif  // !ROCKSDB_LITE
+double rocksdb_statistics_histogram_data_get_min(
+    rocksdb_statistics_histogram_data_t* data) {
+  return data->rep.min;
+}
+
+}  // end extern "C"
--- a/db/c_test.c
+++ b/db/c_test.c
@ -3,17 +3,14 @@
   found in the LICENSE file. See the AUTHORS file for names of contributors. */
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

-#include <stdio.h>
-
-#ifndef ROCKSDB_LITE  // Lite does not support C API
+#include "rocksdb/c.h"

 #include <assert.h>
 #include <stddef.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
-
-#include "rocksdb/c.h"
 #ifndef OS_WIN
 #include <unistd.h>
 #endif
@ -490,6 +487,19 @@ static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
  rocksdb_pinnableslice_destroy(p);
 }

+static void CheckTxnGetForUpdate(rocksdb_transaction_t* txn,
+                                 const rocksdb_readoptions_t* options,
+                                 const char* key, const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_for_update(txn, options, key, strlen(key),
+                                           &val_len, true, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
 static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
                          const rocksdb_readoptions_t* options, const char* key,
                          const char* expected) {
@ -517,6 +527,20 @@ static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
  Free(&val);
 }

+static void CheckTxnGetForUpdateCF(
+    rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
+    rocksdb_column_family_handle_t* column_family, const char* key,
+    const char* expected) {
+  char* err = NULL;
+  size_t val_len;
+  char* val;
+  val = rocksdb_transaction_get_for_update_cf(
+      txn, options, column_family, key, strlen(key), &val_len, true, &err);
+  CheckNoError(err);
+  CheckEqual(expected, val, val_len);
+  Free(&val);
+}
+
 static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
                             const rocksdb_readoptions_t* options,
                             const char* key, const char* expected) {
@ -696,6 +720,8 @@ int main(int argc, char** argv) {

  rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
                                                            10001);
+  rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
+      options, 10000, 10001, 0.0);

  StartPhase("destroy");
  rocksdb_destroy_db(options, dbname, &err);
@ -1647,7 +1673,8 @@ int main(int argc, char** argv) {
    rocksdb_options_set_prefix_extractor(
        options, rocksdb_slicetransform_create_fixed_prefix(3));
    rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
-    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
+    rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16, 0, 0, 0,
+                                            0);
    rocksdb_options_set_allow_concurrent_memtable_write(options, 0);

    db = rocksdb_open(options, dbname, &err);
@ -2033,6 +2060,15 @@ int main(int argc, char** argv) {
    CheckCondition(29.0 ==
                   rocksdb_options_get_experimental_mempurge_threshold(o));

+    CheckCondition(rocksdb_statistics_level_disable_all ==
+                   rocksdb_options_get_statistics_level(o));
+    rocksdb_options_enable_statistics(o);
+    CheckCondition(rocksdb_statistics_level_disable_all !=
+                   rocksdb_options_get_statistics_level(o));
+    rocksdb_options_set_statistics_level(o, rocksdb_statistics_level_all);
+    CheckCondition(rocksdb_statistics_level_all ==
+                   rocksdb_options_get_statistics_level(o));
+
    /* Blob Options */
    rocksdb_options_set_enable_blob_files(o, 1);
    CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
@ -2572,6 +2608,9 @@ int main(int argc, char** argv) {
    rocksdb_readoptions_set_io_timeout(ro, 400);
    CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));

+    rocksdb_readoptions_set_async_io(ro, 1);
+    CheckCondition(1 == rocksdb_readoptions_get_async_io(ro));
+
    rocksdb_readoptions_destroy(ro);
  }

@ -3091,6 +3130,17 @@ int main(int argc, char** argv) {
    CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
    CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);

+    // memory usage
+    rocksdb_t* base_db = rocksdb_transactiondb_get_base_db(txn_db);
+    rocksdb_memory_consumers_t* consumers = rocksdb_memory_consumers_create();
+    rocksdb_memory_consumers_add_db(consumers, base_db);
+    rocksdb_memory_usage_t* usage =
+        rocksdb_approximate_memory_usage_create(consumers, &err);
+    CheckNoError(err);
+    rocksdb_approximate_memory_usage_destroy(usage);
+    rocksdb_memory_consumers_destroy(consumers);
+    rocksdb_transactiondb_close_base_db(base_db);
+
    // flush
    rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
    rocksdb_flushoptions_set_wait(flush_options, 1);
@ -3203,6 +3253,120 @@ int main(int argc, char** argv) {
    rocksdb_transactiondb_options_destroy(txn_db_options);
  }

+  StartPhase("transactions_multi_get_for_update");
+  {
+    // open a TransactionDB
+    txn_db_options = rocksdb_transactiondb_options_create();
+    rocksdb_transactiondb_options_set_transaction_lock_timeout(txn_db_options,
+                                                               0);
+    txn_options = rocksdb_transaction_options_create();
+    rocksdb_options_set_create_if_missing(options, 1);
+    txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_put(txn_db, woptions, "bar", 3, "hello", 5, &err);
+    CheckNoError(err);
+
+    // begin transactions
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    rocksdb_transaction_t* txn2 =
+        rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+
+    // multi get
+    {
+      const char* keys[2] = {"foo", "bar"};
+      const size_t keys_sizes[2] = {3, 3};
+      char* vals[2];
+      size_t vals_sizes[2];
+      char* errs[2];
+      const char* expected[2] = {"hey", "hello"};
+      rocksdb_transaction_multi_get_for_update(
+          txn, roptions, 2, keys, keys_sizes, vals, vals_sizes, errs);
+      CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
+    }
+
+    char* conflict_err = NULL;
+    size_t val_len;
+    rocksdb_transaction_get_for_update(txn2, roptions, "foo", 3, &val_len, true,
+                                       &conflict_err);
+    // get-for-update conflict
+    CheckCondition(conflict_err != NULL);
+    Free(&conflict_err);
+
+    // commit
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+
+    // should work after first tx is commited
+    CheckTxnGetForUpdate(txn2, roptions, "foo", "hey");
+
+    // commit the second one
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+
+    // destroy txns
+    rocksdb_transaction_destroy(txn);
+    rocksdb_transaction_destroy(txn2);
+
+    // same for column families
+
+    rocksdb_column_family_handle_t* cfh;
+    cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
+                                                     "txn_db_cf", &err);
+    CheckNoError(err);
+
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
+                                 8, &err);
+    CheckNoError(err);
+    rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_bar", 6, "cf_hey",
+                                 6, &err);
+    CheckNoError(err);
+
+    txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+    txn2 = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
+
+    {
+      const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
+      const char* keys[2] = {"cf_foo", "cf_bar"};
+      const size_t keys_sizes[2] = {6, 6};
+      char* vals[2];
+      size_t vals_sizes[2];
+      char* errs[2];
+      const char* expected[2] = {"cf_hello", "cf_hey"};
+      rocksdb_transaction_multi_get_for_update_cf(txn, roptions, get_handles, 2,
+                                                  keys, keys_sizes, vals,
+                                                  vals_sizes, errs);
+      CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
+    }
+
+    char* conflict_err_cf = NULL;
+    size_t val_len_cf;
+    rocksdb_transaction_get_for_update_cf(txn2, roptions, cfh, "cf_foo", 6,
+                                          &val_len_cf, true, &conflict_err_cf);
+    CheckCondition(conflict_err_cf != NULL);
+    Free(&conflict_err_cf);
+
+    rocksdb_transaction_commit(txn, &err);
+    CheckNoError(err);
+
+    CheckTxnGetForUpdateCF(txn2, roptions, cfh, "cf_foo", "cf_hello");
+
+    rocksdb_transaction_commit(txn2, &err);
+    CheckNoError(err);
+
+    // close and destroy
+    rocksdb_column_family_handle_destroy(cfh);
+    rocksdb_transaction_destroy(txn);
+    rocksdb_transaction_destroy(txn2);
+    rocksdb_transactiondb_close(txn_db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+    rocksdb_transaction_options_destroy(txn_options);
+    rocksdb_transactiondb_options_destroy(txn_db_options);
+  }
+
  StartPhase("optimistic_transactions");
  {
    rocksdb_options_t* db_options = rocksdb_options_create();
@ -3232,8 +3396,19 @@ int main(int argc, char** argv) {
    rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
    CheckNoError(err);
    rocksdb_column_family_handle_t *cfh1, *cfh2;
-    cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
-    cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
+    char** list_const_cf_names = (char**)malloc(2 * sizeof(char*));
+    list_const_cf_names[0] = "txn_db_cf1";
+    list_const_cf_names[1] = "txn_db_cf2";
+    size_t cflen;
+    rocksdb_column_family_handle_t** list_cfh = rocksdb_create_column_families(
+        db, db_options, 2, (const char* const*)list_const_cf_names, &cflen,
+        &err);
+    free(list_const_cf_names);
+    CheckNoError(err);
+    assert(cflen == 2);
+    cfh1 = list_cfh[0];
+    cfh2 = list_cfh[1];
+    rocksdb_create_column_families_destroy(list_cfh);
    txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
                                              NULL);
    rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
@ -3447,6 +3622,71 @@ int main(int argc, char** argv) {
    rocksdb_readoptions_destroy(ropts);
  }

+  StartPhase("statistics");
+  {
+    const uint32_t BYTES_WRITTEN_TICKER = 40;
+    const uint32_t DB_WRITE_HIST = 1;
+
+    rocksdb_statistics_histogram_data_t* hist =
+        rocksdb_statistics_histogram_data_create();
+    {
+      // zero by default
+      CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
+      CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
+      CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
+      CheckCondition(0.0 ==
+                     rocksdb_statistics_histogram_data_get_average(hist));
+      CheckCondition(0.0 ==
+                     rocksdb_statistics_histogram_data_get_std_dev(hist));
+      CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
+      CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
+      CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
+      CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_min(hist));
+    }
+
+    rocksdb_close(db);
+    rocksdb_destroy_db(options, dbname, &err);
+    CheckNoError(err);
+
+    rocksdb_options_enable_statistics(options);
+    rocksdb_options_set_statistics_level(options, rocksdb_statistics_level_all);
+
+    db = rocksdb_open(options, dbname, &err);
+    CheckNoError(err);
+
+    CheckCondition(0 == rocksdb_options_statistics_get_ticker_count(
+                            options, BYTES_WRITTEN_TICKER));
+    rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_average(hist));
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_std_dev(hist));
+    CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
+    CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
+    CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
+
+    int i;
+    for (i = 0; i < 10; ++i) {
+      char key = '0' + (char)i;
+      rocksdb_put(db, woptions, &key, 1, "", 1, &err);
+      CheckNoError(err);
+    }
+    CheckCondition(0 != rocksdb_options_statistics_get_ticker_count(
+                            options, BYTES_WRITTEN_TICKER));
+    rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_median(hist));
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p95(hist));
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p99(hist));
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_average(hist));
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_std_dev(hist));
+    CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_max(hist));
+    CheckCondition(0 != rocksdb_statistics_histogram_data_get_count(hist));
+    CheckCondition(0 != rocksdb_statistics_histogram_data_get_sum(hist));
+
+    rocksdb_statistics_histogram_data_destroy(hist);
+  }
+
  StartPhase("cancel_all_background_work");
  rocksdb_cancel_all_background_work(db, 1);

@ -3465,12 +3705,3 @@ int main(int argc, char** argv) {
  fprintf(stderr, "PASS\n");
  return 0;
 }
-
-#else
-
-int main(void) {
-  fprintf(stderr, "SKIPPED\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
--- a/db/column_family.cc
+++ b/db/column_family.cc
@ -53,11 +53,9 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(

 ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
  if (cfd_ != nullptr) {
-#ifndef ROCKSDB_LITE
    for (auto& listener : cfd_->ioptions()->listeners) {
      listener->OnColumnFamilyHandleDeletionStarted(this);
    }
-#endif  // ROCKSDB_LITE
    // Job id == 0 means that this is not our background process, but rather
    // user thread
    // Need to hold some shared pointers owned by the initial_cf_options
@ -88,15 +86,10 @@ const std::string& ColumnFamilyHandleImpl::GetName() const {
 }

 Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
-#ifndef ROCKSDB_LITE
  // accessing mutable cf-options requires db mutex.
  InstrumentedMutexLock l(mutex_);
  *desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
  return Status::OK();
-#else
-  (void)desc;
-  return Status::NotSupported();
-#endif  // !ROCKSDB_LITE
 }

 const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
@ -347,7 +340,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
        result.hard_pending_compaction_bytes_limit;
  }

-#ifndef ROCKSDB_LITE
  // When the DB is stopped, it's possible that there are some .trash files that
  // were not deleted yet, when we open the DB we will find these .trash files
  // and schedule them to be deleted (or delete immediately if SstFileManager
@ -359,7 +351,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
                                      result.cf_paths[i].path)
        .PermitUncheckedError();
  }
-#endif

  if (result.cf_paths.empty()) {
    result.cf_paths = db_options.db_paths;
@ -391,8 +382,9 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,

  const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
  if (result.ttl == kDefaultTtl) {
-    if (is_block_based_table &&
-        result.compaction_style != kCompactionStyleFIFO) {
+    if (is_block_based_table) {
+      // FIFO also requires max_open_files=-1, which is checked in
+      // ValidateOptions().
      result.ttl = kAdjustedTtl;
    } else {
      result.ttl = 0;
@ -400,40 +392,35 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
  }

  const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
-
-  // Turn on periodic compactions and set them to occur once every 30 days if
-  // compaction filters are used and periodic_compaction_seconds is set to the
-  // default value.
-  if (result.compaction_style != kCompactionStyleFIFO) {
+  if (result.compaction_style == kCompactionStyleLevel) {
    if ((result.compaction_filter != nullptr ||
         result.compaction_filter_factory != nullptr) &&
        result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
        is_block_based_table) {
      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
    }
-  } else {
-    // result.compaction_style == kCompactionStyleFIFO
-    if (result.ttl == 0) {
-      if (is_block_based_table) {
-        if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
-          result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
-        }
-        result.ttl = result.periodic_compaction_seconds;
-      }
-    } else if (result.periodic_compaction_seconds != 0) {
-      result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
+  } else if (result.compaction_style == kCompactionStyleUniversal) {
+    if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
+        is_block_based_table) {
+      result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
+    }
+  } else if (result.compaction_style == kCompactionStyleFIFO) {
+    if (result.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
+      ROCKS_LOG_WARN(
+          db_options.info_log.get(),
+          "periodic_compaction_seconds does not support FIFO compaction. You"
+          "may want to set option TTL instead.");
    }
  }

-  // TTL compactions would work similar to Periodic Compactions in Universal in
-  // most of the cases. So, if ttl is set, execute the periodic compaction
-  // codepath.
-  if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
-    if (result.periodic_compaction_seconds != 0) {
+  // For universal compaction, `ttl` and `periodic_compaction_seconds` mean the
+  // same thing, take the stricter value.
+  if (result.compaction_style == kCompactionStyleUniversal) {
+    if (result.periodic_compaction_seconds == 0) {
+      result.periodic_compaction_seconds = result.ttl;
+    } else if (result.ttl != 0) {
      result.periodic_compaction_seconds =
          std::min(result.ttl, result.periodic_compaction_seconds);
-    } else {
-      result.periodic_compaction_seconds = result.ttl;
    }
  }

@ -557,7 +544,6 @@ ColumnFamilyData::ColumnFamilyData(
      next_(nullptr),
      prev_(nullptr),
      log_number_(0),
-      flush_reason_(FlushReason::kOthers),
      column_family_set_(column_family_set),
      queued_for_flush_(false),
      queued_for_compaction_(false),
@ -603,7 +589,6 @@ ColumnFamilyData::ColumnFamilyData(
    if (ioptions_.compaction_style == kCompactionStyleLevel) {
      compaction_picker_.reset(
          new LevelCompactionPicker(ioptions_, &internal_comparator_));
-#ifndef ROCKSDB_LITE
    } else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
      compaction_picker_.reset(
          new UniversalCompactionPicker(ioptions_, &internal_comparator_));
@ -617,7 +602,6 @@ ColumnFamilyData::ColumnFamilyData(
                     "Column family %s does not use any background compaction. "
                     "Compactions can only be done via CompactFiles\n",
                     GetName().c_str());
-#endif  // !ROCKSDB_LITE
    } else {
      ROCKS_LOG_ERROR(ioptions_.logger,
                      "Unable to recognize the specified compaction style %d. "
@ -881,7 +865,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
 }
 }  // anonymous namespace

-std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
+std::pair<WriteStallCondition, WriteStallCause>
 ColumnFamilyData::GetWriteStallConditionAndCause(
    int num_unflushed_memtables, int num_l0_files,
    uint64_t num_compaction_needed_bytes,
@ -954,7 +938,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
      if (compaction_picker_->IsLevel0CompactionInProgress()) {
        internal_stats_->AddCFStats(
-            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
+            InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
+            1);
      }
      ROCKS_LOG_WARN(ioptions_.logger,
                     "[%s] Stopping writes because we have %d level-0 files",
@ -975,7 +960,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
          SetupDelay(write_controller, compaction_needed_bytes,
                     prev_compaction_needed_bytes_, was_stopped,
                     mutable_cf_options.disable_auto_compactions);
-      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
+      internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1);
      ROCKS_LOG_WARN(
          ioptions_.logger,
          "[%s] Stalling writes because we have %d immutable memtables "
@ -993,11 +978,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
          SetupDelay(write_controller, compaction_needed_bytes,
                     prev_compaction_needed_bytes_, was_stopped || near_stop,
                     mutable_cf_options.disable_auto_compactions);
-      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
-                                  1);
+      internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1);
      if (compaction_picker_->IsLevel0CompactionInProgress()) {
        internal_stats_->AddCFStats(
-            InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
+            InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
+            1);
      }
      ROCKS_LOG_WARN(ioptions_.logger,
                     "[%s] Stalling writes because we have %d level-0 files "
@ -1023,7 +1008,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
                     prev_compaction_needed_bytes_, was_stopped || near_stop,
                     mutable_cf_options.disable_auto_compactions);
      internal_stats_->AddCFStats(
-          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
+          InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1);
      ROCKS_LOG_WARN(
          ioptions_.logger,
          "[%s] Stalling writes because of estimated pending compaction "
@ -1152,6 +1137,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
  *overlap = false;
  // Create an InternalIterator over all unflushed memtables
  Arena arena;
+  // TODO: plumb Env::IOActivity
  ReadOptions read_opts;
  read_opts.total_order_seek = true;
  MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
@ -1255,30 +1241,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
  // (if no Scrape happens).
  assert(ptr != SuperVersion::kSVInUse);
  SuperVersion* sv = static_cast<SuperVersion*>(ptr);
-  if (sv == SuperVersion::kSVObsolete ||
-      sv->version_number != super_version_number_.load()) {
+  if (sv == SuperVersion::kSVObsolete) {
    RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
-    SuperVersion* sv_to_delete = nullptr;
-
-    if (sv && sv->Unref()) {
-      RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
-      db->mutex()->Lock();
-      // NOTE: underlying resources held by superversion (sst files) might
-      // not be released until the next background job.
-      sv->Cleanup();
-      if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
-        db->AddSuperVersionsToFreeQueue(sv);
-        db->SchedulePurge();
-      } else {
-        sv_to_delete = sv;
-      }
-    } else {
-      db->mutex()->Lock();
-    }
+    db->mutex()->Lock();
    sv = super_version_->Ref();
    db->mutex()->Unlock();
-
-    delete sv_to_delete;
  }
  assert(sv != nullptr);
  return sv;
@ -1409,6 +1376,33 @@ Status ColumnFamilyData::ValidateOptions(
    }
  }

+  const auto* ucmp = cf_options.comparator;
+  assert(ucmp);
+  if (ucmp->timestamp_size() > 0 &&
+      !cf_options.persist_user_defined_timestamps) {
+    if (db_options.atomic_flush) {
+      return Status::NotSupported(
+          "Not persisting user-defined timestamps feature is not supported"
+          "in combination with atomic flush.");
+    }
+    if (db_options.allow_concurrent_memtable_write) {
+      return Status::NotSupported(
+          "Not persisting user-defined timestamps feature is not supported"
+          " in combination with concurrent memtable write.");
+    }
+    const char* comparator_name = cf_options.comparator->Name();
+    size_t name_size = strlen(comparator_name);
+    const char* suffix = ".u64ts";
+    size_t suffix_size = strlen(suffix);
+    if (name_size <= suffix_size ||
+        strcmp(comparator_name + name_size - suffix_size, suffix) != 0) {
+      return Status::NotSupported(
+          "Not persisting user-defined timestamps"
+          "feature only support user-defined timestamps formatted as "
+          "uint64_t.");
+    }
+  }
+
  if (cf_options.enable_blob_garbage_collection) {
    if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
        cf_options.blob_garbage_collection_age_cutoff > 1.0) {
@ -1438,10 +1432,40 @@ Status ColumnFamilyData::ValidateOptions(
        "Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
        "or 8 bytes per key.");
  }
+  if (std::find(supported.begin(), supported.end(),
+                cf_options.block_protection_bytes_per_key) == supported.end()) {
+    return Status::NotSupported(
+        "Block per key-value checksum protection only supports 0, 1, 2, 4 "
+        "or 8 bytes per key.");
+  }
+
+  if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds
+           .empty()) {
+    if (cf_options.compaction_style != kCompactionStyleFIFO) {
+      return Status::NotSupported(
+          "Option file_temperature_age_thresholds only supports FIFO "
+          "compaction.");
+    } else if (cf_options.num_levels > 1) {
+      return Status::NotSupported(
+          "Option file_temperature_age_thresholds is only supported when "
+          "num_levels = 1.");
+    } else {
+      const auto& ages =
+          cf_options.compaction_options_fifo.file_temperature_age_thresholds;
+      assert(ages.size() >= 1);
+      // check that age is sorted
+      for (size_t i = 0; i < ages.size() - 1; ++i) {
+        if (ages[i].age >= ages[i + 1].age) {
+          return Status::NotSupported(
+              "Option file_temperature_age_thresholds requires elements to be "
+              "sorted in increasing order with respect to `age` field.");
+        }
+      }
+    }
+  }
  return s;
 }

-#ifndef ROCKSDB_LITE
 Status ColumnFamilyData::SetOptions(
    const DBOptions& db_opts,
    const std::unordered_map<std::string, std::string>& options_map) {
@ -1460,7 +1484,6 @@ Status ColumnFamilyData::SetOptions(
  }
  return s;
 }
-#endif  // ROCKSDB_LITE

 // REQUIRES: DB mutex held
 Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
@ -1519,6 +1542,43 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
  return data_dirs_[path_id].get();
 }

+bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
+    uint64_t max_memtable_id) {
+  const Comparator* ucmp = user_comparator();
+  const size_t ts_sz = ucmp->timestamp_size();
+  if (ts_sz == 0 || ioptions_.persist_user_defined_timestamps) {
+    return false;
+  }
+  // If users set the `persist_user_defined_timestamps` flag to false, they
+  // should also set the `full_history_ts_low` flag to indicate the range of
+  // user-defined timestamps to retain in memory. Otherwise, we do not
+  // explicitly postpone flush to retain UDTs.
+  const std::string& full_history_ts_low = GetFullHistoryTsLow();
+  if (full_history_ts_low.empty()) {
+    return false;
+  }
+#ifndef NDEBUG
+  Slice last_table_newest_udt;
+#endif /* !NDEBUG */
+  for (const Slice& table_newest_udt :
+       imm()->GetTablesNewestUDT(max_memtable_id)) {
+    assert(table_newest_udt.size() == full_history_ts_low.size());
+    assert(last_table_newest_udt.empty() ||
+           ucmp->CompareTimestamp(table_newest_udt, last_table_newest_udt) >=
+               0);
+    // Checking the newest UDT contained in MemTable with ascending ID up to
+    // `max_memtable_id`. MemTable with bigger ID will have newer UDT, return
+    // immediately on finding the first MemTable that needs postponing.
+    if (ucmp->CompareTimestamp(table_newest_udt, full_history_ts_low) >= 0) {
+      return true;
+    }
+#ifndef NDEBUG
+    last_table_newest_udt = table_newest_udt;
+#endif /* !NDEBUG */
+  }
+  return false;
+}
+
 void ColumnFamilyData::RecoverEpochNumbers() {
  assert(current_);
  auto* vstorage = current_->storage_info();
@ -1621,6 +1681,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
      db_id_, db_session_id_);
  column_families_.insert({name, id});
  column_family_data_.insert({id, new_cfd});
+  auto ucmp = new_cfd->user_comparator();
+  assert(ucmp);
+  size_t ts_sz = ucmp->timestamp_size();
+  running_ts_sz_.insert({id, ts_sz});
+  if (ts_sz > 0) {
+    ts_sz_for_record_.insert({id, ts_sz});
+  }
  max_column_family_ = std::max(max_column_family_, id);
  // add to linked list
  new_cfd->next_ = dummy_cfd_;
@ -1636,10 +1703,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(

 // under a DB mutex AND from a write thread
 void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
-  auto cfd_iter = column_family_data_.find(cfd->GetID());
+  uint32_t cf_id = cfd->GetID();
+  auto cfd_iter = column_family_data_.find(cf_id);
  assert(cfd_iter != column_family_data_.end());
  column_family_data_.erase(cfd_iter);
  column_families_.erase(cfd->GetName());
+  running_ts_sz_.erase(cf_id);
+  ts_sz_for_record_.erase(cf_id);
 }

 // under a DB mutex OR from a write thread
--- a/db/column_family.h
+++ b/db/column_family.h
@ -310,10 +310,6 @@ class ColumnFamilyData {
  void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
  uint64_t GetLogNumber() const { return log_number_; }

-  void SetFlushReason(FlushReason flush_reason) {
-    flush_reason_ = flush_reason;
-  }
-  FlushReason GetFlushReason() const { return flush_reason_; }
  // thread-safe
  const FileOptions* soptions() const;
  const ImmutableOptions* ioptions() const { return &ioptions_; }
@ -339,12 +335,10 @@ class ColumnFamilyData {
  // Validate CF options against DB options
  static Status ValidateOptions(const DBOptions& db_options,
                                const ColumnFamilyOptions& cf_options);
-#ifndef ROCKSDB_LITE
  // REQUIRES: DB mutex held
  Status SetOptions(
      const DBOptions& db_options,
      const std::unordered_map<std::string, std::string>& options_map);
-#endif  // ROCKSDB_LITE

  InternalStats* internal_stats() { return internal_stats_.get(); }

@ -468,12 +462,6 @@ class ColumnFamilyData {
  bool queued_for_flush() { return queued_for_flush_; }
  bool queued_for_compaction() { return queued_for_compaction_; }

-  enum class WriteStallCause {
-    kNone,
-    kMemtableLimit,
-    kL0FileCountLimit,
-    kPendingCompactionBytes,
-  };
  static std::pair<WriteStallCondition, WriteStallCause>
  GetWriteStallConditionAndCause(
      int num_unflushed_memtables, int num_l0_files,
@ -518,6 +506,12 @@ class ColumnFamilyData {
    return full_history_ts_low_;
  }

+  // REQUIRES: DB mutex held.
+  // Return true if flushing up to MemTables with ID `max_memtable_id`
+  // should be postponed to retain user-defined timestamps according to the
+  // user's setting. Called by background flush job.
+  bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id);
+
  ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
  WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
  std::shared_ptr<CacheReservationManager>
@ -616,8 +610,6 @@ class ColumnFamilyData {
  // recovered from
  uint64_t log_number_;

-  std::atomic<FlushReason> flush_reason_;
-
  // An object that keeps all the compaction stats
  // and picks the next compaction
  std::unique_ptr<CompactionPicker> compaction_picker_;
@ -719,6 +711,16 @@ class ColumnFamilySet {
                                       Version* dummy_version,
                                       const ColumnFamilyOptions& options);

+  const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
+      const {
+    return running_ts_sz_;
+  }
+
+  const UnorderedMap<uint32_t, size_t>&
+  GetColumnFamiliesTimestampSizeForRecord() const {
+    return ts_sz_for_record_;
+  }
+
  iterator begin() { return iterator(dummy_cfd_->next_); }
  iterator end() { return iterator(dummy_cfd_); }

@ -744,6 +746,15 @@ class ColumnFamilySet {
  UnorderedMap<std::string, uint32_t> column_families_;
  UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;

+  // Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow
+  // the same requirements as `column_families_` and `column_family_data_`.
+  // Mapping from column family id to user-defined timestamp size for all
+  // running column families.
+  UnorderedMap<uint32_t, size_t> running_ts_sz_;
+  // Mapping from column family id to user-defined timestamp size for
+  // column families with non-zero user-defined timestamp size.
+  UnorderedMap<uint32_t, size_t> ts_sz_for_record_;
+
  uint32_t max_column_family_;
  const FileOptions file_options_;

--- a/db/column_family_test.cc
+++ b/db/column_family_test.cc
@ -17,6 +17,7 @@
 #include "options/options_parser.h"
 #include "port/port.h"
 #include "port/stack_trace.h"
+#include "rocksdb/comparator.h"
 #include "rocksdb/convenience.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
@ -63,6 +64,9 @@ class ColumnFamilyTestBase : public testing::Test {
    db_options_.create_if_missing = true;
    db_options_.fail_if_options_file_error = true;
    db_options_.env = env_;
+  }
+
+  void SetUp() override {
    EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
  }

@ -71,11 +75,7 @@ class ColumnFamilyTestBase : public testing::Test {
    for (auto h : handles_) {
      ColumnFamilyDescriptor cfdescriptor;
      Status s = h->GetDescriptor(&cfdescriptor);
-#ifdef ROCKSDB_LITE
-      EXPECT_TRUE(s.IsNotSupported());
-#else
      EXPECT_OK(s);
-#endif  // ROCKSDB_LITE
      column_families.push_back(cfdescriptor);
    }
    ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@ -197,12 +197,10 @@ class ColumnFamilyTestBase : public testing::Test {
                               &db_);
  }

-#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
  void AssertOpenReadOnly(std::vector<std::string> cf,
                          std::vector<ColumnFamilyOptions> options = {}) {
    ASSERT_OK(OpenReadOnly(cf, options));
  }
-#endif  // !ROCKSDB_LITE

  void Open(std::vector<std::string> cf,
            std::vector<ColumnFamilyOptions> options = {}) {
@ -224,27 +222,16 @@ class ColumnFamilyTestBase : public testing::Test {
  }

  bool IsDbWriteStopped() {
-#ifndef ROCKSDB_LITE
    uint64_t v;
    EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
    return (v == 1);
-#else
-    return dbfull()->TEST_write_controler().IsStopped();
-#endif  // !ROCKSDB_LITE
  }

  uint64_t GetDbDelayedWriteRate() {
-#ifndef ROCKSDB_LITE
    uint64_t v;
    EXPECT_TRUE(
        dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
    return v;
-#else
-    if (!dbfull()->TEST_write_controler().NeedsDelay()) {
-      return 0;
-    }
-    return dbfull()->TEST_write_controler().delayed_write_rate();
-#endif  // !ROCKSDB_LITE
  }

  void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
@ -267,7 +254,6 @@ class ColumnFamilyTestBase : public testing::Test {
          db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
      names_[cfi] = cfs[i];

-#ifndef ROCKSDB_LITE  // RocksDBLite does not support GetDescriptor
      // Verify the CF options of the returned CF handle.
      ColumnFamilyDescriptor desc;
      ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
@ -276,7 +262,6 @@ class ColumnFamilyTestBase : public testing::Test {
      ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
          ConfigOptions(), desc.options,
          SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
-#endif  // !ROCKSDB_LITE
      cfi++;
    }
  }
@ -325,7 +310,6 @@ class ColumnFamilyTestBase : public testing::Test {
    ASSERT_OK(db_->FlushWAL(/*sync=*/false));
  }

-#ifndef ROCKSDB_LITE  // TEST functions in DB are not supported in lite
  void WaitForFlush(int cf) {
    ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
  }
@ -339,7 +323,6 @@ class ColumnFamilyTestBase : public testing::Test {
  void AssertMaxTotalInMemoryState(uint64_t value) {
    ASSERT_EQ(value, MaxTotalInMemoryState());
  }
-#endif  // !ROCKSDB_LITE

  Status Put(int cf, const std::string& key, const std::string& value) {
    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
@ -377,7 +360,6 @@ class ColumnFamilyTestBase : public testing::Test {
                       "rocksdb.num-files-at-level" + std::to_string(level));
  }

-#ifndef ROCKSDB_LITE
  // Return spread of files per level
  std::string FilesPerLevel(int cf) {
    std::string result;
@ -394,31 +376,19 @@ class ColumnFamilyTestBase : public testing::Test {
    result.resize(last_non_zero_offset);
    return result;
  }
-#endif

  void AssertFilesPerLevel(const std::string& value, int cf) {
-#ifndef ROCKSDB_LITE
    ASSERT_EQ(value, FilesPerLevel(cf));
-#else
-    (void)value;
-    (void)cf;
-#endif
  }

-#ifndef ROCKSDB_LITE  // GetLiveFilesMetaData is not supported
  int CountLiveFiles() {
    std::vector<LiveFileMetaData> metadata;
    db_->GetLiveFilesMetaData(&metadata);
    return static_cast<int>(metadata.size());
  }
-#endif  // !ROCKSDB_LITE

  void AssertCountLiveFiles(int expected_value) {
-#ifndef ROCKSDB_LITE
    ASSERT_EQ(expected_value, CountLiveFiles());
-#else
-    (void)expected_value;
-#endif
  }

  // Do n memtable flushes, each of which produces an sstable
@ -432,7 +402,6 @@ class ColumnFamilyTestBase : public testing::Test {
    }
  }

-#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
  int CountLiveLogFiles() {
    int micros_wait_for_log_deletion = 20000;
    env_->SleepForMicroseconds(micros_wait_for_log_deletion);
@ -461,25 +430,18 @@ class ColumnFamilyTestBase : public testing::Test {
    return ret;
    return 0;
  }
-#endif  // !ROCKSDB_LITE

  void AssertCountLiveLogFiles(int value) {
-#ifndef ROCKSDB_LITE  // GetSortedWalFiles is not supported
    ASSERT_EQ(value, CountLiveLogFiles());
-#else
-    (void)value;
-#endif  // !ROCKSDB_LITE
  }

  void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
    assert(num_per_cf.size() == handles_.size());

-#ifndef ROCKSDB_LITE  // GetProperty is not supported in lite
    for (size_t i = 0; i < num_per_cf.size(); ++i) {
      ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
                                           "rocksdb.num-immutable-mem-table"));
    }
-#endif  // !ROCKSDB_LITE
  }

  void CopyFile(const std::string& source, const std::string& destination,
@ -575,7 +537,6 @@ TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
  }
 }

-#ifndef ROCKSDB_LITE
 TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
  Open();

@ -598,7 +559,6 @@ TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {

  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
 }
-#endif  // !ROCKSDB_LITE

 class FlushEmptyCFTestWithParam
    : public ColumnFamilyTestBase,
@ -942,7 +902,6 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
  }
 }

-#ifndef ROCKSDB_LITE  // TEST functions used are not supported
 TEST_P(ColumnFamilyTest, FlushTest) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two"});
@ -1057,7 +1016,6 @@ TEST_P(ColumnFamilyTest, LogDeletionTest) {
  AssertCountLiveLogFiles(4);
  Close();
 }
-#endif  // !ROCKSDB_LITE

 TEST_P(ColumnFamilyTest, CrashAfterFlush) {
  std::unique_ptr<FaultInjectionTestEnv> fault_env(
@ -1097,7 +1055,6 @@ TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
  ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
 }

-#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
 // Makes sure that obsolete log files get deleted
 TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
  // disable flushing stale column families
@ -1205,14 +1162,12 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
  AssertCountLiveLogFiles(7);
  Close();
 }
-#endif  // !ROCKSDB_LITE

 // The test is commented out because we want to test that snapshot is
 // not created for memtables not supported it, but There isn't a memtable
 // that doesn't support snapshot right now. If we have one later, we can
 // re-enable the test.
 //
-// #ifndef ROCKSDB_LITE  // Cuckoo is not supported in lite
 //   TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
 //   db_options_.allow_concurrent_memtable_write = false;
 //   Open();
@ -1232,7 +1187,6 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
 //   {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
 //   Close();
 // }
-// #endif  // !ROCKSDB_LITE

 class TestComparator : public Comparator {
  int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
@ -1299,13 +1253,13 @@ TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
  Close();
 }

-#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
 TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
  Open();
  CreateColumnFamilies({"one", "two"});
  ColumnFamilyOptions default_cf, one, two;
  db_options_.max_open_files = 20;  // only 10 files in file cache

+  default_cf.level_compaction_dynamic_level_bytes = false;
  default_cf.compaction_style = kCompactionStyleLevel;
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
@ -1323,6 +1277,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
  one.level0_file_num_compaction_trigger = 4;
  one.write_buffer_size = 120000;

+  two.level_compaction_dynamic_level_bytes = false;
  two.compaction_style = kCompactionStyleLevel;
  two.num_levels = 4;
  two.level0_file_num_compaction_trigger = 3;
@ -1367,9 +1322,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {

  Close();
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE
 // Sync points not supported in RocksDB Lite

 TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
@ -1379,6 +1332,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
  db_options_.max_open_files = 20;  // only 10 files in file cache
  db_options_.max_background_compactions = 3;

+  default_cf.level_compaction_dynamic_level_bytes = false;
  default_cf.compaction_style = kCompactionStyleLevel;
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
@ -1395,6 +1349,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
  one.level0_file_num_compaction_trigger = 4;
  one.write_buffer_size = 120000;

+  two.level_compaction_dynamic_level_bytes = false;
  two.compaction_style = kCompactionStyleLevel;
  two.num_levels = 4;
  two.level0_file_num_compaction_trigger = 3;
@ -1477,13 +1432,14 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
  db_options_.max_open_files = 20;  // only 10 files in file cache
  db_options_.max_background_compactions = 3;

+  default_cf.level_compaction_dynamic_level_bytes = false;
  default_cf.compaction_style = kCompactionStyleLevel;
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
  default_cf.target_file_size_base = 30 << 10;
  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  ;
+
  table_options.no_block_cache = true;
  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));

@ -1494,6 +1450,7 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
  one.level0_file_num_compaction_trigger = 4;
  one.write_buffer_size = 120000;

+  two.level_compaction_dynamic_level_bytes = false;
  two.compaction_style = kCompactionStyleLevel;
  two.num_levels = 4;
  two.level0_file_num_compaction_trigger = 3;
@ -1572,13 +1529,14 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
  db_options_.max_open_files = 20;  // only 10 files in file cache
  db_options_.max_background_compactions = 3;

+  default_cf.level_compaction_dynamic_level_bytes = false;
  default_cf.compaction_style = kCompactionStyleLevel;
  default_cf.num_levels = 3;
  default_cf.write_buffer_size = 64 << 10;  // 64KB
  default_cf.target_file_size_base = 30 << 10;
  default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
  BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
-  ;
+
  table_options.no_block_cache = true;
  default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));

@ -1589,6 +1547,7 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
  one.level0_file_num_compaction_trigger = 4;
  one.write_buffer_size = 120000;

+  two.level_compaction_dynamic_level_bytes = false;
  two.compaction_style = kCompactionStyleLevel;
  two.num_levels = 4;
  two.level0_file_num_compaction_trigger = 3;
@ -2033,9 +1992,7 @@ TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  // Tailing iterator not supported
 namespace {
 std::string IterStatus(Iterator* iter) {
  std::string result;
@ -2093,9 +2050,7 @@ TEST_P(ColumnFamilyTest, NewIteratorsTest) {
    Destroy();
  }
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  // ReadOnlyDB is not supported
 TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2144,9 +2099,7 @@ TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
  s = OpenReadOnly({"one", "four"});
  ASSERT_TRUE(!s.ok());
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  //  WaitForFlush() is not supported in lite
 TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
  Open();
  CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2168,9 +2121,7 @@ TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
  ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
  Close();
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  //  WaitForCompaction() is not supported in lite
 TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
  Open();
  CreateColumnFamilies({"one", "two"});
@ -2217,7 +2168,6 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
  ASSERT_EQ(0, dbfull()->TEST_total_log_size());
  Close();
 }
-#endif  // !ROCKSDB_LITE

 TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
  Status s = TryOpen({"one", "two"});
@ -2457,8 +2407,6 @@ TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
  Destroy();
 }

-#ifndef ROCKSDB_LITE
-// skipped as persisting options is not supported in ROCKSDB_LITE
 namespace {
 std::atomic<int> test_stage(0);
 std::atomic<bool> ordered_by_writethread(false);
@ -2540,7 +2488,6 @@ TEST_P(ColumnFamilyTest, CreateAndDropRace) {
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
 }
-#endif  // !ROCKSDB_LITE

 TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
  const uint64_t kBaseRate = 800000u;
@ -2950,7 +2897,6 @@ TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
  ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
 }

-#ifndef ROCKSDB_LITE
 TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
  ColumnFamilyHandle* cfh;
  Open();
@ -3005,9 +2951,7 @@ TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
  db_options_.env = env_;
  Close();
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  // WaitForFlush() is not supported
 TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
  SpecialEnv env(Env::Default());
  db_options_.env = &env;
@ -3114,9 +3058,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
  db_options_.env = env_;
  Close();
 }
-#endif  // !ROCKSDB_LITE

-#ifndef ROCKSDB_LITE  // TEST functions are not supported in lite
 TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
  SpecialEnv env(Env::Default());
  // Allow both of flush and purge job to schedule.
@ -3192,7 +3134,6 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
  db_options_.env = env_;
  Close();
 }
-#endif  // !ROCKSDB_LITE

 // Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
 // to return true which is not so in unbuffered mode.
@ -3443,6 +3384,205 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
  ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
 }

+// Tests the flushing behavior of a column family to retain user-defined
+// timestamp when `persist_user_defined_timestamp` is false.
+class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase {
+ public:
+  ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {}
+
+  void SetUp() override {
+    db_options_.allow_concurrent_memtable_write = false;
+    column_family_options_.comparator =
+        test::BytewiseComparatorWithU64TsWrapper();
+    column_family_options_.persist_user_defined_timestamps = false;
+    ColumnFamilyTestBase::SetUp();
+  }
+
+  Status Put(int cf, const std::string& key, const std::string& ts,
+             const std::string& value) {
+    return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(ts),
+                    Slice(value));
+  }
+};
+
+class TestTsComparator : public Comparator {
+ public:
+  TestTsComparator() : Comparator(8 /*ts_sz*/) {}
+
+  int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
+              const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
+    return 0;
+  }
+  const char* Name() const override { return "TestTs"; }
+  void FindShortestSeparator(
+      std::string* /*start*/,
+      const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
+  void FindShortSuccessor(std::string* /*key*/) const override {}
+};
+
+TEST_F(ColumnFamilyRetainUDTTest, SanityCheck) {
+  Open();
+  ColumnFamilyOptions cf_options;
+  cf_options.persist_user_defined_timestamps = false;
+  TestTsComparator test_comparator;
+  cf_options.comparator = &test_comparator;
+  ColumnFamilyHandle* handle;
+  // Not persisting user-defined timestamps feature only supports user-defined
+  // timestamps formatted as uint64_t.
+  ASSERT_TRUE(
+      db_->CreateColumnFamily(cf_options, "pikachu", &handle).IsNotSupported());
+
+  Destroy();
+  // Not persisting user-defined timestamps feature doesn't work in combination
+  // with atomic flush.
+  db_options_.atomic_flush = true;
+  ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
+
+  // Not persisting user-defined timestamps feature doesn't work in combination
+  // with concurrent memtable write.
+  db_options_.atomic_flush = false;
+  db_options_.allow_concurrent_memtable_write = true;
+  ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
+  Close();
+}
+
+TEST_F(ColumnFamilyRetainUDTTest, FullHistoryTsLowNotSet) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto reschedule_count = *static_cast<int*>(arg);
+        ASSERT_EQ(1, reschedule_count);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Open();
+  std::string write_ts;
+  PutFixed64(&write_ts, 1);
+  ASSERT_OK(Put(0, "foo", write_ts, "v1"));
+  // No `full_history_ts_low` explicitly set by user, flush is continued
+  // without checking if its UDTs expired.
+  ASSERT_OK(Flush(0));
+
+  // After flush, `full_history_ts_low` should be automatically advanced to
+  // the effective cutoff timestamp: write_ts + 1
+  std::string cutoff_ts;
+  PutFixed64(&cutoff_ts, 2);
+  std::string effective_full_history_ts_low;
+  ASSERT_OK(
+      db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
+  ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(ColumnFamilyRetainUDTTest, AllKeysExpired) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto reschedule_count = *static_cast<int*>(arg);
+        ASSERT_EQ(1, reschedule_count);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Open();
+  std::string write_ts;
+  PutFixed64(&write_ts, 1);
+  ASSERT_OK(Put(0, "foo", write_ts, "v1"));
+  std::string cutoff_ts;
+  PutFixed64(&cutoff_ts, 3);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
+  // All keys expired w.r.t the configured `full_history_ts_low`, flush continue
+  // without the need for a re-schedule.
+  ASSERT_OK(Flush(0));
+
+  // `full_history_ts_low` stays unchanged after flush.
+  std::string effective_full_history_ts_low;
+  ASSERT_OK(
+      db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
+  ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushToAvoidWriteStall) {
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto reschedule_count = *static_cast<int*>(arg);
+        ASSERT_EQ(1, reschedule_count);
+      });
+
+  SyncPoint::GetInstance()->EnableProcessing();
+  Open();
+  std::string cutoff_ts;
+  std::string write_ts;
+  PutFixed64(&write_ts, 1);
+  ASSERT_OK(Put(0, "foo", write_ts, "v1"));
+  PutFixed64(&cutoff_ts, 1);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
+  ASSERT_OK(db_->SetOptions(handles_[0], {{"max_write_buffer_number", "1"}}));
+  // Not all keys expired, but flush is continued without a re-schedule because
+  // of risk of write stall.
+  ASSERT_OK(Flush(0));
+
+  // After flush, `full_history_ts_low` should be automatically advanced to
+  // the effective cutoff timestamp: write_ts + 1
+  std::string effective_full_history_ts_low;
+  ASSERT_OK(
+      db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
+
+  cutoff_ts.clear();
+  PutFixed64(&cutoff_ts, 2);
+  ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
+TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushRescheduled) {
+  std::string cutoff_ts;
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::AfterRetainUDTReschedule:cb", [&](void* /*arg*/) {
+        // Increasing full_history_ts_low so all keys expired after the initial
+        // FlushRequest is rescheduled
+        cutoff_ts.clear();
+        PutFixed64(&cutoff_ts, 3);
+        ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
+      });
+  SyncPoint::GetInstance()->SetCallBack(
+      "DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
+        ASSERT_NE(nullptr, arg);
+        auto reschedule_count = *static_cast<int*>(arg);
+        ASSERT_EQ(2, reschedule_count);
+      });
+  SyncPoint::GetInstance()->EnableProcessing();
+
+  Open();
+  std::string write_ts;
+  PutFixed64(&write_ts, 1);
+  ASSERT_OK(Put(0, "foo", write_ts, "v1"));
+  PutFixed64(&cutoff_ts, 1);
+  ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
+  // Not all keys expired, and there is no risk of write stall. Flush is
+  // rescheduled. The actual flush happens after `full_history_ts_low` is
+  // increased to mark all keys expired.
+  ASSERT_OK(Flush(0));
+
+  std::string effective_full_history_ts_low;
+  ASSERT_OK(
+      db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
+  // `full_history_ts_low` stays unchanged.
+  ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
+  Close();
+
+  SyncPoint::GetInstance()->DisableProcessing();
+  SyncPoint::GetInstance()->ClearAllCallBacks();
+}
+
 }  // namespace ROCKSDB_NAMESPACE

 int main(int argc, char** argv) {
--- a/db/compact_files_test.cc
+++ b/db/compact_files_test.cc
@ -3,7 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

-#ifndef ROCKSDB_LITE

 #include <mutex>
 #include <string>
@ -67,6 +66,7 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
  const int kWriteBufferSize = 10000;
  const int kLevel0Trigger = 2;
  options.create_if_missing = true;
+  options.level_compaction_dynamic_level_bytes = false;
  options.compaction_style = kCompactionStyleLevel;
  // Small slowdown and stop trigger for experimental purpose.
  options.level0_slowdown_writes_trigger = 20;
@ -121,7 +121,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
 TEST_F(CompactFilesTest, MultipleLevel) {
  Options options;
  options.create_if_missing = true;
-  options.level_compaction_dynamic_level_bytes = true;
+  // Otherwise background compaction can happen to
+  // drain unnecessary level
+  options.level_compaction_dynamic_level_bytes = false;
  options.num_levels = 6;
  // Add listener
  FlushedFileCollector* collector = new FlushedFileCollector();
@ -182,7 +184,6 @@ TEST_F(CompactFilesTest, MultipleLevel) {
  for (int invalid_output_level = 0; invalid_output_level < 5;
       invalid_output_level++) {
    s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
-    std::cout << s.ToString() << std::endl;
    ASSERT_TRUE(s.IsInvalidArgument());
  }

@ -359,6 +360,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
  std::shared_ptr<FilterWithGet> cf(new FilterWithGet());

  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
  options.create_if_missing = true;
  options.compaction_filter = cf.get();

@ -401,6 +403,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
                                CompactionStyle::kCompactionStyleNone}) {
    ASSERT_OK(DestroyDB(db_name_, Options()));
    Options options;
+    options.level_compaction_dynamic_level_bytes = false;
    options.compaction_style = compaction_style;
    // L0: Snappy, L1: ZSTD, L2: Snappy
    options.compression_per_level = {CompressionType::kSnappyCompression,
@ -490,13 +493,3 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }

-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr,
-          "SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@ -13,6 +13,7 @@
 #include <vector>

 #include "db/column_family.h"
+#include "logging/logging.h"
 #include "rocksdb/compaction_filter.h"
 #include "rocksdb/sst_partitioner.h"
 #include "test_util/sync_point.h"
@ -20,14 +21,16 @@

 namespace ROCKSDB_NAMESPACE {

-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
-                      const InternalKey& b) {
-  auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
+const uint64_t kRangeTombstoneSentinel =
+    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
+
+int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
+  auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b));
  if (c != 0) {
    return c;
  }
-  auto a_footer = ExtractInternalKeyFooter(a.Encode());
-  auto b_footer = ExtractInternalKeyFooter(b.Encode());
+  auto a_footer = ExtractInternalKeyFooter(a);
+  auto b_footer = ExtractInternalKeyFooter(b);
  if (a_footer == kRangeTombstoneSentinel) {
    if (b_footer != kRangeTombstoneSentinel) {
      return -1;
@ -201,6 +204,34 @@ bool Compaction::IsFullCompaction(
  return num_files_in_compaction == total_num_files;
 }

+const TablePropertiesCollection& Compaction::GetTableProperties() {
+  if (!input_table_properties_initialized_) {
+    const ReadOptions read_options(Env::IOActivity::kCompaction);
+    for (size_t i = 0; i < num_input_levels(); ++i) {
+      for (const FileMetaData* fmd : *(this->inputs(i))) {
+        std::shared_ptr<const TableProperties> tp;
+        std::string file_name =
+            TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(),
+                          fmd->fd.GetPathId());
+        Status s = input_version_->GetTableProperties(read_options, &tp, fmd,
+                                                      &file_name);
+        if (s.ok()) {
+          table_properties_[file_name] = tp;
+        } else {
+          ROCKS_LOG_ERROR(immutable_options_.info_log,
+                          "Unable to load table properties for file %" PRIu64
+                          " --- %s\n",
+                          fmd->fd.GetNumber(), s.ToString().c_str());
+        }
+      }
+    }
+
+    input_table_properties_initialized_ = true;
+  };
+
+  return table_properties_;
+}
+
 Compaction::Compaction(
    VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
    const MutableCFOptions& _mutable_cf_options,
@ -462,6 +493,11 @@ bool Compaction::IsTrivialMove() const {
    return false;
  }

+  if (compaction_reason_ == CompactionReason::kChangeTemperature) {
+    // Changing temperature usually requires rewriting the file.
+    return false;
+  }
+
  // Used in universal compaction, where trivial move can be done if the
  // input files are non overlapping
  if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
@ -478,26 +514,25 @@ bool Compaction::IsTrivialMove() const {

  // assert inputs_.size() == 1

-  std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
-
-  for (const auto& file : inputs_.front().files) {
-    std::vector<FileMetaData*> file_grand_parents;
-    if (output_level_ + 1 >= number_levels_) {
-      continue;
-    }
-    input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
-                                          &file->largest, &file_grand_parents);
-    const auto compaction_size =
-        file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
-    if (compaction_size > max_compaction_bytes_) {
-      return false;
-    }
-
-    if (partitioner.get() != nullptr) {
-      if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
-                                         file->largest.user_key())) {
+  if (output_level_ + 1 < number_levels_) {
+    std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
+    for (const auto& file : inputs_.front().files) {
+      std::vector<FileMetaData*> file_grand_parents;
+      input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
+                                            &file->largest,
+                                            &file_grand_parents);
+      const auto compaction_size =
+          file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
+      if (compaction_size > max_compaction_bytes_) {
        return false;
      }
+
+      if (partitioner.get() != nullptr) {
+        if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
+                                           file->largest.user_key())) {
+          return false;
+        }
+      }
    }
  }

@ -555,6 +590,49 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
  return false;
 }

+bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
+    const Slice& begin_key, const Slice& end_key,
+    std::vector<size_t>* level_ptrs) const {
+  assert(input_version_ != nullptr);
+  assert(level_ptrs != nullptr);
+  assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
+  assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) <
+         0);
+  if (bottommost_level_) {
+    return true /* does not overlap */;
+  } else if (output_level_ != 0 &&
+             cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
+    const Comparator* user_cmp = cfd_->user_comparator();
+    for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
+      const std::vector<FileMetaData*>& files =
+          input_vstorage_->LevelFiles(lvl);
+      for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
+        auto* f = files[level_ptrs->at(lvl)];
+        // Advance until the first file with begin_key <= f->largest.user_key()
+        if (user_cmp->CompareWithoutTimestamp(begin_key,
+                                              f->largest.user_key()) > 0) {
+          continue;
+        }
+        // We know that the previous file prev_f, if exists, has
+        // prev_f->largest.user_key() < begin_key.
+        if (user_cmp->CompareWithoutTimestamp(end_key,
+                                              f->smallest.user_key()) <= 0) {
+          // not overlapping with this level
+          break;
+        } else {
+          // We have:
+          // - begin_key < end_key,
+          // - begin_key <= f->largest.user_key(), and
+          // - end_key > f->smallest.user_key()
+          return false /* overlap */;
+        }
+      }
+    }
+    return true /* does not overlap */;
+  }
+  return false /* overlaps */;
+};
+
 // Mark (or clear) each file that is being compacted
 void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
  for (size_t i = 0; i < num_input_levels(); i++) {
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@ -18,8 +18,6 @@ namespace ROCKSDB_NAMESPACE {
 // The file contains class Compaction, as well as some helper functions
 // and data structures used by the class.

-const uint64_t kRangeTombstoneSentinel =
-    PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
 // Utility for comparing sstable boundary keys. Returns -1 if either a or b is
 // null which provides the property that a==null indicates a key that is less
 // than any key and b==null indicates a key that is greater than any key. Note
@ -33,8 +31,19 @@ const uint64_t kRangeTombstoneSentinel =
 // that key never appears in the database. We don't want adjacent sstables to
 // be considered overlapping if they are separated by the range tombstone
 // sentinel.
-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
-                      const InternalKey& b);
+int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&);
+inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a,
+                             const InternalKey& b) {
+  return sstableKeyCompare(user_cmp, a, b.Encode());
+}
+inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                             const Slice& b) {
+  return sstableKeyCompare(user_cmp, a.Encode(), b);
+}
+inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
+                             const InternalKey& b) {
+  return sstableKeyCompare(user_cmp, a.Encode(), b.Encode());
+}
 int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
                      const InternalKey& b);
 int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
@ -205,10 +214,18 @@ class Compaction {
  void AddInputDeletions(VersionEdit* edit);

  // Returns true if the available information we have guarantees that
-  // the input "user_key" does not exist in any level beyond "output_level()".
+  // the input "user_key" does not exist in any level beyond `output_level()`.
  bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
                                     std::vector<size_t>* level_ptrs) const;

+  // Returns true if the user key range [begin_key, end_key) does not exist
+  // in any level beyond `output_level()`.
+  // Used for checking range tombstones, so we assume begin_key < end_key.
+  // begin_key and end_key should include timestamp if enabled.
+  bool KeyRangeNotExistsBeyondOutputLevel(
+      const Slice& begin_key, const Slice& end_key,
+      std::vector<size_t>* level_ptrs) const;
+
  // Clear all files to indicate that they are not being compacted
  // Delete this compaction from the list of running compactions.
  //
@ -309,12 +326,16 @@ class Compaction {
      int output_level, VersionStorageInfo* vstorage,
      const std::vector<CompactionInputFiles>& inputs);

-  TablePropertiesCollection GetOutputTableProperties() const {
-    return output_table_properties_;
-  }
+  // If called before a compaction finishes, will return
+  // table properties of all compaction input files.
+  // If called after a compaction finished, will return
+  // table properties of all compaction input and output files.
+  const TablePropertiesCollection& GetTableProperties();

-  void SetOutputTableProperties(TablePropertiesCollection tp) {
-    output_table_properties_ = std::move(tp);
+  void SetOutputTableProperties(
+      const std::string& file_name,
+      const std::shared_ptr<const TableProperties>& tp) {
+    table_properties_[file_name] = tp;
  }

  Slice GetSmallestUserKey() const { return smallest_user_key_; }
@ -501,8 +522,9 @@ class Compaction {
  // Does input compression match the output compression?
  bool InputCompressionMatchesOutput() const;

+  bool input_table_properties_initialized_ = false;
  // table properties of output files
-  TablePropertiesCollection output_table_properties_;
+  TablePropertiesCollection table_properties_;

  // smallest user keys in compaction
  // includes timestamp if user-defined timestamp is enabled.
@ -548,13 +570,16 @@ struct PerKeyPlacementContext {
  const Slice value;
  const SequenceNumber seq_num;

-  bool output_to_penultimate_level;
+  bool& output_to_penultimate_level;

  PerKeyPlacementContext(int _level, Slice _key, Slice _value,
-                         SequenceNumber _seq_num)
-      : level(_level), key(_key), value(_value), seq_num(_seq_num) {
-    output_to_penultimate_level = false;
-  }
+                         SequenceNumber _seq_num,
+                         bool& _output_to_penultimate_level)
+      : level(_level),
+        key(_key),
+        value(_value),
+        seq_num(_seq_num),
+        output_to_penultimate_level(_output_to_penultimate_level) {}
 };
 #endif /* !NDEBUG */

--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@ -13,6 +13,7 @@
 #include "db/blob/blob_index.h"
 #include "db/blob/prefetch_buffer_collection.h"
 #include "db/snapshot_checker.h"
+#include "db/wide/wide_column_serialization.h"
 #include "logging/logging.h"
 #include "port/likely.h"
 #include "rocksdb/listener.h"
@ -30,7 +31,8 @@ CompactionIterator::CompactionIterator(
    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
    bool enforce_single_del_contracts,
    const std::atomic<bool>& manual_compaction_canceled,
-    const Compaction* compaction, const CompactionFilter* compaction_filter,
+    bool must_count_input_entries, const Compaction* compaction,
+    const CompactionFilter* compaction_filter,
    const std::atomic<bool>* shutting_down,
    const std::shared_ptr<Logger> info_log,
    const std::string* full_history_ts_low,
@ -44,8 +46,9 @@ CompactionIterator::CompactionIterator(
          manual_compaction_canceled,
          std::unique_ptr<CompactionProxy>(
              compaction ? new RealCompaction(compaction) : nullptr),
-          compaction_filter, shutting_down, info_log, full_history_ts_low,
-          preserve_time_min_seqno, preclude_last_level_min_seqno) {}
+          must_count_input_entries, compaction_filter, shutting_down, info_log,
+          full_history_ts_low, preserve_time_min_seqno,
+          preclude_last_level_min_seqno) {}

 CompactionIterator::CompactionIterator(
    InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@ -57,15 +60,14 @@ CompactionIterator::CompactionIterator(
    BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
    bool enforce_single_del_contracts,
    const std::atomic<bool>& manual_compaction_canceled,
-    std::unique_ptr<CompactionProxy> compaction,
+    std::unique_ptr<CompactionProxy> compaction, bool must_count_input_entries,
    const CompactionFilter* compaction_filter,
    const std::atomic<bool>* shutting_down,
    const std::shared_ptr<Logger> info_log,
    const std::string* full_history_ts_low,
    const SequenceNumber preserve_time_min_seqno,
    const SequenceNumber preclude_last_level_min_seqno)
-    : input_(input, cmp,
-             !compaction || compaction->DoesInputReferenceBlobFiles()),
+    : input_(input, cmp, must_count_input_entries),
      cmp_(cmp),
      merge_helper_(merge_helper),
      snapshots_(snapshots),
@ -124,6 +126,9 @@ CompactionIterator::CompactionIterator(
         timestamp_size_ == full_history_ts_low_->size());
 #endif
  input_.SetPinnedItersMgr(&pinned_iters_mgr_);
+  // The default `merge_until_status_` does not need to be checked since it is
+  // overwritten as soon as `MergeUntil()` is called
+  merge_until_status_.PermitUncheckedError();
  TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
 }

@ -178,6 +183,20 @@ void CompactionIterator::Next() {
      ikey_.user_key = current_key_.GetUserKey();
      validity_info_.SetValid(ValidContext::kMerge1);
    } else {
+      if (merge_until_status_.IsMergeInProgress()) {
+        // `Status::MergeInProgress()` tells us that the previous `MergeUntil()`
+        // produced only merge operands. Those merge operands were accessed and
+        // written out using `merge_out_iter_`. Since `merge_out_iter_` is
+        // exhausted at this point, all merge operands have been written out.
+        //
+        // Still, there may be a base value (PUT, DELETE, SINGLEDEL, etc.) that
+        // needs to be written out. Normally, `CompactionIterator` would skip it
+        // on the basis that it has already output something in the same
+        // snapshot stripe. To prevent this, we reset `has_current_user_key_` to
+        // trick the future iteration from finding out the snapshot stripe is
+        // unchanged.
+        has_current_user_key_ = false;
+      }
      // We consumed all pinned merge operands, release pinned iterators
      pinned_iters_mgr_.ReleasePinnedData();
      // MergeHelper moves the iterator to the first record after the merged
@ -204,39 +223,47 @@ void CompactionIterator::Next() {

 bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
                                              Slice* skip_until) {
-  // TODO: support compaction filter for wide-column entities
-  if (!compaction_filter_ ||
-      (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
+  if (!compaction_filter_) {
    return true;
  }
-  bool error = false;
-  // If the user has specified a compaction filter and the sequence
-  // number is greater than any external snapshot, then invoke the
-  // filter. If the return value of the compaction filter is true,
-  // replace the entry with a deletion marker.
-  CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
-  compaction_filter_value_.clear();
-  compaction_filter_skip_until_.Clear();
+
+  if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
+      ikey_.type != kTypeWideColumnEntity) {
+    return true;
+  }
+
+  CompactionFilter::Decision decision =
+      CompactionFilter::Decision::kUndetermined;
  CompactionFilter::ValueType value_type =
      ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
-                               : CompactionFilter::ValueType::kBlobIndex;
+      : ikey_.type == kTypeBlobIndex
+          ? CompactionFilter::ValueType::kBlobIndex
+          : CompactionFilter::ValueType::kWideColumnEntity;
+
  // Hack: pass internal key to BlobIndexCompactionFilter since it needs
  // to get sequence number.
  assert(compaction_filter_);
-  Slice& filter_key =
-      (ikey_.type == kTypeValue ||
+  const Slice& filter_key =
+      (ikey_.type != kTypeBlobIndex ||
       !compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
          ? ikey_.user_key
          : key_;
+
+  compaction_filter_value_.clear();
+  compaction_filter_skip_until_.Clear();
+
+  std::vector<std::pair<std::string, std::string>> new_columns;
+
  {
    StopWatchNano timer(clock_, report_detailed_time_);
-    if (kTypeBlobIndex == ikey_.type) {
-      filter = compaction_filter_->FilterBlobByKey(
+
+    if (ikey_.type == kTypeBlobIndex) {
+      decision = compaction_filter_->FilterBlobByKey(
          level_, filter_key, &compaction_filter_value_,
          compaction_filter_skip_until_.rep());
-      if (CompactionFilter::Decision::kUndetermined == filter &&
+      if (decision == CompactionFilter::Decision::kUndetermined &&
          !compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
-        if (compaction_ == nullptr) {
+        if (!compaction_) {
          status_ =
              Status::Corruption("Unexpected blob index outside of compaction");
          validity_info_.Invalidate();
@ -282,33 +309,61 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
        value_type = CompactionFilter::ValueType::kValue;
      }
    }
-    if (CompactionFilter::Decision::kUndetermined == filter) {
-      filter = compaction_filter_->FilterV2(
-          level_, filter_key, value_type,
-          blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
+
+    if (decision == CompactionFilter::Decision::kUndetermined) {
+      const Slice* existing_val = nullptr;
+      const WideColumns* existing_col = nullptr;
+
+      WideColumns existing_columns;
+
+      if (ikey_.type != kTypeWideColumnEntity) {
+        if (!blob_value_.empty()) {
+          existing_val = &blob_value_;
+        } else {
+          existing_val = &value_;
+        }
+      } else {
+        Slice value_copy = value_;
+        const Status s =
+            WideColumnSerialization::Deserialize(value_copy, existing_columns);
+
+        if (!s.ok()) {
+          status_ = s;
+          validity_info_.Invalidate();
+          return false;
+        }
+
+        existing_col = &existing_columns;
+      }
+
+      decision = compaction_filter_->FilterV3(
+          level_, filter_key, value_type, existing_val, existing_col,
+          &compaction_filter_value_, &new_columns,
          compaction_filter_skip_until_.rep());
    }
+
    iter_stats_.total_filter_time +=
        env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
  }

-  if (CompactionFilter::Decision::kUndetermined == filter) {
-    // Should not reach here, since FilterV2 should never return kUndetermined.
-    status_ =
-        Status::NotSupported("FilterV2() should never return kUndetermined");
+  if (decision == CompactionFilter::Decision::kUndetermined) {
+    // Should not reach here, since FilterV2/FilterV3 should never return
+    // kUndetermined.
+    status_ = Status::NotSupported(
+        "FilterV2/FilterV3 should never return kUndetermined");
    validity_info_.Invalidate();
    return false;
  }

-  if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
+  if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil &&
      cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
          0) {
    // Can't skip to a key smaller than the current one.
-    // Keep the key as per FilterV2 documentation.
-    filter = CompactionFilter::Decision::kKeep;
+    // Keep the key as per FilterV2/FilterV3 documentation.
+    decision = CompactionFilter::Decision::kKeep;
  }

-  if (filter == CompactionFilter::Decision::kRemove) {
+  if (decision == CompactionFilter::Decision::kRemove) {
    // convert the current key to a delete; key_ is pointing into
    // current_key_ at this point, so updating current_key_ updates key()
    ikey_.type = kTypeDeletion;
@ -316,7 +371,7 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
    // no value associated with delete
    value_.clear();
    iter_stats_.num_record_drop_user++;
-  } else if (filter == CompactionFilter::Decision::kPurge) {
+  } else if (decision == CompactionFilter::Decision::kPurge) {
    // convert the current key to a single delete; key_ is pointing into
    // current_key_ at this point, so updating current_key_ updates key()
    ikey_.type = kTypeSingleDeletion;
@ -324,19 +379,19 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
    // no value associated with single delete
    value_.clear();
    iter_stats_.num_record_drop_user++;
-  } else if (filter == CompactionFilter::Decision::kChangeValue) {
-    if (ikey_.type == kTypeBlobIndex) {
-      // value transfer from blob file to inlined data
+  } else if (decision == CompactionFilter::Decision::kChangeValue) {
+    if (ikey_.type != kTypeValue) {
      ikey_.type = kTypeValue;
-      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue);
    }
+
    value_ = compaction_filter_value_;
-  } else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
+  } else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) {
    *need_skip = true;
    compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
                                                     kValueTypeForSeek);
    *skip_until = compaction_filter_skip_until_.Encode();
-  } else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
+  } else if (decision == CompactionFilter::Decision::kChangeBlobIndex) {
    // Only the StackableDB-based BlobDB impl's compaction filter should return
    // kChangeBlobIndex. Decision about rewriting blob and changing blob index
    // in the integrated BlobDB impl is made in subsequent call to
@ -348,23 +403,56 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
      validity_info_.Invalidate();
      return false;
    }
-    if (ikey_.type == kTypeValue) {
-      // value transfer from inlined data to blob file
+
+    if (ikey_.type != kTypeBlobIndex) {
      ikey_.type = kTypeBlobIndex;
-      current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
+      current_key_.UpdateInternalKey(ikey_.sequence, kTypeBlobIndex);
    }
+
    value_ = compaction_filter_value_;
-  } else if (filter == CompactionFilter::Decision::kIOError) {
+  } else if (decision == CompactionFilter::Decision::kIOError) {
    if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
      status_ = Status::NotSupported(
          "CompactionFilter for integrated BlobDB should not return kIOError");
      validity_info_.Invalidate();
      return false;
    }
+
    status_ = Status::IOError("Failed to access blob during compaction filter");
-    error = true;
+    validity_info_.Invalidate();
+    return false;
+  } else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) {
+    WideColumns sorted_columns;
+
+    sorted_columns.reserve(new_columns.size());
+    for (const auto& column : new_columns) {
+      sorted_columns.emplace_back(column.first, column.second);
+    }
+
+    std::sort(sorted_columns.begin(), sorted_columns.end(),
+              [](const WideColumn& lhs, const WideColumn& rhs) {
+                return lhs.name().compare(rhs.name()) < 0;
+              });
+
+    {
+      const Status s = WideColumnSerialization::Serialize(
+          sorted_columns, compaction_filter_value_);
+      if (!s.ok()) {
+        status_ = s;
+        validity_info_.Invalidate();
+        return false;
+      }
+    }
+
+    if (ikey_.type != kTypeWideColumnEntity) {
+      ikey_.type = kTypeWideColumnEntity;
+      current_key_.UpdateInternalKey(ikey_.sequence, kTypeWideColumnEntity);
+    }
+
+    value_ = compaction_filter_value_;
  }
-  return !error;
+
+  return true;
 }

 void CompactionIterator::NextFromInput() {
@ -895,14 +983,15 @@ void CompactionIterator::NextFromInput() {
      // have hit (A)
      // We encapsulate the merge related state machine in a different
      // object to minimize change to the existing flow.
-      Status s = merge_helper_->MergeUntil(
+      merge_until_status_ = merge_helper_->MergeUntil(
          &input_, range_del_agg_, prev_snapshot, bottommost_level_,
          allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
          prefetch_buffers_.get(), &iter_stats_);
      merge_out_iter_.SeekToFirst();

-      if (!s.ok() && !s.IsMergeInProgress()) {
-        status_ = s;
+      if (!merge_until_status_.ok() &&
+          !merge_until_status_.IsMergeInProgress()) {
+        status_ = merge_until_status_;
        return;
      } else if (merge_out_iter_.Valid()) {
        // NOTE: key, value, and ikey_ refer to old entries.
@ -1113,17 +1202,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {

 void CompactionIterator::DecideOutputLevel() {
  assert(compaction_->SupportsPerKeyPlacement());
-#ifndef NDEBUG
-  // Could be overridden by unittest
-  PerKeyPlacementContext context(level_, ikey_.user_key, value_,
-                                 ikey_.sequence);
-  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
-                           &context);
-  output_to_penultimate_level_ = context.output_to_penultimate_level;
-#else
  output_to_penultimate_level_ = false;
-#endif  // NDEBUG
-
  // if the key is newer than the cutoff sequence or within the earliest
  // snapshot, it should output to the penultimate level.
  if (ikey_.sequence > preclude_last_level_min_seqno_ ||
@ -1131,6 +1210,17 @@ void CompactionIterator::DecideOutputLevel() {
    output_to_penultimate_level_ = true;
  }

+#ifndef NDEBUG
+  // Could be overridden by unittest
+  PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence,
+                                 output_to_penultimate_level_);
+  TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
+                           &context);
+  if (ikey_.sequence > earliest_snapshot_) {
+    output_to_penultimate_level_ = true;
+  }
+#endif  // NDEBUG
+
  if (output_to_penultimate_level_) {
    // If it's decided to output to the penultimate level, but unsafe to do so,
    // still output to the last level. For example, moving the data from a lower
@ -1323,6 +1413,7 @@ std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
  }

  ReadOptions read_options;
+  read_options.io_activity = Env::IOActivity::kCompaction;
  read_options.fill_cache = false;

  return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@ -38,15 +38,18 @@ class SequenceIterWrapper : public InternalIterator {
  bool Valid() const override { return inner_iter_->Valid(); }
  Status status() const override { return inner_iter_->status(); }
  void Next() override {
-    num_itered_++;
+    if (!inner_iter_->IsDeleteRangeSentinelKey()) {
+      num_itered_++;
+    }
    inner_iter_->Next();
  }
  void Seek(const Slice& target) override {
    if (!need_count_entries_) {
+      has_num_itered_ = false;
      inner_iter_->Seek(target);
    } else {
-      // For flush cases, we need to count total number of entries, so we
-      // do Next() rather than Seek().
+      // Need to count total number of entries,
+      // so we do Next() rather than Seek().
      while (inner_iter_->Valid() &&
             icmp_.Compare(inner_iter_->key(), target) < 0) {
        Next();
@ -62,7 +65,8 @@ class SequenceIterWrapper : public InternalIterator {
  void SeekForPrev(const Slice& /* target */) override { assert(false); }
  void SeekToLast() override { assert(false); }

-  uint64_t num_itered() const { return num_itered_; }
+  uint64_t NumItered() const { return num_itered_; }
+  bool HasNumItered() const { return has_num_itered_; }
  bool IsDeleteRangeSentinelKey() const override {
    assert(Valid());
    return inner_iter_->IsDeleteRangeSentinelKey();
@ -73,6 +77,7 @@ class SequenceIterWrapper : public InternalIterator {
  InternalIterator* inner_iter_;  // not owned
  uint64_t num_itered_ = 0;
  bool need_count_entries_;
+  bool has_num_itered_ = true;
 };

 class CompactionIterator {
@ -189,6 +194,10 @@ class CompactionIterator {
    const Compaction* compaction_;
  };

+  // @param must_count_input_entries  if true, `NumInputEntryScanned()` will
+  // return the number of input keys scanned. If false, `NumInputEntryScanned()`
+  // will return this number if no Seek was called on `input`. User should call
+  // `HasNumInputEntryScanned()` first in this case.
  CompactionIterator(
      InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
      SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@ -199,7 +208,7 @@ class CompactionIterator {
      BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
      bool enforce_single_del_contracts,
      const std::atomic<bool>& manual_compaction_canceled,
-      const Compaction* compaction = nullptr,
+      bool must_count_input_entries, const Compaction* compaction = nullptr,
      const CompactionFilter* compaction_filter = nullptr,
      const std::atomic<bool>* shutting_down = nullptr,
      const std::shared_ptr<Logger> info_log = nullptr,
@ -219,6 +228,7 @@ class CompactionIterator {
      bool enforce_single_del_contracts,
      const std::atomic<bool>& manual_compaction_canceled,
      std::unique_ptr<CompactionProxy> compaction,
+      bool must_count_input_entries,
      const CompactionFilter* compaction_filter = nullptr,
      const std::atomic<bool>* shutting_down = nullptr,
      const std::shared_ptr<Logger> info_log = nullptr,
@ -253,7 +263,8 @@ class CompactionIterator {
    return current_user_key_;
  }
  const CompactionIterationStats& iter_stats() const { return iter_stats_; }
-  uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
+  bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
+  uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
  // If the current key should be placed on penultimate level, only valid if
  // per_key_placement is supported
  bool output_to_penultimate_level() const {
@ -444,6 +455,7 @@ class CompactionIterator {
  bool clear_and_output_next_key_ = false;

  MergeOutputIterator merge_out_iter_;
+  Status merge_until_status_;
  // PinnedIteratorsManager used to pin input_ Iterator blocks while reading
  // merge operands and then releasing them after consuming them.
  PinnedIteratorsManager pinned_iters_mgr_;
--- a/db/compaction/compaction_iterator_test.cc
+++ b/db/compaction/compaction_iterator_test.cc
@ -293,8 +293,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
        nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
        true /*enforce_single_del_contracts*/,
        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
-        std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
-        full_history_ts_low));
+        std::move(compaction), /*must_count_input_entries=*/false, filter,
+        &shutting_down_, /*info_log=*/nullptr, full_history_ts_low));
  }

  void AddSnapshot(SequenceNumber snapshot,
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@ -192,8 +192,8 @@ CompactionJob::CompactionJob(
  assert(log_buffer_ != nullptr);

  const auto* cfd = compact_->compaction->column_family_data();
-  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
-                                    db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
+  ThreadStatusUtil::SetColumnFamily(cfd);
  ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
  ReportStartedCompaction(compaction);
 }
@ -204,10 +204,6 @@ CompactionJob::~CompactionJob() {
 }

 void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
-  const auto* cfd = compact_->compaction->column_family_data();
-  ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
-                                    db_options_.enable_thread_tracking);
-
  ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
                                               job_id_);

@ -264,7 +260,7 @@ void CompactionJob::Prepare() {
    StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
    GenSubcompactionBoundaries();
  }
-  if (boundaries_.size() > 1) {
+  if (boundaries_.size() >= 1) {
    for (size_t i = 0; i <= boundaries_.size(); i++) {
      compact_->sub_compact_states.emplace_back(
          c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
@ -291,12 +287,14 @@ void CompactionJob::Prepare() {
               c->immutable_options()->preclude_last_level_data_seconds);

  if (preserve_time_duration > 0) {
+    const ReadOptions read_options(Env::IOActivity::kCompaction);
    // setup seqno_time_mapping_
    seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
    for (const auto& each_level : *c->inputs()) {
      for (const auto& fmd : each_level.files) {
        std::shared_ptr<const TableProperties> tp;
-        Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
+        Status s =
+            cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr);
        if (s.ok()) {
          seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
              .PermitUncheckedError();
@ -472,7 +470,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
  // overlap with N-1 other ranges. Since we requested a relatively large number
  // (128) of ranges from each input files, even N range overlapping would
  // cause relatively small inaccuracy.
-
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
  auto* c = compact_->compaction;
  if (c->max_subcompactions() <= 1 &&
      !(c->immutable_options()->compaction_pri == kRoundRobin &&
@ -506,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
        FileMetaData* f = flevel->files[i].file_metadata;
        std::vector<TableReader::Anchor> my_anchors;
        Status s = cfd->table_cache()->ApproximateKeyAnchors(
-            ReadOptions(), icomp, *f, my_anchors);
+            read_options, icomp, *f,
+            c->mutable_cf_options()->block_protection_bytes_per_key,
+            my_anchors);
        if (!s.ok() || my_anchors.empty()) {
          my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
        }
@ -722,11 +722,12 @@ Status CompactionJob::Run() {
        // use_direct_io_for_flush_and_compaction is true, we will regard this
        // verification as user reads since the goal is to cache it here for
        // further user reads
-        ReadOptions read_options;
+        const ReadOptions verify_table_read_options(
+            Env::IOActivity::kCompaction);
        InternalIterator* iter = cfd->table_cache()->NewIterator(
-            read_options, file_options_, cfd->internal_comparator(),
-            files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
-            prefix_extractor,
+            verify_table_read_options, file_options_,
+            cfd->internal_comparator(), files_output[file_idx]->meta,
+            /*range_del_agg=*/nullptr, prefix_extractor,
            /*table_reader_ptr=*/nullptr,
            cfd->internal_stats()->GetFileReadHist(
                compact_->compaction->output_level()),
@ -736,7 +737,9 @@ Status CompactionJob::Run() {
                *compact_->compaction->mutable_cf_options()),
            /*smallest_compaction_key=*/nullptr,
            /*largest_compaction_key=*/nullptr,
-            /*allow_unprepared_value=*/false);
+            /*allow_unprepared_value=*/false,
+            compact_->compaction->mutable_cf_options()
+                ->block_protection_bytes_per_key);
        auto s = iter->status();

        if (s.ok() && paranoid_file_checks_) {
@ -793,20 +796,51 @@ Status CompactionJob::Run() {
      auto fn =
          TableFileName(state.compaction->immutable_options()->cf_paths,
                        output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
-      tp[fn] = output.table_properties;
+      compact_->compaction->SetOutputTableProperties(fn,
+                                                     output.table_properties);
    }
  }
-  compact_->compaction->SetOutputTableProperties(std::move(tp));

-  // Finish up all book-keeping to unify the subcompaction results
+  // Finish up all bookkeeping to unify the subcompaction results.
  compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
-  UpdateCompactionStats();
-
+  uint64_t num_input_range_del = 0;
+  bool ok = UpdateCompactionStats(&num_input_range_del);
+  // (Sub)compactions returned ok, do sanity check on the number of input keys.
+  if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
+    size_t ts_sz = compact_->compaction->column_family_data()
+                       ->user_comparator()
+                       ->timestamp_size();
+    // When trim_ts_ is non-empty, CompactionIterator takes
+    // HistoryTrimmingIterator as input iterator and sees a trimmed view of
+    // input keys. So the number of keys it processed is not suitable for
+    // verification here.
+    // TODO: support verification when trim_ts_ is non-empty.
+    if (!(ts_sz > 0 && !trim_ts_.empty()) &&
+        db_options_.compaction_verify_record_count) {
+      assert(compaction_stats_.stats.num_input_records > 0);
+      // TODO: verify the number of range deletion entries.
+      uint64_t expected =
+          compaction_stats_.stats.num_input_records - num_input_range_del;
+      uint64_t actual = compaction_job_stats_->num_input_records;
+      if (expected != actual) {
+        std::string msg =
+            "Total number of input records: " + std::to_string(expected) +
+            ", but processed " + std::to_string(actual) + " records.";
+        ROCKS_LOG_WARN(
+            db_options_.info_log, "[%s] [JOB %d] Compaction %s",
+            compact_->compaction->column_family_data()->GetName().c_str(),
+            job_context_->job_id, msg.c_str());
+        status = Status::Corruption(
+            "Compaction number of input keys does not match number of keys "
+            "processed.");
+      }
+    }
+  }
  RecordCompactionIOStats();
  LogFlush(db_options_.info_log);
  TEST_SYNC_POINT("CompactionJob::Run():End");
-
  compact_->status = status;
+  TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
  return status;
 }

@ -978,7 +1012,6 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {

 void CompactionJob::NotifyOnSubcompactionBegin(
    SubcompactionState* sub_compact) {
-#ifndef ROCKSDB_LITE
  Compaction* c = compact_->compaction;

  if (db_options_.listeners.empty()) {
@ -1004,14 +1037,10 @@ void CompactionJob::NotifyOnSubcompactionBegin(
  }
  info.status.PermitUncheckedError();

-#else
-  (void)sub_compact;
-#endif  // ROCKSDB_LITE
 }

 void CompactionJob::NotifyOnSubcompactionCompleted(
    SubcompactionState* sub_compact) {
-#ifndef ROCKSDB_LITE

  if (db_options_.listeners.empty()) {
    return;
@ -1032,16 +1061,11 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
  for (const auto& listener : db_options_.listeners) {
    listener->OnSubcompactionCompleted(info);
  }
-#else
-  (void)sub_compact;
-#endif  // ROCKSDB_LITE
 }

 void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  assert(sub_compact);
  assert(sub_compact->compaction);
-
-#ifndef ROCKSDB_LITE
  if (db_options_.compaction_service) {
    CompactionServiceJobStatus comp_status =
        ProcessKeyValueCompactionWithCompactionService(sub_compact);
@ -1052,7 +1076,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
    // fallback to local compaction
    assert(comp_status == CompactionServiceJobStatus::kUseLocal);
  }
-#endif  // !ROCKSDB_LITE

  uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();

@ -1093,6 +1116,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  read_options.verify_checksums = true;
  read_options.fill_cache = false;
  read_options.rate_limiter_priority = GetRateLimiterPriority();
+  read_options.io_activity = Env::IOActivity::kCompaction;
  // Compaction iterators shouldn't be confined to a single prefix.
  // Compactions use Seek() for
  // (a) concurrent compactions,
@ -1103,17 +1127,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  // GenSubcompactionBoundaries doesn't strip away the timestamp.
  size_t ts_sz = cfd->user_comparator()->timestamp_size();
  if (start.has_value()) {
-    read_options.iterate_lower_bound = &start.value();
+    read_options.iterate_lower_bound = &(*start);
    if (ts_sz > 0) {
-      start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
-      read_options.iterate_lower_bound = &start_without_ts.value();
+      start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
+      read_options.iterate_lower_bound = &(*start_without_ts);
    }
  }
  if (end.has_value()) {
-    read_options.iterate_upper_bound = &end.value();
+    read_options.iterate_upper_bound = &(*end);
    if (ts_sz > 0) {
-      end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
-      read_options.iterate_upper_bound = &end_without_ts.value();
+      end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
+      read_options.iterate_upper_bound = &(*end_without_ts);
    }
  }

@ -1128,6 +1152,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  IterKey end_ikey;
  Slice start_slice;
  Slice end_slice;
+  Slice start_user_key{};
+  Slice end_user_key{};

  static constexpr char kMaxTs[] =
      "\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
@ -1143,21 +1169,22 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  }

  if (start.has_value()) {
-    start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
-                              kValueTypeForSeek);
+    start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
    if (ts_sz > 0) {
      start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
                                   &ts_slice);
    }
    start_slice = start_ikey.GetInternalKey();
+    start_user_key = start_ikey.GetUserKey();
  }
  if (end.has_value()) {
-    end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
+    end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
    if (ts_sz > 0) {
      end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
                                 &ts_slice);
    }
    end_slice = end_ikey.GetInternalKey();
+    end_user_key = end_ikey.GetUserKey();
  }

  std::unique_ptr<InternalIterator> clip;
@ -1256,6 +1283,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
      /*expect_valid_internal_key=*/true, range_del_agg.get(),
      blob_file_builder.get(), db_options_.allow_data_in_errors,
      db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
+      sub_compact->compaction
+          ->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
      sub_compact->compaction, compaction_filter, shutting_down_,
      db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
      preclude_last_level_min_seqno_);
@ -1273,11 +1302,15 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
      [this, sub_compact](CompactionOutputs& outputs) {
        return this->OpenCompactionOutputFile(sub_compact, outputs);
      };
+
  const CompactionFileCloseFunc close_file_func =
-      [this, sub_compact](CompactionOutputs& outputs, const Status& status,
-                          const Slice& next_table_min_key) {
-        return this->FinishCompactionOutputFile(status, sub_compact, outputs,
-                                                next_table_min_key);
+      [this, sub_compact, start_user_key, end_user_key](
+          CompactionOutputs& outputs, const Status& status,
+          const Slice& next_table_min_key) {
+        return this->FinishCompactionOutputFile(
+            status, sub_compact, outputs, next_table_min_key,
+            sub_compact->start.has_value() ? &start_user_key : nullptr,
+            sub_compact->end.has_value() ? &end_user_key : nullptr);
      };

  Status status;
@ -1288,8 +1321,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
  while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
    // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
    // returns true.
-    assert(!end.has_value() || cfd->user_comparator()->Compare(
-                                   c_iter->user_key(), end.value()) < 0);
+    assert(!end.has_value() ||
+           cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);

    if (c_iter_stats.num_input_records % kRecordStatsEvery ==
        kRecordStatsEvery - 1) {
@ -1316,8 +1349,25 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
    if (c_iter->status().IsManualCompactionPaused()) {
      break;
    }
+
+#ifndef NDEBUG
+    bool stop = false;
+    TEST_SYNC_POINT_CALLBACK("CompactionJob::ProcessKeyValueCompaction()::stop",
+                             static_cast<void*>(&stop));
+    if (stop) {
+      break;
+    }
+#endif  // NDEBUG
  }

+  // This number may not be accurate when CompactionIterator was created
+  // with `must_count_input_entries=false`.
+  assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
+         c_iter->HasNumInputEntryScanned());
+  sub_compact->compaction_job_stats.has_num_input_records =
+      c_iter->HasNumInputEntryScanned();
+  sub_compact->compaction_job_stats.num_input_records =
+      c_iter->NumInputEntryScanned();
  sub_compact->compaction_job_stats.num_blobs_read =
      c_iter_stats.num_blobs_read;
  sub_compact->compaction_job_stats.total_blob_bytes_read =
@ -1467,7 +1517,8 @@ void CompactionJob::RecordDroppedKeys(

 Status CompactionJob::FinishCompactionOutputFile(
    const Status& input_status, SubcompactionState* sub_compact,
-    CompactionOutputs& outputs, const Slice& next_table_min_key) {
+    CompactionOutputs& outputs, const Slice& next_table_min_key,
+    const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
  AutoThreadOperationStageUpdater stage_updater(
      ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
  assert(sub_compact != nullptr);
@ -1497,12 +1548,10 @@ Status CompactionJob::FinishCompactionOutputFile(
    // output_to_penultimate_level compaction here, as it's only used to decide
    // if range dels could be dropped.
    if (outputs.HasRangeDel()) {
-      s = outputs.AddRangeDels(
-          sub_compact->start.has_value() ? &(sub_compact->start.value())
-                                         : nullptr,
-          sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
-          range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
-          earliest_snapshot, next_table_min_key, full_history_ts_low_);
+      s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key,
+                               range_del_out_stats, bottommost_level_,
+                               cfd->internal_comparator(), earliest_snapshot,
+                               next_table_min_key, full_history_ts_low_);
    }
    RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
    TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
@ -1614,7 +1663,6 @@ Status CompactionJob::FinishCompactionOutputFile(
      TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
      file_checksum_func_name);

-#ifndef ROCKSDB_LITE
  // Report new file to SstFileManagerImpl
  auto sfm =
      static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
@ -1633,7 +1681,6 @@ Status CompactionJob::FinishCompactionOutputFile(
      db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
    }
  }
-#endif

  outputs.ResetBuilder();
  return s;
@ -1645,6 +1692,7 @@ Status CompactionJob::InstallCompactionResults(

  db_mutex_->AssertHeld();

+  const ReadOptions read_options(Env::IOActivity::kCompaction);
  auto* compaction = compact_->compaction;
  assert(compaction);

@ -1722,8 +1770,8 @@ Status CompactionJob::InstallCompactionResults(
  }

  return versions_->LogAndApply(compaction->column_family_data(),
-                                mutable_cf_options, edit, db_mutex_,
-                                db_directory_);
+                                mutable_cf_options, read_options, edit,
+                                db_mutex_, db_directory_);
 }

 void CompactionJob::RecordCompactionIOStats() {
@ -1758,11 +1806,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
  std::string fname = GetTableFileName(file_number);
  // Fire events.
  ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
-#ifndef ROCKSDB_LITE
  EventHelpers::NotifyTableFileCreationStarted(
      cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
      TableFileCreationReason::kCompaction);
-#endif  // !ROCKSDB_LITE
  // Make the output file
  std::unique_ptr<FSWritableFile> writable_file;
 #ifndef NDEBUG
@ -1821,10 +1867,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
  uint64_t current_time = static_cast<uint64_t>(temp_current_time);
  InternalKey tmp_start, tmp_end;
  if (sub_compact->start.has_value()) {
-    tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
+    tmp_start.SetMinPossibleForUserKey(*(sub_compact->start));
  }
  if (sub_compact->end.has_value()) {
-    tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
+    tmp_end.SetMinPossibleForUserKey(*(sub_compact->end));
  }
  uint64_t oldest_ancester_time =
      sub_compact->compaction->MinInputFileOldestAncesterTime(
@ -1899,7 +1945,6 @@ void CompactionJob::CleanupCompaction() {
  compact_ = nullptr;
 }

-#ifndef ROCKSDB_LITE
 namespace {
 void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
  assert(prefix_length > 0);
@ -1908,25 +1953,53 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
 }
 }  // namespace

-#endif  // !ROCKSDB_LITE
-
-void CompactionJob::UpdateCompactionStats() {
+bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
  assert(compact_);

  Compaction* compaction = compact_->compaction;
  compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
  compaction_stats_.stats.num_input_files_in_output_level = 0;
+
+  bool has_error = false;
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  const auto& input_table_properties = compaction->GetTableProperties();
  for (int input_level = 0;
       input_level < static_cast<int>(compaction->num_input_levels());
       ++input_level) {
+    size_t num_input_files = compaction->num_input_files(input_level);
+    uint64_t* bytes_read;
    if (compaction->level(input_level) != compaction->output_level()) {
-      UpdateCompactionInputStatsHelper(
-          &compaction_stats_.stats.num_input_files_in_non_output_levels,
-          &compaction_stats_.stats.bytes_read_non_output_levels, input_level);
+      compaction_stats_.stats.num_input_files_in_non_output_levels +=
+          static_cast<int>(num_input_files);
+      bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
    } else {
-      UpdateCompactionInputStatsHelper(
-          &compaction_stats_.stats.num_input_files_in_output_level,
-          &compaction_stats_.stats.bytes_read_output_level, input_level);
+      compaction_stats_.stats.num_input_files_in_output_level +=
+          static_cast<int>(num_input_files);
+      bytes_read = &compaction_stats_.stats.bytes_read_output_level;
+    }
+    for (size_t i = 0; i < num_input_files; ++i) {
+      const FileMetaData* file_meta = compaction->input(input_level, i);
+      *bytes_read += file_meta->fd.GetFileSize();
+      uint64_t file_input_entries = file_meta->num_entries;
+      uint64_t file_num_range_del = file_meta->num_range_deletions;
+      if (file_input_entries == 0) {
+        uint64_t file_number = file_meta->fd.GetNumber();
+        // Try getting info from table property
+        std::string fn =
+            TableFileName(compaction->immutable_options()->cf_paths,
+                          file_number, file_meta->fd.GetPathId());
+        const auto& tp = input_table_properties.find(fn);
+        if (tp != input_table_properties.end()) {
+          file_input_entries = tp->second->num_entries;
+          file_num_range_del = tp->second->num_range_deletions;
+        } else {
+          has_error = true;
+        }
+      }
+      compaction_stats_.stats.num_input_records += file_input_entries;
+      if (num_input_range_del) {
+        *num_input_range_del += file_num_range_del;
+      }
    }
  }

@ -1936,26 +2009,11 @@ void CompactionJob::UpdateCompactionStats() {

  compaction_stats_.stats.num_dropped_records =
      compaction_stats_.DroppedRecords();
-}
-
-void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
-                                                     uint64_t* bytes_read,
-                                                     int input_level) {
-  const Compaction* compaction = compact_->compaction;
-  auto num_input_files = compaction->num_input_files(input_level);
-  *num_files += static_cast<int>(num_input_files);
-
-  for (size_t i = 0; i < num_input_files; ++i) {
-    const auto* file_meta = compaction->input(input_level, i);
-    *bytes_read += file_meta->fd.GetFileSize();
-    compaction_stats_.stats.num_input_records +=
-        static_cast<uint64_t>(file_meta->num_entries);
-  }
+  return !has_error;
 }

 void CompactionJob::UpdateCompactionJobStats(
    const InternalStats::CompactionStats& stats) const {
-#ifndef ROCKSDB_LITE
  compaction_job_stats_->elapsed_micros = stats.micros;

  // input information
@ -1982,9 +2040,6 @@ void CompactionJob::UpdateCompactionJobStats(
    CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
               &compaction_job_stats_->largest_output_key_prefix);
  }
-#else
-  (void)stats;
-#endif  // !ROCKSDB_LITE
 }

 void CompactionJob::LogCompaction() {
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@ -192,7 +192,21 @@ class CompactionJob {
  IOStatus io_status() const { return io_status_; }

 protected:
-  void UpdateCompactionStats();
+  // Update the following stats in compaction_stats_.stats
+  // - num_input_files_in_non_output_levels
+  // - num_input_files_in_output_level
+  // - bytes_read_non_output_levels
+  // - bytes_read_output_level
+  // - num_input_records
+  // - bytes_read_blob
+  // - num_dropped_records
+  //
+  // @param num_input_range_del if non-null, will be set to the number of range
+  // deletion entries in this compaction input.
+  //
+  // Returns true iff compaction_stats_.stats.num_input_records and
+  // num_input_range_del are calculated successfully.
+  bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
  void LogCompaction();
  virtual void RecordCompactionIOStats();
  void CleanupCompaction();
@ -256,7 +270,9 @@ class CompactionJob {
  Status FinishCompactionOutputFile(const Status& input_status,
                                    SubcompactionState* sub_compact,
                                    CompactionOutputs& outputs,
-                                    const Slice& next_table_min_key);
+                                    const Slice& next_table_min_key,
+                                    const Slice* comp_start_user_key,
+                                    const Slice* comp_end_user_key);
  Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
  Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
                                  CompactionOutputs& outputs);
@ -265,9 +281,6 @@ class CompactionJob {
  void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
                         CompactionJobStats* compaction_job_stats = nullptr);

-  void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
-                                        int input_level);
-
  void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);

  void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
--- a/db/compaction/compaction_job_stats_test.cc
+++ b/db/compaction/compaction_job_stats_test.cc
@ -24,7 +24,7 @@
 #include "db/write_batch_internal.h"
 #include "env/mock_env.h"
 #include "file/filename.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "monitoring/thread_status_util.h"
 #include "port/stack_trace.h"
 #include "rocksdb/cache.h"
@ -54,12 +54,11 @@
 #include "util/compression.h"
 #include "util/hash.h"
 #include "util/mutexlock.h"
-#include "util/rate_limiter.h"
+#include "util/rate_limiter_impl.h"
 #include "util/string_util.h"
 #include "utilities/merge_operators.h"

 #if !defined(IOS_CROSS_COMPILE)
-#ifndef ROCKSDB_LITE
 namespace ROCKSDB_NAMESPACE {

 static std::string RandomString(Random* rnd, int len, double ratio) {
@ -617,6 +616,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
  // via AddExpectedStats().
  auto* stats_checker = new CompactionJobStatsChecker();
  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
  options.listeners.emplace_back(stats_checker);
  options.create_if_missing = true;
  // just enough setting to hold off auto-compaction.
@ -816,6 +816,7 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
  // what we expect.
  auto* stats_checker = new CompactionJobDeletionStatsChecker();
  Options options;
+  options.level_compaction_dynamic_level_bytes = false;
  options.listeners.emplace_back(stats_checker);
  options.create_if_missing = true;
  options.level0_file_num_compaction_trigger = kTestScale + 1;
@ -959,15 +960,6 @@ int main(int argc, char** argv) {
  return RUN_ALL_TESTS();
 }

-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // !ROCKSDB_LITE

 #else

--- a/db/compaction/compaction_job_test.cc
+++ b/db/compaction/compaction_job_test.cc
@ -3,7 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

-#ifndef ROCKSDB_LITE

 #include "db/compaction/compaction_job.h"

@ -374,7 +373,7 @@ class CompactionJobTestBase : public testing::Test {
    } else if (table_type_ == TableTypeForTest::kMockTable) {
      file_size = 10;
      EXPECT_OK(mock_table_factory_->CreateMockTable(
-          env_, GenerateFileName(file_number), std::move(contents)));
+          env_, GenerateFileName(file_number), contents));
    } else {
      assert(false);
    }
@ -387,12 +386,13 @@ class CompactionJobTestBase : public testing::Test {
        kUnknownFileCreationTime,
        versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
        kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
-        0);
+        /*compensated_range_deletion_size=*/0, /*tail_size=*/0,
+        /*user_defined_timestamps_persisted=*/true);

    mutex_.Lock();
-    EXPECT_OK(
-        versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
-                               mutable_cf_options_, &edit, &mutex_, nullptr));
+    EXPECT_OK(versions_->LogAndApply(
+        versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
+        read_options_, &edit, &mutex_, nullptr));
    mutex_.Unlock();
  }

@ -455,7 +455,8 @@ class CompactionJobTestBase : public testing::Test {
      Status s = cf_options_.table_factory->NewTableReader(
          read_opts,
          TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
-                             cfd_->internal_comparator()),
+                             cfd_->internal_comparator(),
+                             0 /* block_protection_bytes_per_key */),
          std::move(freader), file_size, &table_reader, false);
      ASSERT_OK(s);
      assert(table_reader);
@ -654,11 +655,12 @@ class CompactionJobTestBase : public testing::Test {
    ASSERT_TRUE(full_history_ts_low_.empty() ||
                ucmp_->timestamp_size() == full_history_ts_low_.size());
    const std::atomic<bool> kManualCompactionCanceledFalse{false};
+    JobContext job_context(1, false /* create_superversion */);
    CompactionJob compaction_job(
        0, &compaction, db_options_, mutable_db_options_, env_options_,
        versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
        nullptr, nullptr, &mutex_, &error_handler_, snapshots,
-        earliest_write_conflict_snapshot, snapshot_checker, nullptr,
+        earliest_write_conflict_snapshot, snapshot_checker, &job_context,
        table_cache_, &event_logger, false, false, dbname_,
        &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
        /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
@ -728,6 +730,7 @@ class CompactionJobTestBase : public testing::Test {
  ColumnFamilyOptions cf_options_;
  MutableCFOptions mutable_cf_options_;
  MutableDBOptions mutable_db_options_;
+  const ReadOptions read_options_;
  std::shared_ptr<Cache> table_cache_;
  WriteController write_controller_;
  WriteBufferManager write_buffer_manager_;
@ -2441,14 +2444,3 @@ int main(int argc, char** argv) {
  RegisterCustomObjects(argc, argv);
  return RUN_ALL_TESTS();
 }
-
-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr,
-          "SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // ROCKSDB_LITE
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@ -43,7 +43,10 @@ Status CompactionOutputs::Finish(const Status& intput_status,
  const uint64_t current_bytes = builder_->FileSize();
  if (s.ok()) {
    meta->fd.file_size = current_bytes;
+    meta->tail_size = builder_->GetTailSize();
    meta->marked_for_compaction = builder_->NeedCompact();
+    meta->user_defined_timestamps_persisted = static_cast<bool>(
+        builder_->GetTableProperties().user_defined_timestamps_persisted);
  }
  current_output().finished = true;
  stats_.bytes_written += current_bytes;
@ -76,6 +79,46 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
  return io_s;
 }

+bool CompactionOutputs::UpdateFilesToCutForTTLStates(
+    const Slice& internal_key) {
+  if (!files_to_cut_for_ttl_.empty()) {
+    const InternalKeyComparator* icmp =
+        &compaction_->column_family_data()->internal_comparator();
+    if (cur_files_to_cut_for_ttl_ != -1) {
+      // Previous key is inside the range of a file
+      if (icmp->Compare(internal_key,
+                        files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
+                            ->largest.Encode()) > 0) {
+        next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
+        cur_files_to_cut_for_ttl_ = -1;
+        return true;
+      }
+    } else {
+      // Look for the key position
+      while (next_files_to_cut_for_ttl_ <
+             static_cast<int>(files_to_cut_for_ttl_.size())) {
+        if (icmp->Compare(internal_key,
+                          files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                              ->smallest.Encode()) >= 0) {
+          if (icmp->Compare(internal_key,
+                            files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
+                                ->largest.Encode()) <= 0) {
+            // With in the current file
+            cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
+            return true;
+          }
+          // Beyond the current file
+          next_files_to_cut_for_ttl_++;
+        } else {
+          // Still fall into the gap
+          break;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
    const Slice& internal_key) {
  size_t curr_key_boundary_switched_num = 0;
@ -84,11 +127,6 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
  if (grandparents.empty()) {
    return curr_key_boundary_switched_num;
  }
-  assert(!internal_key.empty());
-  InternalKey ikey;
-  ikey.DecodeFrom(internal_key);
-  assert(ikey.Valid());
-
  const Comparator* ucmp = compaction_->column_family_data()->user_comparator();

  // Move the grandparent_index_ to the file containing the current user_key.
@ -96,7 +134,7 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
  // index points to the last file containing the key.
  while (grandparent_index_ < grandparents.size()) {
    if (being_grandparent_gap_) {
-      if (sstableKeyCompare(ucmp, ikey,
+      if (sstableKeyCompare(ucmp, internal_key,
                            grandparents[grandparent_index_]->smallest) < 0) {
        break;
      }
@ -109,13 +147,13 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
      being_grandparent_gap_ = false;
    } else {
      int cmp_result = sstableKeyCompare(
-          ucmp, ikey, grandparents[grandparent_index_]->largest);
+          ucmp, internal_key, grandparents[grandparent_index_]->largest);
      // If it's same key, make sure grandparent_index_ is pointing to the last
      // one.
      if (cmp_result < 0 ||
          (cmp_result == 0 &&
           (grandparent_index_ == grandparents.size() - 1 ||
-            sstableKeyCompare(ucmp, ikey,
+            sstableKeyCompare(ucmp, internal_key,
                              grandparents[grandparent_index_ + 1]->smallest) <
                0))) {
        break;
@ -185,18 +223,39 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(

 bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
  assert(c_iter.Valid());
-
-  // always update grandparent information like overlapped file number, size
-  // etc.
  const Slice& internal_key = c_iter.key();
+#ifndef NDEBUG
+  bool should_stop = false;
+  std::pair<bool*, const Slice> p{&should_stop, internal_key};
+  TEST_SYNC_POINT_CALLBACK(
+      "CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
+  if (should_stop) {
+    return true;
+  }
+#endif  // NDEBUG
  const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
-  size_t num_grandparent_boundaries_crossed =
-      UpdateGrandparentBoundaryInfo(internal_key);
+  const InternalKeyComparator* icmp =
+      &compaction_->column_family_data()->internal_comparator();
+  size_t num_grandparent_boundaries_crossed = 0;
+  bool should_stop_for_ttl = false;
+  // Always update grandparent information like overlapped file number, size
+  // etc., and TTL states.
+  // If compaction_->output_level() == 0, there is no need to update grandparent
+  // info, and that `grandparent` should be empty.
+  if (compaction_->output_level() > 0) {
+    num_grandparent_boundaries_crossed =
+        UpdateGrandparentBoundaryInfo(internal_key);
+    should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key);
+  }

  if (!HasBuilder()) {
    return false;
  }

+  if (should_stop_for_ttl) {
+    return true;
+  }
+
  // If there's user defined partitioner, check that first
  if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
                          last_key_for_partitioner_, c_iter.user_key(),
@ -214,9 +273,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
    return true;
  }

-  const InternalKeyComparator* icmp =
-      &compaction_->column_family_data()->internal_comparator();
-
  // Check if it needs to split for RoundRobin
  // Invalid local_output_split_key indicates that we do not need to split
  if (local_output_split_key_ != nullptr && !is_split_) {
@ -290,41 +346,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
    }
  }

-  // check ttl file boundaries if there's any
-  if (!files_to_cut_for_ttl_.empty()) {
-    if (cur_files_to_cut_for_ttl_ != -1) {
-      // Previous key is inside the range of a file
-      if (icmp->Compare(internal_key,
-                        files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
-                            ->largest.Encode()) > 0) {
-        next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
-        cur_files_to_cut_for_ttl_ = -1;
-        return true;
-      }
-    } else {
-      // Look for the key position
-      while (next_files_to_cut_for_ttl_ <
-             static_cast<int>(files_to_cut_for_ttl_.size())) {
-        if (icmp->Compare(internal_key,
-                          files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
-                              ->smallest.Encode()) >= 0) {
-          if (icmp->Compare(internal_key,
-                            files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
-                                ->largest.Encode()) <= 0) {
-            // With in the current file
-            cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
-            return true;
-          }
-          // Beyond the current file
-          next_files_to_cut_for_ttl_++;
-        } else {
-          // Still fall into the gap
-          break;
-        }
-      }
-    }
-  }
-
  return false;
 }

@ -404,114 +425,182 @@ Status CompactionOutputs::AddToOutput(
  return s;
 }

+namespace {
+void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
+                    const size_t ts_sz) {
+  if (ts_sz) {
+    static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
+    if (ts_sz <= strlen(kTsMax)) {
+      internal_key = InternalKey(user_key, kMaxSequenceNumber,
+                                 kTypeRangeDeletion, Slice(kTsMax, ts_sz));
+    } else {
+      internal_key =
+          InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
+                      std::string(ts_sz, '\xff'));
+    }
+  } else {
+    internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
+  }
+}
+}  // namespace
+
 Status CompactionOutputs::AddRangeDels(
    const Slice* comp_start_user_key, const Slice* comp_end_user_key,
    CompactionIterationStats& range_del_out_stats, bool bottommost_level,
    const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
    const Slice& next_table_min_key, const std::string& full_history_ts_low) {
+  // The following example does not happen since
+  // CompactionOutput::ShouldStopBefore() always return false for the first
+  // point key. But we should consider removing this dependency. Suppose for the
+  // first compaction output file,
+  //  - next_table_min_key.user_key == comp_start_user_key
+  //  - no point key is in the output file
+  //  - there is a range tombstone @seqno to be added that covers
+  //  comp_start_user_key
+  // Then meta.smallest will be set to comp_start_user_key@seqno
+  // and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
+  // which violates the assumption that meta.smallest should be <= meta.largest.
  assert(HasRangeDel());
  FileMetaData& meta = current_output().meta;
  const Comparator* ucmp = icmp.user_comparator();
-
+  InternalKey lower_bound_buf, upper_bound_buf;
  Slice lower_bound_guard, upper_bound_guard;
  std::string smallest_user_key;
  const Slice *lower_bound, *upper_bound;
-  bool lower_bound_from_sub_compact = false;
-  bool lower_bound_from_range_tombstone = false;
+
+  // We first determine the internal key lower_bound and upper_bound for
+  // this output file. All and only range tombstones that overlap with
+  // [lower_bound, upper_bound] should be added to this file. File
+  // boundaries (meta.smallest/largest) should be updated accordingly when
+  // extended by range tombstones.
  size_t output_size = outputs_.size();
  if (output_size == 1) {
-    // For the first output table, include range tombstones before the min
-    // key but after the subcompaction boundary.
-    lower_bound = comp_start_user_key;
-    lower_bound_from_sub_compact = true;
-  } else if (range_tombstone_lower_bound_.size() > 0) {
-    assert(meta.smallest.size() == 0 ||
-           icmp.Compare(range_tombstone_lower_bound_, meta.smallest) <= 0);
-    lower_bound_guard = range_tombstone_lower_bound_.user_key();
-    lower_bound = &lower_bound_guard;
-    lower_bound_from_range_tombstone = true;
-  } else if (meta.smallest.size() > 0) {
+    // This is the first file in the subcompaction.
+    //
+    // When outputting a range tombstone that spans a subcompaction boundary,
+    // the files on either side of that boundary need to include that
+    // boundary's user key. Otherwise, the spanning range tombstone would lose
+    // coverage.
+    //
+    // To achieve this while preventing files from overlapping in internal key
+    // (an LSM invariant violation), we allow the earlier file to include the
+    // boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
+    // later file can begin at the boundary user key at the newest key version
+    // it contains. At this point that version number is unknown since we have
+    // not processed the range tombstones yet, so permit any version. Same story
+    // applies to timestamp, and a non-nullptr `comp_start_user_key` should have
+    // `kMaxTs` here, which similarly permits any timestamp.
+    if (comp_start_user_key) {
+      lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
+                          kTypeRangeDeletion);
+      lower_bound_guard = lower_bound_buf.Encode();
+      lower_bound = &lower_bound_guard;
+    } else {
+      lower_bound = nullptr;
+    }
+  } else {
    // For subsequent output tables, only include range tombstones from min
    // key onwards since the previous file was extended to contain range
    // tombstones falling before min key.
-    smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
-    lower_bound_guard = Slice(smallest_user_key);
-    lower_bound = &lower_bound_guard;
-  } else {
-    lower_bound = nullptr;
-  }
-  if (!next_table_min_key.empty()) {
-    // This may be the last file in the subcompaction in some cases, so we
-    // need to compare the end key of subcompaction with the next file start
-    // key. When the end key is chosen by the subcompaction, we know that
-    // it must be the biggest key in output file. Therefore, it is safe to
-    // use the smaller key as the upper bound of the output file, to ensure
-    // that there is no overlapping between different output files.
-    upper_bound_guard = ExtractUserKey(next_table_min_key);
-    if (comp_end_user_key != nullptr &&
-        ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
-            0) {
-      upper_bound = comp_end_user_key;
+    if (range_tombstone_lower_bound_.size() > 0) {
+      assert(meta.smallest.size() == 0 ||
+             icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
+      lower_bound_guard = range_tombstone_lower_bound_.Encode();
    } else {
+      assert(meta.smallest.size() > 0);
+      lower_bound_guard = meta.smallest.Encode();
+    }
+    lower_bound = &lower_bound_guard;
+  }
+
+  const size_t ts_sz = ucmp->timestamp_size();
+  if (next_table_min_key.empty()) {
+    // Last file of the subcompaction.
+    if (comp_end_user_key) {
+      upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
+                          kTypeRangeDeletion);
+      upper_bound_guard = upper_bound_buf.Encode();
      upper_bound = &upper_bound_guard;
+    } else {
+      upper_bound = nullptr;
    }
  } else {
-    // This is the last file in the subcompaction, so extend until the
-    // subcompaction ends.
-    upper_bound = comp_end_user_key;
-  }
-  bool has_overlapping_endpoints;
-  if (upper_bound != nullptr && meta.largest.size() > 0) {
-    has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
-                                    meta.largest.user_key(), *upper_bound) == 0;
-  } else {
-    has_overlapping_endpoints = false;
+    // There is another file coming whose coverage will begin at
+    // `next_table_min_key`. The current file needs to extend range tombstone
+    // coverage through its own keys (through `meta.largest`) and through user
+    // keys preceding `next_table_min_key`'s user key.
+    ParsedInternalKey next_table_min_key_parsed;
+    ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
+                     false /* log_err_key */)
+        .PermitUncheckedError();
+    assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
+    assert(meta.largest.size() == 0 ||
+           icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
+    assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
+    if (meta.largest.size() > 0 &&
+        ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
+                                    next_table_min_key_parsed.user_key)) {
+      // Caution: this assumes meta.largest.Encode() lives longer than
+      // upper_bound, which is only true if meta.largest is never updated.
+      // This just happens to be the case here since meta.largest serves
+      // as the upper_bound.
+      upper_bound_guard = meta.largest.Encode();
+    } else {
+      SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
+                     ts_sz);
+      upper_bound_guard = upper_bound_buf.Encode();
+    }
+    upper_bound = &upper_bound_guard;
+  }
+  if (lower_bound && upper_bound &&
+      icmp.Compare(*lower_bound, *upper_bound) > 0) {
+    assert(meta.smallest.size() == 0 &&
+           ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
+                                       ExtractUserKey(*upper_bound)));
+    // This can only happen when lower_bound have the same user key as
+    // next_table_min_key and that there is no point key in the current
+    // compaction output file.
+    return Status::OK();
  }
-
  // The end key of the subcompaction must be bigger or equal to the upper
  // bound. If the end of subcompaction is null or the upper bound is null,
  // it means that this file is the last file in the compaction. So there
  // will be no overlapping between this file and others.
  assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
-         ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
-  auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
-                                        has_overlapping_endpoints);
-  // Position the range tombstone output iterator. There may be tombstone
-  // fragments that are entirely out of range, so make sure that we do not
-  // include those.
-  if (lower_bound != nullptr) {
-    it->Seek(*lower_bound);
-  } else {
-    it->SeekToFirst();
-  }
-  for (; it->Valid(); it->Next()) {
+         ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
+                                       *comp_end_user_key) <= 0);
+  auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
+  Slice last_tombstone_start_user_key{};
+  bool reached_lower_bound = false;
+  const ReadOptions read_options(Env::IOActivity::kCompaction);
+  for (it->SeekToFirst(); it->Valid(); it->Next()) {
    auto tombstone = it->Tombstone();
-    if (upper_bound != nullptr) {
-      int cmp =
-          ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
-      if ((has_overlapping_endpoints && cmp < 0) ||
-          (!has_overlapping_endpoints && cmp <= 0)) {
-        // Tombstones starting after upper_bound only need to be included in
-        // the next table. If the current SST ends before upper_bound, i.e.,
-        // `has_overlapping_endpoints == false`, we can also skip over range
-        // tombstones that start exactly at upper_bound. Such range
-        // tombstones will be included in the next file and are not relevant
-        // to the point keys or endpoints of the current file.
-        break;
-      }
+    auto kv = tombstone.Serialize();
+    InternalKey tombstone_end = tombstone.SerializeEndKey();
+    // TODO: the underlying iterator should support clamping the bounds.
+    // tombstone_end.Encode is of form user_key@kMaxSeqno
+    // if it is equal to lower_bound, there is no need to include
+    // such range tombstone.
+    if (!reached_lower_bound && lower_bound &&
+        icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
+      continue;
    }
+    assert(!lower_bound ||
+           icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
+    reached_lower_bound = true;

-    const size_t ts_sz = ucmp->timestamp_size();
    // Garbage collection for range tombstones.
    // If user-defined timestamp is enabled, range tombstones are dropped if
    // they are at bottommost_level, below full_history_ts_low and not visible
    // in any snapshot. trim_ts_ is passed to the constructor for
    // range_del_agg_, and range_del_agg_ internally drops tombstones above
    // trim_ts_.
-    if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
+    bool consider_drop =
+        tombstone.seq_ <= earliest_snapshot &&
        (ts_sz == 0 ||
         (!full_history_ts_low.empty() &&
-          ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
+          ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0));
+    if (consider_drop && bottommost_level) {
      // TODO(andrewkr): tombstones that span multiple output files are
      // counted for each compaction output file, so lots of double
      // counting.
@ -520,149 +609,126 @@ Status CompactionOutputs::AddRangeDels(
      continue;
    }

-    auto kv = tombstone.Serialize();
    assert(lower_bound == nullptr ||
-           ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
-    // Range tombstone is not supported by output validator yet.
-    builder_->Add(kv.first.Encode(), kv.second);
-    InternalKey tombstone_start = std::move(kv.first);
-    InternalKey smallest_candidate{tombstone_start};
-    if (lower_bound != nullptr &&
-        ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
-                                      *lower_bound) <= 0) {
-      // Pretend the smallest key has the same user key as lower_bound
-      // (the max key in the previous table or subcompaction) in order for
-      // files to appear key-space partitioned.
-      if (lower_bound_from_sub_compact) {
-        // When lower_bound is chosen by a subcompaction
-        // (lower_bound_from_sub_compact), we know that subcompactions over
-        // smaller keys cannot contain any keys at lower_bound. We also know
-        // that smaller subcompactions exist, because otherwise the
-        // subcompaction woud be unbounded on the left. As a result, we know
-        // that no other files on the output level will contain actual keys at
-        // lower_bound (an output file may have a largest key of
-        // lower_bound@kMaxSequenceNumber, but this only indicates a large range
-        // tombstone was truncated). Therefore, it is safe to use the
-        // tombstone's sequence number, to ensure that keys at lower_bound at
-        // lower levels are covered by truncated tombstones.
-        if (ts_sz) {
-          assert(tombstone.ts_.size() == ts_sz);
-          smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
-                                           kTypeRangeDeletion, tombstone.ts_);
-        } else {
-          smallest_candidate =
-              InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
-        }
-      } else if (lower_bound_from_range_tombstone) {
-        // When lower_bound is chosen from a range tombtone start key:
-        // Range tombstone keys can be truncated at file boundaries of the files
-        // that contain them.
-        //
-        // If this lower bound is from a range tombstone key that is not
-        // truncated, i.e., it was not truncated when reading from the input
-        // files, then its sequence number and `op_type` will be
-        // kMaxSequenceNumber and kTypeRangeDeletion (see
-        // TruncatedRangeDelIterator::start_key()). In this case, when this key
-        // was used as the upper bound to cut the previous compaction output
-        // file, the previous file's largest key could have the same value as
-        // this key (see the upperbound logic below). To guarantee
-        // non-overlapping ranges between output files, we use the range
-        // tombstone's actual sequence number (tombstone.seq_) for the lower
-        // bound of this file. If this range tombstone key is truncated, then
-        // the previous file's largest key will be smaller than this range
-        // tombstone key, so we can use it as the lower bound directly.
-        if (ExtractInternalKeyFooter(range_tombstone_lower_bound_.Encode()) ==
-            kRangeTombstoneSentinel) {
-          if (ts_sz) {
-            smallest_candidate =
-                InternalKey(range_tombstone_lower_bound_.user_key(),
-                            tombstone.seq_, kTypeRangeDeletion, tombstone.ts_);
-          } else {
-            smallest_candidate =
-                InternalKey(range_tombstone_lower_bound_.user_key(),
-                            tombstone.seq_, kTypeRangeDeletion);
-          }
-        } else {
-          assert(GetInternalKeySeqno(range_tombstone_lower_bound_.Encode()) <
-                 kMaxSequenceNumber);
-          smallest_candidate = range_tombstone_lower_bound_;
-        }
-      } else {
-        // If lower_bound was chosen by the smallest data key in the file,
-        // choose lowest seqnum so this file's smallest internal key comes
-        // after the previous file's largest. The fake seqnum is OK because
-        // the read path's file-picking code only considers user key.
-        smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
-      }
+           ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
+                                         kv.second) < 0);
+    InternalKey tombstone_start = kv.first;
+    if (lower_bound &&
+        ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
+                                      ExtractUserKey(*lower_bound)) < 0) {
+      // This just updates the non-timestamp portion of `tombstone_start`'s user
+      // key. Ideally there would be a simpler API usage
+      ParsedInternalKey tombstone_start_parsed;
+      ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
+                       false /* log_err_key */)
+          .PermitUncheckedError();
+      // timestamp should be from where sequence number is from, which is from
+      // tombstone in this case
+      std::string ts =
+          tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
+              .ToString();
+      tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
+      tombstone_start.SetFrom(tombstone_start_parsed, ts);
    }
-    InternalKey tombstone_end = tombstone.SerializeEndKey();
-    InternalKey largest_candidate{tombstone_end};
    if (upper_bound != nullptr &&
-        ucmp->CompareWithoutTimestamp(*upper_bound,
-                                      largest_candidate.user_key()) <= 0) {
-      // Pretend the largest key has the same user key as upper_bound (the
-      // min key in the following table or subcompaction) in order for files
-      // to appear key-space partitioned.
-      //
-      // Choose highest seqnum so this file's largest internal key comes
-      // before the next file's/subcompaction's smallest. The fake seqnum is
-      // OK because the read path's file-picking code only considers the
-      // user key portion.
-      //
-      // Note Seek() also creates InternalKey with (user_key,
-      // kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
-      // kTypeRangeDeletion (0xF), so the range tombstone comes before the
-      // Seek() key in InternalKey's ordering. So Seek() will look in the
-      // next file for the user key
-      if (ts_sz) {
-        static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
-        if (ts_sz <= strlen(kTsMax)) {
-          largest_candidate =
-              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
-                          Slice(kTsMax, ts_sz));
-        } else {
-          largest_candidate =
-              InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
-                          std::string(ts_sz, '\xff'));
-        }
-      } else {
-        largest_candidate =
-            InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
-      }
+        icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
+      break;
    }
-#ifndef NDEBUG
-    SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
-    if (meta.smallest.size() > 0) {
-      smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
+    if (lower_bound &&
+        icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
+      tombstone_start.DecodeFrom(*lower_bound);
    }
-#endif
-    meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
+    if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
+      tombstone_end.DecodeFrom(*upper_bound);
+    }
+    if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel(
+                             tombstone_start.user_key(),
+                             tombstone_end.user_key(), &level_ptrs_)) {
+      range_del_out_stats.num_range_del_drop_obsolete++;
+      range_del_out_stats.num_record_drop_obsolete++;
+      continue;
+    }
+    // Here we show that *only* range tombstones that overlap with
+    // [lower_bound, upper_bound] are added to the current file, and
+    // sanity checking invariants that should hold:
+    // - [tombstone_start, tombstone_end] overlaps with [lower_bound,
+    // upper_bound]
+    // - meta.smallest <= meta.largest
+    // Corresponding assertions are made, the proof is broken is any of them
+    // fails.
+    // TODO: show that *all* range tombstones that overlap with
+    //  [lower_bound, upper_bound] are added.
+    // TODO: some invariant about boundaries are correctly updated.
+    //
+    // Note that `tombstone_start` is updated in the if condition above, we use
+    // tombstone_start to refer to its initial value, i.e.,
+    // it->Tombstone().first, and use tombstone_start* to refer to its value
+    // after the update.
+    //
+    // To show [lower_bound, upper_bound] overlaps with [tombstone_start,
+    // tombstone_end]:
+    // lower_bound <= upper_bound from the if condition right after all
+    // bounds are initialized. We assume each tombstone fragment has
+    // start_key.user_key < end_key.user_key, so
+    // tombstone_start < tombstone_end by
+    // FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
+    // non-emtpy. The flag `reached_lower_bound` and the if logic before it
+    // ensures lower_bound <= tombstone_end. tombstone_start is only updated
+    // if it has a smaller user_key than lower_bound user_key, so
+    // tombstone_start <= tombstone_start*. The above if condition implies
+    // tombstone_start* <= upper_bound. So we have
+    // tombstone_start <= upper_bound and lower_bound <= tombstone_end
+    // and the two ranges overlap.
+    //
+    // To show meta.smallest <= meta.largest:
+    // From the implementation of UpdateBoundariesForRange(), it suffices to
+    // prove that when it is first called in this function, its parameters
+    // satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
+    // and end = min(tombstone_end, upper_bound). From the above proof we have
+    // lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
+    // to show that tombstone_start* <= min(tombstone_end, upper_bound).
+    // Note that tombstone_start*.user_key = max(tombstone_start.user_key,
+    // lower_bound.user_key). Assuming tombstone_end always has
+    // kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
+    // Since lower_bound <= tombstone_end and lower_bound.seqno <
+    // tombstone_end.seqno (in absolute number order, not internal key order),
+    // lower_bound.user_key < tombstone_end.user_key.
+    // Since lower_bound.user_key < tombstone_end.user_key and
+    // tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
+    // tombstone_end. Since tombstone_start* <= upper_bound from the above proof
+    // and tombstone_start* < tombstone_end, tombstone_start* <=
+    // min(tombstone_end, upper_bound), so the two ranges overlap.
+
+    // Range tombstone is not supported by output validator yet.
+    builder_->Add(kv.first.Encode(), kv.second);
+    assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
+    meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
                                  tombstone.seq_, icmp);
    if (!bottommost_level) {
-      // Range tombstones are truncated at file boundaries
-      if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
-        tombstone_start = meta.smallest;
-      }
-      if (icmp.Compare(tombstone_end, meta.largest) > 0) {
-        tombstone_end = meta.largest;
+      bool start_user_key_changed =
+          last_tombstone_start_user_key.empty() ||
+          ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
+                                        it->start_key()) < 0;
+      last_tombstone_start_user_key = it->start_key();
+      if (start_user_key_changed) {
+        // If tombstone_start >= tombstone_end, then either no key range is
+        // covered, or that they have the same user key. If they have the same
+        // user key, then the internal key range should only be within this
+        // level, and no keys from older levels is covered.
+        if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
+                                          tombstone_end.user_key()) < 0) {
+          SizeApproximationOptions approx_opts;
+          approx_opts.files_size_error_margin = 0.1;
+          auto approximate_covered_size =
+              compaction_->input_version()->version_set()->ApproximateSize(
+                  approx_opts, read_options, compaction_->input_version(),
+                  tombstone_start.Encode(), tombstone_end.Encode(),
+                  compaction_->output_level() + 1 /* start_level */,
+                  -1 /* end_level */, kCompaction);
+          meta.compensated_range_deletion_size += approximate_covered_size;
+        }
      }
-      SizeApproximationOptions approx_opts;
-      approx_opts.files_size_error_margin = 0.1;
-      auto approximate_covered_size =
-          compaction_->input_version()->version_set()->ApproximateSize(
-              approx_opts, compaction_->input_version(),
-              tombstone_start.Encode(), tombstone_end.Encode(),
-              compaction_->output_level() + 1 /* start_level */,
-              -1 /* end_level */, kCompaction);
-      meta.compensated_range_deletion_size += approximate_covered_size;
    }
-    // The smallest key in a file is used for range tombstone truncation, so
-    // it cannot have a seqnum of 0 (unless the smallest data key in a file
-    // has a seqnum of 0). Otherwise, the truncated tombstone may expose
-    // deleted keys at lower levels.
-    assert(smallest_ikey_seqnum == 0 || lower_bound_from_range_tombstone ||
-           ExtractInternalKeyFooter(meta.smallest.Encode()) !=
-               PackSequenceAndType(0, kTypeRangeDeletion));
  }
  return Status::OK();
 }
@ -719,6 +785,8 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction,
  if (compaction->output_level() != 0) {
    FillFilesToCutForTtl();
  }
+
+  level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
 }

 }  // namespace ROCKSDB_NAMESPACE
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@ -167,9 +167,15 @@ class CompactionOutputs {
    current_output_file_size_ = 0;
  }

-  // Add range-dels from the aggregator to the current output file
+  // Add range deletions from the range_del_agg_ to the current output file.
+  // Input parameters, `range_tombstone_lower_bound_` and current output's
+  // metadata determine the bounds on range deletions to add. Updates output
+  // file metadata boundary if extended by range tombstones.
+  //
  // @param comp_start_user_key and comp_end_user_key include timestamp if
-  // user-defined timestamp is enabled.
+  // user-defined timestamp is enabled. Their timestamp should be max timestamp.
+  // @param next_table_min_key internal key lower bound for the next compaction
+  // output.
  // @param full_history_ts_low used for range tombstone garbage collection.
  Status AddRangeDels(const Slice* comp_start_user_key,
                      const Slice* comp_end_user_key,
@ -200,10 +206,10 @@ class CompactionOutputs {
      // We may only split the output when the cursor is in the range. Split
      if ((!end.has_value() ||
           icmp->user_comparator()->Compare(
-               ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
-          (!start.has_value() || icmp->user_comparator()->Compare(
-                                     ExtractUserKey(output_split_key->Encode()),
-                                     start.value()) > 0)) {
+               ExtractUserKey(output_split_key->Encode()), *end) < 0) &&
+          (!start.has_value() ||
+           icmp->user_comparator()->Compare(
+               ExtractUserKey(output_split_key->Encode()), *start) > 0)) {
        local_output_split_key_ = output_split_key;
      }
    }
@ -221,6 +227,13 @@ class CompactionOutputs {
    }
  }

+  // Updates states related to file cutting for TTL.
+  // Returns a boolean value indicating whether the current
+  // compaction output file should be cut before `internal_key`.
+  //
+  // @param internal_key the current key to be added to output.
+  bool UpdateFilesToCutForTTLStates(const Slice& internal_key);
+
  // update tracked grandparents information like grandparent index, if it's
  // in the gap between 2 grandparent files, accumulated grandparent files size
  // etc.
@ -343,6 +356,15 @@ class CompactionOutputs {
  // The smallest key of the current output file, this is set when current
  // output file's smallest key is a range tombstone start key.
  InternalKey range_tombstone_lower_bound_;
+
+  // Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in
+  // CompactionOutputs::AddRangeDels().
+  // level_ptrs_[i] holds index of the file that was checked during the last
+  // call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows
+  // future calls to the function to pick up where it left off, since each
+  // range tombstone added to output file within each subcompaction is in
+  // increasing key range.
+  std::vector<size_t> level_ptrs_;
 };

 // helper struct to concatenate the last level and penultimate level outputs
--- a/db/compaction/compaction_picker.cc
+++ b/db/compaction/compaction_picker.cc
@ -20,7 +20,7 @@
 #include "file/filename.h"
 #include "logging/log_buffer.h"
 #include "logging/logging.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/string_util.h"
@ -611,23 +611,21 @@ Compaction* CompactionPicker::CompactRange(
    // Universal compaction with more than one level always compacts all the
    // files together to the last level.
    assert(vstorage->num_levels() > 1);
+    int max_output_level =
+        vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind);
    // DBImpl::CompactRange() set output level to be the last level
-    if (ioptions_.allow_ingest_behind) {
-      assert(output_level == vstorage->num_levels() - 2);
-    } else {
-      assert(output_level == vstorage->num_levels() - 1);
-    }
+    assert(output_level == max_output_level);
    // DBImpl::RunManualCompaction will make full range for universal compaction
    assert(begin == nullptr);
    assert(end == nullptr);
    *compaction_end = nullptr;

    int start_level = 0;
-    for (; start_level < vstorage->num_levels() &&
+    for (; start_level <= max_output_level &&
           vstorage->NumLevelFiles(start_level) == 0;
         start_level++) {
    }
-    if (start_level == vstorage->num_levels()) {
+    if (start_level > max_output_level) {
      return nullptr;
    }

@ -637,9 +635,9 @@ Compaction* CompactionPicker::CompactRange(
      return nullptr;
    }

-    std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
+    std::vector<CompactionInputFiles> inputs(max_output_level + 1 -
                                             start_level);
-    for (int level = start_level; level < vstorage->num_levels(); level++) {
+    for (int level = start_level; level <= max_output_level; level++) {
      inputs[level - start_level].level = level;
      auto& files = inputs[level - start_level].files;
      for (FileMetaData* f : vstorage->LevelFiles(level)) {
@ -753,8 +751,10 @@ Compaction* CompactionPicker::CompactRange(

  // for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
  // files that are created during the current compaction.
-  if (compact_range_options.bottommost_level_compaction ==
-          BottommostLevelCompaction::kForceOptimized &&
+  if ((compact_range_options.bottommost_level_compaction ==
+           BottommostLevelCompaction::kForceOptimized ||
+       compact_range_options.bottommost_level_compaction ==
+           BottommostLevelCompaction::kIfHaveCompactionFilter) &&
      max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
    assert(input_level == output_level);
    // inputs_shrunk holds a continuous subset of input files which were all
@ -877,7 +877,6 @@ Compaction* CompactionPicker::CompactRange(
  return compaction;
 }

-#ifndef ROCKSDB_LITE
 namespace {
 // Test whether two files have overlapping key-ranges.
 bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
@ -1116,7 +1115,6 @@ Status CompactionPicker::SanitizeCompactionInputFiles(

  return Status::OK();
 }
-#endif  // !ROCKSDB_LITE

 void CompactionPicker::RegisterCompaction(Compaction* c) {
  if (c == nullptr) {
--- a/db/compaction/compaction_picker.h
+++ b/db/compaction/compaction_picker.h
@ -93,11 +93,9 @@ class CompactionPicker {
 // into a valid one by adding more files, the function will return a
 // non-ok status with specific reason.
 //
-#ifndef ROCKSDB_LITE
  Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
                                      const ColumnFamilyMetaData& cf_meta,
                                      const int output_level) const;
-#endif  // ROCKSDB_LITE

  // Free up the files that participated in a compaction
  //
@ -229,11 +227,9 @@ class CompactionPicker {

 // A helper function to SanitizeCompactionInputFiles() that
 // sanitizes "input_files" by adding necessary files.
-#ifndef ROCKSDB_LITE
  virtual Status SanitizeCompactionInputFilesForAllLevels(
      std::unordered_set<uint64_t>* input_files,
      const ColumnFamilyMetaData& cf_meta, const int output_level) const;
-#endif  // ROCKSDB_LITE

  // Keeps track of all compactions that are running on Level0.
  // Protected by DB mutex
@ -246,7 +242,6 @@ class CompactionPicker {
  const InternalKeyComparator* const icmp_;
 };

-#ifndef ROCKSDB_LITE
 // A dummy compaction that never triggers any automatic
 // compaction.
 class NullCompactionPicker : public CompactionPicker {
@ -287,7 +282,6 @@ class NullCompactionPicker : public CompactionPicker {
    return false;
  }
 };
-#endif  // !ROCKSDB_LITE

 // Attempts to find an intra L0 compaction conforming to the given parameters.
 //
--- a/db/compaction/compaction_picker_fifo.cc
+++ b/db/compaction/compaction_picker_fifo.cc
@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include "db/compaction/compaction_picker_fifo.h"
-#ifndef ROCKSDB_LITE

 #include <cinttypes>
 #include <string>
@ -17,6 +16,7 @@
 #include "db/column_family.h"
 #include "logging/log_buffer.h"
 #include "logging/logging.h"
+#include "options/options_helper.h"
 #include "util/string_util.h"

 namespace ROCKSDB_NAMESPACE {
@ -285,31 +285,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
  return c;
 }

-Compaction* FIFOCompactionPicker::PickCompactionToWarm(
+Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
    const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
    const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
    LogBuffer* log_buffer) {
-  if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
+  const std::vector<FileTemperatureAge>& ages =
+      mutable_cf_options.compaction_options_fifo
+          .file_temperature_age_thresholds;
+  if (ages.empty()) {
    return nullptr;
  }

-  // PickCompactionToWarm is only triggered if there is no non-L0 files.
-  for (int level = 1; level < vstorage->num_levels(); ++level) {
-    if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
-      return nullptr;
-    }
+  // Does not apply to multi-level FIFO.
+  if (vstorage->num_levels() > 1) {
+    return nullptr;
  }

  const int kLevel0 = 0;
  const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
+  if (level_files.empty()) {
+    return nullptr;
+  }

  int64_t _current_time;
  auto status = ioptions_.clock->GetCurrentTime(&_current_time);
  if (!status.ok()) {
-    ROCKS_LOG_BUFFER(log_buffer,
-                     "[%s] FIFO compaction: Couldn't get current time: %s. "
-                     "Not doing compactions based on warm threshold. ",
-                     cf_name.c_str(), status.ToString().c_str());
+    ROCKS_LOG_BUFFER(
+        log_buffer,
+        "[%s] FIFO compaction: Couldn't get current time: %s. "
+        "Not doing compactions based on file temperature-age threshold. ",
+        cf_name.c_str(), status.ToString().c_str());
    return nullptr;
  }
  const uint64_t current_time = static_cast<uint64_t>(_current_time);
@ -328,56 +333,77 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm(
  inputs[0].level = 0;

  // avoid underflow
-  if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
-    uint64_t create_time_threshold =
-        current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
+  uint64_t min_age = ages[0].age;
+  // kLastTemperature means target temperature is to be determined.
+  Temperature compaction_target_temp = Temperature::kLastTemperature;
+  if (current_time > min_age) {
+    uint64_t create_time_threshold = current_time - min_age;
    uint64_t compaction_size = 0;
-    // We will ideally identify a file qualifying for warm tier by knowing
-    // the timestamp for the youngest entry in the file. However, right now
-    // we don't have the information. We infer it by looking at timestamp
-    // of the next file's (which is just younger) oldest entry's timestamp.
-    FileMetaData* prev_file = nullptr;
-    for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
-      FileMetaData* f = *ritr;
-      assert(f);
-      if (f->being_compacted) {
-        // Right now this probably won't happen as we never try to schedule
-        // two compactions in parallel, so here we just simply don't schedule
-        // anything.
+    // We will ideally identify a file qualifying for temperature change by
+    // knowing the timestamp for the youngest entry in the file. However, right
+    // now we don't have the information. We infer it by looking at timestamp of
+    // the previous file's (which is just younger) oldest entry's timestamp.
+    Temperature cur_target_temp;
+    // avoid index underflow
+    assert(level_files.size() >= 1);
+    for (size_t index = level_files.size() - 1; index >= 1; --index) {
+      // Try to add cur_file to compaction inputs.
+      FileMetaData* cur_file = level_files[index];
+      // prev_file is just younger than cur_file
+      FileMetaData* prev_file = level_files[index - 1];
+      if (cur_file->being_compacted) {
+        // Should not happen since we check for
+        // `level0_compactions_in_progress_` above. Here we simply just don't
+        // schedule anything.
        return nullptr;
      }
-      uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
-      if (oldest_ancester_time == kUnknownOldestAncesterTime) {
+      uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime();
+      if (oldest_ancestor_time == kUnknownOldestAncesterTime) {
        // Older files might not have enough information. It is possible to
        // handle these files by looking at newer files, but maintaining the
        // logic isn't worth it.
        break;
      }
-      if (oldest_ancester_time > create_time_threshold) {
-        // The previous file (which has slightly older data) doesn't qualify
-        // for warm tier.
+      if (oldest_ancestor_time > create_time_threshold) {
+        // cur_file is too fresh
        break;
      }
-      if (prev_file != nullptr) {
-        compaction_size += prev_file->fd.GetFileSize();
-        if (compaction_size > mutable_cf_options.max_compaction_bytes) {
+      cur_target_temp = ages[0].temperature;
+      for (size_t i = 1; i < ages.size(); ++i) {
+        if (current_time >= ages[i].age &&
+            oldest_ancestor_time <= current_time - ages[i].age) {
+          cur_target_temp = ages[i].temperature;
+        }
+      }
+      if (cur_file->temperature == cur_target_temp) {
+        if (inputs[0].empty()) {
+          continue;
+        } else {
          break;
        }
-        inputs[0].files.push_back(prev_file);
-        ROCKS_LOG_BUFFER(log_buffer,
-                         "[%s] FIFO compaction: picking file %" PRIu64
-                         " with next file's oldest time %" PRIu64 " for warm",
-                         cf_name.c_str(), prev_file->fd.GetNumber(),
-                         oldest_ancester_time);
      }
-      if (f->temperature == Temperature::kUnknown ||
-          f->temperature == Temperature::kHot) {
-        prev_file = f;
-      } else if (!inputs[0].files.empty()) {
-        // A warm file newer than files picked.
+
+      // cur_file needs to change temperature
+      if (compaction_target_temp == Temperature::kLastTemperature) {
+        assert(inputs[0].empty());
+        compaction_target_temp = cur_target_temp;
+      } else if (cur_target_temp != compaction_target_temp) {
+        assert(!inputs[0].empty());
+        break;
+      }
+      if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <=
+                                   mutable_cf_options.max_compaction_bytes) {
+        inputs[0].files.push_back(cur_file);
+        compaction_size += cur_file->fd.GetFileSize();
+        ROCKS_LOG_BUFFER(
+            log_buffer,
+            "[%s] FIFO compaction: picking file %" PRIu64
+            " with next file's oldest time %" PRIu64 " for temperature %s.",
+            cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
+            temperature_to_string[cur_target_temp].c_str());
+      }
+      if (compaction_size > mutable_cf_options.max_compaction_bytes) {
        break;
-      } else {
-        assert(prev_file == nullptr);
      }
    }
  }
@ -391,7 +417,7 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm(
      std::move(inputs), 0, 0 /* output file size limit */,
      0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
      mutable_cf_options.compression, mutable_cf_options.compression_opts,
-      Temperature::kWarm,
+      compaction_target_temp,
      /* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
      vstorage->CompactionScore(0),
      /* is deletion compaction */ false, /* l0_files_might_overlap */ true,
@ -413,8 +439,8 @@ Compaction* FIFOCompactionPicker::PickCompaction(
                           vstorage, log_buffer);
  }
  if (c == nullptr) {
-    c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
-                             vstorage, log_buffer);
+    c = PickTemperatureChangeCompaction(
+        cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
  }
  RegisterCompaction(c);
  return c;
@ -443,4 +469,3 @@ Compaction* FIFOCompactionPicker::CompactRange(
 }

 }  // namespace ROCKSDB_NAMESPACE
-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction_picker_fifo.h
+++ b/db/compaction/compaction_picker_fifo.h
@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #pragma once
-#ifndef ROCKSDB_LITE

 #include "db/compaction/compaction_picker.h"

@ -53,11 +52,9 @@ class FIFOCompactionPicker : public CompactionPicker {
                                 VersionStorageInfo* version,
                                 LogBuffer* log_buffer);

-  Compaction* PickCompactionToWarm(const std::string& cf_name,
-                                   const MutableCFOptions& mutable_cf_options,
-                                   const MutableDBOptions& mutable_db_options,
-                                   VersionStorageInfo* version,
-                                   LogBuffer* log_buffer);
+  Compaction* PickTemperatureChangeCompaction(
+      const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
+      const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
+      LogBuffer* log_buffer);
 };
 }  // namespace ROCKSDB_NAMESPACE
-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction_picker_level.cc
+++ b/db/compaction/compaction_picker_level.cc
@ -83,7 +83,7 @@ class LevelCompactionBuilder {

  Compaction* GetCompaction();

-  // For the specfied level, pick a file that we want to compact.
+  // From `start_level_`, pick files to compact to `output_level_`.
  // Returns false if there is no file to compact.
  // If it returns true, inputs->files.size() will be exactly one for
  // all compaction priorities except round-robin. For round-robin,
@ -107,8 +107,9 @@ class LevelCompactionBuilder {
  bool PickIntraL0Compaction();

  // Return true if TrivialMove is extended. `start_index` is the index of
-  // the intiial file picked, which should already be in `start_level_inputs_`.
-  bool TryExtendNonL0TrivialMove(int start_index);
+  // the initial file picked, which should already be in `start_level_inputs_`.
+  bool TryExtendNonL0TrivialMove(int start_index,
+                                 bool only_expand_right = false);

  // Picks a file from level_files to compact.
  // level_files is a vector of (level, file metadata) in ascending order of
@ -355,7 +356,8 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
  vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
                                  &output_level_inputs.files);
  if (output_level_inputs.empty()) {
-    if (TryExtendNonL0TrivialMove((int)start_index)) {
+    if (TryExtendNonL0TrivialMove((int)start_index,
+                                  true /* only_expand_right */)) {
      return;
    }
  }
@ -501,6 +503,16 @@ Compaction* LevelCompactionBuilder::PickCompaction() {
 }

 Compaction* LevelCompactionBuilder::GetCompaction() {
+  // TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
+  // empty output level. So L0 files is picked in PickFileToCompact() by
+  // compaction score. We may still be able to do trivial move when this file
+  // does not overlap with other L0s. This happens when
+  // compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
+  // pull in more L0s.
+  assert(!compaction_inputs_.empty());
+  bool l0_files_might_overlap =
+      start_level_ == 0 && !is_l0_trivial_move_ &&
+      (compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
  auto c = new Compaction(
      vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
      std::move(compaction_inputs_), output_level_,
@ -515,8 +527,7 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
      Temperature::kUnknown,
      /* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
      /* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
-      /* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
-      compaction_reason_);
+      l0_files_might_overlap, compaction_reason_);

  // If it's level 0 compaction, make sure we don't execute any other level 0
  // compactions in parallel
@ -653,7 +664,8 @@ bool LevelCompactionBuilder::TryPickL0TrivialMove() {
  return false;
 }

-bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
+bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
+                                                       bool only_expand_right) {
  if (start_level_inputs_.size() == 1 &&
      (ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
      (mutable_cf_options_.compression_per_level.empty())) {
@ -670,6 +682,7 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
    size_t total_size = initial_file->fd.GetFileSize();
    CompactionInputFiles output_level_inputs;
    output_level_inputs.level = output_level_;
+    // Expand towards right
    for (int i = start_index + 1;
         i < static_cast<int>(level_files.size()) &&
         start_level_inputs_.size() < kMaxMultiTrivialMove;
@ -702,6 +715,37 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
      }
      start_level_inputs_.files.push_back(next_file);
    }
+    // Expand towards left
+    if (!only_expand_right) {
+      for (int i = start_index - 1;
+           i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
+        FileMetaData* next_file = level_files[i];
+        if (next_file->being_compacted) {
+          break;
+        }
+        vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
+                                        &(initial_file->largest),
+                                        &output_level_inputs.files);
+        if (!output_level_inputs.empty()) {
+          break;
+        }
+        if (i > 0 && compaction_picker_->icmp()
+                             ->user_comparator()
+                             ->CompareWithoutTimestamp(
+                                 next_file->smallest.user_key(),
+                                 level_files[i - 1]->largest.user_key()) == 0) {
+          // Not a clean up after adding the next file. Skip.
+          break;
+        }
+        total_size += next_file->fd.GetFileSize();
+        if (total_size > mutable_cf_options_.max_compaction_bytes) {
+          break;
+        }
+        // keep `files` sorted in increasing order by key range
+        start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
+                                         next_file);
+      }
+    }
    return start_level_inputs_.size() > 1;
  }
  return false;
@ -785,7 +829,10 @@ bool LevelCompactionBuilder::PickFileToCompact() {
    vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
                                    &output_level_inputs.files);
    if (output_level_inputs.empty()) {
-      if (TryExtendNonL0TrivialMove(index)) {
+      if (start_level_ > 0 &&
+          TryExtendNonL0TrivialMove(index,
+                                    ioptions_.compaction_pri ==
+                                        kRoundRobin /* only_expand_right */)) {
        break;
      }
    } else {
--- a/db/compaction/compaction_picker_test.cc
+++ b/db/compaction/compaction_picker_test.cc
@ -70,6 +70,11 @@ class CompactionPickerTestBase : public testing::Test {
    mutable_cf_options_.RefreshDerivedOptions(ioptions_);
    ioptions_.cf_paths.emplace_back("dummy",
                                    std::numeric_limits<uint64_t>::max());
+    // When the default value of this option is true, universal compaction
+    // tests can encounter assertion failure since SanitizeOption() is
+    // not run to set this option to false. So we do the sanitization
+    // here. Tests that test this option set this option to true explicitly.
+    ioptions_.level_compaction_dynamic_level_bytes = false;
  }

  ~CompactionPickerTestBase() override {}
@ -148,7 +153,8 @@ class CompactionPickerTestBase : public testing::Test {
        smallest_seq, largest_seq, marked_for_compact, temperature,
        kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
        kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
-        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
+        kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
+        true /* user_defined_timestamps_persisted */);
    f->compensated_file_size =
        (compensated_file_size != 0) ? compensated_file_size : file_size;
    f->oldest_ancester_time = oldest_ancestor_time;
@ -482,8 +488,6 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
  ASSERT_EQ(num_levels - 1, compaction->output_level());
 }

-// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
-#ifndef ROCKSDB_LITE
 TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
  NewVersionStorage(1, kCompactionStyleUniversal);
  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -507,7 +511,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {

 TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
  const uint64_t kFileSize = 100000;
-  NewVersionStorage(1, kCompactionStyleUniversal);
+  NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
  ioptions_.allow_ingest_behind = true;
  ioptions_.num_levels = 3;
  UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -534,6 +538,14 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {

  // output level should be the one above the bottom-most
  ASSERT_EQ(1, compaction->output_level());
+
+  // input should not include the reserved level
+  const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
+  for (const auto& compaction_input : *inputs) {
+    if (!compaction_input.empty()) {
+      ASSERT_LT(compaction_input.level, 2);
+    }
+  }
 }
 // Tests if the files can be trivially moved in multi level
 // universal compaction when allow_trivial_move option is set
@ -1007,29 +1019,28 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
  }
 }

-TEST_F(CompactionPickerTest, FIFOToWarm1) {
+TEST_F(CompactionPickerTest, FIFOToCold1) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kColdThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kCold, kColdThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kWarmThreshold;
-  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
-      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
-  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
-      Temperature::kUnknown, threshold_time + 100);
-  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
-      Temperature::kUnknown, threshold_time - 2000);
+      static_cast<uint64_t>(current_time) - kColdThreshold;
+  Add(0 /* level */, 4U /* file_number */, "260", "300", 1 * kFileSize, 0, 2500,
+      2600, 0, true, Temperature::kUnknown,
+      threshold_time - 2000 /* oldest_ancestor_time */);
+  // Qualifies for compaction to kCold.
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
      Temperature::kUnknown, threshold_time - 3000);
  UpdateVersionStorageInfo();
@ -1039,33 +1050,36 @@ TEST_F(CompactionPickerTest, FIFOToWarm1) {
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
  ASSERT_EQ(1U, compaction->num_input_files(0));
  ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
 }

-TEST_F(CompactionPickerTest, FIFOToWarm2) {
+TEST_F(CompactionPickerTest, FIFOToCold2) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kColdThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kCold, kColdThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kWarmThreshold;
+      static_cast<uint64_t>(current_time) - kColdThreshold;
  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
-  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
-      Temperature::kUnknown, threshold_time + 100);
  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
-      Temperature::kUnknown, threshold_time - 2000);
+      Temperature::kUnknown, threshold_time);
+  // The following two files qualify for compaction to kCold.
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
      Temperature::kUnknown, threshold_time - 3000);
  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1077,34 +1091,40 @@ TEST_F(CompactionPickerTest, FIFOToWarm2) {
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
  ASSERT_EQ(2U, compaction->num_input_files(0));
  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
 }

-TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
+TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kColdThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kCold, kColdThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kWarmThreshold;
+      static_cast<uint64_t>(current_time) - kColdThreshold;
  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
      Temperature::kUnknown, threshold_time + 100);
  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
      Temperature::kUnknown, threshold_time - 2000);
+  // The following two files qualify for compaction to kCold.
+  // But only the last two should be included to respect `max_compaction_bytes`.
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
      Temperature::kUnknown, threshold_time - 3000);
  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1118,40 +1138,45 @@ TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
  ASSERT_EQ(2U, compaction->num_input_files(0));
  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
 }

-TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
+TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kColdThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kCold, kColdThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kWarmThreshold;
+      static_cast<uint64_t>(current_time) - kColdThreshold;
  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
      Temperature::kUnknown, threshold_time + 100);
  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
      Temperature::kUnknown, threshold_time - 2000);
+  // The following two files qualify for compaction to kCold.
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
      Temperature::kUnknown, threshold_time - 3000);
  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
      Temperature::kUnknown, threshold_time - 4000);
  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
-      Temperature::kWarm, threshold_time - 5000);
+      Temperature::kCold, threshold_time - 5000);
  UpdateVersionStorageInfo();

  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
@ -1159,28 +1184,32 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
  ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->num_input_files(0));
  ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
 }

-TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
+TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kColdThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kCold, kColdThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
  uint64_t threshold_time =
-      static_cast<uint64_t>(current_time) - kWarmThreshold;
+      static_cast<uint64_t>(current_time) - kColdThreshold;
  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
@ -1188,65 +1217,78 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
      Temperature::kUnknown, threshold_time - 2000);
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
-      Temperature::kUnknown, threshold_time - 3000);
+      Temperature::kCold, threshold_time - 3000);
+  // Qualifies for compaction to kCold.
  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
      Temperature::kUnknown, threshold_time - 4000);
  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
-      Temperature::kWarm, threshold_time - 5000);
-  file_map_[2].first->being_compacted = true;
+      Temperature::kCold, threshold_time - 5000);
  UpdateVersionStorageInfo();

  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
-  // Stop if a file is being compacted
-  ASSERT_TRUE(compaction.get() == nullptr);
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
+  ASSERT_EQ(1U, compaction->num_input_files(0));
+  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
 }

-TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
+TEST_F(CompactionPickerTest, FIFOToColdAndWarm) {
  NewVersionStorage(1, kCompactionStyleFIFO);
  const uint64_t kFileSize = 100000;
  const uint64_t kMaxSize = kFileSize * 100000;
-  uint64_t kWarmThreshold = 2000;
+  uint64_t kWarmThreshold = 10000;
+  uint64_t kHotThreshold = 2000;

  fifo_options_.max_table_files_size = kMaxSize;
-  fifo_options_.age_for_warm = kWarmThreshold;
+  // Test that multiple threshold works.
+  fifo_options_.file_temperature_age_thresholds = {
+      {Temperature::kHot, kHotThreshold}, {Temperature::kWarm, kWarmThreshold}};
  mutable_cf_options_.compaction_options_fifo = fifo_options_;
-  mutable_cf_options_.level0_file_num_compaction_trigger = 2;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 100;
  mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
  FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);

  int64_t current_time = 0;
  ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
-  uint64_t threshold_time =
+  uint64_t hot_threshold_time =
+      static_cast<uint64_t>(current_time) - kHotThreshold;
+  uint64_t warm_threshold_time =
      static_cast<uint64_t>(current_time) - kWarmThreshold;
  Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
      Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
  Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
-      Temperature::kUnknown, threshold_time + 100);
+      Temperature::kUnknown, hot_threshold_time + 100);
  Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
-      Temperature::kUnknown, threshold_time - 2000);
+      Temperature::kUnknown, hot_threshold_time - 200);
+  // Qualifies for Hot
  Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
-      Temperature::kWarm, threshold_time - 3000);
+      Temperature::kUnknown, warm_threshold_time - 100);
+  // Qualifies for Warm
  Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
-      Temperature::kUnknown, threshold_time - 4000);
+      Temperature::kUnknown, warm_threshold_time - 4000);
  Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
-      Temperature::kWarm, threshold_time - 5000);
+      Temperature::kUnknown, warm_threshold_time - 5000);
  UpdateVersionStorageInfo();

  ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
  std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
      &log_buffer_));
-  // Stop if a file is being compacted
  ASSERT_TRUE(compaction.get() != nullptr);
-  ASSERT_EQ(1U, compaction->num_input_files(0));
-  ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(compaction->compaction_reason(),
+            CompactionReason::kChangeTemperature);
+  // Assumes compaction picker picks older files first.
+  ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
+  ASSERT_EQ(2U, compaction->num_input_files(0));
+  ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
 }

-#endif  // ROCKSDB_LITE
-
 TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
  NewVersionStorage(6, kCompactionStyleLevel);
  ioptions_.compaction_pri = kMinOverlappingRatio;
@ -2523,6 +2565,61 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
  ASSERT_TRUE(compaction->IsTrivialMove());
 }

+TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) {
+  mutable_cf_options_.max_bytes_for_level_base = 5000;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  Add(1, 1U, "300", "350", 3000U, 0, 710, 800, 3000U);
+  Add(1, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
+  Add(1, 3U, "700", "750", 3000U, 0, 500, 550, 3000U);
+  Add(2, 4U, "800", "850", 4000U, 0, 150, 200, 4000U);
+
+  UpdateVersionStorageInfo();
+  // File #2 should be picked first, and expand both directions to include
+  // files #1 and #3.
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(3, compaction->num_input_files(0));
+  ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_EQ(2, compaction->input(0, 1)->fd.GetNumber());
+  ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
+TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) {
+  mutable_cf_options_.max_bytes_for_level_base = 5000;
+  mutable_cf_options_.level0_file_num_compaction_trigger = 4;
+  mutable_cf_options_.max_compaction_bytes = 10000000u;
+  ioptions_.level_compaction_dynamic_level_bytes = false;
+  NewVersionStorage(6, kCompactionStyleLevel);
+
+  // File 2 will be picked first, which by itself is trivial movable.
+  // There was a bug before where compaction also picks file 3 and 4,
+  // (and then file 1 since it overlaps with the key range),
+  // which makes the compaction not trivial movable.
+  Add(0, 1U, "450", "599", 3000U, 0, 710, 800, 3000U);
+  Add(0, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
+  Add(0, 3U, "300", "350", 3000U, 0, 500, 550, 3000U);
+  Add(0, 4U, "500", "550", 2999U, 0, 300, 350, 2999U);
+
+  UpdateVersionStorageInfo();
+
+  std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
+      cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
+      &log_buffer_));
+  ASSERT_TRUE(compaction.get() != nullptr);
+  ASSERT_EQ(1, compaction->num_input_levels());
+  ASSERT_EQ(1, compaction->num_input_files(0));
+  ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
+  ASSERT_TRUE(compaction->IsTrivialMove());
+}
+
 TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
  mutable_cf_options_.max_bytes_for_level_base = 10000u;
  mutable_cf_options_.max_compaction_bytes = 10001u;
@ -2873,7 +2970,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
  ASSERT_EQ(0, compaction->output_level());
 }

-#ifndef ROCKSDB_LITE
 TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
  const uint64_t kFileSize = 100000;

@ -3982,7 +4078,6 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
 INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
                        PerKeyPlacementCompactionPickerTest, ::testing::Bool());

-#endif  // ROCKSDB_LITE

 }  // namespace ROCKSDB_NAMESPACE

--- a/db/compaction/compaction_picker_universal.cc
+++ b/db/compaction/compaction_picker_universal.cc
@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #include "db/compaction/compaction_picker_universal.h"
-#ifndef ROCKSDB_LITE

 #include <cinttypes>
 #include <limits>
@ -20,7 +19,7 @@
 #include "file/filename.h"
 #include "logging/log_buffer.h"
 #include "logging/logging.h"
-#include "monitoring/statistics.h"
+#include "monitoring/statistics_impl.h"
 #include "test_util/sync_point.h"
 #include "util/random.h"
 #include "util/string_util.h"
@ -134,8 +133,8 @@ class UniversalCompactionBuilder {
  UniversalCompactionPicker* picker_;
  LogBuffer* log_buffer_;

-  static std::vector<SortedRun> CalculateSortedRuns(
-      const VersionStorageInfo& vstorage);
+  static std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
+      const VersionStorageInfo& vstorage, int last_level);

  // Pick a path ID to place a newly generated file, with its estimated file
  // size.
@ -340,13 +339,13 @@ void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(

 std::vector<UniversalCompactionBuilder::SortedRun>
 UniversalCompactionBuilder::CalculateSortedRuns(
-    const VersionStorageInfo& vstorage) {
+    const VersionStorageInfo& vstorage, int last_level) {
  std::vector<UniversalCompactionBuilder::SortedRun> ret;
  for (FileMetaData* f : vstorage.LevelFiles(0)) {
    ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
                     f->being_compacted);
  }
-  for (int level = 1; level < vstorage.num_levels(); level++) {
+  for (int level = 1; level <= last_level; level++) {
    uint64_t total_compensated_size = 0U;
    uint64_t total_size = 0U;
    bool being_compacted = false;
@ -375,7 +374,9 @@ UniversalCompactionBuilder::CalculateSortedRuns(
 Compaction* UniversalCompactionBuilder::PickCompaction() {
  const int kLevel0 = 0;
  score_ = vstorage_->CompactionScore(kLevel0);
-  sorted_runs_ = CalculateSortedRuns(*vstorage_);
+  int max_output_level =
+      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  sorted_runs_ = CalculateSortedRuns(*vstorage_, max_output_level);

  if (sorted_runs_.size() == 0 ||
      (vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@ -472,6 +473,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
        "UniversalCompactionBuilder::PickCompaction:Return", nullptr);
    return nullptr;
  }
+  assert(c->output_level() <=
+         vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));

  if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
          true &&
@ -699,22 +702,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
  int start_level = sorted_runs_[start_index].level;
  int output_level;
+  // last level is reserved for the files ingested behind
+  int max_output_level =
+      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
  if (first_index_after == sorted_runs_.size()) {
-    output_level = vstorage_->num_levels() - 1;
+    output_level = max_output_level;
  } else if (sorted_runs_[first_index_after].level == 0) {
    output_level = 0;
  } else {
    output_level = sorted_runs_[first_index_after].level - 1;
  }

-  // last level is reserved for the files ingested behind
-  if (ioptions_.allow_ingest_behind &&
-      (output_level == vstorage_->num_levels() - 1)) {
-    assert(output_level > 1);
-    output_level--;
-  }
-
-  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  std::vector<CompactionInputFiles> inputs(max_output_level + 1);
  for (size_t i = 0; i < inputs.size(); ++i) {
    inputs[i].level = start_level + static_cast<int>(i);
  }
@ -1193,8 +1192,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
      return nullptr;
    }

+    int max_output_level =
+        vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
    // Pick the first non-empty level after the start_level
-    for (output_level = start_level + 1; output_level < vstorage_->num_levels();
+    for (output_level = start_level + 1; output_level <= max_output_level;
         output_level++) {
      if (vstorage_->NumLevelFiles(output_level) != 0) {
        break;
@ -1202,9 +1203,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
    }

    // If all higher levels are empty, pick the highest level as output level
-    if (output_level == vstorage_->num_levels()) {
+    if (output_level > max_output_level) {
      if (start_level == 0) {
-        output_level = vstorage_->num_levels() - 1;
+        output_level = max_output_level;
      } else {
        // If start level is non-zero and all higher levels are empty, this
        // compaction will translate into a trivial move. Since the idea is
@ -1213,11 +1214,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
        return nullptr;
      }
    }
-    if (ioptions_.allow_ingest_behind &&
-        output_level == vstorage_->num_levels() - 1) {
-      assert(output_level > 1);
-      output_level--;
-    }
+    assert(output_level <= max_output_level);

    if (output_level != 0) {
      if (start_level == 0) {
@ -1294,8 +1291,9 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
  uint32_t path_id =
      GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
  int start_level = sorted_runs_[start_index].level;
-
-  std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
+  int max_output_level =
+      vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
+  std::vector<CompactionInputFiles> inputs(max_output_level + 1);
  for (size_t i = 0; i < inputs.size(); ++i) {
    inputs[i].level = start_level + static_cast<int>(i);
  }
@ -1332,13 +1330,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(

  int output_level;
  if (end_index == sorted_runs_.size() - 1) {
-    // output files at the last level, unless it's reserved
-    output_level = vstorage_->num_levels() - 1;
-    // last level is reserved for the files ingested behind
-    if (ioptions_.allow_ingest_behind) {
-      assert(output_level > 1);
-      output_level--;
-    }
+    output_level = max_output_level;
  } else {
    // if it's not including all sorted_runs, it can only output to the level
    // above the `end_index + 1` sorted_run.
@ -1451,4 +1443,3 @@ uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
 }
 }  // namespace ROCKSDB_NAMESPACE

-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction_picker_universal.h
+++ b/db/compaction/compaction_picker_universal.h
@ -8,7 +8,6 @@
 // found in the LICENSE file. See the AUTHORS file for names of contributors.

 #pragma once
-#ifndef ROCKSDB_LITE

 #include "db/compaction/compaction_picker.h"

@ -29,4 +28,3 @@ class UniversalCompactionPicker : public CompactionPicker {
      const VersionStorageInfo* vstorage) const override;
 };
 }  // namespace ROCKSDB_NAMESPACE
-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction_service_job.cc
+++ b/db/compaction/compaction_service_job.cc
@ -16,7 +16,6 @@
 #include "options/options_helper.h"
 #include "rocksdb/utilities/options_type.h"

-#ifndef ROCKSDB_LITE
 namespace ROCKSDB_NAMESPACE {
 class SubcompactionState;

@ -832,4 +831,3 @@ bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
 #endif  // NDEBUG
 }  // namespace ROCKSDB_NAMESPACE

-#endif  // !ROCKSDB_LITE
--- a/db/compaction/compaction_service_test.cc
+++ b/db/compaction/compaction_service_test.cc
@ -3,7 +3,6 @@
 //  COPYING file in the root directory) and Apache 2.0 License
 //  (found in the LICENSE.Apache file in the root directory).

-#ifndef ROCKSDB_LITE

 #include "db/db_test_util.h"
 #include "port/stack_trace.h"
@ -929,7 +928,7 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) {
    }
    ASSERT_OK(Flush());
  }
-  ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
+  ASSERT_OK(dbfull()->TEST_WaitForCompact());

  ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));

@ -953,14 +952,3 @@ int main(int argc, char** argv) {
  RegisterCustomObjects(argc, argv);
  return RUN_ALL_TESTS();
 }
-
-#else
-#include <stdio.h>
-
-int main(int /*argc*/, char** /*argv*/) {
-  fprintf(stderr,
-          "SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
-  return 0;
-}
-
-#endif  // ROCKSDB_LITE
--- a/db/compaction/sst_partitioner.cc
+++ b/db/compaction/sst_partitioner.cc
@ -15,11 +15,9 @@
 namespace ROCKSDB_NAMESPACE {
 static std::unordered_map<std::string, OptionTypeInfo>
    sst_fixed_prefix_type_info = {
-#ifndef ROCKSDB_LITE
        {"length",
         {0, OptionType::kSizeT, OptionVerificationType::kNormal,
          OptionTypeFlags::kNone}},
-#endif  // ROCKSDB_LITE
 };

 SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
@ -58,7 +56,6 @@ std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
  return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
 }

-#ifndef ROCKSDB_LITE
 namespace {
 static int RegisterSstPartitionerFactories(ObjectLibrary& library,
                                           const std::string& /*arg*/) {
@ -73,18 +70,14 @@ static int RegisterSstPartitionerFactories(ObjectLibrary& library,
  return 1;
 }
 }  // namespace
-#endif  // ROCKSDB_LITE

 Status SstPartitionerFactory::CreateFromString(
    const ConfigOptions& options, const std::string& value,
    std::shared_ptr<SstPartitionerFactory>* result) {
-#ifndef ROCKSDB_LITE
  static std::once_flag once;
  std::call_once(once, [&]() {
    RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
  });
-#endif  // ROCKSDB_LITE
-  return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
-                                                 result);
+  return LoadSharedObject<SstPartitionerFactory>(options, value, result);
 }
 }  // namespace ROCKSDB_NAMESPACE
--- a/Show More
+++ b/Show More