Compare commits

...

360 Commits

Author SHA1 Message Date
Niko PLP e542f605ac Revert "trying with memcpy in openssl" 1 year ago
Niko PLP 1f28422b79 Revert "randomize only the IV part of prefix" 1 year ago
Niko PLP 2776c764d0 copying key when received 1 year ago
Niko PLP 85d59d7c73 randomize only the IV part of prefix 1 year ago
Niko PLP 62d62a9d12 trying with memcpy in openssl 1 year ago
Niko PLP d334e861a1 fix intermittent segfault discovered in test in openbsd. new ctx for every encrypt 1 year ago
Niko PLP 56e77c4c66 fix for android build 1 year ago
Niko PLP 4de4c7d0d6 removed unnecessary header in openssl plugin 1 year ago
Niko PLP 82db9514a3 remove typoin makefile 1 year ago
Niko PLP ba14cbef4f fix openssl includes 1 year ago
Niko PLP 2d05557bbc remove include for windows 1 year ago
Niko PLP b5f9a498ce gitignore for test 1 year ago
Niko PLP cb0a82ea93 openssl plugin for data at rest encryption in rocksdb 1 year ago
Niko PLP 76238c71dd adding ippcp windows binary library 1 year ago
Niko PLP 93c3369d39 fix tests 1 year ago
Niko PLP e393b5183a separating binary lib of ippcp by target platform 1 year ago
Niko PLP a397fbd0e2 adding rocksdb_create_encrypted_env 1 year ago
Niko PLP 217f764211 adding binary library file 1 year ago
Niko PLP 3d515daf29 intel IPPCP library source and binaries included in plugin 1 year ago
Niko PLP 293fe1abf8 upgrade platform version of macos to 10.14 because of C++17 standard 1 year ago
Niko PLP d5fb4a8763 plugin for AES CTR encryption with IPPCP intel library 1 year ago
Niko PLP e8f69a734a fix CXX not initialized early enough in Makefile on openbsd 1 year ago
Tpt f773c1d376 Makes MockFileSystem::GetAbsolutePath work on Windows 1 year ago
Hui Xiao 09882a52d6 Prepare for deprecation of Options::access_hint_on_compaction_start (#11658) 1 year ago
Vardhan 87a21d08fe Add an option to trigger flush when the number of range deletions reach a threshold (#11358) 1 year ago
Peter Dillinger f9de217353 Some cache_bench enhancements (#11661) 1 year ago
Andrew Kryczka cf95821fb6 Update for 8.5.fb branch cut (#11642) 1 year ago
Peter Dillinger f4e4039f00 Add some more bit operations to internal APIs (#11660) 1 year ago
amatveev-cf 946d1009bc Expand Statistics support in the C API (#11263) 1 year ago
Jay Huh 9a2a6db2a9 Use C++17 [[fallthrough]] in transaction_test.cc (#11663) 1 year ago
Peter Dillinger bb8fcc0044 db_stress: Reinstate Transaction::Rollback() calls before destruction (#11656) 1 year ago
Peter Dillinger 7a1b0207e6 format_version=6 and context-aware block checksums (#9058) 1 year ago
Peter Dillinger b3c54186ab Allow TryAgain in db_stress with optimistic txn, and refactoring (#11653) 1 year ago
Peter Dillinger c205a217e6 Strip leading and trailing whitespace for unreleased_history entries (#11652) 1 year ago
Changyu Bi 6a0f637633 Compare the number of input keys and processed keys for compactions (#11571) 1 year ago
Yu Zhang 5dd8c114bb Add a UDT comparator for ReverseBytewiseComparator to object library (#11647) 1 year ago
akankshamahajan 63a5125a52 Fix use_after_free bug when underlying FS enables kFSBuffer (#11645) 1 year ago
Yu Zhang c24ef26ca7 Support switching on / off UDT together with in-Memtable-only feature (#11623) 1 year ago
Yu Zhang 4ea7b796b7 Respect cutoff timestamp during flush (#11599) 1 year ago
Changyu Bi 5c2a063c49 Clarify usage for options `ttl` and `periodic_compaction_seconds` for universal compaction (#11552) 1 year ago
ywave 9cc0986ae2 Fix comment in WriteBatchWithIndex::NewIteratorWithBase (#11636) 1 year ago
Peter Dillinger c41122b1a0 Even more HyperClockCache refactoring (#11630) 1 year ago
zhangyuxiang.ax 1567108fc1 Add missing table properties in plaintable GetTableProperties() (#11267) 1 year ago
Hui Xiao 629605d645 Move prefetching responsibility to page cache for compaction read under non directIO usecase (#11631) 1 year ago
darionyaphet df543460d5 Remove some useless qualifier (#11596) 1 year ago
Rémi Calixte 6628ff12d6 Extend C API to expose base db of transaction db (#11562) 1 year ago
ywave 86634885eb Fix typo in comment (#11617) 1 year ago
darionyaphet 64b0439bc1 fix typo (#11595) 1 year ago
shuzz 2f712235ab optimized code (#11614) 1 year ago
zhutao aeda36e925 add exe and script path check (#11621) 1 year ago
huangmengbin 98d0f6ec08 fix: VersionSet::DumpManifest (#11605) 1 year ago
Dan Wang 8a7b9888d4 Fix the sync point SanitizeOptions::AfterChangeMaxOpenFiles which is not executed in db_compaction_test (#11583) 1 year ago
Muhammad 977aae53d2 Allow rocksdb library to be usable with CMake's `FetchContent` API (#11575) 1 year ago
Andrew Kryczka 05c3b8ecac Prepare for specialized interface for row cache (#11620) 1 year ago
Chad Austin ff0d618c7f add a missing include (#11624) 1 year ago
Peter Dillinger 846db9d7b1 Refactor ClockCache ApplyToEntries (#11609) 1 year ago
Changyu Bi 662a1c99f6 Verify number of keys flushed during DB open (#11611) 1 year ago
akankshamahajan 749b179c04 Remove reallocation of AlignedBuffer in direct_io sync reads if already aligned (#11600) 1 year ago
Peter Dillinger b1b6f87fbe Some small improvements to HyperClockCache (#11601) 1 year ago
leipeng bc0db33483 Optimize about sstableKeyCompare (#11610) 1 year ago
Peter Dillinger c3c84b3397 Refactor (Hyper)ClockCache code for upcoming changes (#11572) 1 year ago
Changyu Bi 854eb76a8c Improve error message when an SST file in MANIFEST is not found (#11573) 1 year ago
weedge 1a7c741977 fix: std::optional value() build error on older macOS SDK (#11574) 1 year ago
Yu Zhang f74526341d Handle file boundaries when timestamps should not be persisted (#11578) 1 year ago
Yu Zhang baf37a0e81 Fix a unit test hole for recovering UDTs with WAL files (#11577) 1 year ago
Changyu Bi 1f410ff95f Make `rocksdb_options_add_compact_on_deletion_collector_factory` backward compatible (#11593) 1 year ago
Changyu Bi df082c8d1d Deprecate option `periodic_compaction_seconds` for FIFO compaction (#11550) 1 year ago
Changyu Bi c53d604f41 `sst_dump --command=verify` should verify block checksums (#11576) 1 year ago
leipeng 25b08eb438 MemTable::Add: first_seqno_.compare_exchange_weak to earliest_seqno_ (#11398) 1 year ago
darionyaphet f4e304f987 Simplify conditional judgment (#11580) 1 year ago
Yu Zhang 15053f3ab4 Logically strip timestamp during flush (#11557) 2 years ago
Griffin Smith bfdc91017c C-API: Expose remaining PlainTableOptions (#11442) 2 years ago
Akanksha Mahajan 5187ac2af3 Add skip_tmpdir_check arg in crash script (#11539) 2 years ago
Jay Schmidek f7aa70a72f Add create_column_families to C api (#9527) 2 years ago
Yelso Honnr 5732cf50e1 Add OpenBSD Support (#11255) 2 years ago
Yingchun Lai 44524cf5da remove duplicate comments in EncryptedEnv (#11549) 2 years ago
JUBIN CHHEDA b14c0b0602 Update secondary_cache_adapter.cc (#11566) 2 years ago
akankshamahajan 94c247bff8 Update HISTORY.md for branch cut for 8.4.fb (#11565) 2 years ago
akankshamahajan ff1cc8a63e Fix extra prefetching when num_file_reads_for_auto_readahead is 1 in async_io (#11560) 2 years ago
Changyu Bi ca50ccc71a Add CreateColumnFamilyWithImport to `StackableDB` and `DBImplReadOnly` (#11556) 2 years ago
akankshamahajan fbd2f563bb Add an interface to provide support for underlying FS to pass their own buffer during reads (#11324) 2 years ago
Yu Zhang fb5748decf Fix crash_test crash (#11554) 2 years ago
Peter Dillinger 05a1d52e77 Use FaultInjectionTestFS in transaction_test, clarify Close() APIs (#11499) 2 years ago
Yu Zhang 7521478b43 Record the `persist_user_defined_timestamps` flag in manifest (#11515) 2 years ago
Peter Dillinger 98c6d7fd80 Internal API for generating semi-random salt (#11331) 2 years ago
Alexandre Lavigne 2926e0718c Add missing parameter in C API (#11542) 2 years ago
Levi Tamasi 022d89549d Attempt to deflake DBWALTestWithEnrichedEnv.SkipDeletedWALs (#11537) 2 years ago
Levi Tamasi b3edb87341 Initialize StressTest::optimistic_txn_db_ in ctor (#11547) 2 years ago
Jay Huh 17d5200504 Stress/Crash Test for OptimisticTransactionDB (#11513) 2 years ago
Hui Xiao 1da9ac2363 Add UT to test BG read qps behavior during upgrade for pr11406 (#11522) 2 years ago
Yu Zhang 66499780b2 Fix error case memory bug in GetHostName() (#11544) 2 years ago
Yu Zhang b421a8c21b Add a ticker to track number of trash files deleted in background thread (#11540) 2 years ago
Changyu Bi bc04ec85db Make option `level_compaction_dynamic_level_bytes` true by default (#11525) 2 years ago
yaphet 253bc91953 Move the status judgment into the block (#11534) 2 years ago
darionyaphet 9f774baaa8 Support Error Recovery Retry Flush in GetFlushReasonString (#11536) 2 years ago
mayue.fight fa878a0107 Support to create a CF by importing multiple non-overlapping CFs (#11378) 2 years ago
Peter Dillinger 70bf5ef093 Avoid destroying default PosixEnv, safely (#11538) 2 years ago
Changyu Bi 15e8a843d9 Do not include last level in compaction when `allow_ingest_behind=true` (#11489) 2 years ago
Andrew Kryczka cac3240cbf add property "rocksdb.obsolete-sst-files-size" (#11533) 2 years ago
Changyu Bi e178f9e477 Fix failed CI job "Check buck targets and code format" (#11532) 2 years ago
Hui Xiao 6041e50eba Fix info_log comment in SSTFileManager (#11530) 2 years ago
Yu Zhang a2a90f8998 Fix typo in twitter link (#11529) 2 years ago
Ignat Loskutov 7c67aee4a0 statistics.cc: fix mistype (#11509) 2 years ago
Ignat Loskutov 05fcacdb42 Add missing stopwatch and perf timer to DBImplReadOnly (#11521) 2 years ago
Yu Zhang 77dda0d9d8 Fix use after move in data block hash index (#11505) 2 years ago
Peter Dillinger 2b2994c8db Fix old comment about HyperClockCache and SecondaryCache (#11517) 2 years ago
Hui Xiao 3093d98c78 Fix higher read qps during db open caused by pr 11406 (#11516) 2 years ago
Changyu Bi 2e8cc98ab2 Fix subcompaction bug to allow running two subcompactions (#11501) 2 years ago
leipeng ddfcbea3e1 IterKey: change space_[32] to 39 to utilize padding space (#10633) 2 years ago
Changyu Bi 633c738a98 Fix unit test `DBRangeDelTest.NonBottommostCompactionDropRangetombstone` (#11512) 2 years ago
Yu Zhang 4dafa5b220 switch to use RocksDB UnorderedMap (#11507) 2 years ago
Changyu Bi 4aa52d89cf Drop range tombstone during non-bottommost compaction (#11459) 2 years ago
Andrew Kryczka 687a2a0d9a Small improvements to DBGet microbenchmark (#11498) 2 years ago
Peter Dillinger 7a9b264f36 Some fixes to unreleased_history/ (#11504) 2 years ago
Changyu Bi 71ca9a1dcd Log correct compaction score for Universal Compaction (#11487) 2 years ago
Changyu Bi e95cc1217d `CompactRange()` always compacts to bottommost level for leveled compaction (#11468) 2 years ago
Yu Zhang 9f7877f246 Add support to strip / pad timestamp when creating / reading a block based table (#11495) 2 years ago
Changyu Bi 9f1ce6d804 Make `unreleased_history/release.sh` work on macOS (#11494) 2 years ago
darionyaphet 68a9cd21f2 Support single delete help message in ldb (#11493) 2 years ago
Jay Huh 87bc929db3 Flush option in WaitForCompact() (#11483) 2 years ago
Yu Zhang 56ca9e3106 Logging timestamp size record in WAL and use it during recovery (#11471) 2 years ago
Peter Dillinger 8848ec92dd Better management of unreleased HISTORY (#11481) 2 years ago
Changyu Bi e1c7209beb Fix flaky test: `DBCompactionTest.WaitForCompactShutdownWhileWaiting` (#11488) 2 years ago
anand76 fcc358baf2 Integrate CacheReservationManager with compressed secondary cache (#11449) 2 years ago
Andrew Kryczka 3e7fc88167 add WriteBatch::Release() (#11482) 2 years ago
Soli de1dd4ca19 Tweak on IsTrivialMove() (#11467) 2 years ago
Jay Huh 23f4e9ad63 Move WaitForCompect() change entry to Unreleased in History file (#11479) 2 years ago
Jay Huh 81aeb15988 Add WaitForCompact with WaitForCompactOptions to public API (#11436) 2 years ago
Yu Zhang d1ae7f6c41 Add support to strip / pad timestamp when writing / reading a block (#11472) 2 years ago
Hui Xiao dcc6fc99f9 Fix StopWatch bug; Remove setting `record_read_stats` (#11474) 2 years ago
Peter Dillinger e8710303d9 Document SyncPoint::LoadDependency (#11477) 2 years ago
Peter Dillinger 17bc27741f Improve memory efficiency of many OptimisticTransactionDBs (#11439) 2 years ago
Alan Paxton 93e0715fad Implement missing compactrangeoptions from Java API (#10880) 2 years ago
rogertyang 28bf7ba77d remove unnecessary code in super version getter (#11452) 2 years ago
akankshamahajan bf9e864235 Update db_crashtest.py for support for dir creation on remote storage (#11448) 2 years ago
Yu Zhang 68cc429be2 Fix stress test failure caused by #11424 (#11470) 2 years ago
akankshamahajan 53e0b2fe6f Fix regression script for async_io benchmarks (#11462) 2 years ago
Yu Zhang 11ebddb1d4 Add utils to use for handling user defined timestamp size record in WAL (#11451) 2 years ago
Yu Zhang ffb5f1f445 Refactor WriteUnpreparedStressTest to be a unit test (#11424) 2 years ago
FishAndBird 5b945adf60 fix typo in detecting HAVE_AUXV_GETAUXVAL (#10913) 2 years ago
Alan Paxton 6eb3770b46 Repair/instate jemalloc build on M1 (#11257) 2 years ago
Yu Zhang 509116c53b Update HISTORY.md/version.h/format compatiblity test for 8.3 release (#11464) 2 years ago
Peter Dillinger 39f5846ec7 Much better stats for seeks and prefix filtering (#11460) 2 years ago
Peter Dillinger 4067acabca Compatibility step for separating BlockCache and GeneralCache APIs (#11450) 2 years ago
mayue.fight 8d8eb0e77e Support Clip DB to KeyRange (#11379) 2 years ago
Hui Xiao 7263f51d50 Improve comment of ExpectedValue in db stress (#11456) 2 years ago
Hui Xiao 50046869a4 Add `rocksdb.file.read.db.open.micros` (#11455) 2 years ago
Alan Paxton e110d713e0 Minimal RocksJava compliance with Java 8 language level (EB 1046) (#10951) 2 years ago
Jay Huh 586d78b31e Remove wait_unscheduled from waitForCompact internal API (#11443) 2 years ago
Peter Dillinger 206fdea3d9 Change internal headers with duplicate names (#11408) 2 years ago
Hui Xiao 5fc57eec2b Support parallel read and write/delete to same key in NonBatchedOpsStressTest (#11058) 2 years ago
Andrew Kryczka fb636f2498 Fix write stall stats dump format (#11445) 2 years ago
anand76 2084cdf237 Delete temp OPTIONS file on failure to write it (#11423) 2 years ago
Andrew Kryczka 113f3250f1 Add block checksum mismatch ticker stat (#11438) 2 years ago
Yu Zhang 47235dda9e Add support in log writer and reader for a user-defined timestamp size record (#11433) 2 years ago
Changyu Bi 8827cd0618 Support compacting files to different temperatures in FIFO compaction (#11428) 2 years ago
Jay Huh 7531cbda91 Clean up rate limiter refill logic (#11425) 2 years ago
Peter Dillinger 459969e993 Simplify detection of x86 CPU features (#11419) 2 years ago
Peter Dillinger f4a02f2c52 Add hash_seed to Caches (#11391) 2 years ago
akankshamahajan 6ba4717f35 Fix build error: variable 'base_level' may be uninitialized (#11435) 2 years ago
Hui Xiao 8f763bdeab Record and use the tail size to prefetch table tail (#11406) 2 years ago
Peter Dillinger e1d1c50317 Organize + modernize ReadOptions (#11430) 2 years ago
Akanksha koul 736b3c4909 Added encryption plugin based on Intel open-source ipp-crypto library (#11429) 2 years ago
Peter Dillinger a5909f8864 Clarify io_activity (#11427) 2 years ago
Changyu Bi a11f1e12ca Fix flaky test `DBTestUniversalManualCompactionOutputPathId.ManualCompactionOutputPathId` (#11412) 2 years ago
nccx c81d58016b Add more users (#11369) 2 years ago
clundro 50b33ebb1b remove redundant move (#11418) 2 years ago
leipeng a475e9f746 DBIter::FindValueForCurrentKey: remove unused Status s (#11394) 2 years ago
anand76 03a892a9fb Delete empty WAL files on reopen (#11409) 2 years ago
Peter Dillinger 41a7fbf758 Avoid long parameter lists configuring Caches (#11386) 2 years ago
Peter Dillinger e0e318f370 Optionally support lldb for stack traces and debugger attach (#11413) 2 years ago
Peter Dillinger 76a40286b0 Fix duplicate symbols in linking with buck2 (#11421) 2 years ago
Andrew Kryczka 925d8252e5 Shard JemallocNodumpAllocator (#11400) 2 years ago
Levi Tamasi d3ed796855 Deflake some old BlobDB test cases (#11417) 2 years ago
Changyu Bi 62fc15f009 Block per key-value checksum (#11287) 2 years ago
leipeng 40d69b59ad DBImpl::MultiGet: delete unused var `superversions_to_delete` (#11395) 2 years ago
Hui Xiao 3622cfa34a Add back io_uring stress test hack with DbStressFSWrapper for FS not supporting read async (#11404) 2 years ago
Peter Dillinger 46dbcfd799 Start version 8.3 (#11405) 2 years ago
Peter Dillinger a2c1f57358 Fix compression tests^2 (#11403) 2 years ago
Peter Dillinger fb63d9b4ee Fix compression tests when snappy not available (#11396) 2 years ago
Peter Dillinger d79be3dca2 Changes and enhancements to compression stats, thresholds (#11388) 2 years ago
Changyu Bi adc9001f20 Improve error message from `SanityCheckCFOptions()` for merge_operator (#11393) 2 years ago
Hui Xiao 151242ce46 Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) 2 years ago
Andrew Kryczka 0a774a102f Clarify `SstFileWriter::DeleteRange()` ordering requirements (#11390) 2 years ago
Andrew Kryczka 6cac4c79d4 Fix race condition in db_stress checkpoint cleanup (#11389) 2 years ago
Changyu Bi 43e9a60bb2 Always allow L0->L1 trivial move during manual compaction (#11375) 2 years ago
Andrew Kryczka bd80433c73 Set -source 8 in CMAKE_JAVA_COMPILE_FLAGS (#11385) 2 years ago
Peter Dillinger 9b698cda51 Update GeneralTableTest::ApproximateOffsetOfCompressed values (#11384) 2 years ago
Andrew Kryczka f3818948e8 Deflake DBWriteTest.LockWALInEffect (#11382) 2 years ago
Andrew Kryczka b8555ba470 Deflake DBBloomFilterTest.OptimizeFiltersForHits (#11383) 2 years ago
Murali Vilayannur 226ee25d30 Block fetch CPU time counters in perf context (#11342) 2 years ago
mayue.fight 4d72f48e57 Fix the wrong calculation of largest_key in import_column_family_job (#11381) 2 years ago
Changyu Bi ba16e8eee7 Try to pick more files in `LevelCompactionBuilder::TryExtendNonL0TrivialMove()` (#11347) 2 years ago
mayue.fight 9500d90d1b Fix serval bugs in ImportColumnFamilyTest (#11372) 2 years ago
Jeff Palm 6b67b561bc util/ribbon_test.cc: avoid ambiguous reversed operator error in c++20 (#11371) 2 years ago
Yu Zhang 647cd73674 Initial add UDT in memtable only option (#11362) 2 years ago
Andrew Kryczka 760b773f58 fix optimization-disabled test builds with platform010 (#11361) 2 years ago
Niklas Fiekas d5a9c0c937 C-API: Constify cache functions where possible (#11243) 2 years ago
Zdenek Korcak c8552d8c63 fix bad implementation of ShardedCache::GetOccupancyCount (#11325) 2 years ago
nccx d30bb3d14a Add PaxosStore to USERS (#11357) 2 years ago
leipeng b2c4bc5f73 Makefile: fix a typo: PLATFORM_CFLAGS to PLATFORM_CCFLAGS (#11348) 2 years ago
nccx 140dd93b57 Remove deprecated integration tests from README.md (#11354) 2 years ago
Changyu Bi 64cead919f Initialize `lowest_unnecessary_level_` in `VersionStorageInfo` constructor (#11359) 2 years ago
Peter Dillinger f9db0c6e9c Refactor block cache tracing w/improved MultiGet (#11339) 2 years ago
Changyu Bi f631138e1c Better support for merge operation with data block hash index (#11356) 2 years ago
Wentian Guo 0578d9f951 Filter table files by timestamp: Get operator (#11332) 2 years ago
Changyu Bi b3c43a5b99 Drain unnecessary levels when `level_compaction_dynamic_level_bytes=true` (#11340) 2 years ago
anand76 0623c5b903 Ensure VerifyFileChecksums reads don't exceed readahead_size (#11328) 2 years ago
Hui Xiao 7f5b9f40cb Fix initialization-order-fiasco in write_stall_stats.cc (#11355) 2 years ago
Andrew Kryczka b45738622a Use user-provided ReadOptions for metadata block reads more often (#11208) 2 years ago
Peter Dillinger 03ccb1cd42 Re-clarify SecondaryCache API (#11316) 2 years ago
Peter Dillinger 3c17930ede Change default block cache from 8MB to 32MB (#11350) 2 years ago
Niklas Fiekas e5a560ec98 Expose cache occupancy via C API (#11327) 2 years ago
Peter Dillinger b4d78189b3 Fix gflags_compat.h (#11346) 2 years ago
anand76 891ced8b15 Remove platform009 and default to platform010 (#11333) 2 years ago
Hui Xiao 39c29372bf Add `SetAllowStall()` (#11335) 2 years ago
Levi Tamasi 0efd7b4ba1 Extend the stress test coverage of MultiGetEntity (#11336) 2 years ago
Hui Xiao c14eb134ed Add experimental PerfContext counters for db iterator Prev/Next/Seek* APIs (#11320) 2 years ago
Changyu Bi 601320164b Trivially move files down when opening db with level_compaction_dynamic_l… (#11321) 2 years ago
karemta-orday 40c2ec6d08 Add in-transaction multi-get-for-update to the C interface (#11107) 2 years ago
Andrew Kryczka 9f8cdc8ad6 validate SstFileWriter range tombstones cover positive ranges (#11322) 2 years ago
Levi Tamasi 57abdea389 Backport an internal change to regression_build_test.sh (#11319) 2 years ago
Tobias Ruck 8f6c2a2cc0 Remove unused `#include <stdarg.h>` in include/rocksdb/c.h (#11302) 2 years ago
sdong b92bc04ab0 Deflake DBCompactionTest.CancelCompactionWaitingOnConflict (#11318) 2 years ago
sdong cea81cad66 Disabling some IO error assertion in EnvLogger (#11314) 2 years ago
Andrew Kryczka 8c445407b7 Specify precedence in `SstFileWriter::DeleteRange()` API contract (#11309) 2 years ago
Levi Tamasi 87de4fee6b Updates for the 8.1 release (HISTORY, version.h, compatibility tests) (#11307) 2 years ago
Hui Xiao cb58477185 New stat rocksdb.{cf|db}-write-stall-stats exposed in a structural way (#11300) 2 years ago
Peter Dillinger 204fcff751 HyperClockCache support for SecondaryCache, with refactoring (#11301) 2 years ago
anand76 eac6b6d0cd Ignore async_io ReadOption if FileSystem doesn't support it (#11296) 2 years ago
Levi Tamasi a72d55c99d Increase the stress test coverage of GetEntity (#11303) 2 years ago
hackingthekernel 291300ece8 add c-api for allowing FIFO compaction (#11156) 2 years ago
Peter Dillinger ccaa3225b0 Simplify tracking entries already in SecondaryCache (#11299) 2 years ago
nccx 664dabda8f Add Microsoft Bing as a user (#11270) 2 years ago
Hui Xiao bab5f9a6f2 Add new stat rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit} (#11265) 2 years ago
Peter Dillinger 601efe3cf2 Misc cleanup of block cache code (#11291) 2 years ago
Hui Xiao 11cb6af6e5 Fix bug of prematurely excluded CF in atomic flush contains unflushed data that should've been included in the atomic flush (#11148) 2 years ago
Peter Dillinger 2a23bee963 Use CacheWrapper in more places (#11295) 2 years ago
Levi Tamasi 49881921cd Rename a recently added PerfContext counter (#11294) 2 years ago
Peter Dillinger 648e972f30 Document DB::Resume(), fix LockWALInEffect test (#11290) 2 years ago
Changyu Bi 9aa3b6f9ae Support range deletion tombstones in `CreateColumnFamilyWithImport` (#11252) 2 years ago
Alan Paxton fbd603d04a Reverse wrong order of parameter names for Java WriteBatchWithIndex#iteratorWithBase (#11280) 2 years ago
Jaepil Jeong 969d4e1dd2 Fix compile errors in Clang due to unused variables depending on the build configuration (#11234) 2 years ago
zhangliangkai1992 7a07afe82e DBWithTTLImpl::IsStale overflow when ttl is 15 years (#11279) 2 years ago
Alan Paxton daeec505a4 Add instructions for installing googlebenchmark (#11282) 2 years ago
akankshamahajan 1de697628e Fix hang in async_io benchmarks in regression script (#11285) 2 years ago
Levi Tamasi 1d52438504 Add a PerfContext counter for merge operands applied in point lookups (#11284) 2 years ago
akankshamahajan 6c65bf1743 Decrease duration time for internally debugging the regression_script (#11283) 2 years ago
Peter Dillinger e01073252b Tests verifying non-zero checksums of zero bytes (#11260) 2 years ago
akankshamahajan 13357de0c2 Add support for parameters setting related to async_io benchmarks (#11262) 2 years ago
Levi Tamasi a1a3b23346 Deflake/fix BlobSourceCacheReservationTest.IncreaseCacheReservationOnFullCache (#11273) 2 years ago
Peter Dillinger 50e9b3f9c7 Default print stack traces with GDB on Linux (#11272) 2 years ago
Peter Dillinger e168c1b1a4 Use FaultInjectionTestFS in DBWriteTest.LockWALInEffect (#11271) 2 years ago
Igor Canadi ddde1e6af8 Avoid ColumnFamilyDescriptor copy (#10978) 2 years ago
Changyu Bi d053926fa2 Improve documentation for MergingIterator (#11161) 2 years ago
Levi Tamasi 95d67f3646 Fix/clarify/extend the API comments of CompactionFilter (#11261) 2 years ago
Yu Zhang 8dfcfd4e90 Fix backward iteration issue when user defined timestamp is enabled in BlobDB (#11258) 2 years ago
anand76 cf09917c18 Add filter/index/data secondary cache hits stats (#11246) 2 years ago
yihuang b7e73501d8 fix: add extern and ROCKSDB_LIBRARY_API to two c apis (#11217) 2 years ago
Levi Tamasi 3c9eed688e Enable moving a string or PinnableSlice into PinnableWideColumns (#11248) 2 years ago
Yu Zhang af7872ffd1 Fix a TestGet failure when user defined timestamp is enabled (#11249) 2 years ago
Yu Zhang f007b8fdea Support iter_start_ts in integrated BlobDB (#11244) 2 years ago
Changyu Bi 229297d1b8 Refactor AddRangeDels() + consider range tombstone during compaction file cutting (#11113) 2 years ago
ywave 9fa9becf53 fix -Wrange-loop-analysis in Apple clang version 12.0.0 (clang-1200.0.32.29) (#11240) 2 years ago
Andrew Kryczka 286080456c Update HISTORY.md and version.h for 8.0 release (#11238) 2 years ago
anand76 476b01579c Revert enabling IO uring in db_stress (#11242) 2 years ago
Changyu Bi 1b48ecc2c6 Fix an assertion failure in DBIter::SeekToLast() when user-defined timestamp is enabled (#11223) 2 years ago
leipeng ea85148b78 DBIter::FindNextUserEntryInternal: do not PrepareValue for `Delete` (#11211) 2 years ago
Changyu Bi ebfca2cf00 Fix comment for option `periodic_compaction_seconds` (#11227) 2 years ago
HuangYi 83bc03a99a add c api to set option fail_if_not_bottommost_level (#11158) 2 years ago
HuangYi cfe50f7e77 add c api for HyperClockCache (#11110) 2 years ago
Matt Jurik 142b18d00b C-API: Support multi-CF flush (#11112) 2 years ago
Andrew Kryczka fcd816d534 Add missing override keyword in env_win.h functions (#11232) 2 years ago
Alan Paxton d47126875b Fix Java API ComparatorOptions use after delete error (#11176) 2 years ago
mrambacher b6640c3117 Remove FactoryFunc from LoadXXXObject (#11203) 2 years ago
Andrew Kryczka 25e1365227 Merge operator failed subcode (#11231) 2 years ago
Andrew Kryczka 6aef1a05d6 Use CacheDependencies() at start of ApproximateKeyAnchors() (#11230) 2 years ago
akankshamahajan 68e4581c67 Return NotSupported in scan if IOUring not supported and enable IOUring in db_stress for async_io testing (#11197) 2 years ago
Peter Dillinger 64a1f7670f Customize CompressedSecondaryCache by block kind (#11204) 2 years ago
Peter Dillinger 88056ea6cb Re-add memory_allocator.h include from cache.h (#11229) 2 years ago
Levi Tamasi ab22e79824 Support using MultiGetEntity as verification method in stress tests (#11228) 2 years ago
Levi Tamasi 94ec433833 Mention the new MultiGetEntity API in HISTORY.md (#11226) 2 years ago
Levi Tamasi 9794acb597 Add a new MultiGetEntity API (#11222) 2 years ago
akankshamahajan 6d5e8604f1 Fix regression script for async_io benchmarks (#11224) 2 years ago
sdong 1969815fa8 Remove docs/Gemfile.lock and update github-pages version (#11173) 2 years ago
Yu Zhang c19672c187 Enable crash test to run BlobDB together with user-defined timestamp (#11199) 2 years ago
Wentian Guo 42d6652ba2 remove dependency on options.h for port_posix.h andport_win.h (#11214) 2 years ago
akankshamahajan a72f591825 Fix a minor bug in the regression script during assigning value (#11215) 2 years ago
akankshamahajan ab2157fa6f Extend existing benchmarks seekrandom and multiread to run with async_io (#11170) 2 years ago
Peter Dillinger 3cacd4b4ec Put Cache and CacheWrapper in new public header (#11192) 2 years ago
Peter Dillinger b7747bbc9f Attempt fix flaky DBWriteTest.LockWALInEffect (#11209) 2 years ago
Peter Dillinger 34bb3ddc43 Improve SmallEnumSet (#11178) 2 years ago
Peter Dillinger ee5305fabb Mitigate presumed OOM in CircleCI (#11206) 2 years ago
anand76 77b61abc7b Fix bug in WAL streaming uncompression (#11198) 2 years ago
Levi Tamasi 876d281592 Add compaction filter support for wide-column entities (#11196) 2 years ago
Hui Xiao 6650ca244e Remove a couple deprecated convenience.h APIs (#11120) 2 years ago
Peter Dillinger b5827c806c Revert to LIB_MODE=static for optimized builds (#11195) 2 years ago
Symious 68fa90ca43 Add kForceOptimized option to jni (#11181) 2 years ago
Peter Dillinger 54d72085b5 Fix regression_test.sh for LIB_MODE=shared default (#11194) 2 years ago
Hui Xiao 9b66331388 Simplify TEST_F(DBWALTest, FixSyncWalOnObseletedWalWithNewManifestCausingMissingWAL) (#11186) 2 years ago
Peter Dillinger 92e8874654 Fix more CircleCI jobs for LIB_MODE=shared default (#11193) 2 years ago
Alan Paxton 4a51900700 CI Benchmarking. Reduce runtime further as overhead appears to have risen. (#11189) 2 years ago
Peter Dillinger 27cf09172c Fix compile gettid on older Linux (#11184) 2 years ago
Peter Dillinger cf756ed916 Use LIB_MODE=shared build by default with make (#11168) 2 years ago
Peter Dillinger e17f31057b Support stack traces with gdb (and debugger invocation) (#11150) 2 years ago
Peter Dillinger 0cf1008fe3 Deprecate write_global_seqno and default to false (#11179) 2 years ago
Peter Dillinger 390cc0b156 Ensure LockWAL() stall cleared for UnlockWAL() return (#11172) 2 years ago
anand76 63da9cfa26 Return any errors returned by ReadAsync to the MultiGet caller (#11171) 2 years ago
Yu Zhang 701a19cc83 Enable crash test for user-defined timestamp and BlobDB combination (#11163) 2 years ago
changyubi fec5c8deb8 Remove NUMA setting for benchmark-linux (#11180) 2 years ago
Alan Paxton 6781009ee8 CI Benchmarking. Small configuration changes based on performance analysis. (#11074) 2 years ago
Andrew Kryczka 6af16ac7c1 Update HISTORY.md for #11136 (#11177) 2 years ago
Levi Tamasi df680b24ef Clean up InvokeFilterIfNeeded a bit (#11174) 2 years ago
Andrew Kryczka 071c33846d Allow canceling manual compaction while waiting for conflicting compaction (#11165) 2 years ago
Levi Tamasi 753d4d5078 Support using GetEntity as a verification method in the non-batched stress tests (#11144) 2 years ago
Levi Tamasi a82021c3d0 Fix a bug where GetEntity would expose a blob reference (#11162) 2 years ago
Peter Dillinger 94e3beec77 Cleanup, improve, stress test LockWAL() (#11143) 2 years ago
sdong 36174d89a6 DB Stress to fix a false assertion (#11164) 2 years ago
Yu Zhang 24ac53d81a Use user key on sst file for blob verification for Get and MultiGet (#11105) 2 years ago
akankshamahajan 79e57a39a3 Move ExternalSSTTestEnv to FileSystemWrapper (#11139) 2 years ago
sdong 4720ba4391 Remove RocksDB LITE (#11147) 2 years ago
Yu Zhang 6943ff6e50 Remove deprecated util functions in options_util.h (#11126) 2 years ago
Andrew Kryczka 97c1024d3e Include db_stress verification method in failure message (#11133) 2 years ago
Changyu Bi c94c8fcbd4 Remove deprecated FileSystem::Load() (#11122) 2 years ago
Karim TAAM a1e92bd956 use verify checksum option in block based table reader Open() (#11099) 2 years ago
Andrew Kryczka b44cbbf709 Fix GetMergeOperands() returning MergeInProgress (#11136) 2 years ago
dependabot[bot] dcf93b7b3e Bump commonmarker from 0.23.6 to 0.23.7 in /docs (#11128) 2 years ago
Levi Tamasi a6cfdd4eda Fix the HISTORY.md entry related to the removed statistics (#11140) 2 years ago
akankshamahajan 986c5b9d4e Migrate TestEnv in listener_test.cc to FileSystemWrapper (#11125) 2 years ago
sdong e808858ae0 Remove Stats related to compressed block cache (#11135) 2 years ago
Levi Tamasi 6da2e20df3 Remove more obsolete statistics (#11131) 2 years ago
Heiko Becker 88edfbfb5e Fix build with gcc 13 by including <cstdint> (#11118) 2 years ago
Andrew Kryczka 6a5071ceb5 Support PutEntity in trace analyzer (#11127) 2 years ago
Peter Dillinger 546e213c4f Fix DelayWrite() calls for two_write_queues (#11130) 2 years ago
Peter Dillinger 9afa0f05ad Remove deprecated Env::LoadEnv() (#11121) 2 years ago
Levi Tamasi 99e559533d Remove some deprecated/obsolete statistics from the API (#11123) 2 years ago
anand76 bcbab59c55 Migrate ErrorEnv from EnvWrapper to FileSystemWrapper (#11124) 2 years ago
sdong 2800aa069a Remove compressed block cache (#11117) 2 years ago
Peter Dillinger 4a9185340d A better contract for best_efforts_recovery (#11085) 2 years ago
Changyu Bi e0ea0dc6bd Improve documentation for `allow_ingest_behind` (#11119) 2 years ago
Hui Xiao 86fa2592be Fix data race on `ColumnFamilyData::flush_reason` by letting FlushRequest/Job owns flush_reason instead of CFD (#11111) 2 years ago
Hui Xiao 7e7548477c Update HISTORY.md/version.h/format compatiblity test for 7.10 release (#11114) 2 years ago
Andrew Kryczka b7fbcefda8 Add API to limit blast radius of merge operator failure (#11092) 2 years ago
akankshamahajan bde65052c4 Enhance async scan prefetch unit tests (#11087) 2 years ago
codeoos f4a5446cab Fix error maybe-uninitialized #11100 (#11101) 2 years ago
leipeng a5bcbcd8be remove unused InternalIteratorBase::is_mutable_ (#11104) 2 years ago
Peter Dillinger fd911f9655 Upgrade xxhash.h to latest dev (#11098) 2 years ago
Changyu Bi e9d6a0d7ce Fix asan failure caused by range tombstone start key use-after-free (#11106) 2 years ago
akankshamahajan bd4b8d6487 Fix crash in block_cache_trace_analyzer if reference key is null in case of MultiGet (#11042) 2 years ago
Changyu Bi 4d0f9a995c Consider TTL compaction file cutting earlier to prevent small output file (#11075) 2 years ago
Changyu Bi 6a82b68788 Avoid counting extra range tombstone compensated size in `AddRangeDels()` (#11091) 2 years ago
Changyu Bi f515d9d203 Revert #10802 Consider range tombstone in compaction output file cutting (#11089) 2 years ago
leipeng 3941c34950 db_bench: let -benchmark=compact respect -subcompactions (#11077) 2 years ago
Wenlong Zhang 1cfe3528a2 support loongarch64 for rocksdb (#10036) 2 years ago
anand76 a510880346 Add a unit test for async prefetch fix in #11049 (#11084) 2 years ago
Peter Dillinger 9f7801c5f1 Major Cache refactoring, CPU efficiency improvement (#10975) 2 years ago
Changyu Bi 0a2d3b663a Fix some unit test failure in ExternalSSTFileBasicTest (#11070) 2 years ago
Niklas Fiekas ff04fb154b Add C API for ReadOptions::async_io (#11062) 2 years ago
ehds 4737e1d41b fix shared state used after free (#11059) 2 years ago
Hui Xiao b965a5a80e Add back Options::CompactionOptionsFIFO::allow_compaction to stress/crash test (#11063) 2 years ago
  1. 70
      .circleci/config.yml
  2. 4
      .github/workflows/sanity_check.yml
  3. 2
      .gitignore
  4. 99
      CMakeLists.txt
  5. 170
      HISTORY.md
  6. 22
      INSTALL.md
  7. 124
      Makefile
  8. 1
      PLUGINS.md
  9. 2
      README.md
  10. 21
      ROCKSDB_LITE.md
  11. 362
      TARGETS
  12. 34
      USERS.md
  13. 13
      buckifier/buckify_rocksdb.py
  14. 169
      build_tools/build_detect_platform
  15. 22
      build_tools/dependencies_platform009.sh
  16. 2
      build_tools/fbcode_config.sh
  17. 170
      build_tools/fbcode_config_platform009.sh
  18. 2
      build_tools/fbcode_config_platform010.sh
  19. 2
      build_tools/regression_build_test.sh
  20. 43
      build_tools/update_dependencies.sh
  21. 59
      cache/cache.cc
  22. 259
      cache/cache_bench_tool.cc
  23. 30
      cache/cache_entry_roles.cc
  24. 83
      cache/cache_entry_roles.h
  25. 27
      cache/cache_entry_stats.h
  26. 40
      cache/cache_helpers.cc
  27. 36
      cache/cache_helpers.h
  28. 2
      cache/cache_key.cc
  29. 17
      cache/cache_reservation_manager.cc
  30. 9
      cache/cache_reservation_manager.h
  31. 428
      cache/cache_test.cc
  32. 70
      cache/charged_cache.cc
  33. 80
      cache/charged_cache.h
  34. 971
      cache/clock_cache.cc
  35. 345
      cache/clock_cache.h
  36. 141
      cache/compressed_secondary_cache.cc
  37. 41
      cache/compressed_secondary_cache.h
  38. 563
      cache/compressed_secondary_cache_test.cc
  39. 510
      cache/lru_cache.cc
  40. 179
      cache/lru_cache.h
  41. 971
      cache/lru_cache_test.cc
  42. 20
      cache/secondary_cache.cc
  43. 433
      cache/secondary_cache_adapter.cc
  44. 76
      cache/secondary_cache_adapter.h
  45. 51
      cache/sharded_cache.cc
  46. 97
      cache/sharded_cache.h
  47. 375
      cache/typed_cache.h
  48. 2
      coverage/coverage_test.sh
  49. 14
      crash_test.mk
  50. 5
      db/arena_wrapped_db_iter.cc
  51. 48
      db/blob/blob_contents.cc
  52. 37
      db/blob/blob_contents.h
  53. 7
      db/blob/blob_counting_iterator.h
  54. 32
      db/blob/blob_file_builder.cc
  55. 23
      db/blob/blob_file_cache.cc
  56. 14
      db/blob/blob_file_cache.h
  57. 25
      db/blob/blob_file_cache_test.cc
  58. 17
      db/blob/blob_file_completion_callback.h
  59. 74
      db/blob/blob_file_reader.cc
  60. 6
      db/blob/blob_file_reader.h
  61. 83
      db/blob/blob_file_reader_test.cc
  62. 2
      db/blob/blob_log_sequential_reader.cc
  63. 2
      db/blob/blob_log_writer.cc
  64. 77
      db/blob/blob_source.cc
  65. 22
      db/blob/blob_source.h
  66. 83
      db/blob/blob_source_test.cc
  67. 464
      db/blob/db_blob_basic_test.cc
  68. 14
      db/blob/db_blob_compaction_test.cc
  69. 2
      db/blob/db_blob_corruption_test.cc
  70. 6
      db/blob/db_blob_index_test.cc
  71. 79
      db/builder.cc
  72. 4
      db/builder.h
  73. 389
      db/c.cc
  74. 265
      db/c_test.c
  75. 206
      db/column_family.cc
  76. 39
      db/column_family.h
  77. 286
      db/column_family_test.cc
  78. 19
      db/compact_files_test.cc
  79. 124
      db/compaction/compaction.cc
  80. 57
      db/compaction/compaction.h
  81. 215
      db/compaction/compaction_iterator.cc
  82. 24
      db/compaction/compaction_iterator.h
  83. 4
      db/compaction/compaction_iterator_test.cc
  84. 237
      db/compaction/compaction_job.cc
  85. 23
      db/compaction/compaction_job.h
  86. 16
      db/compaction/compaction_job_stats_test.cc
  87. 30
      db/compaction/compaction_job_test.cc
  88. 580
      db/compaction/compaction_outputs.cc
  89. 34
      db/compaction/compaction_outputs.h
  90. 26
      db/compaction/compaction_picker.cc
  91. 6
      db/compaction/compaction_picker.h
  92. 129
      db/compaction/compaction_picker_fifo.cc
  93. 11
      db/compaction/compaction_picker_fifo.h
  94. 63
      db/compaction/compaction_picker_level.cc
  95. 219
      db/compaction/compaction_picker_test.cc
  96. 59
      db/compaction/compaction_picker_universal.cc
  97. 2
      db/compaction/compaction_picker_universal.h
  98. 2
      db/compaction/compaction_service_job.cc
  99. 14
      db/compaction/compaction_service_test.cc
  100. 9
      db/compaction/sst_partitioner.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -152,15 +152,15 @@ commands:
steps:
- run:
name: "Test low-variance benchmarks"
command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 10000000
command: ./tools/benchmark_ci.py --db_dir /tmp/rocksdb-benchmark-datadir --output_dir /tmp/benchmark-results --num_keys 20000000
environment:
LD_LIBRARY_PATH: /usr/local/lib
# How long to run parts of the test(s)
DURATION_RO: 400
DURATION_RW: 700
DURATION_RO: 300
DURATION_RW: 500
# Keep threads within physical capacity of server (much lower than default)
NUM_THREADS: 1
MAX_BACKGROUND_JOBS: 3
MAX_BACKGROUND_JOBS: 4
# Don't run a couple of "optional" initial tests
CI_TESTS_ONLY: "true"
# Reduce configured size of levels to ensure more levels in the leveled compaction LSM tree
@ -170,7 +170,11 @@ commands:
# The benchmark host has 32GB memory
# The following values are tailored to work with that
# Note, tests may not exercise the targeted issues if the memory is increased on new test hosts.
COMPRESSION_TYPE: "none"
CACHE_INDEX_AND_FILTER_BLOCKS: 1
MIN_LEVEL_TO_COMPRESS: 3
CACHE_SIZE_MB: 10240
MB_WRITE_PER_SEC: 2
post-benchmarks:
steps:
@ -269,12 +273,12 @@ jobs:
./sst_dump --help | grep -E -q 'Supported compression types: kNoCompression$' # Verify no compiled in compression
- post-steps
build-linux-shared_lib-alt_namespace-status_checked:
build-linux-static_lib-alt_namespace-status_checked:
executor: linux-docker
resource_class: 2xlarge
steps:
- pre-steps
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=shared OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j32 check
- run: ASSERT_STATUS_CHECKED=1 TEST_UINT128_COMPAT=1 ROCKSDB_MODIFY_NPHASH=1 LIB_MODE=static OPT="-DROCKSDB_NAMESPACE=alternative_rocksdb_ns" make V=1 -j24 check
- post-steps
build-linux-release:
@ -282,11 +286,21 @@ jobs:
resource_class: 2xlarge
steps:
- checkout # check out the code in the project directory
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: make V=1 -j32 LIB_MODE=shared release
- run: ls librocksdb.so # ensure shared lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- run: make clean
- run: make V=1 -j32 release
- run: ls librocksdb.a # ensure static lib built
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- post-steps
@ -302,27 +316,6 @@ jobs:
- run: USE_RTTI=1 DEBUG_LEVEL=0 make V=1 -j16 static_lib tools db_bench
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
build-linux-lite:
executor: linux-docker
resource_class: large
steps:
- pre-steps
- run: LITE=1 make V=1 J=8 -j8 check
- post-steps
build-linux-lite-release:
executor: linux-docker
resource_class: large
steps:
- checkout # check out the code in the project directory
- run: LITE=1 make V=1 -j8 release
- run: ./db_stress --version # ensure with gflags
- run: make clean
- run: apt-get remove -y libgflags-dev
- run: LITE=1 make V=1 -j8 release
- run: if ./db_stress --version; then false; else true; fi # ensure without gflags
- post-steps
build-linux-clang-no_test_run:
executor: linux-docker
resource_class: xlarge
@ -427,7 +420,10 @@ jobs:
steps:
- checkout # check out the code in the project directory
- run: apt-get update -y && apt-get install -y libgflags-dev
- run: make V=1 -j8 unity_test
- run:
name: "Unity build"
command: make V=1 -j8 unity_test
no_output_timeout: 20m
- run: make V=1 -j8 -k check-headers # could be moved to a different build
- post-steps
@ -438,7 +434,7 @@ jobs:
- pre-steps
- setup-folly
- build-folly
- run: USE_FOLLY=1 CC=gcc-7 CXX=g++-7 V=1 make -j32 check
- run: USE_FOLLY=1 LIB_MODE=static CC=gcc-7 CXX=g++-7 V=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
build-linux-gcc-7-with-folly-lite-no-test:
@ -484,7 +480,7 @@ jobs:
resource_class: 2xlarge
steps:
- pre-steps
- run: CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench
- run: LIB_MODE=static CC=gcc-11 CXX=g++-11 V=1 make -j32 all microbench # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
build-linux-clang-13-no_test_run:
@ -503,7 +499,7 @@ jobs:
- pre-steps
- setup-folly
- build-folly
- run: CC=clang-13 CXX=clang++-13 USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check
- run: CC=clang-13 CXX=clang++-13 LIB_MODE=static USE_CLANG=1 USE_FOLLY=1 COMPILE_WITH_UBSAN=1 COMPILE_WITH_ASAN=1 make -j32 check # TODO: LIB_MODE only to work around unresolved linker failures
- post-steps
# This job is only to make sure the microbench tests are able to run, the benchmark result is not meaningful as the CI host is changing.
@ -520,7 +516,7 @@ jobs:
resource_class: large
steps:
- pre-steps
- run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000' blackbox_crash_test_with_atomic_flush
- run: ulimit -S -n `ulimit -H -n` && make V=1 -j8 CRASH_TEST_EXT_ARGS='--duration=960 --max_key=2500000 --use_io_uring=0' blackbox_crash_test_with_atomic_flush
- post-steps
build-linux-crashtest-tiered-storage-bb:
@ -530,7 +526,7 @@ jobs:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 blackbox_crash_test_with_tiered_storage
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' blackbox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
@ -541,7 +537,7 @@ jobs:
- pre-steps
- run:
name: "run crashtest"
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS=--duration=10800 whitebox_crash_test_with_tiered_storage
command: ulimit -S -n `ulimit -H -n` && make V=1 -j32 CRASH_TEST_EXT_ARGS='--duration=10800 --use_io_uring=0' whitebox_crash_test_with_tiered_storage
no_output_timeout: 100m
- post-steps
@ -821,18 +817,16 @@ workflows:
- build-linux-cmake-with-folly-coroutines
- build-linux-cmake-with-benchmark
- build-linux-encrypted_env-no_compression
- build-linux-lite
jobs-linux-run-tests-san:
jobs:
- build-linux-clang10-asan
- build-linux-clang10-ubsan
- build-linux-clang10-mini-tsan
- build-linux-shared_lib-alt_namespace-status_checked
- build-linux-static_lib-alt_namespace-status_checked
jobs-linux-no-test-run:
jobs:
- build-linux-release
- build-linux-release-rtti
- build-linux-lite-release
- build-examples
- build-fuzzers
- build-linux-clang-no_test_run

@ -33,9 +33,7 @@ jobs:
run: pip install argparse
- name: Download clang-format-diff.py
uses: wei/wget@v1
with:
args: https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
run: wget https://raw.githubusercontent.com/llvm/llvm-project/release/12.x/clang/tools/clang-format/clang-format-diff.py
- name: Check format
run: VERBOSE_CHECK=1 make check-format

2
.gitignore vendored

@ -95,3 +95,5 @@ fuzz/crash-*
cmake-build-*
third-party/folly/
.cache
*.sublime-*

@ -245,41 +245,40 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
endif(HAS_S390X_MARCH_NATIVE)
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
option(PORTABLE "build a portable binary" OFF)
option(FORCE_SSE42 "force building with SSE4.2, even when PORTABLE=ON" OFF)
option(FORCE_AVX "force building with AVX, even when PORTABLE=ON" OFF)
option(FORCE_AVX2 "force building with AVX2, even when PORTABLE=ON" OFF)
if(PORTABLE)
add_definitions(-DROCKSDB_PORTABLE)
# MSVC does not need a separate compiler flag to enable SSE4.2; if nmmintrin.h
# is available, it is available by default.
if(FORCE_SSE42 AND NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2 -mpclmul")
endif()
if(MSVC)
if(FORCE_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
endif()
# MSVC automatically enables BMI / lzcnt with AVX2.
if(FORCE_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
endif()
else()
if(FORCE_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
endif()
if(FORCE_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -mbmi -mlzcnt")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
CHECK_C_COMPILER_FLAG("-march=loongarch64" HAS_LOONGARCH64)
if(HAS_LOONGARCH64)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=loongarch64 -mtune=loongarch64")
endif(HAS_LOONGARCH64)
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64")
set(PORTABLE 0 CACHE STRING "Minimum CPU arch to support, or 0 = current CPU, 1 = baseline CPU")
if(PORTABLE STREQUAL 1)
# Usually nothing to do; compiler default is typically the most general
if(NOT MSVC)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=loongarch64")
endif()
endif()
elseif(PORTABLE MATCHES [^0]+)
# Name of a CPU arch spec or feature set to require
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:${PORTABLE}")
else()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${PORTABLE}")
endif()
else()
if(MSVC)
# NOTE: No auto-detection of current CPU, but instead assume some useful
# level of optimization is supported
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
else()
# Require instruction set from current CPU (with some legacy or opt-out
# exceptions)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^s390x" AND NOT HAS_S390X_MARCH_NATIVE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z196")
elseif(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND NOT HAS_ARMV8_CRC)
@ -294,25 +293,6 @@ if(NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "-msse4.2 -mpclmul")
endif()
CHECK_CXX_SOURCE_COMPILES("
#include <cstdint>
#include <nmmintrin.h>
#include <wmmintrin.h>
int main() {
volatile uint32_t x = _mm_crc32_u32(0, 0);
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
}
" HAVE_SSE42)
if(HAVE_SSE42)
add_definitions(-DHAVE_SSE42)
add_definitions(-DHAVE_PCLMUL)
elseif(FORCE_SSE42)
message(FATAL_ERROR "FORCE_SSE42=ON but unable to compile with SSE4.2 enabled")
endif()
# Check if -latomic is required or not
if (NOT MSVC)
set(CMAKE_REQUIRED_FLAGS "--std=c++17")
@ -485,12 +465,6 @@ if(CMAKE_COMPILER_IS_GNUCXX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-builtin-memcmp")
endif()
option(ROCKSDB_LITE "Build RocksDBLite version" OFF)
if(ROCKSDB_LITE)
add_definitions(-DROCKSDB_LITE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -Os")
endif()
if(CMAKE_SYSTEM_NAME MATCHES "Cygwin")
add_definitions(-fno-builtin-memcmp -DCYGWIN)
elseif(CMAKE_SYSTEM_NAME MATCHES "Darwin")
@ -573,7 +547,7 @@ if(HAVE_SCHED_GETCPU)
add_definitions(-DROCKSDB_SCHED_GETCPU_PRESENT)
endif()
check_cxx_symbol_exists(getauxval auvx.h HAVE_AUXV_GETAUXVAL)
check_cxx_symbol_exists(getauxval "sys/auxv.h" HAVE_AUXV_GETAUXVAL)
if(HAVE_AUXV_GETAUXVAL)
add_definitions(-DROCKSDB_AUXV_GETAUXVAL_PRESENT)
endif()
@ -649,12 +623,14 @@ set(SOURCES
cache/cache.cc
cache/cache_entry_roles.cc
cache/cache_key.cc
cache/cache_helpers.cc
cache/cache_reservation_manager.cc
cache/charged_cache.cc
cache/clock_cache.cc
cache/compressed_secondary_cache.cc
cache/lru_cache.cc
cache/secondary_cache.cc
cache/secondary_cache_adapter.cc
cache/sharded_cache.cc
db/arena_wrapped_db_iter.cc
db/blob/blob_contents.cc
@ -741,6 +717,7 @@ set(SOURCES
db/write_batch.cc
db/write_batch_base.cc
db/write_controller.cc
db/write_stall_stats.cc
db/write_thread.cc
env/composite_env.cc
env/env.cc
@ -806,6 +783,7 @@ set(SOURCES
table/block_based/block_based_table_iterator.cc
table/block_based/block_based_table_reader.cc
table/block_based/block_builder.cc
table/block_based/block_cache.cc
table/block_based/block_prefetcher.cc
table/block_based/block_prefix_index.cc
table/block_based/data_block_hash_index.cc
@ -873,6 +851,7 @@ set(SOURCES
util/compression_context_cache.cc
util/concurrent_task_limiter_impl.cc
util/crc32c.cc
util/data_structure.cc
util/dynamic_bloom.cc
util/hash.cc
util/murmurhash.cc
@ -886,6 +865,8 @@ set(SOURCES
util/string_util.cc
util/thread_local.cc
util/threadpool_imp.cc
util/udt_util.cc
util/write_batch_util.cc
util/xxhash.cc
utilities/agg_merge/agg_merge.cc
utilities/backup/backup_engine.cc
@ -1000,12 +981,6 @@ if ( ROCKSDB_PLUGINS )
endforeach()
endif()
if(HAVE_SSE42 AND NOT MSVC)
set_source_files_properties(
util/crc32c.cc
PROPERTIES COMPILE_FLAGS "-msse4.2 -mpclmul")
endif()
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
list(APPEND SOURCES
util/crc32c_ppc.c
@ -1271,6 +1246,7 @@ if(WITH_TESTS OR WITH_BENCHMARK_TOOLS)
add_subdirectory(third-party/gtest-1.8.1/fused-src/gtest)
add_library(testharness STATIC
test_util/mock_time_env.cc
test_util/secondary_cache_test_util.cc
test_util/testharness.cc)
target_link_libraries(testharness gtest)
endif()
@ -1316,6 +1292,7 @@ if(WITH_TESTS)
db/db_bloom_filter_test.cc
db/db_compaction_filter_test.cc
db/db_compaction_test.cc
db/db_clip_test.cc
db/db_dynamic_level_test.cc
db/db_encryption_test.cc
db/db_flush_test.cc
@ -1446,6 +1423,7 @@ if(WITH_TESTS)
util/timer_test.cc
util/thread_list_test.cc
util/thread_local_test.cc
util/udt_util_test.cc
util/work_queue_test.cc
utilities/agg_merge/agg_merge_test.cc
utilities/backup/backup_engine_test.cc
@ -1614,3 +1592,6 @@ option(WITH_BENCHMARK "build benchmark tests" OFF)
if(WITH_BENCHMARK)
add_subdirectory(${PROJECT_SOURCE_DIR}/microbench/)
endif()
target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)

@ -1,9 +1,164 @@
# Rocksdb Change Log
## Unreleased
> NOTE: Entries for next release do not go here. Follow instructions in `unreleased_history/README.txt`
## 8.5.0 (07/21/2023)
### Public API Changes
* Removed recently added APIs `GeneralCache` and `MakeSharedGeneralCache()` as our plan changed to stop exposing a general-purpose cache interface. The old forms of these APIs, `Cache` and `NewLRUCache()`, are still available, although general-purpose caching support will be dropped eventually.
### Behavior Changes
* Option `periodic_compaction_seconds` no longer supports FIFO compaction: setting it has no effect on FIFO compactions. FIFO compaction users should only set option `ttl` instead.
* Move prefetching responsibility to page cache for compaction read for non directIO use case
### Performance Improvements
* In case of direct_io, if buffer passed by callee is already aligned, RandomAccessFileRead::Read will avoid realloacting a new buffer, reducing memcpy and use already passed aligned buffer.
* Small efficiency improvement to HyperClockCache by reducing chance of compiler-generated heap allocations
### Bug Fixes
* Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest. Right now it's an experimental feature.
## 8.4.0 (06/26/2023)
### New Features
* Add FSReadRequest::fs_scratch which is a data buffer allocated and provided by underlying FileSystem to RocksDB during reads, when FS wants to provide its own buffer with data instead of using RocksDB provided FSReadRequest::scratch. This can help in cpu optimization by avoiding copy from file system's buffer to RocksDB buffer. More details on how to use/enable it in file_system.h. Right now its supported only for MultiReads(async + sync) with non direct io.
* Start logging non-zero user-defined timestamp sizes in WAL to signal user key format in subsequent records and use it during recovery. This change will break recovery from WAL files written by early versions that contain user-defined timestamps. The workaround is to ensure there are no WAL files to recover (i.e. by flushing before close) before upgrade.
* Added new property "rocksdb.obsolete-sst-files-size-property" that reports the size of SST files that have become obsolete but have not yet been deleted or scheduled for deletion
* Start to record the value of the flag `AdvancedColumnFamilyOptions.persist_user_defined_timestamps` in the Manifest and table properties for a SST file when it is created. And use the recorded flag when creating a table reader for the SST file. This flag is only explicitly record if it's false.
* Add a new option OptimisticTransactionDBOptions::shared_lock_buckets that enables sharing mutexes for validating transactions between DB instances, for better balancing memory efficiency and validation contention across DB instances. Different column families and DBs also now use different hash seeds in this validation, so that the same set of key names will not contend across DBs or column families.
* Add a new ticker `rocksdb.files.marked.trash.deleted` to track the number of trash files deleted by background thread from the trash queue.
* Add an API NewTieredVolatileCache() in include/rocksdb/cache.h to allocate an instance of a block cache with a primary block cache tier and a compressed secondary cache tier. A cache of this type distributes memory reservations against the block cache, such as WriteBufferManager, table reader memory etc., proportionally across both the primary and compressed secondary cache.
* Add `WaitForCompact()` to wait for all flush and compactions jobs to finish. Jobs to wait include the unscheduled (queued, but not scheduled yet).
* Add `WriteBatch::Release()` that releases the batch's serialized data to the caller.
### Public API Changes
* Add C API `rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio`.
* change the FileSystem::use_async_io() API to SupportedOps API in order to extend it to various operations supported by underlying FileSystem. Right now it contains FSSupportedOps::kAsyncIO and FSSupportedOps::kFSBuffer. More details about FSSupportedOps in filesystem.h
* Add new tickers: `rocksdb.error.handler.bg.error.count`, `rocksdb.error.handler.bg.io.error.count`, `rocksdb.error.handler.bg.retryable.io.error.count` to replace the misspelled ones: `rocksdb.error.handler.bg.errro.count`, `rocksdb.error.handler.bg.io.errro.count`, `rocksdb.error.handler.bg.retryable.io.errro.count` ('error' instead of 'errro'). Users should switch to use the new tickers before 9.0 release as the misspelled old tickers will be completely removed then.
* Overload the API CreateColumnFamilyWithImport() to support creating ColumnFamily by importing multiple ColumnFamilies It requires that CFs should not overlap in user key range.
### Behavior Changes
* Change the default value for option `level_compaction_dynamic_level_bytes` to true. This affects users who use leveled compaction and do not set this option explicitly. These users may see additional background compactions following DB open. These compactions help to shape the LSM according to `level_compaction_dynamic_level_bytes` such that the size of each level Ln is approximately size of Ln-1 * `max_bytes_for_level_multiplier`. Turning on this option has other benefits too: see more detail in wiki: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction#option-level_compaction_dynamic_level_bytes-and-levels-target-size and in option comment in advanced_options.h (#11525).
* For Leveled Compaction users, `CompactRange()` will now always try to compact to the last non-empty level. (#11468)
For Leveled Compaction users, `CompactRange()` with `bottommost_level_compaction = BottommostLevelCompaction::kIfHaveCompactionFilter` will behave similar to `kForceOptimized` in that it will skip files created during this manual compaction when compacting files in the bottommost level. (#11468)
* RocksDB will try to drop range tombstones during non-bottommost compaction when it is safe to do so. (#11459)
* When a DB is openend with `allow_ingest_behind=true` (currently only Universal compaction is supported), files in the last level, i.e. the ingested files, will not be included in any compaction. (#11489)
* Statistics `rocksdb.sst.read.micros` scope is expanded to all SST reads except for file ingestion and column family import (some compaction reads were previously excluded).
### Bug Fixes
* Reduced cases of illegally using Env::Default() during static destruction by never destroying the internal PosixEnv itself (except for builds checking for memory leaks). (#11538)
* Fix extra prefetching during seek in async_io when BlockBasedTableOptions.num_file_reads_for_auto_readahead is 1 leading to extra reads than required.
* Fix a bug where compactions that are qualified to be run as 2 subcompactions were only run as one subcompaction.
* Fix a use-after-move bug in block.cc.
## 8.3.0 (05/19/2023)
### New Features
* Introduced a new option `block_protection_bytes_per_key`, which can be used to enable per key-value integrity protection for in-memory blocks in block cache (#11287).
* Added `JemallocAllocatorOptions::num_arenas`. Setting `num_arenas > 1` may mitigate mutex contention in the allocator, particularly in scenarios where block allocations commonly bypass jemalloc tcache.
* Improve the operational safety of publishing a DB or SST files to many hosts by using different block cache hash seeds on different hosts. The exact behavior is controlled by new option `ShardedCacheOptions::hash_seed`, which also documents the solved problem in more detail.
* Introduced a new option `CompactionOptionsFIFO::file_temperature_age_thresholds` that allows FIFO compaction to compact files to different temperatures based on key age (#11428).
* Added a new ticker stat to count how many times RocksDB detected a corruption while verifying a block checksum: `BLOCK_CHECKSUM_MISMATCH_COUNT`.
* New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open.
* New statistics tickers for various iterator seek behaviors and relevant filtering, as \*`_LEVEL_SEEK_`\*. (#11460)
### Public API Changes
* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones.
* Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists.
* Changed the meaning of various Bloom filter stats (prefix vs. whole key), with iterator-related filtering only being tracked in the new \*`_LEVEL_SEEK_`\*. stats. (#11460)
### Behavior changes
* For x86, CPU features are no longer detected at runtime nor in build scripts, but in source code using common preprocessor defines. This will likely unlock some small performance improvements on some newer hardware, but could hurt performance of the kCRC32c checksum, which is no longer the default, on some "portable" builds. See PR #11419 for details.
### Bug Fixes
* Delete an empty WAL file on DB open if the log number is less than the min log number to keep
* Delete temp OPTIONS file on DB open if there is a failure to write it out or rename it
### Performance Improvements
* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406).
## 8.2.0 (04/24/2023)
### Public API Changes
* `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined.
* Add `multi_get_for_update` to C API.
* Remove unnecessary constructor for CompressionOptions.
### Behavior changes
* Changed default block cache size from an 8MB to 32MB LRUCache, which increases the default number of cache shards from 16 to 64. This change is intended to minimize cache mutex contention under stress conditions. See https://github.com/facebook/rocksdb/wiki/Block-Cache for more information.
* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now trivially moves levels down to fill LSM starting from bottommost level during DB open. See more in comments for option `level_compaction_dynamic_level_bytes` (#11321).
* User-provided `ReadOptions` take effect for more reads of non-`CacheEntryRole::kDataBlock` blocks.
* For level compaction with `level_compaction_dynamic_level_bytes=true`, RocksDB now drains unnecessary levels through background compaction automatically (#11340). This together with #11321 makes it automatic to migrate other compaction settings to level compaction with `level_compaction_dynamic_level_bytes=true`. In addition, a live DB that becomes smaller will now have unnecessary levels drained which can help to reduce read and space amp.
* If `CompactRange()` is called with `CompactRangeOptions::bottommost_level_compaction=kForce*` to compact from L0 to L1, RocksDB now will try to do trivial move from L0 to L1 and then do an intra L1 compaction, instead of a L0 to L1 compaction with trivial move disabled (#11375)).
### Bug Fixes
* In the DB::VerifyFileChecksums API, ensure that file system reads of SST files are equal to the readahead_size in ReadOptions, if specified. Previously, each read was 2x the readahead_size.
* In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet.
### New Features
* Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called.
* Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()`
* Added statistics tickers BYTES_COMPRESSED_FROM, BYTES_COMPRESSED_TO, BYTES_COMPRESSION_BYPASSED, BYTES_COMPRESSION_REJECTED, NUMBER_BLOCK_COMPRESSION_BYPASSED, and NUMBER_BLOCK_COMPRESSION_REJECTED. Disabled/deprecated histograms BYTES_COMPRESSED and BYTES_DECOMPRESSED, and ticker NUMBER_BLOCK_NOT_COMPRESSED. The new tickers offer more inight into compression ratios, rejected vs. disabled compression, etc. (#11388)
* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction.
## 8.1.0 (03/18/2023)
### Behavior changes
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
* If the async_io ReadOption is specified for MultiGet or NewIterator on a platform that doesn't support IO uring, the option is ignored and synchronous IO is used.
### Bug Fixes
* Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB.
* Fixed a couple of cases where a Merge operand encountered during iteration wasn't reflected in the `internal_merge_count` PerfContext counter.
* Fixed a bug in CreateColumnFamilyWithImport()/ExportColumnFamily() which did not support range tombstones (#11252).
* Fixed a bug where an excluded column family from an atomic flush contains unflushed data that should've been included in this atomic flush (i.e, data of seqno less than the max seqno of this atomic flush), leading to potential data loss in this excluded column family when `WriteOptions::disableWAL == true` (#11148).
### New Features
* Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits
* Added a new PerfContext counter `internal_merge_point_lookup_count` which tracks the number of Merge operands applied while serving point lookup queries.
* Add new statistics rocksdb.table.open.prefetch.tail.read.bytes, rocksdb.table.open.prefetch.tail.{miss|hit}
* Add support for SecondaryCache with HyperClockCache (`HyperClockCacheOptions` inherits `secondary_cache` option from `ShardedCacheOptions`)
* Add new db properties `rocksdb.cf-write-stall-stats`, `rocksdb.db-write-stall-stats`and APIs to examine them in a structured way. In particular, users of `GetMapProperty()` with property `kCFWriteStallStats`/`kDBWriteStallStats` can now use the functions in `WriteStallStatsMapKeys` to find stats in the map.
### Public API Changes
* Changed various functions and features in `Cache` that are mostly relevant to custom implementations or wrappers. Especially, asychronous lookup functionality is moved from `Lookup()` to a new `StartAsyncLookup()` function.
## 8.0.0 (02/19/2023)
### Behavior changes
* `ReadOptions::verify_checksums=false` disables checksum verification for more reads of non-`CacheEntryRole::kDataBlock` blocks.
* In case of scan with async_io enabled, if posix doesn't support IOUring, Status::NotSupported error will be returned to the users. Initially that error was swallowed and reads were switched to synchronous reads.
### Bug Fixes
* Fixed a data race on `ColumnFamilyData::flush_reason` caused by concurrent flushes.
* Fixed an issue in `Get` and `MultiGet` when user-defined timestamps is enabled in combination with BlobDB.
* Fixed some atypical behaviors for `LockWAL()` such as allowing concurrent/recursive use and not expecting `UnlockWAL()` after non-OK result. See API comments.
* Fixed a feature interaction bug where for blobs `GetEntity` would expose the blob reference instead of the blob value.
* Fixed `DisableManualCompaction()` and `CompactRangeOptions::canceled` to cancel compactions even when they are waiting on conflicting compactions to finish
* Fixed a bug in which a successful `GetMergeOperands()` could transiently return `Status::MergeInProgress()`
* Return the correct error (Status::NotSupported()) to MultiGet caller when ReadOptions::async_io flag is true and IO uring is not enabled. Previously, Status::Corruption() was being returned when the actual failure was lack of async IO support.
* Fixed a bug in DB open/recovery from a compressed WAL that was caused due to incorrect handling of certain record fragments with the same offset within a WAL block.
### Feature Removal
* Remove RocksDB Lite.
* The feature block_cache_compressed is removed. Statistics related to it are removed too.
* Remove deprecated Env::LoadEnv(). Use Env::CreateFromString() instead.
* Remove deprecated FileSystem::Load(). Use FileSystem::CreateFromString() instead.
* Removed the deprecated version of these utility functions and the corresponding Java bindings: `LoadOptionsFromFile`, `LoadLatestOptions`, `CheckOptionsCompatibility`.
* Remove the FactoryFunc from the LoadObject method from the Customizable helper methods.
### Public API Changes
* Moved rarely-needed Cache class definition to new advanced_cache.h, and added a CacheWrapper class to advanced_cache.h. Minor changes to SimCache API definitions.
* Completely removed the following deprecated/obsolete statistics: the tickers `BLOCK_CACHE_INDEX_BYTES_EVICT`, `BLOCK_CACHE_FILTER_BYTES_EVICT`, `BLOOM_FILTER_MICROS`, `NO_FILE_CLOSES`, `STALL_L0_SLOWDOWN_MICROS`, `STALL_MEMTABLE_COMPACTION_MICROS`, `STALL_L0_NUM_FILES_MICROS`, `RATE_LIMIT_DELAY_MILLIS`, `NO_ITERATORS`, `NUMBER_FILTERED_DELETES`, `WRITE_TIMEDOUT`, `BLOB_DB_GC_NUM_KEYS_OVERWRITTEN`, `BLOB_DB_GC_NUM_KEYS_EXPIRED`, `BLOB_DB_GC_BYTES_OVERWRITTEN`, `BLOB_DB_GC_BYTES_EXPIRED`, `BLOCK_CACHE_COMPRESSION_DICT_BYTES_EVICT` as well as the histograms `STALL_L0_SLOWDOWN_COUNT`, `STALL_MEMTABLE_COMPACTION_COUNT`, `STALL_L0_NUM_FILES_COUNT`, `HARD_RATE_LIMIT_DELAY_COUNT`, `SOFT_RATE_LIMIT_DELAY_COUNT`, `BLOB_DB_GC_MICROS`, and `NUM_DATA_BLOCKS_READ_PER_LEVEL`. Note that as a result, the C++ enum values of the still supported statistics have changed. Developers are advised to not rely on the actual numeric values.
* Deprecated IngestExternalFileOptions::write_global_seqno and change default to false. This option only needs to be set to true to generate a DB compatible with RocksDB versions before 5.16.0.
* Remove deprecated APIs `GetColumnFamilyOptionsFrom{Map|String}(const ColumnFamilyOptions&, ..)`, `GetDBOptionsFrom{Map|String}(const DBOptions&, ..)`, `GetBlockBasedTableOptionsFrom{Map|String}(const BlockBasedTableOptions& table_options, ..)` and ` GetPlainTableOptionsFrom{Map|String}(const PlainTableOptions& table_options,..)`.
* Added a subcode of `Status::Corruption`, `Status::SubCode::kMergeOperatorFailed`, for users to identify corruption failures originating in the merge operator, as opposed to RocksDB's internally identified data corruptions
### Build Changes
* The `make` build now builds a shared library by default instead of a static library. Use `LIB_MODE=static` to override.
### New Features
* Compaction filters are now supported for wide-column entities by means of the `FilterV3` API. See the comment of the API for more details.
* Added `do_not_compress_roles` to `CompressedSecondaryCacheOptions` to disable compression on certain kinds of block. Filter blocks are now not compressed by CompressedSecondaryCache by default.
* Added a new `MultiGetEntity` API that enables batched wide-column point lookups. See the API comments for more details.
## 7.10.0 (01/23/2023)
### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
* Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file.
* Compaction output file cutting logic now considers range tombstone start keys. For example, SST partitioner now may receive ParitionRequest for range tombstone start keys.
### Bug Fixes
* Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed.
@ -16,13 +171,20 @@
* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
* Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching
* Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d).
* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075).
### New Features
* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.
### New Features
* Add BackupEngine feature to exclude files from backup that are known to be backed up elsewhere, using `CreateBackupOptions::exclude_files_callback`. To restore the DB, the excluded files must be provided in alternative backup directories using `RestoreOptions::alternate_dirs`.
### Public API Changes
* Substantial changes have been made to the Cache class to support internal development goals. Direct use of Cache class members is discouraged and further breaking modifications are expected in the future. SecondaryCache has some related changes and implementations will need to be updated. (Unlike Cache, SecondaryCache is still intended to support user implementations, and disruptive changes will be avoided.) (#10975)
* Add `MergeOperationOutput::op_failure_scope` for merge operator users to control the blast radius of merge operator failures. Existing merge operator users do not need to make any change to preserve the old behavior
### Performance Improvements
* Updated xxHash source code, which should improve kXXH3 checksum speed, at least on ARM (#11098).
* Improved CPU efficiency of DB reads, from block cache access improvements (#10975).
## 7.9.0 (11/21/2022)
### Performance Improvements
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).

@ -20,12 +20,15 @@ There are few options when compiling RocksDB:
depend on gflags. You will need to have gflags installed to run `make all`. This will compile RocksDB in debug mode. Don't
use binaries compiled by `make all` in production.
* By default the binary we produce is optimized for the platform you're compiling on
(`-march=native` or the equivalent). SSE4.2 will thus be enabled automatically if your
CPU supports it. To print a warning if your CPU does not support SSE4.2, build with
`USE_SSE=1 make static_lib` or, if using CMake, `cmake -DFORCE_SSE42=ON`. If you want
to build a portable binary, add `PORTABLE=1` before your make commands, like this:
`PORTABLE=1 make static_lib`.
* By default the binary we produce is optimized for the CPU you're compiling on
(`-march=native` or the equivalent). To build a binary compatible with the most
general architecture supported by your CPU and compiler, set `PORTABLE=1` for
the build, but performance will suffer as many operations benefit from newer
and wider instructions. In addition to `PORTABLE=0` (default) and `PORTABLE=1`,
it can be set to an architecture name recognized by your compiler. For example,
on 64-bit x86, a reasonable compromise is `PORTABLE=haswell` which supports
many or most of the available optimizations while still being compatible with
most processors made since roughly 2013.
## Dependencies
@ -48,6 +51,11 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
* If you wish to build the RocksJava static target, then cmake is required for building Snappy.
* If you wish to run microbench (e.g, `make microbench`, `make ribbon_bench` or `cmake -DWITH_BENCHMARK=1`), Google benchmark >= 1.6.0 is needed.
* You can do the following to install Google benchmark. These commands are copied from `./build_tools/ubuntu20_image/Dockerfile`:
`$ git clone --depth 1 --branch v1.7.0 https://github.com/google/benchmark.git ~/benchmark`
`$ cd ~/benchmark && mkdir build && cd build && cmake .. -GNinja -DCMAKE_BUILD_TYPE=Release -DBENCHMARK_ENABLE_GTEST_TESTS=0 && ninja && ninja install`
## Supported platforms
@ -178,7 +186,7 @@ to build a portable binary, add `PORTABLE=1` before your make commands, like thi
gmake rocksdbjava
* **iOS**:
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define two important pre-processing macros: `ROCKSDB_LITE` and `IOS_CROSS_COMPILE`.
* Run: `TARGET_OS=IOS make static_lib`. When building the project which uses rocksdb iOS library, make sure to define an important pre-processing macros: `IOS_CROSS_COMPILE`.
* **Windows** (Visual Studio 2017 to up):
* Read and follow the instructions at CMakeLists.txt

@ -44,13 +44,6 @@ quoted_perl_command = $(subst ','\'',$(perl_command))
# Set the default DEBUG_LEVEL to 1
DEBUG_LEVEL?=1
# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
# Mode "static" means to link against static libraries (.a)
# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
#
# Set the default LIB_MODE to static
LIB_MODE?=static
# OBJ_DIR is where the object files reside. Default to the current directory
OBJ_DIR?=.
@ -81,29 +74,42 @@ else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
endif
endif
$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL})
# Lite build flag.
LITE ?= 0
ifeq ($(LITE), 0)
ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
# Be backward compatible and support older format where OPT=-DROCKSDB_LITE is
# specified instead of LITE=1 on the command line.
LITE=1
endif
else ifeq ($(LITE), 1)
ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
OPT += -DROCKSDB_LITE
endif
# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
# Mode "static" means to link against static libraries (.a)
# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
#
ifeq ($(DEBUG_LEVEL), 0)
# For optimized, set the default LIB_MODE to static for code size/efficiency
LIB_MODE?=static
else
# For debug, set the default LIB_MODE to shared for efficient `make check` etc.
LIB_MODE?=shared
endif
$(info $$DEBUG_LEVEL is $(DEBUG_LEVEL), $$LIB_MODE is $(LIB_MODE))
# Detect what platform we're building on.
# Export some common variables that might have been passed as Make variables
# instead of environment variables.
dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
export LDFLAGS="$(EXTRA_LDFLAGS)"; \
export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
export PORTABLE="$(PORTABLE)"; \
export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
export USE_CLANG="$(USE_CLANG)"; \
export LIB_MODE="$(LIB_MODE)"; \
export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
export USE_FOLLY="$(USE_FOLLY)"; \
"$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
# this file is generated by the previous line to set build flags and sources
include make_config.mk
# Figure out optimize level.
ifneq ($(DEBUG_LEVEL), 2)
ifeq ($(LITE), 0)
OPTIMIZE_LEVEL ?= -O2
else
OPTIMIZE_LEVEL ?= -Os
endif
endif
# `OPTIMIZE_LEVEL` is empty when the user does not set it and `DEBUG_LEVEL=2`.
# In that case, the compiler default (`-O0` for gcc and clang) will be used.
@ -236,25 +242,6 @@ am__v_AR_1 =
AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@
# Detect what platform we're building on.
# Export some common variables that might have been passed as Make variables
# instead of environment variables.
dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
export CXXFLAGS="$(EXTRA_CXXFLAGS)"; \
export LDFLAGS="$(EXTRA_LDFLAGS)"; \
export COMPILE_WITH_ASAN="$(COMPILE_WITH_ASAN)"; \
export COMPILE_WITH_TSAN="$(COMPILE_WITH_TSAN)"; \
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
export PORTABLE="$(PORTABLE)"; \
export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
export USE_CLANG="$(USE_CLANG)"; \
export LIB_MODE="$(LIB_MODE)"; \
export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
export USE_FOLLY="$(USE_FOLLY)"; \
"$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
# this file is generated by the previous line to set build flags and sources
include make_config.mk
ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
include $(ROCKSDB_PLUGIN_MKS)
ROCKSDB_PLUGIN_PROTO =ROCKSDB_NAMESPACE::ObjectLibrary\&, const std::string\&
@ -337,13 +324,6 @@ endif
ifeq ($(PLATFORM), OS_SOLARIS)
PLATFORM_CXXFLAGS += -D _GLIBCXX_USE_C99
endif
ifneq ($(filter -DROCKSDB_LITE,$(OPT)),)
# found
CFLAGS += -fno-exceptions
CXXFLAGS += -fno-exceptions
# LUA is not supported under ROCKSDB_LITE
LUA_PATH =
endif
ifeq ($(LIB_MODE),shared)
# So that binaries are executable from build location, in addition to install location
@ -357,8 +337,8 @@ ifneq ($(MACHINE), arm64)
# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
# set during platform detection
DISABLE_JEMALLOC=1
PLATFORM_CFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CFLAGS))
PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS))
PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS))
endif
endif
endif
@ -559,7 +539,7 @@ endif
ifdef USE_CLANG
# Used by some teams in Facebook
WARNING_FLAGS += -Wshift-sign-overflow
WARNING_FLAGS += -Wshift-sign-overflow -Wambiguous-reversed-operator
endif
ifeq ($(PLATFORM), OS_OPENBSD)
@ -1082,13 +1062,11 @@ check: all
rm -rf $(TEST_TMPDIR)
ifneq ($(PLATFORM), OS_AIX)
$(PYTHON) tools/check_all_python.py
ifeq ($(filter -DROCKSDB_LITE,$(OPT)),)
ifndef ASSERT_STATUS_CHECKED # not yet working with these tests
$(PYTHON) tools/ldb_test.py
sh tools/rocksdb_dump_test.sh
endif
endif
endif
ifndef SKIP_FORMAT_BUCK_CHECKS
$(MAKE) check-format
$(MAKE) check-buck-targets
@ -1244,9 +1222,9 @@ clean: clean-ext-libraries-all clean-rocks clean-rocksjava
clean-not-downloaded: clean-ext-libraries-bin clean-rocks clean-not-downloaded-rocksjava
clean-rocks:
echo shared=$(ALL_SHARED_LIBS)
echo static=$(ALL_STATIC_LIBS)
rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(ALL_STATIC_LIBS) $(ALL_SHARED_LIBS) $(MICROBENCHS)
# Not practical to exactly match all versions/variants in naming (e.g. debug or not)
rm -f ${LIBNAME}*.so* ${LIBNAME}*.a
rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(MICROBENCHS)
rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
$(FIND) . -name "*.[oda]" -exec rm -f {} \;
$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
@ -1439,6 +1417,9 @@ thread_local_test: $(OBJ_DIR)/util/thread_local_test.o $(TEST_LIBRARY) $(LIBRARY
work_queue_test: $(OBJ_DIR)/util/work_queue_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
udt_util_test: $(OBJ_DIR)/util/udt_util_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
corruption_test: $(OBJ_DIR)/db/corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
@ -1502,6 +1483,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR
db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
@ -2070,7 +2054,7 @@ JAVA_INCLUDE = -I$(JAVA_HOME)/include/ -I$(JAVA_HOME)/include/linux
ifeq ($(PLATFORM), OS_SOLARIS)
ARCH := $(shell isainfo -b)
else ifeq ($(PLATFORM), OS_OPENBSD)
ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64, $(MACHINE)))
ifneq (,$(filter amd64 ppc64 ppc64le s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
ARCH := 64
else
ARCH := 32
@ -2091,7 +2075,7 @@ ifneq ($(origin JNI_LIBC), undefined)
endif
ifeq (,$(ROCKSDBJNILIB))
ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64, $(MACHINE)))
ifneq (,$(filter ppc% s390x arm64 aarch64 sparc64 loongarch64, $(MACHINE)))
ROCKSDBJNILIB = librocksdbjni-linux-$(MACHINE)$(JNI_LIBC_POSTFIX).so
else
ROCKSDBJNILIB = librocksdbjni-linux$(ARCH)$(JNI_LIBC_POSTFIX).so
@ -2457,6 +2441,8 @@ checkout_folly:
@# NOTE: this hack is required for gcc in some cases
perl -pi -e 's/(__has_include.<experimental.memory_resource>.)/__cpp_rtti && $$1/' third-party/folly/folly/memory/MemoryResource.h
CXX_M_FLAGS = $(filter -m%, $(CXXFLAGS))
build_folly:
FOLLY_INST_PATH=`cd third-party/folly; $(PYTHON) build/fbcode_builder/getdeps.py show-inst-dir`; \
if [ "$$FOLLY_INST_PATH" ]; then \
@ -2467,8 +2453,8 @@ build_folly:
fi
# Restore the original version of Invoke.h with boost dependency
cd third-party/folly && ${GIT_COMMAND} checkout folly/functional/Invoke.h
cd third-party/folly && MAYBE_AVX2=`echo $(CXXFLAGS) | grep -o -- -DHAVE_AVX2 | sed 's/-DHAVE_AVX2/-mavx2/g' || true` && \
CXXFLAGS=" $$MAYBE_AVX2 -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
cd third-party/folly && \
CXXFLAGS=" $(CXX_M_FLAGS) -DHAVE_CXX11_ATOMIC " $(PYTHON) build/fbcode_builder/getdeps.py build --no-tests
# ---------------------------------------------------------------------------
# Build size testing
@ -2489,18 +2475,6 @@ build_size:
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib $$(stat --printf="%s" `readlink -f librocksdb.so`)
strip `readlink -f librocksdb.so`
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
# === lite build, static ===
$(MAKE) clean
$(MAKE) LITE=1 static_lib
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite $$(stat --printf="%s" librocksdb.a)
strip librocksdb.a
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.static_lib_lite_stripped $$(stat --printf="%s" librocksdb.a)
# === lite build, shared ===
$(MAKE) clean
$(MAKE) LITE=1 shared_lib
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite $$(stat --printf="%s" `readlink -f librocksdb.so`)
strip `readlink -f librocksdb.so`
$(REPORT_BUILD_STATISTIC) rocksdb.build_size.shared_lib_lite_stripped $$(stat --printf="%s" `readlink -f librocksdb.so`)
# ---------------------------------------------------------------------------
# Platform-specific compilation

@ -5,3 +5,4 @@ This is the list of all known third-party plugins for RocksDB. If something is m
* [ZenFS](https://github.com/westerndigitalcorporation/zenfs): a file system for zoned block devices
* [RADOS](https://github.com/riversand963/rocksdb-rados-env): an Env used for interacting with RADOS. Migrated from RocksDB main repo.
* [PMEM](https://github.com/pmem/pmem-rocksdb-plugin): a collection of plugins to enable Persistent Memory on RocksDB.
* [IPPCP](https://github.com/intel/ippcp-plugin-rocksdb): a plugin to enable encryption on RocksDB based on Intel optimized open source IPP-Crypto library.

@ -1,8 +1,6 @@
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)

@ -1,21 +0,0 @@
# RocksDBLite
RocksDBLite is a project focused on mobile use cases, which don't need a lot of fancy things we've built for server workloads and they are very sensitive to binary size. For that reason, we added a compile flag ROCKSDB_LITE that comments out a lot of the nonessential code and keeps the binary lean.
Some examples of the features disabled by ROCKSDB_LITE:
* compiled-in support for LDB tool
* No backup engine
* No support for replication (which we provide in form of TransactionalIterator)
* No advanced monitoring tools
* No special-purpose memtables that are highly optimized for specific use cases
* No Transactions
When adding a new big feature to RocksDB, please add ROCKSDB_LITE compile guard if:
* Nobody from mobile really needs your feature,
* Your feature is adding a lot of weight to the binary.
Don't add ROCKSDB_LITE compile guard if:
* It would introduce a lot of code complexity. Compile guards make code harder to read. It's a trade-off.
* Your feature is not adding a lot of weight.
If unsure, ask. :)

@ -11,6 +11,7 @@ load("//rocks/buckifier:defs.bzl", "cpp_library_wrapper","rocks_cpp_library_wrap
cpp_library_wrapper(name="rocksdb_lib", srcs=[
"cache/cache.cc",
"cache/cache_entry_roles.cc",
"cache/cache_helpers.cc",
"cache/cache_key.cc",
"cache/cache_reservation_manager.cc",
"cache/charged_cache.cc",
@ -18,6 +19,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"cache/compressed_secondary_cache.cc",
"cache/lru_cache.cc",
"cache/secondary_cache.cc",
"cache/secondary_cache_adapter.cc",
"cache/sharded_cache.cc",
"db/arena_wrapped_db_iter.cc",
"db/blob/blob_contents.cc",
@ -104,6 +106,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"db/write_batch.cc",
"db/write_batch_base.cc",
"db/write_controller.cc",
"db/write_stall_stats.cc",
"db/write_thread.cc",
"env/composite_env.cc",
"env/env.cc",
@ -180,6 +183,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"table/block_based/block_based_table_iterator.cc",
"table/block_based/block_based_table_reader.cc",
"table/block_based/block_builder.cc",
"table/block_based/block_cache.cc",
"table/block_based/block_prefetcher.cc",
"table/block_based/block_prefix_index.cc",
"table/block_based/data_block_footer.cc",
@ -246,6 +250,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"util/concurrent_task_limiter_impl.cc",
"util/crc32c.cc",
"util/crc32c_arm64.cc",
"util/data_structure.cc",
"util/dynamic_bloom.cc",
"util/file_checksum_helper.cc",
"util/hash.cc",
@ -259,6 +264,8 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"util/string_util.cc",
"util/thread_local.cc",
"util/threadpool_imp.cc",
"util/udt_util.cc",
"util/write_batch_util.cc",
"util/xxhash.cc",
"utilities/agg_merge/agg_merge.cc",
"utilities/backup/backup_engine.cc",
@ -349,352 +356,14 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"//folly/synchronization:distributed_mutex",
], headers=None, link_whole=False, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[
"cache/cache.cc",
"cache/cache_entry_roles.cc",
"cache/cache_key.cc",
"cache/cache_reservation_manager.cc",
"cache/charged_cache.cc",
"cache/clock_cache.cc",
"cache/compressed_secondary_cache.cc",
"cache/lru_cache.cc",
"cache/secondary_cache.cc",
"cache/sharded_cache.cc",
"db/arena_wrapped_db_iter.cc",
"db/blob/blob_contents.cc",
"db/blob/blob_fetcher.cc",
"db/blob/blob_file_addition.cc",
"db/blob/blob_file_builder.cc",
"db/blob/blob_file_cache.cc",
"db/blob/blob_file_garbage.cc",
"db/blob/blob_file_meta.cc",
"db/blob/blob_file_reader.cc",
"db/blob/blob_garbage_meter.cc",
"db/blob/blob_log_format.cc",
"db/blob/blob_log_sequential_reader.cc",
"db/blob/blob_log_writer.cc",
"db/blob/blob_source.cc",
"db/blob/prefetch_buffer_collection.cc",
"db/builder.cc",
"db/c.cc",
"db/column_family.cc",
"db/compaction/compaction.cc",
"db/compaction/compaction_iterator.cc",
"db/compaction/compaction_job.cc",
"db/compaction/compaction_outputs.cc",
"db/compaction/compaction_picker.cc",
"db/compaction/compaction_picker_fifo.cc",
"db/compaction/compaction_picker_level.cc",
"db/compaction/compaction_picker_universal.cc",
"db/compaction/compaction_service_job.cc",
"db/compaction/compaction_state.cc",
"db/compaction/sst_partitioner.cc",
"db/compaction/subcompaction_state.cc",
"db/convenience.cc",
"db/db_filesnapshot.cc",
"db/db_impl/compacted_db_impl.cc",
"db/db_impl/db_impl.cc",
"db/db_impl/db_impl_compaction_flush.cc",
"db/db_impl/db_impl_debug.cc",
"db/db_impl/db_impl_experimental.cc",
"db/db_impl/db_impl_files.cc",
"db/db_impl/db_impl_open.cc",
"db/db_impl/db_impl_readonly.cc",
"db/db_impl/db_impl_secondary.cc",
"db/db_impl/db_impl_write.cc",
"db/db_info_dumper.cc",
"db/db_iter.cc",
"db/dbformat.cc",
"db/error_handler.cc",
"db/event_helpers.cc",
"db/experimental.cc",
"db/external_sst_file_ingestion_job.cc",
"db/file_indexer.cc",
"db/flush_job.cc",
"db/flush_scheduler.cc",
"db/forward_iterator.cc",
"db/import_column_family_job.cc",
"db/internal_stats.cc",
"db/log_reader.cc",
"db/log_writer.cc",
"db/logs_with_prep_tracker.cc",
"db/malloc_stats.cc",
"db/memtable.cc",
"db/memtable_list.cc",
"db/merge_helper.cc",
"db/merge_operator.cc",
"db/output_validator.cc",
"db/periodic_task_scheduler.cc",
"db/range_del_aggregator.cc",
"db/range_tombstone_fragmenter.cc",
"db/repair.cc",
"db/seqno_to_time_mapping.cc",
"db/snapshot_impl.cc",
"db/table_cache.cc",
"db/table_properties_collector.cc",
"db/transaction_log_impl.cc",
"db/trim_history_scheduler.cc",
"db/version_builder.cc",
"db/version_edit.cc",
"db/version_edit_handler.cc",
"db/version_set.cc",
"db/wal_edit.cc",
"db/wal_manager.cc",
"db/wide/wide_column_serialization.cc",
"db/wide/wide_columns.cc",
"db/write_batch.cc",
"db/write_batch_base.cc",
"db/write_controller.cc",
"db/write_thread.cc",
"env/composite_env.cc",
"env/env.cc",
"env/env_chroot.cc",
"env/env_encryption.cc",
"env/env_posix.cc",
"env/file_system.cc",
"env/file_system_tracer.cc",
"env/fs_posix.cc",
"env/fs_remap.cc",
"env/io_posix.cc",
"env/mock_env.cc",
"env/unique_id_gen.cc",
"file/delete_scheduler.cc",
"file/file_prefetch_buffer.cc",
"file/file_util.cc",
"file/filename.cc",
"file/line_file_reader.cc",
"file/random_access_file_reader.cc",
"file/read_write_util.cc",
"file/readahead_raf.cc",
"file/sequence_file_reader.cc",
"file/sst_file_manager_impl.cc",
"file/writable_file_writer.cc",
"logging/auto_roll_logger.cc",
"logging/event_logger.cc",
"logging/log_buffer.cc",
"memory/arena.cc",
"memory/concurrent_arena.cc",
"memory/jemalloc_nodump_allocator.cc",
"memory/memkind_kmem_allocator.cc",
"memory/memory_allocator.cc",
"memtable/alloc_tracker.cc",
"memtable/hash_linklist_rep.cc",
"memtable/hash_skiplist_rep.cc",
"memtable/skiplistrep.cc",
"memtable/vectorrep.cc",
"memtable/write_buffer_manager.cc",
"monitoring/histogram.cc",
"monitoring/histogram_windowing.cc",
"monitoring/in_memory_stats_history.cc",
"monitoring/instrumented_mutex.cc",
"monitoring/iostats_context.cc",
"monitoring/perf_context.cc",
"monitoring/perf_level.cc",
"monitoring/persistent_stats_history.cc",
"monitoring/statistics.cc",
"monitoring/thread_status_impl.cc",
"monitoring/thread_status_updater.cc",
"monitoring/thread_status_updater_debug.cc",
"monitoring/thread_status_util.cc",
"monitoring/thread_status_util_debug.cc",
"options/cf_options.cc",
"options/configurable.cc",
"options/customizable.cc",
"options/db_options.cc",
"options/options.cc",
"options/options_helper.cc",
"options/options_parser.cc",
"port/mmap.cc",
"port/port_posix.cc",
"port/stack_trace.cc",
"port/win/env_default.cc",
"port/win/env_win.cc",
"port/win/io_win.cc",
"port/win/port_win.cc",
"port/win/win_logger.cc",
"port/win/win_thread.cc",
"table/adaptive/adaptive_table_factory.cc",
"table/block_based/binary_search_index_reader.cc",
"table/block_based/block.cc",
"table/block_based/block_based_table_builder.cc",
"table/block_based/block_based_table_factory.cc",
"table/block_based/block_based_table_iterator.cc",
"table/block_based/block_based_table_reader.cc",
"table/block_based/block_builder.cc",
"table/block_based/block_prefetcher.cc",
"table/block_based/block_prefix_index.cc",
"table/block_based/data_block_footer.cc",
"table/block_based/data_block_hash_index.cc",
"table/block_based/filter_block_reader_common.cc",
"table/block_based/filter_policy.cc",
"table/block_based/flush_block_policy.cc",
"table/block_based/full_filter_block.cc",
"table/block_based/hash_index_reader.cc",
"table/block_based/index_builder.cc",
"table/block_based/index_reader_common.cc",
"table/block_based/parsed_full_filter_block.cc",
"table/block_based/partitioned_filter_block.cc",
"table/block_based/partitioned_index_iterator.cc",
"table/block_based/partitioned_index_reader.cc",
"table/block_based/reader_common.cc",
"table/block_based/uncompression_dict_reader.cc",
"table/block_fetcher.cc",
"table/compaction_merging_iterator.cc",
"table/cuckoo/cuckoo_table_builder.cc",
"table/cuckoo/cuckoo_table_factory.cc",
"table/cuckoo/cuckoo_table_reader.cc",
"table/format.cc",
"table/get_context.cc",
"table/iterator.cc",
"table/merging_iterator.cc",
"table/meta_blocks.cc",
"table/persistent_cache_helper.cc",
"table/plain/plain_table_bloom.cc",
"table/plain/plain_table_builder.cc",
"table/plain/plain_table_factory.cc",
"table/plain/plain_table_index.cc",
"table/plain/plain_table_key_coding.cc",
"table/plain/plain_table_reader.cc",
"table/sst_file_dumper.cc",
"table/sst_file_reader.cc",
"table/sst_file_writer.cc",
"table/table_factory.cc",
"table/table_properties.cc",
"table/two_level_iterator.cc",
"table/unique_id.cc",
"test_util/sync_point.cc",
"test_util/sync_point_impl.cc",
"test_util/transaction_test_util.cc",
"tools/dump/db_dump_tool.cc",
"tools/io_tracer_parser_tool.cc",
"tools/ldb_cmd.cc",
"tools/ldb_tool.cc",
"tools/sst_dump_tool.cc",
"trace_replay/block_cache_tracer.cc",
"trace_replay/io_tracer.cc",
"trace_replay/trace_record.cc",
"trace_replay/trace_record_handler.cc",
"trace_replay/trace_record_result.cc",
"trace_replay/trace_replay.cc",
"util/async_file_reader.cc",
"util/build_version.cc",
"util/cleanable.cc",
"util/coding.cc",
"util/compaction_job_stats_impl.cc",
"util/comparator.cc",
"util/compression.cc",
"util/compression_context_cache.cc",
"util/concurrent_task_limiter_impl.cc",
"util/crc32c.cc",
"util/crc32c_arm64.cc",
"util/dynamic_bloom.cc",
"util/file_checksum_helper.cc",
"util/hash.cc",
"util/murmurhash.cc",
"util/random.cc",
"util/rate_limiter.cc",
"util/ribbon_config.cc",
"util/slice.cc",
"util/status.cc",
"util/stderr_logger.cc",
"util/string_util.cc",
"util/thread_local.cc",
"util/threadpool_imp.cc",
"util/xxhash.cc",
"utilities/agg_merge/agg_merge.cc",
"utilities/backup/backup_engine.cc",
"utilities/blob_db/blob_compaction_filter.cc",
"utilities/blob_db/blob_db.cc",
"utilities/blob_db/blob_db_impl.cc",
"utilities/blob_db/blob_db_impl_filesnapshot.cc",
"utilities/blob_db/blob_dump_tool.cc",
"utilities/blob_db/blob_file.cc",
"utilities/cache_dump_load.cc",
"utilities/cache_dump_load_impl.cc",
"utilities/cassandra/cassandra_compaction_filter.cc",
"utilities/cassandra/format.cc",
"utilities/cassandra/merge_operator.cc",
"utilities/checkpoint/checkpoint_impl.cc",
"utilities/compaction_filters.cc",
"utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc",
"utilities/convenience/info_log_finder.cc",
"utilities/counted_fs.cc",
"utilities/debug.cc",
"utilities/env_mirror.cc",
"utilities/env_timed.cc",
"utilities/fault_injection_env.cc",
"utilities/fault_injection_fs.cc",
"utilities/fault_injection_secondary_cache.cc",
"utilities/leveldb_options/leveldb_options.cc",
"utilities/memory/memory_util.cc",
"utilities/merge_operators.cc",
"utilities/merge_operators/bytesxor.cc",
"utilities/merge_operators/max.cc",
"utilities/merge_operators/put.cc",
"utilities/merge_operators/sortlist.cc",
"utilities/merge_operators/string_append/stringappend.cc",
"utilities/merge_operators/string_append/stringappend2.cc",
"utilities/merge_operators/uint64add.cc",
"utilities/object_registry.cc",
"utilities/option_change_migration/option_change_migration.cc",
"utilities/options/options_util.cc",
"utilities/persistent_cache/block_cache_tier.cc",
"utilities/persistent_cache/block_cache_tier_file.cc",
"utilities/persistent_cache/block_cache_tier_metadata.cc",
"utilities/persistent_cache/persistent_cache_tier.cc",
"utilities/persistent_cache/volatile_tier_impl.cc",
"utilities/simulator_cache/cache_simulator.cc",
"utilities/simulator_cache/sim_cache.cc",
"utilities/table_properties_collectors/compact_on_deletion_collector.cc",
"utilities/trace/file_trace_reader_writer.cc",
"utilities/trace/replayer_impl.cc",
"utilities/transactions/lock/lock_manager.cc",
"utilities/transactions/lock/point/point_lock_manager.cc",
"utilities/transactions/lock/point/point_lock_tracker.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/concurrent_tree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/keyrange.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/lock_request.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/locktree.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/manager.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/range_buffer.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/treenode.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/txnid_set.cc",
"utilities/transactions/lock/range/range_tree/lib/locktree/wfg.cc",
"utilities/transactions/lock/range/range_tree/lib/standalone_port.cc",
"utilities/transactions/lock/range/range_tree/lib/util/dbt.cc",
"utilities/transactions/lock/range/range_tree/lib/util/memarena.cc",
"utilities/transactions/lock/range/range_tree/range_tree_lock_manager.cc",
"utilities/transactions/lock/range/range_tree/range_tree_lock_tracker.cc",
"utilities/transactions/optimistic_transaction.cc",
"utilities/transactions/optimistic_transaction_db_impl.cc",
"utilities/transactions/pessimistic_transaction.cc",
"utilities/transactions/pessimistic_transaction_db.cc",
"utilities/transactions/snapshot_checker.cc",
"utilities/transactions/transaction_base.cc",
"utilities/transactions/transaction_db_mutex_impl.cc",
"utilities/transactions/transaction_util.cc",
"utilities/transactions/write_prepared_txn.cc",
"utilities/transactions/write_prepared_txn_db.cc",
"utilities/transactions/write_unprepared_txn.cc",
"utilities/transactions/write_unprepared_txn_db.cc",
"utilities/ttl/db_ttl_impl.cc",
"utilities/wal_filter.cc",
"utilities/write_batch_with_index/write_batch_with_index.cc",
"utilities/write_batch_with_index/write_batch_with_index_internal.cc",
], deps=[
"//folly/container:f14_hash",
"//folly/experimental/coro:blocking_wait",
"//folly/experimental/coro:collect",
"//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex",
], headers=None, link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_whole_archive_lib", srcs=[], deps=[":rocksdb_lib"], headers=None, link_whole=True, extra_test_libs=False)
cpp_library_wrapper(name="rocksdb_test_lib", srcs=[
"db/db_test_util.cc",
"db/db_with_timestamp_test_util.cc",
"table/mock_table.cc",
"test_util/mock_time_env.cc",
"test_util/secondary_cache_test_util.cc",
"test_util/testharness.cc",
"test_util/testutil.cc",
"tools/block_cache_analyzer/block_cache_trace_analyzer.cc",
@ -725,6 +394,7 @@ rocks_cpp_library_wrapper(name="rocksdb_stress_lib", srcs=[
"db_stress_tool/db_stress_test_base.cc",
"db_stress_tool/db_stress_tool.cc",
"db_stress_tool/expected_state.cc",
"db_stress_tool/expected_value.cc",
"db_stress_tool/multi_ops_txns_stress.cc",
"db_stress_tool/no_batched_ops_stress.cc",
"test_util/testutil.cc",
@ -5082,6 +4752,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test",
extra_compiler_flags=[])
cpp_unittest_wrapper(name="db_clip_test",
srcs=["db/db_clip_test.cc"],
deps=[":rocksdb_test_lib"],
extra_compiler_flags=[])
cpp_unittest_wrapper(name="db_compaction_filter_test",
srcs=["db/db_compaction_filter_test.cc"],
deps=[":rocksdb_test_lib"],
@ -5834,6 +5510,12 @@ cpp_unittest_wrapper(name="ttl_test",
extra_compiler_flags=[])
cpp_unittest_wrapper(name="udt_util_test",
srcs=["util/udt_util_test.cc"],
deps=[":rocksdb_test_lib"],
extra_compiler_flags=[])
cpp_unittest_wrapper(name="util_merge_operators_test",
srcs=["utilities/util_merge_operators_test.cc"],
deps=[":rocksdb_test_lib"],

@ -15,6 +15,28 @@ At Facebook, we use RocksDB as storage engines in multiple data management servi
[2] https://code.facebook.com/posts/357056558062811/logdevice-a-distributed-data-store-for-logs/
## Bilibili
[Bilibili](bilibili.com) [uses](https://www.alluxio.io/blog/when-ai-meets-alluxio-at-bilibili-building-an-efficient-ai-platform-for-data-preprocessing-and-model-training/) Alluxio to speed up its ML training workloads, and Alluxio uses RocksDB to store its filesystem metadata, so Bilibili uses RocksDB.
Bilibili's [real-time platform](https://www.alibabacloud.com/blog/architecture-and-practices-of-bilibilis-real-time-platform_596676) uses Flink, and uses RocksDB as Flink's state store.
## TikTok
TikTok, or its parent company ByteDance, uses RocksDB as the storage engine for some storage systems, such as its distributed graph database [ByteGraph](https://vldb.org/pvldb/vol15/p3306-li.pdf).
Also, TikTok uses [Alluxio](alluxio.io) to [speed up Presto queries](https://www.alluxio.io/resources/videos/improving-presto-performance-with-alluxio-at-tiktok/), and Alluxio stores the files' metadata in RocksDB.
## FoundationDB
[FoundationDB](https://www.foundationdb.org/) [uses](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp) RocksDB to implement a [key-value store interface](https://github.com/apple/foundationdb/blob/377f1f692da6ab2fe5bdac57035651db3e5fb66d/fdbserver/KeyValueStoreRocksDB.actor.cpp#L1127) in its server backend.
## Apple
Apple [uses](https://opensource.apple.com/projects/foundationdb/) FoundationDB, so it also uses RocksDB.
## Snowflake
Snowflake [uses](https://www.snowflake.com/blog/how-foundationdb-powers-snowflake-metadata-forward/) FoundationDB, so it also uses RocksDB.
## Microsoft
The Bing search engine from Microsoft uses RocksDB as the storage engine for its web data platform: https://blogs.bing.com/Engineering-Blog/october-2021/RocksDB-in-Microsoft-Bing
## LinkedIn
Two different use cases at Linkedin are using RocksDB as a storage engine:
@ -26,6 +48,9 @@ Learn more about those use cases in a Tech Talk by Ankit Gupta and Naveen Somasu
## Yahoo
Yahoo is using RocksDB as a storage engine for their biggest distributed data store Sherpa. Learn more about it here: http://yahooeng.tumblr.com/post/120730204806/sherpa-scales-new-heights
## Tencent
[PaxosStore](https://github.com/Tencent/paxosstore) is a distributed database supporting WeChat. It uses RocksDB as its storage engine.
## Baidu
[Apache Doris](http://doris.apache.org/master/en/) is a MPP analytical database engine released by Baidu. It [uses RocksDB](http://doris.apache.org/master/en/administrator-guide/operation/tablet-meta-tool.html) to manage its tablet's metadata.
@ -79,9 +104,18 @@ quasardb uses a heavily tuned RocksDB as its persistence layer.
## TiKV
[TiKV](https://github.com/pingcap/tikv) is a GEO-replicated, high-performance, distributed, transactional key-value database. TiKV is powered by Rust and Raft. TiKV uses RocksDB as its persistence layer.
## TiDB
[TiDB](https://github.com/pingcap/tidb) uses the TiKV distributed key-value database, so it uses RocksDB.
## PingCAP
[PingCAP](https://www.pingcap.com/) is the company behind TiDB, its cloud database service uses RocksDB.
## Apache Spark
[Spark Structured Streaming](https://docs.databricks.com/structured-streaming/rocksdb-state-store.html) uses RocksDB as the local state store.
## Databricks
[Databricks](https://www.databricks.com/) [replaces AWS RDS with TiDB](https://www.pingcap.com/case-study/how-databricks-tackles-the-scalability-limit-with-a-mysql-alternative/) for scalability, so it uses RocksDB.
## Apache Flink
[Apache Flink](https://flink.apache.org/news/2016/03/08/release-1.0.0.html) uses RocksDB to store state locally on a machine.

@ -26,7 +26,7 @@ from util import ColorString
# $python3 buckifier/buckify_rocksdb.py \
# '{"fake": {
# "extra_deps": [":test_dep", "//fakes/module:mock1"],
# "extra_compiler_flags": ["-DROCKSDB_LITE", "-Os"]
# "extra_compiler_flags": ["-DFOO_BAR", "-Os"]
# }
# }'
# (Generated TARGETS file has test_dep and mock1 as dependencies for RocksDB
@ -154,16 +154,9 @@ def generate_targets(repo_path, deps_map):
# rocksdb_whole_archive_lib
TARGETS.add_library(
"rocksdb_whole_archive_lib",
src_mk["LIB_SOURCES"] +
# always add range_tree, it's only excluded on ppc64, which we don't use internally
src_mk["RANGE_TREE_SOURCES"] + src_mk["TOOL_LIB_SOURCES"],
[],
deps=[
"//folly/container:f14_hash",
"//folly/experimental/coro:blocking_wait",
"//folly/experimental/coro:collect",
"//folly/experimental/coro:coroutine",
"//folly/experimental/coro:task",
"//folly/synchronization:distributed_mutex",
":rocksdb_lib",
],
headers=None,
extra_external_deps="",

@ -63,13 +63,7 @@ if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
if [ "$LIB_MODE" == "shared" ]; then
PIC_BUILD=1
fi
if [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM010" ]; then
source "$PWD/build_tools/fbcode_config_platform010.sh"
elif [ -n "$ROCKSDB_FBCODE_BUILD_WITH_PLATFORM009" ]; then
source "$PWD/build_tools/fbcode_config_platform009.sh"
else
source "$PWD/build_tools/fbcode_config_platform009.sh"
fi
source "$PWD/build_tools/fbcode_config_platform010.sh"
fi
# Delete existing output, if it exists
@ -154,7 +148,7 @@ case "$TARGET_OS" in
;;
IOS)
PLATFORM=IOS
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE -DROCKSDB_LITE"
COMMON_FLAGS="$COMMON_FLAGS -DOS_MACOSX -DIOS_CROSS_COMPILE "
PLATFORM_SHARED_EXT=dylib
PLATFORM_SHARED_LDFLAGS="-dynamiclib -install_name "
CROSS_COMPILE=true
@ -425,7 +419,7 @@ EOF
if ! test $ROCKSDB_DISABLE_JEMALLOC; then
# Test whether jemalloc is available
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS -x c++ - -o test.o -ljemalloc \
if echo 'int main() {}' | $CXX $PLATFORM_CXXFLAGS $LDFLAGS -x c++ - -o test.o -ljemalloc \
2>/dev/null; then
# This will enable some preprocessor identifiers in the Makefile
JEMALLOC=1
@ -434,12 +428,19 @@ EOF
WITH_JEMALLOC_FLAG=1
# check for JEMALLOC installed with HomeBrew
if [ "$PLATFORM" == "OS_MACOSX" ]; then
if [ "$TARGET_ARCHITECTURE" = "arm64" ]; then
# on M1 Macs, homebrew installs here instead of /usr/local
JEMALLOC_PREFIX="/opt/homebrew"
else
JEMALLOC_PREFIX="/usr/local"
fi
if hash brew 2>/dev/null && brew ls --versions jemalloc > /dev/null; then
JEMALLOC_VER=$(brew ls --versions jemalloc | tail -n 1 | cut -f 2 -d ' ')
JEMALLOC_INCLUDE="-I/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/include"
JEMALLOC_LIB="/usr/local/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS $JEMALLOC_LIB"
JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS $JEMALLOC_LIB"
JEMALLOC_INCLUDE="-I${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/include"
JEMALLOC_LIB="${JEMALLOC_PREFIX}/Cellar/jemalloc/${JEMALLOC_VER}/lib/libjemalloc_pic.a"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_LDFLAGS="$JAVA_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
JAVA_STATIC_LDFLAGS="$JAVA_STATIC_LDFLAGS -L${JEMALLOC_PREFIX}/lib $JEMALLOC_LIB"
fi
fi
fi
@ -627,7 +628,7 @@ EOF
fi
fi
if test "0$PORTABLE" -eq 0; then
if [ "$PORTABLE" == "" ] || [ "$PORTABLE" == 0 ]; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^ppc64`"; then
# Tune for this POWER processor, treating '+' models as base models
POWER=`LD_SHOW_AUXV=1 /bin/true | grep AT_PLATFORM | grep -E -o power[0-9]+`
@ -650,37 +651,36 @@ if test "0$PORTABLE" -eq 0; then
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif [ "$TARGET_OS" == "IOS" ]; then
COMMON_FLAGS="$COMMON_FLAGS"
elif [ "$TARGET_OS" == "AIX" ] || [ "$TARGET_OS" == "SunOS" ]; then
# TODO: Not sure why we don't use -march=native on these OSes
if test "$USE_SSE"; then
TRY_SSE_ETC="1"
fi
else
COMMON_FLAGS="$COMMON_FLAGS -march=native "
fi
else
# PORTABLE=1
if test "$USE_SSE"; then
TRY_SSE_ETC="1"
fi
if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
fi
if test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
# PORTABLE specified
if [ "$PORTABLE" == 1 ]; then
if test -n "`echo $TARGET_ARCHITECTURE | grep ^s390x`"; then
COMMON_FLAGS="$COMMON_FLAGS -march=z196 "
elif test -n "`echo $TARGET_ARCHITECTURE | grep ^riscv64`"; then
RISC_ISA=$(cat /proc/cpuinfo | grep isa | head -1 | cut --delimiter=: -f 2 | cut -b 2-)
COMMON_FLAGS="$COMMON_FLAGS -march=${RISC_ISA}"
elif test "$USE_SSE"; then
# USE_SSE is DEPRECATED
# This is a rough approximation of the old USE_SSE behavior
COMMON_FLAGS="$COMMON_FLAGS -march=haswell"
fi
# Other than those cases, not setting -march= here.
else
# Assume PORTABLE is a minimum assumed cpu type, e.g. PORTABLE=haswell
COMMON_FLAGS="$COMMON_FLAGS -march=${PORTABLE}"
fi
if [[ "${PLATFORM}" == "OS_MACOSX" ]]; then
# For portability compile for macOS 10.13 (2017) or newer
COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.13"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.13"
# For portability compile for macOS 10.14 or newer
COMMON_FLAGS="$COMMON_FLAGS -mmacosx-version-min=10.14"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -mmacosx-version-min=10.14"
# -mmacosx-version-min must come first here.
PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.13 $PLATFORM_SHARED_LDFLAGS"
PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.13"
JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.13"
PLATFORM_SHARED_LDFLAGS="-mmacosx-version-min=10.14 $PLATFORM_SHARED_LDFLAGS"
PLATFORM_CMAKE_FLAGS="-DCMAKE_OSX_DEPLOYMENT_TARGET=10.14"
JAVA_STATIC_DEPS_COMMON_FLAGS="-mmacosx-version-min=10.14"
JAVA_STATIC_DEPS_LDFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CCFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
JAVA_STATIC_DEPS_CXXFLAGS="$JAVA_STATIC_DEPS_COMMON_FLAGS"
@ -704,101 +704,6 @@ EOF
fi
fi
if test "$TRY_SSE_ETC"; then
# The USE_SSE flag now means "attempt to compile with widely-available
# Intel architecture extensions utilized by specific optimizations in the
# source code." It's a qualifier on PORTABLE=1 that means "mostly portable."
# It doesn't even really check that your current CPU is compatible.
#
# SSE4.2 available since nehalem, ca. 2008-2010
# Includes POPCNT for BitsSetToOne, BitParity
TRY_SSE42="-msse4.2"
# PCLMUL available since westmere, ca. 2010-2011
TRY_PCLMUL="-mpclmul"
# AVX2 available since haswell, ca. 2013-2015
TRY_AVX2="-mavx2"
# BMI available since haswell, ca. 2013-2015
# Primarily for TZCNT for CountTrailingZeroBits
TRY_BMI="-mbmi"
# LZCNT available since haswell, ca. 2013-2015
# For FloorLog2
TRY_LZCNT="-mlzcnt"
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_SSE42 -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <nmmintrin.h>
int main() {
volatile uint32_t x = _mm_crc32_u32(0, 0);
(void)x;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_SSE42 -DHAVE_SSE42"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use SSE intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_PCLMUL -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <wmmintrin.h>
int main() {
const auto a = _mm_set_epi64x(0, 0);
const auto b = _mm_set_epi64x(0, 0);
const auto c = _mm_clmulepi64_si128(a, b, 0x00);
auto d = _mm_cvtsi128_si64(c);
(void)d;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_PCLMUL -DHAVE_PCLMUL"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use PCLMUL intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_AVX2 -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main() {
const auto a = _mm256_setr_epi32(0, 1, 2, 3, 4, 7, 6, 5);
const auto b = _mm256_permutevar8x32_epi32(a, a);
(void)b;
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_AVX2 -DHAVE_AVX2"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use AVX2 intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_BMI -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main(int argc, char *argv[]) {
(void)argv;
return (int)_tzcnt_u64((uint64_t)argc);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_BMI -DHAVE_BMI"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use BMI intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS $TRY_LZCNT -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
#include <immintrin.h>
int main(int argc, char *argv[]) {
(void)argv;
return (int)_lzcnt_u64((uint64_t)argc);
}
EOF
if [ "$?" = 0 ]; then
COMMON_FLAGS="$COMMON_FLAGS $TRY_LZCNT -DHAVE_LZCNT"
elif test "$USE_SSE"; then
echo "warning: USE_SSE specified but compiler could not use LZCNT intrinsics, disabling" >&2
fi
$CXX $PLATFORM_CXXFLAGS $COMMON_FLAGS -x c++ - -o test.o 2>/dev/null <<EOF
#include <cstdint>
int main() {

@ -1,22 +0,0 @@
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
GCC_BASE=/mnt/gvfs/third-party2/gcc/1795efe5f06778c15a92c8f9a2aba5dc496d9d4d/9.x/centos7-native/3bed279
CLANG_BASE=/mnt/gvfs/third-party2/llvm-fb/7318eaac22659b6ff2fe43918e4b69fd0772a8a7/9.0.0/platform009/651ee30
LIBGCC_BASE=/mnt/gvfs/third-party2/libgcc/4959b39cfbe5965a37c861c4c327fa7c5c759b87/9.x/platform009/9202ce7
GLIBC_BASE=/mnt/gvfs/third-party2/glibc/45ce3375cdc77ecb2520bbf8f0ecddd3f98efd7a/2.30/platform009/f259413
SNAPPY_BASE=/mnt/gvfs/third-party2/snappy/be4de3205e029101b18aa8103daa696c2bef3b19/1.1.3/platform009/7f3b187
ZLIB_BASE=/mnt/gvfs/third-party2/zlib/3c160ac5c67e257501e24c6c1d00ad5e01d73db6/1.2.8/platform009/7f3b187
BZIP2_BASE=/mnt/gvfs/third-party2/bzip2/73a237ac5bc0a5f5d67b39b8d253cfebaab88684/1.0.6/platform009/7f3b187
LZ4_BASE=/mnt/gvfs/third-party2/lz4/6ca38d3c390be2774d61a300f151464bbd632d62/1.9.1/platform009/7f3b187
ZSTD_BASE=/mnt/gvfs/third-party2/zstd/64c58a207d2495e83abc57a500a956df09b79a7c/1.4.x/platform009/ba86d1f
GFLAGS_BASE=/mnt/gvfs/third-party2/gflags/824d0a8a5abb5b121afd1b35fc3896407ea50092/2.2.0/platform009/7f3b187
JEMALLOC_BASE=/mnt/gvfs/third-party2/jemalloc/b62912d333ef33f9760efa6219dbe3fe6abb3b0e/master/platform009/c305944
NUMA_BASE=/mnt/gvfs/third-party2/numa/0af65f71e23a67bf65dc91b11f95caa39325c432/2.0.11/platform009/7f3b187
LIBUNWIND_BASE=/mnt/gvfs/third-party2/libunwind/02486dac347645d31dce116f44e1de3177315be2/1.4/platform009/5191652
TBB_BASE=/mnt/gvfs/third-party2/tbb/2e0ec671e550bfca347300bf3f789d9c0fff24ad/2018_U5/platform009/7f3b187
LIBURING_BASE=/mnt/gvfs/third-party2/liburing/70dbd9cfee63a25611417d09433a86d7711b3990/20200729/platform009/7f3b187
KERNEL_HEADERS_BASE=/mnt/gvfs/third-party2/kernel-headers/32b8a2407b634df3f8f948ba373fc4acc6a18296/fb/platform009/da39a3e
BINUTILS_BASE=/mnt/gvfs/third-party2/binutils/08634589372fa5f237bfd374e8c644a8364e78c1/2.32/platform009/ba86d1f/
VALGRIND_BASE=/mnt/gvfs/third-party2/valgrind/6ae525939ad02e5e676855082fbbc7828dbafeac/3.15.0/platform009/7f3b187
LUA_BASE=/mnt/gvfs/third-party2/lua/162efd9561a3d21f6869f4814011e9cf1b3ff4dc/5.3.4/platform009/a6271c4
BENCHMARK_BASE=/mnt/gvfs/third-party2/benchmark/30bf49ad6414325e17f3425b0edcb64239427ae3/1.6.1/platform009/7f3b187
GLOG_BASE=/mnt/gvfs/third-party2/glog/32d751bd5673375b438158717ab6a57c1cc57e3d/0.3.2_fb/platform009/10a364d

@ -147,7 +147,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS"

@ -1,170 +0,0 @@
#!/bin/sh
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#
# Set environment variables so that we can compile rocksdb using
# fbcode settings. It uses the latest g++ and clang compilers and also
# uses jemalloc
# Environment variables that change the behavior of this script:
# PIC_BUILD -- if true, it will only take pic versions of libraries from fbcode. libraries that don't have pic variant will not be included
BASEDIR=`dirname $BASH_SOURCE`
source "$BASEDIR/dependencies_platform009.sh"
CFLAGS=""
# libgcc
LIBGCC_INCLUDE="$LIBGCC_BASE/include/c++/9.3.0 -I $LIBGCC_BASE/include/c++/9.3.0/backward"
LIBGCC_LIBS=" -L $LIBGCC_BASE/lib"
# glibc
GLIBC_INCLUDE="$GLIBC_BASE/include"
GLIBC_LIBS=" -L $GLIBC_BASE/lib"
if test -z $PIC_BUILD; then
MAYBE_PIC=
else
MAYBE_PIC=_pic
fi
if ! test $ROCKSDB_DISABLE_SNAPPY; then
# snappy
SNAPPY_INCLUDE=" -I $SNAPPY_BASE/include/"
SNAPPY_LIBS=" $SNAPPY_BASE/lib/libsnappy${MAYBE_PIC}.a"
CFLAGS+=" -DSNAPPY"
fi
if ! test $ROCKSDB_DISABLE_ZLIB; then
# location of zlib headers and libraries
ZLIB_INCLUDE=" -I $ZLIB_BASE/include/"
ZLIB_LIBS=" $ZLIB_BASE/lib/libz${MAYBE_PIC}.a"
CFLAGS+=" -DZLIB"
fi
if ! test $ROCKSDB_DISABLE_BZIP; then
# location of bzip headers and libraries
BZIP_INCLUDE=" -I $BZIP2_BASE/include/"
BZIP_LIBS=" $BZIP2_BASE/lib/libbz2${MAYBE_PIC}.a"
CFLAGS+=" -DBZIP2"
fi
if ! test $ROCKSDB_DISABLE_LZ4; then
LZ4_INCLUDE=" -I $LZ4_BASE/include/"
LZ4_LIBS=" $LZ4_BASE/lib/liblz4${MAYBE_PIC}.a"
CFLAGS+=" -DLZ4"
fi
if ! test $ROCKSDB_DISABLE_ZSTD; then
ZSTD_INCLUDE=" -I $ZSTD_BASE/include/"
ZSTD_LIBS=" $ZSTD_BASE/lib/libzstd${MAYBE_PIC}.a"
CFLAGS+=" -DZSTD"
fi
# location of gflags headers and libraries
GFLAGS_INCLUDE=" -I $GFLAGS_BASE/include/"
GFLAGS_LIBS=" $GFLAGS_BASE/lib/libgflags${MAYBE_PIC}.a"
CFLAGS+=" -DGFLAGS=gflags"
BENCHMARK_INCLUDE=" -I $BENCHMARK_BASE/include/"
BENCHMARK_LIBS=" $BENCHMARK_BASE/lib/libbenchmark${MAYBE_PIC}.a"
GLOG_INCLUDE=" -I $GLOG_BASE/include/"
GLOG_LIBS=" $GLOG_BASE/lib/libglog${MAYBE_PIC}.a"
# location of jemalloc
JEMALLOC_INCLUDE=" -I $JEMALLOC_BASE/include/"
JEMALLOC_LIB=" $JEMALLOC_BASE/lib/libjemalloc${MAYBE_PIC}.a"
# location of numa
NUMA_INCLUDE=" -I $NUMA_BASE/include/"
NUMA_LIB=" $NUMA_BASE/lib/libnuma${MAYBE_PIC}.a"
CFLAGS+=" -DNUMA"
# location of libunwind
LIBUNWIND="$LIBUNWIND_BASE/lib/libunwind${MAYBE_PIC}.a"
# location of TBB
TBB_INCLUDE=" -isystem $TBB_BASE/include/"
TBB_LIBS="$TBB_BASE/lib/libtbb${MAYBE_PIC}.a"
CFLAGS+=" -DTBB"
# location of LIBURING
LIBURING_INCLUDE=" -isystem $LIBURING_BASE/include/"
LIBURING_LIBS="$LIBURING_BASE/lib/liburing${MAYBE_PIC}.a"
CFLAGS+=" -DLIBURING"
test "$USE_SSE" || USE_SSE=1
export USE_SSE
test "$PORTABLE" || PORTABLE=1
export PORTABLE
BINUTILS="$BINUTILS_BASE/bin"
AR="$BINUTILS/ar"
AS="$BINUTILS/as"
DEPS_INCLUDE="$SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $ZSTD_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE $TBB_INCLUDE $LIBURING_INCLUDE $BENCHMARK_INCLUDE $GLOG_INCLUDE"
STDLIBS="-L $GCC_BASE/lib64"
CLANG_BIN="$CLANG_BASE/bin"
CLANG_LIB="$CLANG_BASE/lib"
CLANG_SRC="$CLANG_BASE/../../src"
CLANG_ANALYZER="$CLANG_BIN/clang++"
CLANG_SCAN_BUILD="$CLANG_SRC/llvm/clang/tools/scan-build/bin/scan-build"
if [ -z "$USE_CLANG" ]; then
# gcc
CC="$GCC_BASE/bin/gcc"
CXX="$GCC_BASE/bin/g++"
AR="$GCC_BASE/bin/gcc-ar"
CFLAGS+=" -B$BINUTILS"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $GLIBC_INCLUDE"
JEMALLOC=1
else
# clang
CLANG_INCLUDE="$CLANG_LIB/clang/stable/include"
CC="$CLANG_BIN/clang"
CXX="$CLANG_BIN/clang++"
AR="$CLANG_BIN/llvm-ar"
KERNEL_HEADERS_INCLUDE="$KERNEL_HEADERS_BASE/include"
CFLAGS+=" -B$BINUTILS -nostdinc -nostdlib"
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x "
CFLAGS+=" -isystem $LIBGCC_BASE/include/c++/9.x/x86_64-facebook-linux "
CFLAGS+=" -isystem $GLIBC_INCLUDE"
CFLAGS+=" -isystem $LIBGCC_INCLUDE"
CFLAGS+=" -isystem $CLANG_INCLUDE"
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" -Wno-expansion-to-defined "
CXXFLAGS="-nostdinc++"
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
EXEC_LDFLAGS+=" -Wl,--dynamic-linker,/usr/local/fbcode/platform009/lib/ld.so"
EXEC_LDFLAGS+=" $LIBUNWIND"
EXEC_LDFLAGS+=" -Wl,-rpath=/usr/local/fbcode/platform009/lib"
EXEC_LDFLAGS+=" -Wl,-rpath=$GCC_BASE/lib64"
# required by libtbb
EXEC_LDFLAGS+=" -ldl"
PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS $STDLIBS -lgcc -lstdc++"
PLATFORM_LDFLAGS+=" -B$BINUTILS"
EXEC_LDFLAGS_SHARED="$SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"
VALGRIND_VER="$VALGRIND_BASE/bin/"
# lua not supported because it's on track for deprecation, I think
LUA_PATH=
LUA_LIB=
export CC CXX AR AS CFLAGS CXXFLAGS EXEC_LDFLAGS EXEC_LDFLAGS_SHARED VALGRIND_VER JEMALLOC_LIB JEMALLOC_INCLUDE CLANG_ANALYZER CLANG_SCAN_BUILD LUA_PATH LUA_LIB

@ -154,7 +154,7 @@ CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE/linux "
CFLAGS+=" -isystem $KERNEL_HEADERS_INCLUDE "
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DHAVE_SSE42 -DROCKSDB_IOURING_PRESENT"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -DROCKSDB_IOURING_PRESENT"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB $TBB_LIBS $LIBURING_LIBS $BENCHMARK_LIBS"

@ -360,7 +360,7 @@ function send_to_ods {
echo >&2 "ERROR: Key $key doesn't have a value."
return
fi
curl --silent "https://www.intern.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
curl --silent "https://www.facebook.com/intern/agent/ods_set.php?entity=rocksdb_build&key=$key&value=$value" \
--connect-timeout 60
}

@ -104,46 +104,3 @@ get_lib_base valgrind LATEST platform010
get_lib_base lua 5.3.4 platform010
git diff $OUTPUT
###########################################################
# platform009 dependencies #
###########################################################
OUTPUT="$BASEDIR/dependencies_platform009.sh"
rm -f "$OUTPUT"
touch "$OUTPUT"
echo "Writing dependencies to $OUTPUT"
# Compilers locations
GCC_BASE=`readlink -f $TP2_LATEST/gcc/9.x/centos7-native/*/`
CLANG_BASE=`readlink -f $TP2_LATEST/llvm-fb/9.0.0/platform009/*/`
log_header
log_variable GCC_BASE
log_variable CLANG_BASE
# Libraries locations
get_lib_base libgcc 9.x platform009
get_lib_base glibc 2.30 platform009
get_lib_base snappy LATEST platform009
get_lib_base zlib LATEST platform009
get_lib_base bzip2 LATEST platform009
get_lib_base lz4 LATEST platform009
get_lib_base zstd LATEST platform009
get_lib_base gflags LATEST platform009
get_lib_base jemalloc LATEST platform009
get_lib_base numa LATEST platform009
get_lib_base libunwind LATEST platform009
get_lib_base tbb 2018_U5 platform009
get_lib_base liburing LATEST platform009
get_lib_base benchmark LATEST platform009
get_lib_base kernel-headers fb platform009
get_lib_base binutils LATEST centos7-native
get_lib_base valgrind LATEST platform009
get_lib_base lua 5.3.4 platform009
git diff $OUTPUT

59
cache/cache.cc vendored

@ -16,7 +16,8 @@
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
#ifndef ROCKSDB_LITE
const Cache::CacheItemHelper kNoopCacheItemHelper{};
static std::unordered_map<std::string, OptionTypeInfo>
lru_cache_options_type_info = {
{"capacity",
@ -64,7 +65,6 @@ static std::unordered_map<std::string, OptionTypeInfo>
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
};
#endif // ROCKSDB_LITE
Status SecondaryCache::CreateFromString(
const ConfigOptions& config_options, const std::string& value,
@ -75,7 +75,6 @@ Status SecondaryCache::CreateFromString(
Status status;
std::shared_ptr<SecondaryCache> sec_cache;
#ifndef ROCKSDB_LITE
CompressedSecondaryCacheOptions sec_cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&comp_sec_cache_options_type_info, "",
@ -84,19 +83,13 @@ Status SecondaryCache::CreateFromString(
sec_cache = NewCompressedSecondaryCache(sec_cache_opts);
}
#else
(void)config_options;
status = Status::NotSupported(
"Cannot load compressed secondary cache in LITE mode ", args);
#endif //! ROCKSDB_LITE
if (status.ok()) {
result->swap(sec_cache);
}
return status;
} else {
return LoadSharedObject<SecondaryCache>(config_options, value, nullptr,
result);
return LoadSharedObject<SecondaryCache>(config_options, value, result);
}
}
@ -108,7 +101,6 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
if (value.find('=') == std::string::npos) {
cache = NewLRUCache(ParseSizeT(value));
} else {
#ifndef ROCKSDB_LITE
LRUCacheOptions cache_opts;
status = OptionTypeInfo::ParseStruct(config_options, "",
&lru_cache_options_type_info, "",
@ -116,14 +108,51 @@ Status Cache::CreateFromString(const ConfigOptions& config_options,
if (status.ok()) {
cache = NewLRUCache(cache_opts);
}
#else
(void)config_options;
status = Status::NotSupported("Cannot load cache in LITE mode ", value);
#endif //! ROCKSDB_LITE
}
if (status.ok()) {
result->swap(cache);
}
return status;
}
bool Cache::AsyncLookupHandle::IsReady() {
return pending_handle == nullptr || pending_handle->IsReady();
}
bool Cache::AsyncLookupHandle::IsPending() { return pending_handle != nullptr; }
Cache::Handle* Cache::AsyncLookupHandle::Result() {
assert(!IsPending());
return result_handle;
}
void Cache::StartAsyncLookup(AsyncLookupHandle& async_handle) {
async_handle.found_dummy_entry = false; // in case re-used
assert(!async_handle.IsPending());
async_handle.result_handle =
Lookup(async_handle.key, async_handle.helper, async_handle.create_context,
async_handle.priority, async_handle.stats);
}
Cache::Handle* Cache::Wait(AsyncLookupHandle& async_handle) {
WaitAll(&async_handle, 1);
return async_handle.Result();
}
void Cache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
for (size_t i = 0; i < count; ++i) {
if (async_handles[i].IsPending()) {
// If a pending handle gets here, it should be marked at "to be handled
// by a caller" by that caller erasing the pending_cache on it.
assert(async_handles[i].pending_cache == nullptr);
}
}
}
void Cache::SetEvictionCallback(EvictionCallback&& fn) {
// Overwriting non-empty with non-empty could indicate a bug
assert(!eviction_callback_ || !fn);
eviction_callback_ = std::move(fn);
}
} // namespace ROCKSDB_NAMESPACE

@ -16,7 +16,7 @@
#include "db/db_impl/db_impl.h"
#include "monitoring/histogram.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
DEFINE_uint32(skew, 5, "Degree of skew in key selection");
DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
DEFINE_bool(populate_cache, true, "Populate cache before operations");
DEFINE_uint32(lookup_insert_percent, 87,
@ -71,17 +71,23 @@ DEFINE_uint32(
DEFINE_uint32(gather_stats_entries_per_lock, 256,
"For Cache::ApplyToAllEntries");
DEFINE_bool(skewed, false, "If true, skew the key access distribution");
DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache "
"operations.");
#ifndef ROCKSDB_LITE
DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\"");
DEFINE_bool(histograms, true,
"Whether to track and print histogram statistics.");
DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
#endif // ROCKSDB_LITE
DEFINE_string(cache_type, "lru_cache", "Type of block cache.");
@ -147,9 +153,6 @@ class SharedState {
public:
explicit SharedState(CacheBench* cache_bench)
: cv_(&mu_),
num_initialized_(0),
start_(false),
num_done_(0),
cache_bench_(cache_bench) {}
~SharedState() {}
@ -172,15 +175,27 @@ class SharedState {
bool Started() const { return start_; }
void AddLookupStats(uint64_t hits, uint64_t misses) {
MutexLock l(&mu_);
lookup_count_ += hits + misses;
lookup_hits_ += hits;
}
double GetLookupHitRatio() const {
return 1.0 * lookup_hits_ / lookup_count_;
}
private:
port::Mutex mu_;
port::CondVar cv_;
uint64_t num_initialized_;
bool start_;
uint64_t num_done_;
CacheBench* cache_bench_;
uint64_t num_initialized_ = 0;
bool start_ = false;
uint64_t num_done_ = 0;
uint64_t lookup_count_ = 0;
uint64_t lookup_hits_ = 0;
};
// Per-thread state for concurrent executions of the same benchmark.
@ -192,27 +207,19 @@ struct ThreadState {
uint64_t duration_us = 0;
ThreadState(uint32_t index, SharedState* _shared)
: tid(index), rnd(1000 + index), shared(_shared) {}
: tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
};
struct KeyGen {
char key_data[27];
Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
uint64_t key = 0;
if (!FLAGS_skewed) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < FLAGS_skew; ++i) {
raw = std::min(raw, rnd.Next());
}
key = FastRange64(raw, max_key);
} else {
key = rnd.Skewed(max_log);
if (key > max_key) {
key -= max_key;
}
Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < skew; ++i) {
raw = std::min(raw, rnd.Next());
}
uint64_t key = FastRange64(raw, max_key);
// Variable size and alignment
size_t off = key % 8;
key_data[0] = char{42};
@ -226,7 +233,7 @@ struct KeyGen {
}
};
char* createValue(Random64& rnd) {
Cache::ObjectPtr createValue(Random64& rnd) {
char* rv = new char[FLAGS_value_bytes];
// Fill with some filler data, and take some CPU time
for (uint32_t i = 0; i < FLAGS_value_bytes; i += 8) {
@ -236,28 +243,36 @@ char* createValue(Random64& rnd) {
}
// Callbacks for secondary cache
size_t SizeFn(void* /*obj*/) { return FLAGS_value_bytes; }
size_t SizeFn(Cache::ObjectPtr /*obj*/) { return FLAGS_value_bytes; }
Status SaveToFn(void* obj, size_t /*offset*/, size_t size, void* out) {
memcpy(out, obj, size);
Status SaveToFn(Cache::ObjectPtr from_obj, size_t /*from_offset*/,
size_t length, char* out) {
memcpy(out, from_obj, length);
return Status::OK();
}
// Different deleters to simulate using deleter to gather
// stats on the code origin and kind of cache entries.
void deleter1(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
void deleter2(const Slice& /*key*/, void* value) {
delete[] static_cast<char*>(value);
}
void deleter3(const Slice& /*key*/, void* value) {
Status CreateFn(const Slice& data, Cache::CreateContext* /*context*/,
MemoryAllocator* /*allocator*/, Cache::ObjectPtr* out_obj,
size_t* out_charge) {
*out_obj = new char[data.size()];
memcpy(*out_obj, data.data(), data.size());
*out_charge = data.size();
return Status::OK();
};
void DeleteFn(Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
delete[] static_cast<char*>(value);
}
Cache::CacheItemHelper helper1(SizeFn, SaveToFn, deleter1);
Cache::CacheItemHelper helper2(SizeFn, SaveToFn, deleter2);
Cache::CacheItemHelper helper3(SizeFn, SaveToFn, deleter3);
Cache::CacheItemHelper helper1_wos(CacheEntryRole::kDataBlock, DeleteFn);
Cache::CacheItemHelper helper1(CacheEntryRole::kDataBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper1_wos);
Cache::CacheItemHelper helper2_wos(CacheEntryRole::kIndexBlock, DeleteFn);
Cache::CacheItemHelper helper2(CacheEntryRole::kIndexBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper2_wos);
Cache::CacheItemHelper helper3_wos(CacheEntryRole::kFilterBlock, DeleteFn);
Cache::CacheItemHelper helper3(CacheEntryRole::kFilterBlock, DeleteFn, SizeFn,
SaveToFn, CreateFn, &helper3_wos);
} // namespace
class CacheBench {
@ -275,32 +290,25 @@ class CacheBench {
lookup_threshold_(insert_threshold_ +
kHundredthUint64 * FLAGS_lookup_percent),
erase_threshold_(lookup_threshold_ +
kHundredthUint64 * FLAGS_erase_percent),
skewed_(FLAGS_skewed) {
kHundredthUint64 * FLAGS_erase_percent) {
if (erase_threshold_ != 100U * kHundredthUint64) {
fprintf(stderr, "Percentages must add to 100.\n");
exit(1);
}
max_log_ = 0;
if (skewed_) {
uint64_t max_key = max_key_;
while (max_key >>= 1) max_log_++;
if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
}
if (FLAGS_cache_type == "clock_cache") {
fprintf(stderr, "Old clock cache implementation has been removed.\n");
exit(1);
} else if (FLAGS_cache_type == "hyper_clock_cache") {
cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits)
.MakeSharedCache();
HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
cache_ = opts.MakeSharedCache();
} else if (FLAGS_cache_type == "lru_cache") {
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
false /* strict_capacity_limit */,
0.5 /* high_pri_pool_ratio */);
#ifndef ROCKSDB_LITE
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
if (!FLAGS_secondary_cache_uri.empty()) {
Status s = SecondaryCache::CreateFromString(
ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@ -313,7 +321,6 @@ class CacheBench {
}
opts.secondary_cache = secondary_cache;
}
#endif // ROCKSDB_LITE
cache_ = NewLRUCache(opts);
} else {
@ -325,13 +332,50 @@ class CacheBench {
~CacheBench() {}
void PopulateCache() {
Random64 rnd(1);
Random64 rnd(FLAGS_seed);
KeyGen keygen;
for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
createValue(rnd), &helper1, FLAGS_value_bytes);
size_t max_occ = 0;
size_t inserts_since_max_occ_increase = 0;
size_t keys_since_last_not_found = 0;
// Avoid redundant insertions by checking Lookup before Insert.
// Loop until insertions consistently fail to increase max occupancy or
// it becomes difficult to find keys not already inserted.
while (inserts_since_max_occ_increase < 100 &&
keys_since_last_not_found < 100) {
Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
Cache::Handle* handle = cache_->Lookup(key);
if (handle != nullptr) {
cache_->Release(handle);
++keys_since_last_not_found;
continue;
}
keys_since_last_not_found = 0;
Status s =
cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
assert(s.ok());
handle = cache_->Lookup(key);
if (!handle) {
fprintf(stderr, "Failed to lookup key just inserted.\n");
assert(false);
exit(42);
} else {
cache_->Release(handle);
}
size_t occ = cache_->GetOccupancyCount();
if (occ > max_occ) {
max_occ = occ;
inserts_since_max_occ_increase = 0;
} else {
++inserts_since_max_occ_increase;
}
}
printf("Population complete (%zu entries, %g average charge)\n", max_occ,
1.0 * FLAGS_cache_size / max_occ);
}
bool Run() {
@ -390,18 +434,21 @@ class CacheBench {
FLAGS_ops_per_thread / elapsed_secs);
printf("Thread ops/sec = %u\n", ops_per_sec);
printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
if (FLAGS_histograms) {
printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
}
printf("\n%s", stats_report.c_str());
return true;
@ -415,8 +462,6 @@ class CacheBench {
const uint64_t insert_threshold_;
const uint64_t lookup_threshold_;
const uint64_t erase_threshold_;
const bool skewed_;
int max_log_;
// A benchmark version of gathering stats on an active block cache by
// iterating over it. The primary purpose is to measure the impact of
@ -436,7 +481,7 @@ class CacheBench {
uint64_t total_entry_count = 0;
uint64_t table_occupancy = 0;
uint64_t table_size = 0;
std::set<Cache::DeleterFn> deleters;
std::set<const Cache::CacheItemHelper*> helpers;
StopWatchNano timer(clock);
for (;;) {
@ -461,7 +506,7 @@ class CacheBench {
<< BytesToHumanString(static_cast<uint64_t>(
1.0 * total_charge / total_entry_count))
<< "\n"
<< "Unique deleters: " << deleters.size() << "\n";
<< "Unique helpers: " << helpers.size() << "\n";
*stats_report = ostr.str();
return;
}
@ -477,22 +522,26 @@ class CacheBench {
total_key_size = 0;
total_charge = 0;
total_entry_count = 0;
deleters.clear();
auto fn = [&](const Slice& key, void* /*value*/, size_t charge,
Cache::DeleterFn deleter) {
helpers.clear();
auto fn = [&](const Slice& key, Cache::ObjectPtr /*value*/, size_t charge,
const Cache::CacheItemHelper* helper) {
total_key_size += key.size();
total_charge += charge;
++total_entry_count;
// Something slightly more expensive as in (future) stats by category
deleters.insert(deleter);
// Something slightly more expensive as in stats by category
helpers.insert(helper);
};
timer.Start();
if (FLAGS_histograms) {
timer.Start();
}
Cache::ApplyToAllEntriesOptions opts;
opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
stats_hist->Add(timer.ElapsedNanos() / 1000);
if (FLAGS_histograms) {
stats_hist->Add(timer.ElapsedNanos() / 1000);
}
}
}
@ -523,6 +572,8 @@ class CacheBench {
void OperateCache(ThreadState* thread) {
// To use looked-up values
uint64_t result = 0;
uint64_t lookup_misses = 0;
uint64_t lookup_hits = 0;
// To hold handles for a non-trivial amount of time
Cache::Handle* handle = nullptr;
KeyGen gen;
@ -531,18 +582,12 @@ class CacheBench {
StopWatchNano timer(clock);
for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
uint64_t random_op = thread->rnd.Next();
Cache::CreateCallback create_cb = [](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
*out_obj = reinterpret_cast<void*>(new char[size]);
memcpy(*out_obj, buf, size);
*charge = size;
return Status::OK();
};
timer.Start();
if (FLAGS_histograms) {
timer.Start();
}
if (random_op < lookup_insert_threshold_) {
if (handle) {
@ -550,15 +595,17 @@ class CacheBench {
handle = nullptr;
}
// do lookup
handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
true);
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
// do insert
Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
FLAGS_value_bytes, &handle);
@ -579,14 +626,17 @@ class CacheBench {
handle = nullptr;
}
// do lookup
handle = cache_->Lookup(key, &helper2, create_cb, Cache::Priority::LOW,
true);
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
}
} else if (random_op < erase_threshold_) {
// do erase
@ -595,7 +645,14 @@ class CacheBench {
// Should be extremely unlikely (noop)
assert(random_op >= kHundredthUint64 * 100U);
}
thread->latency_ns_hist.Add(timer.ElapsedNanos());
if (FLAGS_histograms) {
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
thread->shared->AddLookupStats(lookup_hits, lookup_misses);
}
if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex());
exit(0);
}
if (handle) {
cache_->Release(handle);
@ -617,6 +674,7 @@ class CacheBench {
#ifndef NDEBUG
printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
printf("----------------------------\n");
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
printf("DMutex impl name : %s\n", DMutex::kName());
printf("Number of threads : %u\n", FLAGS_threads);
@ -956,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) {
exit(1);
}
if (FLAGS_seed == 0) {
FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
}
ROCKSDB_NAMESPACE::CacheBench bench;
if (FLAGS_populate_cache) {
bench.PopulateCache();
printf("Population complete\n");
printf("----------------------------\n");
}
if (bench.Run()) {
return 0;

@ -101,34 +101,4 @@ std::string BlockCacheEntryStatsMapKeys::UsedPercent(CacheEntryRole role) {
return GetPrefixedCacheEntryRoleName(kPrefix, role);
}
namespace {
struct Registry {
std::mutex mutex;
UnorderedMap<Cache::DeleterFn, CacheEntryRole> role_map;
void Register(Cache::DeleterFn fn, CacheEntryRole role) {
std::lock_guard<std::mutex> lock(mutex);
role_map[fn] = role;
}
UnorderedMap<Cache::DeleterFn, CacheEntryRole> Copy() {
std::lock_guard<std::mutex> lock(mutex);
return role_map;
}
};
Registry& GetRegistry() {
STATIC_AVOID_DESTRUCTION(Registry, registry);
return registry;
}
} // namespace
void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role) {
GetRegistry().Register(fn, role);
}
UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap() {
return GetRegistry().Copy();
}
} // namespace ROCKSDB_NAMESPACE

@ -7,11 +7,8 @@
#include <array>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "rocksdb/cache.h"
#include "util/hash_containers.h"
namespace ROCKSDB_NAMESPACE {
@ -20,84 +17,4 @@ extern std::array<std::string, kNumCacheEntryRoles>
extern std::array<std::string, kNumCacheEntryRoles>
kCacheEntryRoleToHyphenString;
// To associate cache entries with their role, we use a hack on the
// existing Cache interface. Because the deleter of an entry can authenticate
// the code origin of an entry, we can elaborate the choice of deleter to
// also encode role information, without inferring false role information
// from entries not choosing to encode a role.
//
// The rest of this file is for handling mappings between deleters and
// roles.
// To infer a role from a deleter, the deleter must be registered. This
// can be done "manually" with this function. This function is thread-safe,
// and the registration mappings go into private but static storage. (Note
// that DeleterFn is a function pointer, not std::function. Registrations
// should not be too many.)
void RegisterCacheDeleterRole(Cache::DeleterFn fn, CacheEntryRole role);
// Gets a copy of the registered deleter -> role mappings. This is the only
// function for reading the mappings made with RegisterCacheDeleterRole.
// Why only this interface for reading?
// * This function has to be thread safe, which could incur substantial
// overhead. We should not pay this overhead for every deleter look-up.
// * This is suitable for preparing for batch operations, like with
// CacheEntryStatsCollector.
// * The number of mappings should be sufficiently small (dozens).
UnorderedMap<Cache::DeleterFn, CacheEntryRole> CopyCacheDeleterRoleMap();
// ************************************************************** //
// An automatic registration infrastructure. This enables code
// to simply ask for a deleter associated with a particular type
// and role, and registration is automatic. In a sense, this is
// a small dependency injection infrastructure, because linking
// in new deleter instantiations is essentially sufficient for
// making stats collection (using CopyCacheDeleterRoleMap) aware
// of them.
namespace cache_entry_roles_detail {
template <typename T, CacheEntryRole R>
struct RegisteredDeleter {
RegisteredDeleter() { RegisterCacheDeleterRole(Delete, R); }
// These have global linkage to help ensure compiler optimizations do not
// break uniqueness for each <T,R>
static void Delete(const Slice& /* key */, void* value) {
// Supports T == Something[], unlike delete operator
std::default_delete<T>()(
static_cast<typename std::remove_extent<T>::type*>(value));
}
};
template <CacheEntryRole R>
struct RegisteredNoopDeleter {
RegisteredNoopDeleter() { RegisterCacheDeleterRole(Delete, R); }
static void Delete(const Slice& /* key */, void* /* value */) {
// Here was `assert(value == nullptr);` but we can also put pointers
// to static data in Cache, for testing at least.
}
};
} // namespace cache_entry_roles_detail
// Get an automatically registered deleter for value type T and role R.
// Based on C++ semantics, registration is invoked exactly once in a
// thread-safe way on first call to this function, for each <T, R>.
template <typename T, CacheEntryRole R>
Cache::DeleterFn GetCacheEntryDeleterForRole() {
static cache_entry_roles_detail::RegisteredDeleter<T, R> reg;
return reg.Delete;
}
// Get an automatically registered no-op deleter (value should be nullptr)
// and associated with role R. This is used for Cache "reservation" entries
// such as for WriteBufferManager.
template <CacheEntryRole R>
Cache::DeleterFn GetNoopDeleterForRole() {
static cache_entry_roles_detail::RegisteredNoopDeleter<R> reg;
return reg.Delete;
}
} // namespace ROCKSDB_NAMESPACE

@ -10,8 +10,8 @@
#include <memory>
#include <mutex>
#include "cache/cache_helpers.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "port/lang.h"
#include "rocksdb/cache.h"
#include "rocksdb/status.h"
@ -111,11 +111,14 @@ class CacheEntryStatsCollector {
// Gets or creates a shared instance of CacheEntryStatsCollector in the
// cache itself, and saves into `ptr`. This shared_ptr will hold the
// entry in cache until all refs are destroyed.
static Status GetShared(Cache *cache, SystemClock *clock,
static Status GetShared(Cache *raw_cache, SystemClock *clock,
std::shared_ptr<CacheEntryStatsCollector> *ptr) {
const Slice &cache_key = GetCacheKey();
assert(raw_cache);
BasicTypedCacheInterface<CacheEntryStatsCollector, CacheEntryRole::kMisc>
cache{raw_cache};
Cache::Handle *h = cache->Lookup(cache_key);
const Slice &cache_key = GetCacheKey();
auto h = cache.Lookup(cache_key);
if (h == nullptr) {
// Not yet in cache, but Cache doesn't provide a built-in way to
// avoid racing insert. So we double-check under a shared mutex,
@ -123,15 +126,15 @@ class CacheEntryStatsCollector {
STATIC_AVOID_DESTRUCTION(std::mutex, static_mutex);
std::lock_guard<std::mutex> lock(static_mutex);
h = cache->Lookup(cache_key);
h = cache.Lookup(cache_key);
if (h == nullptr) {
auto new_ptr = new CacheEntryStatsCollector(cache, clock);
auto new_ptr = new CacheEntryStatsCollector(cache.get(), clock);
// TODO: non-zero charge causes some tests that count block cache
// usage to go flaky. Fix the problem somehow so we can use an
// accurate charge.
size_t charge = 0;
Status s = cache->Insert(cache_key, new_ptr, charge, Deleter, &h,
Cache::Priority::HIGH);
Status s =
cache.Insert(cache_key, new_ptr, charge, &h, Cache::Priority::HIGH);
if (!s.ok()) {
assert(h == nullptr);
delete new_ptr;
@ -140,11 +143,11 @@ class CacheEntryStatsCollector {
}
}
// If we reach here, shared entry is in cache with handle `h`.
assert(cache->GetDeleter(h) == Deleter);
assert(cache.get()->GetCacheItemHelper(h) == cache.GetBasicHelper());
// Build an aliasing shared_ptr that keeps `ptr` in cache while there
// are references.
*ptr = MakeSharedCacheHandleGuard<CacheEntryStatsCollector>(cache, h);
*ptr = cache.SharedGuard(h);
return Status::OK();
}
@ -157,10 +160,6 @@ class CacheEntryStatsCollector {
cache_(cache),
clock_(clock) {}
static void Deleter(const Slice &, void *value) {
delete static_cast<CacheEntryStatsCollector *>(value);
}
static const Slice &GetCacheKey() {
// For each template instantiation
static CacheKey ckey = CacheKey::CreateUniqueForProcessLifetime();

@ -0,0 +1,40 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/cache_helpers.h"
namespace ROCKSDB_NAMESPACE {
void ReleaseCacheHandleCleanup(void* arg1, void* arg2) {
Cache* const cache = static_cast<Cache*>(arg1);
assert(cache);
Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
assert(cache_handle);
cache->Release(cache_handle);
}
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority, size_t* out_charge) {
assert(helper);
assert(helper->create_cb);
Cache::ObjectPtr value;
size_t charge;
Status st = helper->create_cb(saved, create_context,
cache->memory_allocator(), &value, &charge);
if (st.ok()) {
st =
cache->Insert(key, value, helper, charge, /*handle*/ nullptr, priority);
if (out_charge) {
*out_charge = charge;
}
}
return st;
}
} // namespace ROCKSDB_NAMESPACE

@ -7,7 +7,7 @@
#include <cassert>
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/rocksdb_namespace.h"
namespace ROCKSDB_NAMESPACE {
@ -17,22 +17,17 @@ template <typename T>
T* GetFromCacheHandle(Cache* cache, Cache::Handle* handle) {
assert(cache);
assert(handle);
return static_cast<T*>(cache->Value(handle));
}
// Simple generic deleter for Cache (to be used with Cache::Insert).
template <typename T>
void DeleteCacheEntry(const Slice& /* key */, void* value) {
delete static_cast<T*>(value);
}
// Turns a T* into a Slice so it can be used as a key with Cache.
template <typename T>
Slice GetSlice(const T* t) {
Slice GetSliceForKey(const T* t) {
return Slice(reinterpret_cast<const char*>(t), sizeof(T));
}
void ReleaseCacheHandleCleanup(void* arg1, void* arg2);
// Generic resource management object for cache handles that releases the handle
// when destroyed. Has unique ownership of the handle, so copying it is not
// allowed, while moving it transfers ownership.
@ -88,7 +83,7 @@ class CacheHandleGuard {
if (cleanable) {
if (handle_ != nullptr) {
assert(cache_);
cleanable->RegisterCleanup(&ReleaseCacheHandle, cache_, handle_);
cleanable->RegisterCleanup(&ReleaseCacheHandleCleanup, cache_, handle_);
}
}
ResetFields();
@ -115,16 +110,6 @@ class CacheHandleGuard {
value_ = nullptr;
}
static void ReleaseCacheHandle(void* arg1, void* arg2) {
Cache* const cache = static_cast<Cache*>(arg1);
assert(cache);
Cache::Handle* const cache_handle = static_cast<Cache::Handle*>(arg2);
assert(cache_handle);
cache->Release(cache_handle);
}
private:
Cache* cache_ = nullptr;
Cache::Handle* handle_ = nullptr;
@ -139,7 +124,16 @@ template <typename T>
std::shared_ptr<T> MakeSharedCacheHandleGuard(Cache* cache,
Cache::Handle* handle) {
auto wrapper = std::make_shared<CacheHandleGuard<T>>(cache, handle);
return std::shared_ptr<T>(wrapper, static_cast<T*>(cache->Value(handle)));
return std::shared_ptr<T>(wrapper, GetFromCacheHandle<T>(cache, handle));
}
// Given the persistable data (saved) for a block cache entry, parse that
// into a cache entry object and insert it into the given cache. The charge
// of the new entry can be returned to the caller through `out_charge`.
Status WarmInCache(Cache* cache, const Slice& key, const Slice& saved,
Cache::CreateContext* create_context,
const Cache::CacheItemHelper* helper,
Cache::Priority priority = Cache::Priority::LOW,
size_t* out_charge = nullptr);
} // namespace ROCKSDB_NAMESPACE

@ -8,7 +8,7 @@
#include <algorithm>
#include <atomic>
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
#include "table/unique_id_impl.h"
#include "util/hash.h"
#include "util/math.h"

@ -13,7 +13,6 @@
#include <cstring>
#include <memory>
#include "cache/cache_entry_roles.h"
#include "rocksdb/cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -41,17 +40,17 @@ CacheReservationManagerImpl<
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::CacheReservationManagerImpl(
std::shared_ptr<Cache> cache, bool delayed_decrease)
: delayed_decrease_(delayed_decrease),
: cache_(cache),
delayed_decrease_(delayed_decrease),
cache_allocated_size_(0),
memory_used_(0) {
assert(cache != nullptr);
cache_ = cache;
}
template <CacheEntryRole R>
CacheReservationManagerImpl<R>::~CacheReservationManagerImpl() {
for (auto* handle : dummy_handles_) {
cache_->Release(handle, true);
cache_.ReleaseAndEraseIfLastRef(handle);
}
}
@ -115,8 +114,7 @@ Status CacheReservationManagerImpl<R>::IncreaseCacheReservation(
Status return_status = Status::OK();
while (new_mem_used > cache_allocated_size_.load(std::memory_order_relaxed)) {
Cache::Handle* handle = nullptr;
return_status = cache_->Insert(GetNextCacheKey(), nullptr, kSizeDummyEntry,
GetNoopDeleterForRole<R>(), &handle);
return_status = cache_.Insert(GetNextCacheKey(), kSizeDummyEntry, &handle);
if (return_status != Status::OK()) {
return return_status;
@ -141,7 +139,7 @@ Status CacheReservationManagerImpl<R>::DecreaseCacheReservation(
cache_allocated_size_.load(std::memory_order_relaxed)) {
assert(!dummy_handles_.empty());
auto* handle = dummy_handles_.back();
cache_->Release(handle, true);
cache_.ReleaseAndEraseIfLastRef(handle);
dummy_handles_.pop_back();
cache_allocated_size_ -= kSizeDummyEntry;
}
@ -169,8 +167,9 @@ Slice CacheReservationManagerImpl<R>::GetNextCacheKey() {
}
template <CacheEntryRole R>
Cache::DeleterFn CacheReservationManagerImpl<R>::TEST_GetNoopDeleterForRole() {
return GetNoopDeleterForRole<R>();
const Cache::CacheItemHelper*
CacheReservationManagerImpl<R>::TEST_GetCacheItemHelperForRole() {
return CacheInterface::GetHelper();
}
template class CacheReservationManagerImpl<

@ -18,7 +18,7 @@
#include "cache/cache_entry_roles.h"
#include "cache/cache_key.h"
#include "rocksdb/cache.h"
#include "cache/typed_cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "util/coding.h"
@ -197,10 +197,10 @@ class CacheReservationManagerImpl
static constexpr std::size_t GetDummyEntrySize() { return kSizeDummyEntry; }
// For testing only - it is to help ensure the NoopDeleterForRole<R>
// For testing only - it is to help ensure the CacheItemHelperForRole<R>
// accessed from CacheReservationManagerImpl and the one accessed from the
// test are from the same translation units
static Cache::DeleterFn TEST_GetNoopDeleterForRole();
static const Cache::CacheItemHelper *TEST_GetCacheItemHelperForRole();
private:
static constexpr std::size_t kSizeDummyEntry = 256 * 1024;
@ -211,7 +211,8 @@ class CacheReservationManagerImpl
Status IncreaseCacheReservation(std::size_t new_mem_used);
Status DecreaseCacheReservation(std::size_t new_mem_used);
std::shared_ptr<Cache> cache_;
using CacheInterface = PlaceholderSharedCacheInterface<R>;
CacheInterface cache_;
bool delayed_decrease_;
std::atomic<std::size_t> cache_allocated_size_;
std::size_t memory_used_;

428
cache/cache_test.cc vendored

@ -16,9 +16,12 @@
#include <vector>
#include "cache/lru_cache.h"
#include "cache/typed_cache.h"
#include "port/stack_trace.h"
#include "test_util/secondary_cache_test_util.h"
#include "test_util/testharness.h"
#include "util/coding.h"
#include "util/hash_containers.h"
#include "util/string_util.h"
// HyperClockCache only supports 16-byte keys, so some of the tests
@ -55,42 +58,43 @@ int DecodeKey32Bits(const Slice& k) {
return DecodeFixed32(k.data());
}
void* EncodeValue(uintptr_t v) { return reinterpret_cast<void*>(v); }
Cache::ObjectPtr EncodeValue(uintptr_t v) {
return reinterpret_cast<Cache::ObjectPtr>(v);
}
int DecodeValue(void* v) {
return static_cast<int>(reinterpret_cast<uintptr_t>(v));
}
void DumbDeleter(const Slice& /*key*/, void* /*value*/) {}
void EraseDeleter1(const Slice& /*key*/, void* value) {
Cache* cache = reinterpret_cast<Cache*>(value);
cache->Erase("foo");
}
void EraseDeleter2(const Slice& /*key*/, void* value) {
Cache* cache = reinterpret_cast<Cache*>(value);
cache->Erase(EncodeKey16Bytes(1234));
}
const std::string kLRU = "lru";
const std::string kHyperClock = "hyper_clock";
const Cache::CacheItemHelper kDumbHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr /*value*/, MemoryAllocator* /*alloc*/) {}};
const Cache::CacheItemHelper kEraseOnDeleteHelper1{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
Cache* cache = static_cast<Cache*>(value);
cache->Erase("foo");
}};
const Cache::CacheItemHelper kEraseOnDeleteHelper2{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr value, MemoryAllocator* /*alloc*/) {
Cache* cache = static_cast<Cache*>(value);
cache->Erase(EncodeKey16Bytes(1234));
}};
} // anonymous namespace
class CacheTest : public testing::TestWithParam<std::string> {
class CacheTest : public testing::Test,
public secondary_cache_test_util::WithCacheTypeParam {
public:
static CacheTest* current_;
static std::string type_;
static void Deleter(const Slice& key, void* v) {
if (type_ == kHyperClock) {
current_->deleted_keys_.push_back(DecodeKey16Bytes(key));
} else {
current_->deleted_keys_.push_back(DecodeKey32Bits(key));
}
static void Deleter(Cache::ObjectPtr v, MemoryAllocator*) {
current_->deleted_values_.push_back(DecodeValue(v));
}
static const Cache::CacheItemHelper kHelper;
static const int kCacheSize = 1000;
static const int kNumShardBits = 4;
@ -98,13 +102,10 @@ class CacheTest : public testing::TestWithParam<std::string> {
static const int kCacheSize2 = 100;
static const int kNumShardBits2 = 2;
std::vector<int> deleted_keys_;
std::vector<int> deleted_values_;
std::shared_ptr<Cache> cache_;
std::shared_ptr<Cache> cache2_;
size_t estimated_value_size_ = 1;
CacheTest()
: cache_(NewCache(kCacheSize, kNumShardBits, false)),
cache2_(NewCache(kCacheSize2, kNumShardBits2, false)) {
@ -114,41 +115,6 @@ class CacheTest : public testing::TestWithParam<std::string> {
~CacheTest() override {}
std::shared_ptr<Cache> NewCache(size_t capacity) {
auto type = GetParam();
if (type == kLRU) {
return NewLRUCache(capacity);
}
if (type == kHyperClock) {
return HyperClockCacheOptions(
capacity, estimated_value_size_ /*estimated_value_size*/)
.MakeSharedCache();
}
return nullptr;
}
std::shared_ptr<Cache> NewCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
CacheMetadataChargePolicy charge_policy = kDontChargeCacheMetadata) {
auto type = GetParam();
if (type == kLRU) {
LRUCacheOptions co;
co.capacity = capacity;
co.num_shard_bits = num_shard_bits;
co.strict_capacity_limit = strict_capacity_limit;
co.high_pri_pool_ratio = 0;
co.metadata_charge_policy = charge_policy;
return NewLRUCache(co);
}
if (type == kHyperClock) {
return HyperClockCacheOptions(capacity, 1 /*estimated_value_size*/,
num_shard_bits, strict_capacity_limit,
nullptr /*allocator*/, charge_policy)
.MakeSharedCache();
}
return nullptr;
}
// These functions encode/decode keys in tests cases that use
// int keys.
// Currently, HyperClockCache requires keys to be 16B long, whereas
@ -182,8 +148,8 @@ class CacheTest : public testing::TestWithParam<std::string> {
void Insert(std::shared_ptr<Cache> cache, int key, int value,
int charge = 1) {
EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), charge,
&CacheTest::Deleter));
EXPECT_OK(cache->Insert(EncodeKey(key), EncodeValue(value), &kHelper,
charge, /*handle*/ nullptr, Cache::Priority::HIGH));
}
void Erase(std::shared_ptr<Cache> cache, int key) {
@ -207,6 +173,9 @@ class CacheTest : public testing::TestWithParam<std::string> {
void Erase2(int key) { Erase(cache2_, key); }
};
const Cache::CacheItemHelper CacheTest::kHelper{CacheEntryRole::kMisc,
&CacheTest::Deleter};
CacheTest* CacheTest::current_;
std::string CacheTest::type_;
@ -236,10 +205,8 @@ TEST_P(CacheTest, UsageTest) {
key = EncodeKey(i);
}
auto kv_size = key.size() + 5;
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
kv_size, DumbDeleter));
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size));
usage += kv_size;
ASSERT_EQ(usage, cache->GetUsage());
if (type == kHyperClock) {
@ -262,10 +229,8 @@ TEST_P(CacheTest, UsageTest) {
} else {
key = EncodeKey(static_cast<int>(1000 + i));
}
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
key.size() + 5, DumbDeleter));
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
}
// the usage should be close to the capacity
@ -320,11 +285,9 @@ TEST_P(CacheTest, PinnedUsageTest) {
auto kv_size = key.size() + 5;
Cache::Handle* handle;
Cache::Handle* handle_in_precise_cache;
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), kv_size,
DumbDeleter, &handle));
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, kv_size, &handle));
assert(handle);
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
kv_size, DumbDeleter,
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, kv_size,
&handle_in_precise_cache));
assert(handle_in_precise_cache);
pinned_usage += kv_size;
@ -365,10 +328,8 @@ TEST_P(CacheTest, PinnedUsageTest) {
} else {
key = EncodeKey(static_cast<int>(1000 + i));
}
ASSERT_OK(cache->Insert(key, reinterpret_cast<void*>(value), key.size() + 5,
DumbDeleter));
ASSERT_OK(precise_cache->Insert(key, reinterpret_cast<void*>(value),
key.size() + 5, DumbDeleter));
ASSERT_OK(cache->Insert(key, value, &kDumbHelper, key.size() + 5));
ASSERT_OK(precise_cache->Insert(key, value, &kDumbHelper, key.size() + 5));
}
ASSERT_EQ(pinned_usage, cache->GetPinnedUsage());
ASSERT_EQ(precise_cache_pinned_usage, precise_cache->GetPinnedUsage());
@ -416,8 +377,7 @@ TEST_P(CacheTest, HitAndMiss) {
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(-1, Lookup(300));
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(1U, deleted_values_.size());
if (GetParam() == kHyperClock) {
ASSERT_EQ(102, deleted_values_[0]);
} else {
@ -438,21 +398,20 @@ TEST_P(CacheTest, InsertSameKey) {
TEST_P(CacheTest, Erase) {
Erase(200);
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(0U, deleted_values_.size());
Insert(100, 101);
Insert(200, 201);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(101, deleted_values_[0]);
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(201, Lookup(200));
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(1U, deleted_values_.size());
}
TEST_P(CacheTest, EntriesArePinned) {
@ -469,23 +428,21 @@ TEST_P(CacheTest, EntriesArePinned) {
Insert(100, 102);
Cache::Handle* h2 = cache_->Lookup(EncodeKey(100));
ASSERT_EQ(102, DecodeValue(cache_->Value(h2)));
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(0U, deleted_values_.size());
ASSERT_EQ(2U, cache_->GetUsage());
cache_->Release(h1);
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[0]);
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(101, deleted_values_[0]);
ASSERT_EQ(1U, cache_->GetUsage());
Erase(100);
ASSERT_EQ(-1, Lookup(100));
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(1U, deleted_values_.size());
ASSERT_EQ(1U, cache_->GetUsage());
cache_->Release(h2);
ASSERT_EQ(2U, deleted_keys_.size());
ASSERT_EQ(100, deleted_keys_[1]);
ASSERT_EQ(2U, deleted_values_.size());
ASSERT_EQ(102, deleted_values_[1]);
ASSERT_EQ(0U, cache_->GetUsage());
}
@ -588,9 +545,9 @@ TEST_P(CacheTest, EvictEmptyCache) {
// Insert item large than capacity to trigger eviction on empty cache.
auto cache = NewCache(1, 0, false);
if (type == kLRU) {
ASSERT_OK(cache->Insert("foo", nullptr, 10, DumbDeleter));
ASSERT_OK(cache->Insert("foo", nullptr, &kDumbHelper, 10));
} else {
ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, 10, DumbDeleter));
ASSERT_OK(cache->Insert(EncodeKey(1000), nullptr, &kDumbHelper, 10));
}
}
@ -601,19 +558,19 @@ TEST_P(CacheTest, EraseFromDeleter) {
// the cache at that point.
std::shared_ptr<Cache> cache = NewCache(10, 0, false);
std::string foo, bar;
Cache::DeleterFn erase_deleter;
const Cache::CacheItemHelper* erase_helper;
if (type == kLRU) {
foo = "foo";
bar = "bar";
erase_deleter = EraseDeleter1;
erase_helper = &kEraseOnDeleteHelper1;
} else {
foo = EncodeKey(1234);
bar = EncodeKey(5678);
erase_deleter = EraseDeleter2;
erase_helper = &kEraseOnDeleteHelper2;
}
ASSERT_OK(cache->Insert(foo, nullptr, 1, DumbDeleter));
ASSERT_OK(cache->Insert(bar, cache.get(), 1, erase_deleter));
ASSERT_OK(cache->Insert(foo, nullptr, &kDumbHelper, 1));
ASSERT_OK(cache->Insert(bar, cache.get(), erase_helper, 1));
cache->Erase(bar);
ASSERT_EQ(nullptr, cache->Lookup(foo));
@ -675,50 +632,51 @@ TEST_P(CacheTest, NewId) {
ASSERT_NE(a, b);
}
class Value {
public:
explicit Value(int v) : v_(v) {}
int v_;
};
namespace {
void deleter(const Slice& /*key*/, void* value) {
delete static_cast<Value*>(value);
}
} // namespace
TEST_P(CacheTest, ReleaseAndErase) {
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
Cache::Handle* handle;
Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
&CacheTest::Deleter, &handle);
Status s =
cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
ASSERT_TRUE(s.ok());
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(1U, cache->GetUsage());
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(0U, deleted_values_.size());
auto erased = cache->Release(handle, true);
ASSERT_TRUE(erased);
// This tests that deleter has been called
ASSERT_EQ(1U, deleted_keys_.size());
ASSERT_EQ(1U, deleted_values_.size());
}
TEST_P(CacheTest, ReleaseWithoutErase) {
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
Cache::Handle* handle;
Status s = cache->Insert(EncodeKey(100), EncodeValue(100), 1,
&CacheTest::Deleter, &handle);
Status s =
cache->Insert(EncodeKey(100), EncodeValue(100), &kHelper, 1, &handle);
ASSERT_TRUE(s.ok());
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(1U, cache->GetUsage());
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(0U, deleted_values_.size());
auto erased = cache->Release(handle);
ASSERT_FALSE(erased);
// This tests that deleter is not called. When cache has free capacity it is
// not expected to immediately erase the released items.
ASSERT_EQ(0U, deleted_keys_.size());
ASSERT_EQ(0U, deleted_values_.size());
}
namespace {
class Value {
public:
explicit Value(int v) : v_(v) {}
int v_;
static constexpr auto kCacheEntryRole = CacheEntryRole::kMisc;
};
using SharedCache = BasicTypedSharedCacheInterface<Value>;
using TypedHandle = SharedCache::TypedHandle;
} // namespace
TEST_P(CacheTest, SetCapacity) {
auto type = GetParam();
if (type == kHyperClock) {
@ -731,19 +689,19 @@ TEST_P(CacheTest, SetCapacity) {
// lets create a cache with capacity 5,
// then, insert 5 elements, then increase capacity
// to 10, returned capacity should be 10, usage=5
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
std::vector<Cache::Handle*> handles(10);
SharedCache cache{NewCache(5, 0, false)};
std::vector<TypedHandle*> handles(10);
// Insert 5 entries, but not releasing.
for (int i = 0; i < 5; i++) {
std::string key = EncodeKey(i + 1);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
ASSERT_TRUE(s.ok());
}
ASSERT_EQ(5U, cache->GetCapacity());
ASSERT_EQ(5U, cache->GetUsage());
cache->SetCapacity(10);
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(5U, cache->GetUsage());
ASSERT_EQ(5U, cache.get()->GetCapacity());
ASSERT_EQ(5U, cache.get()->GetUsage());
cache.get()->SetCapacity(10);
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(5U, cache.get()->GetUsage());
// test2: decrease capacity
// insert 5 more elements to cache, then release 5,
@ -751,77 +709,77 @@ TEST_P(CacheTest, SetCapacity) {
// and usage should be 7
for (int i = 5; i < 10; i++) {
std::string key = EncodeKey(i + 1);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
ASSERT_TRUE(s.ok());
}
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(10U, cache->GetUsage());
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(10U, cache.get()->GetUsage());
for (int i = 0; i < 5; i++) {
cache->Release(handles[i]);
cache.Release(handles[i]);
}
ASSERT_EQ(10U, cache->GetCapacity());
ASSERT_EQ(10U, cache->GetUsage());
cache->SetCapacity(7);
ASSERT_EQ(7, cache->GetCapacity());
ASSERT_EQ(7, cache->GetUsage());
ASSERT_EQ(10U, cache.get()->GetCapacity());
ASSERT_EQ(10U, cache.get()->GetUsage());
cache.get()->SetCapacity(7);
ASSERT_EQ(7, cache.get()->GetCapacity());
ASSERT_EQ(7, cache.get()->GetUsage());
// release remaining 5 to keep valgrind happy
for (int i = 5; i < 10; i++) {
cache->Release(handles[i]);
cache.Release(handles[i]);
}
// Make sure this doesn't crash or upset ASAN/valgrind
cache->DisownData();
cache.get()->DisownData();
}
TEST_P(LRUCacheTest, SetStrictCapacityLimit) {
// test1: set the flag to false. Insert more keys than capacity. See if they
// all go through.
std::shared_ptr<Cache> cache = NewCache(5, 0, false);
std::vector<Cache::Handle*> handles(10);
SharedCache cache{NewCache(5, 0, false)};
std::vector<TypedHandle*> handles(10);
Status s;
for (int i = 0; i < 10; i++) {
std::string key = EncodeKey(i + 1);
s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
ASSERT_OK(s);
ASSERT_NE(nullptr, handles[i]);
}
ASSERT_EQ(10, cache->GetUsage());
ASSERT_EQ(10, cache.get()->GetUsage());
// test2: set the flag to true. Insert and check if it fails.
std::string extra_key = EncodeKey(100);
Value* extra_value = new Value(0);
cache->SetStrictCapacityLimit(true);
Cache::Handle* handle;
s = cache->Insert(extra_key, extra_value, 1, &deleter, &handle);
cache.get()->SetStrictCapacityLimit(true);
TypedHandle* handle;
s = cache.Insert(extra_key, extra_value, 1, &handle);
ASSERT_TRUE(s.IsMemoryLimit());
ASSERT_EQ(nullptr, handle);
ASSERT_EQ(10, cache->GetUsage());
ASSERT_EQ(10, cache.get()->GetUsage());
for (int i = 0; i < 10; i++) {
cache->Release(handles[i]);
cache.Release(handles[i]);
}
// test3: init with flag being true.
std::shared_ptr<Cache> cache2 = NewCache(5, 0, true);
SharedCache cache2{NewCache(5, 0, true)};
for (int i = 0; i < 5; i++) {
std::string key = EncodeKey(i + 1);
s = cache2->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
s = cache2.Insert(key, new Value(i + 1), 1, &handles[i]);
ASSERT_OK(s);
ASSERT_NE(nullptr, handles[i]);
}
s = cache2->Insert(extra_key, extra_value, 1, &deleter, &handle);
s = cache2.Insert(extra_key, extra_value, 1, &handle);
ASSERT_TRUE(s.IsMemoryLimit());
ASSERT_EQ(nullptr, handle);
// test insert without handle
s = cache2->Insert(extra_key, extra_value, 1, &deleter);
s = cache2.Insert(extra_key, extra_value, 1);
// AS if the key have been inserted into cache but get evicted immediately.
ASSERT_OK(s);
ASSERT_EQ(5, cache2->GetUsage());
ASSERT_EQ(nullptr, cache2->Lookup(extra_key));
ASSERT_EQ(5, cache2.get()->GetUsage());
ASSERT_EQ(nullptr, cache2.Lookup(extra_key));
for (int i = 0; i < 5; i++) {
cache2->Release(handles[i]);
cache2.Release(handles[i]);
}
}
@ -829,55 +787,54 @@ TEST_P(CacheTest, OverCapacity) {
size_t n = 10;
// a LRUCache with n entries and one shard only
std::shared_ptr<Cache> cache = NewCache(n, 0, false);
std::vector<Cache::Handle*> handles(n + 1);
SharedCache cache{NewCache(n, 0, false)};
std::vector<TypedHandle*> handles(n + 1);
// Insert n+1 entries, but not releasing.
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
Status s = cache->Insert(key, new Value(i + 1), 1, &deleter, &handles[i]);
Status s = cache.Insert(key, new Value(i + 1), 1, &handles[i]);
ASSERT_TRUE(s.ok());
}
// Guess what's in the cache now?
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
auto h = cache->Lookup(key);
auto h = cache.Lookup(key);
ASSERT_TRUE(h != nullptr);
if (h) cache->Release(h);
if (h) cache.Release(h);
}
// the cache is over capacity since nothing could be evicted
ASSERT_EQ(n + 1U, cache->GetUsage());
ASSERT_EQ(n + 1U, cache.get()->GetUsage());
for (int i = 0; i < static_cast<int>(n + 1); i++) {
cache->Release(handles[i]);
cache.Release(handles[i]);
}
if (GetParam() == kHyperClock) {
// Make sure eviction is triggered.
ASSERT_OK(cache->Insert(EncodeKey(-1), nullptr, 1, &deleter, &handles[0]));
ASSERT_OK(cache.Insert(EncodeKey(-1), nullptr, 1, &handles[0]));
// cache is under capacity now since elements were released
ASSERT_GE(n, cache->GetUsage());
ASSERT_GE(n, cache.get()->GetUsage());
// clean up
cache->Release(handles[0]);
cache.Release(handles[0]);
} else {
// LRUCache checks for over-capacity in Release.
// cache is exactly at capacity now with minimal eviction
ASSERT_EQ(n, cache->GetUsage());
ASSERT_EQ(n, cache.get()->GetUsage());
// element 0 is evicted and the rest is there
// This is consistent with the LRU policy since the element 0
// was released first
for (int i = 0; i < static_cast<int>(n + 1); i++) {
std::string key = EncodeKey(i + 1);
auto h = cache->Lookup(key);
auto h = cache.Lookup(key);
if (h) {
ASSERT_NE(static_cast<size_t>(i), 0U);
cache->Release(h);
cache.Release(h);
} else {
ASSERT_EQ(static_cast<size_t>(i), 0U);
}
@ -885,40 +842,15 @@ TEST_P(CacheTest, OverCapacity) {
}
}
namespace {
std::vector<std::pair<int, int>> legacy_callback_state;
void legacy_callback(void* value, size_t charge) {
legacy_callback_state.push_back(
{DecodeValue(value), static_cast<int>(charge)});
}
}; // namespace
TEST_P(CacheTest, ApplyToAllCacheEntriesTest) {
std::vector<std::pair<int, int>> inserted;
legacy_callback_state.clear();
for (int i = 0; i < 10; ++i) {
Insert(i, i * 2, i + 1);
inserted.push_back({i * 2, i + 1});
}
cache_->ApplyToAllCacheEntries(legacy_callback, true);
std::sort(inserted.begin(), inserted.end());
std::sort(legacy_callback_state.begin(), legacy_callback_state.end());
ASSERT_EQ(inserted.size(), legacy_callback_state.size());
for (int i = 0; i < static_cast<int>(inserted.size()); ++i) {
EXPECT_EQ(inserted[i], legacy_callback_state[i]);
}
}
TEST_P(CacheTest, ApplyToAllEntriesTest) {
std::vector<std::string> callback_state;
const auto callback = [&](const Slice& key, void* value, size_t charge,
Cache::DeleterFn deleter) {
const auto callback = [&](const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper) {
callback_state.push_back(std::to_string(DecodeKey(key)) + "," +
std::to_string(DecodeValue(value)) + "," +
std::to_string(charge));
assert(deleter == &CacheTest::Deleter);
assert(helper == &CacheTest::kHelper);
};
std::vector<std::string> inserted;
@ -957,8 +889,8 @@ TEST_P(CacheTest, ApplyToAllEntriesDuringResize) {
// For callback
int special_count = 0;
const auto callback = [&](const Slice&, void*, size_t charge,
Cache::DeleterFn) {
const auto callback = [&](const Slice&, Cache::ObjectPtr, size_t charge,
const Cache::CacheItemHelper*) {
if (charge == static_cast<size_t>(kSpecialCharge)) {
++special_count;
}
@ -1020,13 +952,105 @@ TEST_P(CacheTest, GetChargeAndDeleter) {
Cache::Handle* h1 = cache_->Lookup(EncodeKey(1));
ASSERT_EQ(2, DecodeValue(cache_->Value(h1)));
ASSERT_EQ(1, cache_->GetCharge(h1));
ASSERT_EQ(&CacheTest::Deleter, cache_->GetDeleter(h1));
ASSERT_EQ(&CacheTest::kHelper, cache_->GetCacheItemHelper(h1));
cache_->Release(h1);
}
namespace {
bool AreTwoCacheKeysOrdered(Cache* cache) {
std::vector<std::string> keys;
const auto callback = [&](const Slice& key, Cache::ObjectPtr /*value*/,
size_t /*charge*/,
const Cache::CacheItemHelper* /*helper*/) {
keys.push_back(key.ToString());
};
cache->ApplyToAllEntries(callback, /*opts*/ {});
EXPECT_EQ(keys.size(), 2U);
EXPECT_NE(keys[0], keys[1]);
return keys[0] < keys[1];
}
} // namespace
TEST_P(CacheTest, CacheUniqueSeeds) {
// kQuasiRandomHashSeed should generate unique seeds (up to 2 billion before
// repeating)
UnorderedSet<uint32_t> seeds_seen;
// Roughly sqrt(number of possible values) for a decent chance at detecting
// a random collision if it's possible (shouldn't be)
uint16_t kSamples = 20000;
seeds_seen.reserve(kSamples);
// Hash seed should affect ordering of entries in the table, so we should
// have extremely high chance of seeing two entries ordered both ways.
bool seen_forward_order = false;
bool seen_reverse_order = false;
for (int i = 0; i < kSamples; ++i) {
auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
opts.hash_seed = LRUCacheOptions::kQuasiRandomHashSeed;
opts.num_shard_bits = 0;
opts.metadata_charge_policy = kDontChargeCacheMetadata;
});
auto val = cache->GetHashSeed();
ASSERT_TRUE(seeds_seen.insert(val).second);
ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
if (AreTwoCacheKeysOrdered(cache.get())) {
seen_forward_order = true;
} else {
seen_reverse_order = true;
}
}
ASSERT_TRUE(seen_forward_order);
ASSERT_TRUE(seen_reverse_order);
}
TEST_P(CacheTest, CacheHostSeed) {
// kHostHashSeed should generate a consistent seed within this process
// (and other processes on the same host, but not unit testing that).
// And we should be able to use that chosen seed as an explicit option
// (for debugging).
// And we should verify consistent ordering of entries.
uint32_t expected_seed = 0;
bool expected_order = false;
// 10 iterations -> chance of a random seed falsely appearing consistent
// should be low, just 1 in 2^9.
for (int i = 0; i < 10; ++i) {
auto cache = NewCache(2, [=](ShardedCacheOptions& opts) {
if (i != 5) {
opts.hash_seed = LRUCacheOptions::kHostHashSeed;
} else {
// Can be used as explicit seed
opts.hash_seed = static_cast<int32_t>(expected_seed);
ASSERT_GE(opts.hash_seed, 0);
}
opts.num_shard_bits = 0;
opts.metadata_charge_policy = kDontChargeCacheMetadata;
});
ASSERT_OK(cache->Insert(EncodeKey(1), nullptr, &kHelper, /*charge*/ 1));
ASSERT_OK(cache->Insert(EncodeKey(2), nullptr, &kHelper, /*charge*/ 1));
uint32_t val = cache->GetHashSeed();
bool order = AreTwoCacheKeysOrdered(cache.get());
if (i != 0) {
ASSERT_EQ(val, expected_seed);
ASSERT_EQ(order, expected_order);
} else {
expected_seed = val;
expected_order = order;
}
}
// Printed for reference in case it's needed to reproduce other unit test
// failures on another host
fprintf(stderr, "kHostHashSeed -> %u\n", (unsigned)expected_seed);
}
INSTANTIATE_TEST_CASE_P(CacheTestInstance, CacheTest,
testing::Values(kLRU, kHyperClock));
INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest, testing::Values(kLRU));
secondary_cache_test_util::GetTestingCacheTypes());
INSTANTIATE_TEST_CASE_P(CacheTestInstance, LRUCacheTest,
testing::Values(secondary_cache_test_util::kLRU));
} // namespace ROCKSDB_NAMESPACE

@ -11,65 +11,57 @@ namespace ROCKSDB_NAMESPACE {
ChargedCache::ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache)
: cache_(cache),
: CacheWrapper(cache),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<
CacheReservationManagerImpl<CacheEntryRole::kBlobCache>>(
block_cache))) {}
Status ChargedCache::Insert(const Slice& key, void* value, size_t charge,
DeleterFn deleter, Handle** handle,
Priority priority) {
Status s = cache_->Insert(key, value, charge, deleter, handle, priority);
if (s.ok()) {
// Insert may cause the cache entry eviction if the cache is full. So we
// directly call the reservation manager to update the total memory used
// in the cache.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
.PermitUncheckedError();
}
return s;
}
Status ChargedCache::Insert(const Slice& key, void* value,
Status ChargedCache::Insert(const Slice& key, ObjectPtr obj,
const CacheItemHelper* helper, size_t charge,
Handle** handle, Priority priority) {
Status s = cache_->Insert(key, value, helper, charge, handle, priority);
Status s = target_->Insert(key, obj, helper, charge, handle, priority);
if (s.ok()) {
// Insert may cause the cache entry eviction if the cache is full. So we
// directly call the reservation manager to update the total memory used
// in the cache.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
return s;
}
Cache::Handle* ChargedCache::Lookup(const Slice& key, Statistics* stats) {
return cache_->Lookup(key, stats);
}
Cache::Handle* ChargedCache::Lookup(const Slice& key,
const CacheItemHelper* helper,
const CreateCallback& create_cb,
Priority priority, bool wait,
Statistics* stats) {
auto handle = cache_->Lookup(key, helper, create_cb, priority, wait, stats);
CreateContext* create_context,
Priority priority, Statistics* stats) {
auto handle = target_->Lookup(key, helper, create_context, priority, stats);
// Lookup may promote the KV pair from the secondary cache to the primary
// cache. So we directly call the reservation manager to update the total
// memory used in the cache.
if (helper && helper->create_cb) {
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
return handle;
}
void ChargedCache::WaitAll(AsyncLookupHandle* async_handles, size_t count) {
target_->WaitAll(async_handles, count);
// In case of any promotions. Although some could finish by return of
// StartAsyncLookup, Wait/WaitAll will generally be used, so simpler to
// update here.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
return handle;
}
bool ChargedCache::Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref) {
size_t memory_used_delta = cache_->GetUsage(handle);
bool erased = cache_->Release(handle, useful, erase_if_last_ref);
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, useful, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
@ -80,8 +72,8 @@ bool ChargedCache::Release(Cache::Handle* handle, bool useful,
}
bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
size_t memory_used_delta = cache_->GetUsage(handle);
bool erased = cache_->Release(handle, erase_if_last_ref);
size_t memory_used_delta = target_->GetUsage(handle);
bool erased = target_->Release(handle, erase_if_last_ref);
if (erased) {
assert(cache_res_mgr_);
cache_res_mgr_
@ -92,25 +84,25 @@ bool ChargedCache::Release(Cache::Handle* handle, bool erase_if_last_ref) {
}
void ChargedCache::Erase(const Slice& key) {
cache_->Erase(key);
target_->Erase(key);
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::EraseUnRefEntries() {
cache_->EraseUnRefEntries();
target_->EraseUnRefEntries();
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}
void ChargedCache::SetCapacity(size_t capacity) {
cache_->SetCapacity(capacity);
target_->SetCapacity(capacity);
// SetCapacity can result in evictions when the cache capacity is decreased,
// so we would want to update the cache reservation here as well.
assert(cache_res_mgr_);
cache_res_mgr_->UpdateCacheReservation(cache_->GetUsage())
cache_res_mgr_->UpdateCacheReservation(target_->GetUsage())
.PermitUncheckedError();
}

@ -8,7 +8,7 @@
#include <string>
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
namespace ROCKSDB_NAMESPACE {
@ -17,22 +17,21 @@ class ConcurrentCacheReservationManager;
// A cache interface which wraps around another cache and takes care of
// reserving space in block cache towards a single global memory limit, and
// forwards all the calls to the underlying cache.
class ChargedCache : public Cache {
class ChargedCache : public CacheWrapper {
public:
ChargedCache(std::shared_ptr<Cache> cache,
std::shared_ptr<Cache> block_cache);
~ChargedCache() override = default;
Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
Handle** handle, Priority priority) override;
Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Cache::Handle* Lookup(const Slice& key, Statistics* stats) override;
Cache::Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
const CreateCallback& create_cb, Priority priority,
bool wait, Statistics* stats = nullptr) override;
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
bool Release(Cache::Handle* handle, bool useful,
bool erase_if_last_ref = false) override;
@ -44,69 +43,9 @@ class ChargedCache : public Cache {
static const char* kClassName() { return "ChargedCache"; }
const char* Name() const override { return kClassName(); }
uint64_t NewId() override { return cache_->NewId(); }
void SetCapacity(size_t capacity) override;
void SetStrictCapacityLimit(bool strict_capacity_limit) override {
cache_->SetStrictCapacityLimit(strict_capacity_limit);
}
bool HasStrictCapacityLimit() const override {
return cache_->HasStrictCapacityLimit();
}
void* Value(Cache::Handle* handle) override { return cache_->Value(handle); }
bool IsReady(Cache::Handle* handle) override {
return cache_->IsReady(handle);
}
void Wait(Cache::Handle* handle) override { cache_->Wait(handle); }
void WaitAll(std::vector<Handle*>& handles) override {
cache_->WaitAll(handles);
}
bool Ref(Cache::Handle* handle) override { return cache_->Ref(handle); }
size_t GetCapacity() const override { return cache_->GetCapacity(); }
size_t GetUsage() const override { return cache_->GetUsage(); }
size_t GetUsage(Cache::Handle* handle) const override {
return cache_->GetUsage(handle);
}
size_t GetPinnedUsage() const override { return cache_->GetPinnedUsage(); }
size_t GetCharge(Cache::Handle* handle) const override {
return cache_->GetCharge(handle);
}
Cache::DeleterFn GetDeleter(Cache::Handle* handle) const override {
return cache_->GetDeleter(handle);
}
void ApplyToAllEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
Cache::DeleterFn deleter)>& callback,
const Cache::ApplyToAllEntriesOptions& opts) override {
cache_->ApplyToAllEntries(callback, opts);
}
void ApplyToAllCacheEntries(void (*callback)(void* value, size_t charge),
bool thread_safe) override {
cache_->ApplyToAllCacheEntries(callback, thread_safe);
}
std::string GetPrintableOptions() const override {
return cache_->GetPrintableOptions();
}
void DisownData() override { return cache_->DisownData(); }
inline Cache* GetCache() const { return cache_.get(); }
inline Cache* GetCache() const { return target_.get(); }
inline ConcurrentCacheReservationManager* TEST_GetCacheReservationManager()
const {
@ -114,7 +53,6 @@ class ChargedCache : public Cache {
}
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};

971
cache/clock_cache.cc vendored

File diff suppressed because it is too large Load Diff

345
cache/clock_cache.h vendored

@ -24,6 +24,7 @@
#include "rocksdb/cache.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/math.h"
namespace ROCKSDB_NAMESPACE {
@ -145,7 +146,7 @@ class ClockCacheTest;
// (erased by user) but can be read by existing references, and ref count
// changed by Ref and Release.
//
// A special case is "detached" entries, which are heap-allocated handles
// A special case is "standalone" entries, which are heap-allocated handles
// not in the table. They are always Invisible and freed on zero refs.
//
// State transitions:
@ -200,8 +201,8 @@ class ClockCacheTest;
// table occupancy limit has been reached. If strict_capacity_limit=false,
// we must never fail Insert, and if a Handle* is provided, we have to return
// a usable Cache handle on success. The solution to this (typically rare)
// problem is "detached" handles, which are usable by the caller but not
// actually available for Lookup in the Cache. Detached handles are allocated
// problem is "standalone" handles, which are usable by the caller but not
// actually available for Lookup in the Cache. Standalone handles are allocated
// independently on the heap and specially marked so that they are freed on
// the heap when their last reference is released.
//
@ -305,23 +306,17 @@ constexpr double kLoadFactor = 0.7;
constexpr double kStrictLoadFactor = 0.84;
struct ClockHandleBasicData {
void* value = nullptr;
Cache::DeleterFn deleter = nullptr;
Cache::ObjectPtr value = nullptr;
const Cache::CacheItemHelper* helper = nullptr;
// A lossless, reversible hash of the fixed-size (16 byte) cache key. This
// eliminates the need to store a hash separately.
UniqueId64x2 hashed_key = kNullUniqueId64x2;
size_t total_charge = 0;
// For total_charge_and_flags
// "Detached" means the handle is allocated separately from hash table.
static constexpr uint64_t kFlagDetached = uint64_t{1} << 63;
// Extract just the total charge
static constexpr uint64_t kTotalChargeMask = kFlagDetached - 1;
inline size_t GetTotalCharge() const { return total_charge; }
// Calls deleter (if non-null) on cache key and value
void FreeData() const;
void FreeData(MemoryAllocator* allocator) const;
// Required by concept HandleImpl
const UniqueId64x2& GetHash() const { return hashed_key; }
@ -377,14 +372,134 @@ struct ClockHandle : public ClockHandleBasicData {
static constexpr uint8_t kMaxCountdown = kHighCountdown;
// TODO: make these coundown values tuning parameters for eviction?
// See above
std::atomic<uint64_t> meta{};
// See above. Mutable for read reference counting.
mutable std::atomic<uint64_t> meta{};
// Whether this is a "deteched" handle that is independently allocated
// with `new` (so must be deleted with `delete`).
// TODO: ideally this would be packed into some other data field, such
// as upper bits of total_charge, but that incurs a measurable performance
// regression.
bool standalone = false;
// Anticipating use for SecondaryCache support
void* reserved_for_future_use = nullptr;
inline bool IsStandalone() const { return standalone; }
inline void SetStandalone() { standalone = true; }
}; // struct ClockHandle
class HyperClockTable {
class BaseClockTable {
public:
BaseClockTable(CacheMetadataChargePolicy metadata_charge_policy,
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed)
: metadata_charge_policy_(metadata_charge_policy),
allocator_(allocator),
eviction_callback_(*eviction_callback),
hash_seed_(*hash_seed) {}
template <class Table>
typename Table::HandleImpl* CreateStandalone(ClockHandleBasicData& proto,
size_t capacity,
bool strict_capacity_limit,
bool allow_uncharged);
template <class Table>
Status Insert(const ClockHandleBasicData& proto,
typename Table::HandleImpl** handle, Cache::Priority priority,
size_t capacity, bool strict_capacity_limit);
void Ref(ClockHandle& handle);
size_t GetOccupancy() const {
return occupancy_.load(std::memory_order_relaxed);
}
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
size_t GetStandaloneUsage() const {
return standalone_usage_.load(std::memory_order_relaxed);
}
uint32_t GetHashSeed() const { return hash_seed_; }
struct EvictionData {
size_t freed_charge = 0;
size_t freed_count = 0;
};
void TrackAndReleaseEvictedEntry(ClockHandle* h, EvictionData* data);
#ifndef NDEBUG
// Acquire N references
void TEST_RefN(ClockHandle& handle, size_t n);
// Helper for TEST_ReleaseN
void TEST_ReleaseNMinus1(ClockHandle* handle, size_t n);
#endif
private: // fns
// Creates a "standalone" handle for returning from an Insert operation that
// cannot be completed by actually inserting into the table.
// Updates `standalone_usage_` but not `usage_` nor `occupancy_`.
template <class HandleImpl>
HandleImpl* StandaloneInsert(const ClockHandleBasicData& proto);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=true rules. This
// means the operation might fail with Status::MemoryLimit. If
// `need_evict_for_occupancy`, then eviction of at least one entry is
// required, and the operation should fail if not possible.
// NOTE: Otherwise, occupancy_ is not managed in this function
template <class Table>
Status ChargeUsageMaybeEvictStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=false rules. This
// means that updating `usage_` always succeeds even if forced to exceed
// capacity. If `need_evict_for_occupancy`, then eviction of at least one
// entry is required, and the operation should return false if such eviction
// is not possible. `usage_` is not updated in that case. Otherwise, returns
// true, indicating success.
// NOTE: occupancy_ is not managed in this function
template <class Table>
bool ChargeUsageMaybeEvictNonStrict(size_t total_charge, size_t capacity,
bool need_evict_for_occupancy,
typename Table::InsertState& state);
protected: // data
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
// Clock algorithm sweep pointer.
std::atomic<uint64_t> clock_pointer_{};
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<size_t> occupancy_{};
// Memory usage by entries tracked by the cache (including standalone)
std::atomic<size_t> usage_{};
// Part of usage by standalone entries (not in table)
std::atomic<size_t> standalone_usage_{};
ALIGN_AS(CACHE_LINE_SIZE)
const CacheMetadataChargePolicy metadata_charge_policy_;
// From Cache, for deleter
MemoryAllocator* const allocator_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
// A reference to ShardedCacheBase::hash_seed_
const uint32_t& hash_seed_;
};
class HyperClockTable : public BaseClockTable {
public:
// Target size to be exactly a common cache line size (see static_assert in
// clock_cache.cc)
@ -393,16 +508,6 @@ class HyperClockTable {
// up in this slot or a higher one.
std::atomic<uint32_t> displacements{};
// Whether this is a "deteched" handle that is independently allocated
// with `new` (so must be deleted with `delete`).
// TODO: ideally this would be packed into some other data field, such
// as upper bits of total_charge, but that incurs a measurable performance
// regression.
bool detached = false;
inline bool IsDetached() const { return detached; }
inline void SetDetached() { detached = true; }
}; // struct HandleImpl
struct Opts {
@ -411,75 +516,70 @@ class HyperClockTable {
HyperClockTable(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
const Opts& opts);
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const Opts& opts);
~HyperClockTable();
Status Insert(const ClockHandleBasicData& proto, HandleImpl** handle,
Cache::Priority priority, size_t capacity,
bool strict_capacity_limit);
// For BaseClockTable::Insert
struct InsertState {};
void StartInsert(InsertState& state);
// Returns true iff there is room for the proposed number of entries.
bool GrowIfNeeded(size_t new_occupancy, InsertState& state);
HandleImpl* DoInsert(const ClockHandleBasicData& proto,
uint64_t initial_countdown, bool take_ref,
InsertState& state);
// Runs the clock eviction algorithm trying to reclaim at least
// requested_charge. Returns how much is evicted, which could be less
// if it appears impossible to evict the requested amount without blocking.
void Evict(size_t requested_charge, InsertState& state, EvictionData* data);
HandleImpl* Lookup(const UniqueId64x2& hashed_key);
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref);
void Ref(HandleImpl& handle);
void Erase(const UniqueId64x2& hashed_key);
void ConstApplyToEntriesRange(std::function<void(const HandleImpl&)> func,
size_t index_begin, size_t index_end,
bool apply_if_will_be_deleted) const;
void EraseUnRefEntries();
size_t GetTableSize() const { return size_t{1} << length_bits_; }
int GetLengthBits() const { return length_bits_; }
size_t GetOccupancy() const {
return occupancy_.load(std::memory_order_relaxed);
}
size_t GetOccupancyLimit() const { return occupancy_limit_; }
size_t GetUsage() const { return usage_.load(std::memory_order_relaxed); }
const HandleImpl* HandlePtr(size_t idx) const { return &array_[idx]; }
size_t GetDetachedUsage() const {
return detached_usage_.load(std::memory_order_relaxed);
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return const_cast<size_t&>(occupancy_limit_);
}
// Acquire/release N references
void TEST_RefN(HandleImpl& handle, size_t n);
// Release N references
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // functions
// Returns x mod 2^{length_bits_}.
inline size_t ModTableSize(uint64_t x) {
return static_cast<size_t>(x) & length_bits_mask_;
return BitwiseAnd(x, length_bits_mask_);
}
// Runs the clock eviction algorithm trying to reclaim at least
// requested_charge. Returns how much is evicted, which could be less
// if it appears impossible to evict the requested amount without blocking.
inline void Evict(size_t requested_charge, size_t* freed_charge,
size_t* freed_count);
// Returns the first slot in the probe sequence, starting from the given
// probe number, with a handle e such that match(e) is true. At every
// step, the function first tests whether match(e) holds. If this is false,
// it evaluates abort(e) to decide whether the search should be aborted,
// and in the affirmative returns -1. For every handle e probed except
// the last one, the function runs update(e).
// The probe parameter is modified as follows. We say a probe to a handle
// e is aborting if match(e) is false and abort(e) is true. Then the final
// value of probe is one more than the last non-aborting probe during the
// call. This is so that that the variable can be used to keep track of
// progress across consecutive calls to FindSlot.
// Returns the first slot in the probe sequence with a handle e such that
// match_fn(e) is true. At every step, the function first tests whether
// match_fn(e) holds. If this is false, it evaluates abort_fn(e) to decide
// whether the search should be aborted, and if so, FindSlot immediately
// returns nullptr. For every handle e that is not a match and not aborted,
// FindSlot runs update_fn(e, is_last) where is_last is set to true iff that
// slot will be the last probed because the next would cycle back to the first
// slot probed. This function uses templates instead of std::function to
// minimize the risk of heap-allocated closures being created.
template <typename MatchFn, typename AbortFn, typename UpdateFn>
inline HandleImpl* FindSlot(const UniqueId64x2& hashed_key,
std::function<bool(HandleImpl*)> match,
std::function<bool(HandleImpl*)> stop,
std::function<void(HandleImpl*)> update,
size_t& probe);
const MatchFn& match_fn, const AbortFn& abort_fn,
const UpdateFn& update_fn);
// Re-decrement all displacements in probe path starting from beginning
// until (not including) the given handle
@ -492,32 +592,7 @@ class HyperClockTable {
// before releasing it so that it can be provided to this function.
inline void ReclaimEntryUsage(size_t total_charge);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=true rules. This
// means the operation might fail with Status::MemoryLimit. If
// `need_evict_for_occupancy`, then eviction of at least one entry is
// required, and the operation should fail if not possible.
// NOTE: Otherwise, occupancy_ is not managed in this function
inline Status ChargeUsageMaybeEvictStrict(size_t total_charge,
size_t capacity,
bool need_evict_for_occupancy);
// Helper for updating `usage_` for new entry with given `total_charge`
// and evicting if needed under strict_capacity_limit=false rules. This
// means that updating `usage_` always succeeds even if forced to exceed
// capacity. If `need_evict_for_occupancy`, then eviction of at least one
// entry is required, and the operation should return false if such eviction
// is not possible. `usage_` is not updated in that case. Otherwise, returns
// true, indicating success.
// NOTE: occupancy_ is not managed in this function
inline bool ChargeUsageMaybeEvictNonStrict(size_t total_charge,
size_t capacity,
bool need_evict_for_occupancy);
// Creates a "detached" handle for returning from an Insert operation that
// cannot be completed by actually inserting into the table.
// Updates `detached_usage_` but not `usage_` nor `occupancy_`.
inline HandleImpl* DetachedInsert(const ClockHandleBasicData& proto);
MemoryAllocator* GetAllocator() const { return allocator_; }
// Returns the number of bits used to hash an element in the hash
// table.
@ -537,24 +612,6 @@ class HyperClockTable {
// Array of slots comprising the hash table.
const std::unique_ptr<HandleImpl[]> array_;
// We partition the following members into different cache lines
// to avoid false sharing among Lookup, Release, Erase and Insert
// operations in ClockCacheShard.
ALIGN_AS(CACHE_LINE_SIZE)
// Clock algorithm sweep pointer.
std::atomic<uint64_t> clock_pointer_{};
ALIGN_AS(CACHE_LINE_SIZE)
// Number of elements in the table.
std::atomic<size_t> occupancy_{};
// Memory usage by entries tracked by the cache (including detached)
std::atomic<size_t> usage_{};
// Part of usage by detached entries (not in table)
std::atomic<size_t> detached_usage_{};
}; // class HyperClockTable
// A single shard of sharded cache.
@ -563,7 +620,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
public:
ClockCacheShard(size_t capacity, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
const typename Table::Opts& opts);
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback,
const uint32_t* hash_seed, const typename Table::Opts& opts);
// For CacheShard concept
using HandleImpl = typename Table::HandleImpl;
@ -573,22 +632,23 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Upper32of64(hash[0]);
}
static inline HashVal ComputeHash(const Slice& key) {
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
assert(key.size() == kCacheKeySize);
HashVal in;
HashVal out;
// NOTE: endian dependence
// TODO: use GetUnaligned?
std::memcpy(&in, key.data(), kCacheKeySize);
BijectiveHash2x64(in[1], in[0], &out[1], &out[0]);
BijectiveHash2x64(in[1], in[0] ^ seed, &out[1], &out[0]);
return out;
}
// For reconstructing key from hashed_key. Requires the caller to provide
// backing storage for the Slice in `unhashed`
static inline Slice ReverseHash(const UniqueId64x2& hashed,
UniqueId64x2* unhashed) {
UniqueId64x2* unhashed, uint32_t seed) {
BijectiveUnhash2x64(hashed[1], hashed[0], &(*unhashed)[1], &(*unhashed)[0]);
(*unhashed)[0] ^= seed;
// NOTE: endian dependence
return Slice(reinterpret_cast<const char*>(unhashed), kCacheKeySize);
}
@ -600,9 +660,14 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
void SetStrictCapacityLimit(bool strict_capacity_limit);
Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
size_t charge, Cache::DeleterFn deleter, HandleImpl** handle,
Cache::Priority priority);
Status Insert(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr value, const Cache::CacheItemHelper* helper,
size_t charge, HandleImpl** handle, Cache::Priority priority);
HandleImpl* CreateStandalone(const Slice& key, const UniqueId64x2& hashed_key,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key);
@ -618,7 +683,7 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
size_t GetUsage() const;
size_t GetDetachedUsage() const;
size_t GetStandaloneUsage() const;
size_t GetPinnedUsage() const;
@ -629,37 +694,30 @@ class ALIGN_AS(CACHE_LINE_SIZE) ClockCacheShard final : public CacheShardBase {
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const std::function<void(const Slice& key, Cache::ObjectPtr obj,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
std::string GetPrintableOptions() const { return std::string{}; }
// SecondaryCache not yet supported
Status Insert(const Slice& key, const UniqueId64x2& hashed_key, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
HandleImpl** handle, Cache::Priority priority) {
return Insert(key, hashed_key, value, charge, helper->del_cb, handle,
priority);
}
HandleImpl* Lookup(const Slice& key, const UniqueId64x2& hashed_key,
const Cache::CacheItemHelper* /*helper*/,
const Cache::CreateCallback& /*create_cb*/,
Cache::Priority /*priority*/, bool /*wait*/,
Statistics* /*stats*/) {
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/, Statistics* /*stats*/) {
return Lookup(key, hashed_key);
}
bool IsReady(HandleImpl* /*handle*/) { return true; }
void Wait(HandleImpl* /*handle*/) {}
#ifndef NDEBUG
size_t& TEST_MutableOccupancyLimit() const {
return table_.TEST_MutableOccupancyLimit();
}
// Acquire/release N references
void TEST_RefN(HandleImpl* handle, size_t n);
void TEST_ReleaseN(HandleImpl* handle, size_t n);
#endif
private: // data
Table table_;
@ -679,18 +737,15 @@ class HyperClockCache
public:
using Shard = ClockCacheShard<HyperClockTable>;
HyperClockCache(size_t capacity, size_t estimated_value_size,
int num_shard_bits, bool strict_capacity_limit,
CacheMetadataChargePolicy metadata_charge_policy,
std::shared_ptr<MemoryAllocator> memory_allocator);
explicit HyperClockCache(const HyperClockCacheOptions& opts);
const char* Name() const override { return "HyperClockCache"; }
void* Value(Handle* handle) override;
Cache::ObjectPtr Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
void ReportProblems(
const std::shared_ptr<Logger>& /*info_log*/) const override;

@ -9,7 +9,7 @@
#include <cstdint>
#include <memory>
#include "memory/memory_allocator.h"
#include "memory/memory_allocator_impl.h"
#include "monitoring/perf_context_imp.h"
#include "util/compression.h"
#include "util/string_util.h"
@ -17,30 +17,24 @@
namespace ROCKSDB_NAMESPACE {
CompressedSecondaryCache::CompressedSecondaryCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
CompressionType compression_type, uint32_t compress_format_version,
bool enable_custom_split_merge)
: cache_options_(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, low_pri_pool_ratio, memory_allocator,
use_adaptive_mutex, metadata_charge_policy,
compression_type, compress_format_version,
enable_custom_split_merge) {
cache_ =
NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, low_pri_pool_ratio);
const CompressedSecondaryCacheOptions& opts)
: cache_(opts.LRUCacheOptions::MakeSharedCache()),
cache_options_(opts),
cache_res_mgr_(std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache_))) {}
CompressedSecondaryCache::~CompressedSecondaryCache() {
assert(cache_res_mgr_->GetTotalReservedCacheSize() == 0);
}
CompressedSecondaryCache::~CompressedSecondaryCache() { cache_.reset(); }
std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
bool advise_erase, bool& is_in_sec_cache) {
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) {
assert(helper);
std::unique_ptr<SecondaryCacheResultHandle> handle;
is_in_sec_cache = false;
kept_in_sec_cache = false;
Cache::Handle* lru_handle = cache_->Lookup(key);
if (lru_handle == nullptr) {
return nullptr;
@ -64,12 +58,15 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
ptr = reinterpret_cast<CacheAllocationPtr*>(handle_value);
handle_value_charge = cache_->GetCharge(lru_handle);
}
MemoryAllocator* allocator = cache_options_.memory_allocator.get();
Status s;
void* value{nullptr};
Cache::ObjectPtr value{nullptr};
size_t charge{0};
if (cache_options_.compression_type == kNoCompression) {
s = create_cb(ptr->get(), handle_value_charge, &value, &charge);
if (cache_options_.compression_type == kNoCompression ||
cache_options_.do_not_compress_roles.Contains(helper->role)) {
s = helper->create_cb(Slice(ptr->get(), handle_value_charge),
create_context, allocator, &value, &charge);
} else {
UncompressionContext uncompression_context(cache_options_.compression_type);
UncompressionInfo uncompression_info(uncompression_context,
@ -79,14 +76,14 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
size_t uncompressed_size{0};
CacheAllocationPtr uncompressed = UncompressData(
uncompression_info, (char*)ptr->get(), handle_value_charge,
&uncompressed_size, cache_options_.compress_format_version,
cache_options_.memory_allocator.get());
&uncompressed_size, cache_options_.compress_format_version, allocator);
if (!uncompressed) {
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
return nullptr;
}
s = create_cb(uncompressed.get(), uncompressed_size, &value, &charge);
s = helper->create_cb(Slice(uncompressed.get(), uncompressed_size),
create_context, allocator, &value, &charge);
}
if (!s.ok()) {
@ -98,30 +95,32 @@ std::unique_ptr<SecondaryCacheResultHandle> CompressedSecondaryCache::Lookup(
cache_->Release(lru_handle, /*erase_if_last_ref=*/true);
// Insert a dummy handle.
cache_
->Insert(key, /*value=*/nullptr, /*charge=*/0,
GetDeletionCallback(cache_options_.enable_custom_split_merge))
->Insert(key, /*obj=*/nullptr,
GetHelper(cache_options_.enable_custom_split_merge),
/*charge=*/0)
.PermitUncheckedError();
} else {
is_in_sec_cache = true;
kept_in_sec_cache = true;
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
handle.reset(new CompressedSecondaryCacheResultHandle(value, charge));
return handle;
}
Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
Status CompressedSecondaryCache::Insert(const Slice& key,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper) {
if (value == nullptr) {
return Status::InvalidArgument();
}
Cache::Handle* lru_handle = cache_->Lookup(key);
Cache::DeleterFn del_cb =
GetDeletionCallback(cache_options_.enable_custom_split_merge);
auto internal_helper = GetHelper(cache_options_.enable_custom_split_merge);
if (lru_handle == nullptr) {
PERF_COUNTER_ADD(compressed_sec_cache_insert_dummy_count, 1);
// Insert a dummy handle if the handle is evicted for the first time.
return cache_->Insert(key, /*value=*/nullptr, /*charge=*/0, del_cb);
return cache_->Insert(key, /*obj=*/nullptr, internal_helper,
/*charge=*/0);
} else {
cache_->Release(lru_handle, /*erase_if_last_ref=*/false);
}
@ -137,7 +136,8 @@ Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
Slice val(ptr.get(), size);
std::string compressed_val;
if (cache_options_.compression_type != kNoCompression) {
if (cache_options_.compression_type != kNoCompression &&
!cache_options_.do_not_compress_roles.Contains(helper->role)) {
PERF_COUNTER_ADD(compressed_sec_cache_uncompressed_bytes, size);
CompressionOptions compression_opts;
CompressionContext compression_context(cache_options_.compression_type);
@ -169,10 +169,10 @@ Status CompressedSecondaryCache::Insert(const Slice& key, void* value,
size_t charge{0};
CacheValueChunk* value_chunks_head =
SplitValueIntoChunks(val, cache_options_.compression_type, charge);
return cache_->Insert(key, value_chunks_head, charge, del_cb);
return cache_->Insert(key, value_chunks_head, internal_helper, charge);
} else {
CacheAllocationPtr* buf = new CacheAllocationPtr(std::move(ptr));
return cache_->Insert(key, buf, size, del_cb);
return cache_->Insert(key, buf, internal_helper, size);
}
}
@ -276,50 +276,43 @@ CacheAllocationPtr CompressedSecondaryCache::MergeChunksIntoValue(
return ptr;
}
Cache::DeleterFn CompressedSecondaryCache::GetDeletionCallback(
bool enable_custom_split_merge) {
const Cache::CacheItemHelper* CompressedSecondaryCache::GetHelper(
bool enable_custom_split_merge) const {
if (enable_custom_split_merge) {
return [](const Slice& /*key*/, void* obj) {
CacheValueChunk* chunks_head = reinterpret_cast<CacheValueChunk*>(obj);
while (chunks_head != nullptr) {
CacheValueChunk* tmp_chunk = chunks_head;
chunks_head = chunks_head->next;
tmp_chunk->Free();
obj = nullptr;
};
};
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
CacheValueChunk* chunks_head = static_cast<CacheValueChunk*>(obj);
while (chunks_head != nullptr) {
CacheValueChunk* tmp_chunk = chunks_head;
chunks_head = chunks_head->next;
tmp_chunk->Free();
obj = nullptr;
};
}};
return &kHelper;
} else {
return [](const Slice& /*key*/, void* obj) {
delete reinterpret_cast<CacheAllocationPtr*>(obj);
obj = nullptr;
};
static const Cache::CacheItemHelper kHelper{
CacheEntryRole::kMisc,
[](Cache::ObjectPtr obj, MemoryAllocator* /*alloc*/) {
delete static_cast<CacheAllocationPtr*>(obj);
obj = nullptr;
}};
return &kHelper;
}
}
std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
CompressionType compression_type, uint32_t compress_format_version,
bool enable_custom_split_merge) {
return std::make_shared<CompressedSecondaryCache>(
capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, compression_type, compress_format_version,
enable_custom_split_merge);
std::shared_ptr<SecondaryCache>
CompressedSecondaryCacheOptions::MakeSharedSecondaryCache() const {
return std::make_shared<CompressedSecondaryCache>(*this);
}
Status CompressedSecondaryCache::Deflate(size_t decrease) {
return cache_res_mgr_->UpdateCacheReservation(decrease, /*increase=*/true);
}
std::shared_ptr<SecondaryCache> NewCompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts) {
// The secondary_cache is disabled for this LRUCache instance.
assert(opts.secondary_cache == nullptr);
return NewCompressedSecondaryCache(
opts.capacity, opts.num_shard_bits, opts.strict_capacity_limit,
opts.high_pri_pool_ratio, opts.low_pri_pool_ratio, opts.memory_allocator,
opts.use_adaptive_mutex, opts.metadata_charge_policy,
opts.compression_type, opts.compress_format_version,
opts.enable_custom_split_merge);
Status CompressedSecondaryCache::Inflate(size_t increase) {
return cache_res_mgr_->UpdateCacheReservation(increase, /*increase=*/false);
}
} // namespace ROCKSDB_NAMESPACE

@ -9,8 +9,9 @@
#include <cstddef>
#include <memory>
#include "cache/cache_reservation_manager.h"
#include "cache/lru_cache.h"
#include "memory/memory_allocator.h"
#include "memory/memory_allocator_impl.h"
#include "rocksdb/secondary_cache.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -21,7 +22,7 @@ namespace ROCKSDB_NAMESPACE {
class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
public:
CompressedSecondaryCacheResultHandle(void* value, size_t size)
CompressedSecondaryCacheResultHandle(Cache::ObjectPtr value, size_t size)
: value_(value), size_(size) {}
~CompressedSecondaryCacheResultHandle() override = default;
@ -34,12 +35,12 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
void Wait() override {}
void* Value() override { return value_; }
Cache::ObjectPtr Value() override { return value_; }
size_t Size() override { return size_; }
private:
void* value_;
Cache::ObjectPtr value_;
size_t size_;
};
@ -69,26 +70,19 @@ class CompressedSecondaryCacheResultHandle : public SecondaryCacheResultHandle {
class CompressedSecondaryCache : public SecondaryCache {
public:
CompressedSecondaryCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
CacheMetadataChargePolicy metadata_charge_policy =
kDefaultCacheMetadataChargePolicy,
CompressionType compression_type = CompressionType::kLZ4Compression,
uint32_t compress_format_version = 2,
bool enable_custom_split_merge = false);
explicit CompressedSecondaryCache(
const CompressedSecondaryCacheOptions& opts);
~CompressedSecondaryCache() override;
const char* Name() const override { return "CompressedSecondaryCache"; }
Status Insert(const Slice& key, void* value,
Status Insert(const Slice& key, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper) override;
std::unique_ptr<SecondaryCacheResultHandle> Lookup(
const Slice& key, const Cache::CreateCallback& create_cb, bool /*wait*/,
bool advise_erase, bool& is_in_sec_cache) override;
const Slice& key, const Cache::CacheItemHelper* helper,
Cache::CreateContext* create_context, bool /*wait*/, bool advise_erase,
bool& kept_in_sec_cache) override;
bool SupportForceErase() const override { return true; }
@ -100,10 +94,16 @@ class CompressedSecondaryCache : public SecondaryCache {
Status GetCapacity(size_t& capacity) override;
Status Deflate(size_t decrease) override;
Status Inflate(size_t increase) override;
std::string GetPrintableOptions() const override;
size_t TEST_GetUsage() { return cache_->GetUsage(); }
private:
friend class CompressedSecondaryCacheTest;
friend class CompressedSecondaryCacheTestBase;
static constexpr std::array<uint16_t, 8> malloc_bin_sizes_{
128, 256, 512, 1024, 2048, 4096, 8192, 16384};
@ -129,11 +129,12 @@ class CompressedSecondaryCache : public SecondaryCache {
CacheAllocationPtr MergeChunksIntoValue(const void* chunks_head,
size_t& charge);
// An implementation of Cache::DeleterFn.
static Cache::DeleterFn GetDeletionCallback(bool enable_custom_split_merge);
// TODO: clean up to use cleaner interfaces in typed_cache.h
const Cache::CacheItemHelper* GetHelper(bool enable_custom_split_merge) const;
std::shared_ptr<Cache> cache_;
CompressedSecondaryCacheOptions cache_options_;
mutable port::Mutex capacity_mutex_;
std::shared_ptr<ConcurrentCacheReservationManager> cache_res_mgr_;
};
} // namespace ROCKSDB_NAMESPACE

@ -5,86 +5,45 @@
#include "cache/compressed_secondary_cache.h"
#include <array>
#include <iterator>
#include <memory>
#include <tuple>
#include "cache/secondary_cache_adapter.h"
#include "memory/jemalloc_nodump_allocator.h"
#include "rocksdb/convenience.h"
#include "test_util/secondary_cache_test_util.h"
#include "test_util/testharness.h"
#include "test_util/testutil.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
class CompressedSecondaryCacheTest : public testing::Test {
public:
CompressedSecondaryCacheTest() : fail_create_(false) {}
~CompressedSecondaryCacheTest() override = default;
protected:
class TestItem {
public:
TestItem(const char* buf, size_t size) : buf_(new char[size]), size_(size) {
memcpy(buf_.get(), buf, size);
}
~TestItem() = default;
char* Buf() { return buf_.get(); }
[[nodiscard]] size_t Size() const { return size_; }
private:
std::unique_ptr<char[]> buf_;
size_t size_;
};
using secondary_cache_test_util::GetTestingCacheTypes;
using secondary_cache_test_util::WithCacheType;
static size_t SizeCallback(void* obj) {
return reinterpret_cast<TestItem*>(obj)->Size();
}
static Status SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out) {
auto item = reinterpret_cast<TestItem*>(from_obj);
const char* buf = item->Buf();
EXPECT_EQ(length, item->Size());
EXPECT_EQ(from_offset, 0);
memcpy(out, buf, length);
return Status::OK();
}
static void DeletionCallback(const Slice& /*key*/, void* obj) {
delete reinterpret_cast<TestItem*>(obj);
obj = nullptr;
}
// 16 bytes for HCC compatibility
const std::string key0 = "____ ____key0";
const std::string key1 = "____ ____key1";
const std::string key2 = "____ ____key2";
const std::string key3 = "____ ____key3";
static Cache::CacheItemHelper helper_;
static Status SaveToCallbackFail(void* /*obj*/, size_t /*offset*/,
size_t /*size*/, void* /*out*/) {
return Status::NotSupported();
}
static Cache::CacheItemHelper helper_fail_;
Cache::CreateCallback test_item_creator = [&](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
if (fail_create_) {
return Status::NotSupported();
}
*out_obj = reinterpret_cast<void*>(new TestItem((char*)buf, size));
*charge = size;
return Status::OK();
};
void SetFailCreate(bool fail) { fail_create_ = fail; }
class CompressedSecondaryCacheTestBase : public testing::Test,
public WithCacheType {
public:
CompressedSecondaryCacheTestBase() {}
~CompressedSecondaryCacheTestBase() override = default;
protected:
void BasicTestHelper(std::shared_ptr<SecondaryCache> sec_cache,
bool sec_cache_is_compressed) {
get_perf_context()->Reset();
bool is_in_sec_cache{true};
bool kept_in_sec_cache{true};
// Lookup an non-existent key.
std::unique_ptr<SecondaryCacheResultHandle> handle0 = sec_cache->Lookup(
"k0", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle0 =
sec_cache->Lookup(key0, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
ASSERT_EQ(handle0, nullptr);
Random rnd(301);
@ -92,25 +51,25 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::string str1(rnd.RandomString(1000));
TestItem item1(str1.data(), str1.length());
// A dummy handle is inserted if the item is inserted for the first time.
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_EQ(handle1_1, nullptr);
// Insert and Lookup the item k1 for the second time and advise erasing it.
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
std::unique_ptr<SecondaryCacheResultHandle> handle1_2 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_2 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
ASSERT_NE(handle1_2, nullptr);
ASSERT_FALSE(is_in_sec_cache);
ASSERT_FALSE(kept_in_sec_cache);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
1000);
@ -127,22 +86,22 @@ class CompressedSecondaryCacheTest : public testing::Test {
ASSERT_EQ(memcmp(val1->Buf(), item1.Buf(), item1.Size()), 0);
// Lookup the item k1 again.
std::unique_ptr<SecondaryCacheResultHandle> handle1_3 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_3 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
ASSERT_EQ(handle1_3, nullptr);
// Insert and Lookup the item k2.
std::string str2(rnd.RandomString(1000));
TestItem item2(str2.data(), str2.length());
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_EQ(handle2_1, nullptr);
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -153,8 +112,9 @@ class CompressedSecondaryCacheTest : public testing::Test {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
}
std::unique_ptr<SecondaryCacheResultHandle> handle2_2 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_2 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_NE(handle2_2, nullptr);
std::unique_ptr<TestItem> val2 =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2_2->Value()));
@ -223,28 +183,26 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::string str1(rnd.RandomString(1000));
TestItem item1(str1.data(), str1.length());
// Insert a dummy handle.
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
// Insert k1.
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
// Insert and Lookup the second item.
std::string str2(rnd.RandomString(200));
TestItem item2(str2.data(), str2.length());
// Insert a dummy handle, k1 is not evicted.
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
bool is_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> handle1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
bool kept_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> handle1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_EQ(handle1, nullptr);
// Insert k2 and k1 is evicted.
ASSERT_OK(sec_cache->Insert("k2", &item2,
&CompressedSecondaryCacheTest::helper_));
std::unique_ptr<SecondaryCacheResultHandle> handle2 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
ASSERT_OK(sec_cache->Insert(key2, &item2, GetHelper()));
std::unique_ptr<SecondaryCacheResultHandle> handle2 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_NE(handle2, nullptr);
std::unique_ptr<TestItem> val2 =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle2->Value()));
@ -252,27 +210,26 @@ class CompressedSecondaryCacheTest : public testing::Test {
ASSERT_EQ(memcmp(val2->Buf(), item2.Buf(), item2.Size()), 0);
// Insert k1 again and a dummy handle is inserted.
ASSERT_OK(sec_cache->Insert("k1", &item1,
&CompressedSecondaryCacheTest::helper_));
ASSERT_OK(sec_cache->Insert(key1, &item1, GetHelper()));
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 = sec_cache->Lookup(
"k1", test_item_creator, true, /*advise_erase=*/false, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle1_1 =
sec_cache->Lookup(key1, GetHelper(), this, true, /*advise_erase=*/false,
kept_in_sec_cache);
ASSERT_EQ(handle1_1, nullptr);
// Create Fails.
SetFailCreate(true);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 = sec_cache->Lookup(
"k2", test_item_creator, true, /*advise_erase=*/true, is_in_sec_cache);
std::unique_ptr<SecondaryCacheResultHandle> handle2_1 =
sec_cache->Lookup(key2, GetHelper(), this, true, /*advise_erase=*/true,
kept_in_sec_cache);
ASSERT_EQ(handle2_1, nullptr);
// Save Fails.
std::string str3 = rnd.RandomString(10);
TestItem item3(str3.data(), str3.length());
// The Status is OK because a dummy handle is inserted.
ASSERT_OK(sec_cache->Insert("k3", &item3,
&CompressedSecondaryCacheTest::helper_fail_));
ASSERT_NOK(sec_cache->Insert("k3", &item3,
&CompressedSecondaryCacheTest::helper_fail_));
ASSERT_OK(sec_cache->Insert(key3, &item3, GetHelperFail()));
ASSERT_NOK(sec_cache->Insert(key3, &item3, GetHelperFail()));
sec_cache.reset();
}
@ -296,28 +253,22 @@ class CompressedSecondaryCacheTest : public testing::Test {
secondary_cache_opts.enable_custom_split_merge = enable_custom_split_merge;
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
LRUCacheOptions lru_cache_opts(
std::shared_ptr<Cache> cache = NewCache(
/*_capacity =*/1300, /*_num_shard_bits =*/0,
/*_strict_capacity_limit =*/false, /*_high_pri_pool_ratio =*/0.5,
/*_memory_allocator =*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio =*/0.0);
lru_cache_opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(lru_cache_opts);
/*_strict_capacity_limit =*/true, secondary_cache);
std::shared_ptr<Statistics> stats = CreateDBStatistics();
get_perf_context()->Reset();
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1_1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(
"k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
std::string str2 = rnd.RandomString(1012);
auto item2_1 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert(
"k2", item2_1, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key2, item2_1, GetHelper(), str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
@ -326,22 +277,19 @@ class CompressedSecondaryCacheTest : public testing::Test {
auto item3_1 = new TestItem(str3.data(), str3.length());
// After this Insert, primary cache contains k3 and secondary cache contains
// k1's dummy item and k2's dummy item.
ASSERT_OK(cache->Insert(
"k3", item3_1, &CompressedSecondaryCacheTest::helper_, str3.length()));
ASSERT_OK(cache->Insert(key3, item3_1, GetHelper(), str3.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 2);
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item, k2's dummy item, and k3's dummy item.
auto item1_2 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(
"k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's item, k2's dummy item, and k3's dummy item.
auto item2_2 = new TestItem(str2.data(), str2.length());
ASSERT_OK(cache->Insert(
"k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -356,8 +304,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
// After this Insert, primary cache contains k3 and secondary cache contains
// k1's item and k2's item.
auto item3_2 = new TestItem(str3.data(), str3.length());
ASSERT_OK(cache->Insert(
"k3", item3_2, &CompressedSecondaryCacheTest::helper_, str3.length()));
ASSERT_OK(cache->Insert(key3, item3_2, GetHelper(), str3.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 2);
if (sec_cache_is_compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
@ -370,8 +317,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
}
Cache::Handle* handle;
handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_NE(handle, nullptr);
auto val3 = static_cast<TestItem*>(cache->Value(handle));
@ -380,15 +326,13 @@ class CompressedSecondaryCacheTest : public testing::Test {
cache->Release(handle);
// Lookup an non-existent key.
handle = cache->Lookup("k0", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key0, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_EQ(handle, nullptr);
// This Lookup should just insert a dummy handle in the primary cache
// and the k1 is still in the secondary cache.
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -400,8 +344,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
// This Lookup should erase k1 from the secondary cache and insert
// it into primary cache; then k3 is demoted.
// k2 and k3 are in secondary cache.
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 1);
@ -409,8 +352,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
cache->Release(handle);
// k2 is still in secondary cache.
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_NE(handle, nullptr);
ASSERT_EQ(get_perf_context()->block_cache_standalone_handle_count, 2);
@ -418,8 +360,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
// Testing SetCapacity().
ASSERT_OK(secondary_cache->SetCapacity(0));
handle = cache->Lookup("k3", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key3, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_EQ(handle, nullptr);
@ -429,35 +370,30 @@ class CompressedSecondaryCacheTest : public testing::Test {
ASSERT_EQ(capacity, 7000);
auto item1_3 = new TestItem(str1.data(), str1.length());
// After this Insert, primary cache contains k1.
ASSERT_OK(cache->Insert(
"k1", item1_3, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key1, item1_3, GetHelper(), str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 3);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 4);
auto item2_3 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert(
"k2", item2_3, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_OK(cache->Insert(key2, item2_3, GetHelper(), str1.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 4);
auto item1_4 = new TestItem(str1.data(), str1.length());
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item and k2's dummy item.
ASSERT_OK(cache->Insert(
"k1", item1_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key1, item1_4, GetHelper(), str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 5);
auto item2_4 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's real item and k2's dummy item.
ASSERT_OK(cache->Insert(
"k2", item2_4, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key2, item2_4, GetHelper(), str2.length()));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 5);
// This Lookup should just insert a dummy handle in the primary cache
// and the k1 is still in the secondary cache.
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true,
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW,
stats.get());
ASSERT_NE(handle, nullptr);
@ -485,31 +421,31 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
LRUCacheOptions opts(
std::shared_ptr<Cache> cache = NewCache(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
/*_strict_capacity_limit=*/false, secondary_cache);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = std::make_unique<TestItem>(str1.data(), str1.length());
ASSERT_NOK(cache->Insert("k1", item1.get(), nullptr, str1.length()));
ASSERT_OK(cache->Insert("k1", item1.get(),
&CompressedSecondaryCacheTest::helper_,
str1.length()));
ASSERT_OK(cache->Insert(key1, item1.get(), GetHelper(), str1.length()));
item1.release(); // Appease clang-analyze "potential memory leak"
Cache::Handle* handle;
handle = cache->Lookup("k2", nullptr, test_item_creator,
Cache::Priority::LOW, true);
handle = cache->Lookup(key2, nullptr, this, Cache::Priority::LOW);
ASSERT_EQ(handle, nullptr);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, false);
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
ASSERT_EQ(handle, nullptr);
Cache::AsyncLookupHandle ah;
ah.key = key2;
ah.helper = GetHelper();
ah.create_context = this;
ah.priority = Cache::Priority::LOW;
cache->StartAsyncLookup(ah);
cache->Wait(ah);
ASSERT_EQ(ah.Result(), nullptr);
cache.reset();
secondary_cache.reset();
}
@ -532,40 +468,29 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
LRUCacheOptions opts(
std::shared_ptr<Cache> cache = NewCache(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
/*_strict_capacity_limit=*/true, secondary_cache);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert("k1", item1,
&CompressedSecondaryCacheTest::helper_fail_,
str1.length()));
ASSERT_OK(cache->Insert(key1, item1, GetHelperFail(), str1.length()));
std::string str2 = rnd.RandomString(1002);
auto item2 = new TestItem(str2.data(), str2.length());
// k1 should be demoted to the secondary cache.
ASSERT_OK(cache->Insert("k2", item2,
&CompressedSecondaryCacheTest::helper_fail_,
str2.length()));
ASSERT_OK(cache->Insert(key2, item2, GetHelperFail(), str2.length()));
Cache::Handle* handle;
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
// This lookup should fail, since k1 demotion would have failed.
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key1, GetHelperFail(), this, Cache::Priority::LOW);
ASSERT_EQ(handle, nullptr);
// Since k1 was not promoted, k2 should still be in cache.
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_fail_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key2, GetHelperFail(), this, Cache::Priority::LOW);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
@ -591,39 +516,30 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
LRUCacheOptions opts(
std::shared_ptr<Cache> cache = NewCache(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
/*_strict_capacity_limit=*/true, secondary_cache);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert("k1", item1, &CompressedSecondaryCacheTest::helper_,
str1.length()));
ASSERT_OK(cache->Insert(key1, item1, GetHelper(), str1.length()));
std::string str2 = rnd.RandomString(1002);
auto item2 = new TestItem(str2.data(), str2.length());
// k1 should be demoted to the secondary cache.
ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
str2.length()));
ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));
Cache::Handle* handle;
SetFailCreate(true);
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
// This lookup should fail, since k1 creation would have failed
handle = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
ASSERT_EQ(handle, nullptr);
// Since k1 didn't get promoted, k2 should still be in cache
handle = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
ASSERT_NE(handle, nullptr);
cache->Release(handle);
@ -649,43 +565,34 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::shared_ptr<SecondaryCache> secondary_cache =
NewCompressedSecondaryCache(secondary_cache_opts);
LRUCacheOptions opts(
std::shared_ptr<Cache> cache = NewCache(
/*_capacity=*/1300, /*_num_shard_bits=*/0,
/*_strict_capacity_limit=*/false, /*_high_pri_pool_ratio=*/0.5,
/*_memory_allocator=*/nullptr, kDefaultToAdaptiveMutex,
kDefaultCacheMetadataChargePolicy, /*_low_pri_pool_ratio=*/0.0);
opts.secondary_cache = secondary_cache;
std::shared_ptr<Cache> cache = NewLRUCache(opts);
/*_strict_capacity_limit=*/false, secondary_cache);
Random rnd(301);
std::string str1 = rnd.RandomString(1001);
auto item1_1 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(
"k1", item1_1, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_OK(cache->Insert(key1, item1_1, GetHelper(), str1.length()));
std::string str2 = rnd.RandomString(1002);
std::string str2_clone{str2};
auto item2 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's dummy item.
ASSERT_OK(cache->Insert("k2", item2, &CompressedSecondaryCacheTest::helper_,
str2.length()));
ASSERT_OK(cache->Insert(key2, item2, GetHelper(), str2.length()));
// After this Insert, primary cache contains k1 and secondary cache contains
// k1's dummy item and k2's dummy item.
auto item1_2 = new TestItem(str1.data(), str1.length());
ASSERT_OK(cache->Insert(
"k1", item1_2, &CompressedSecondaryCacheTest::helper_, str1.length()));
ASSERT_OK(cache->Insert(key1, item1_2, GetHelper(), str1.length()));
auto item2_2 = new TestItem(str2.data(), str2.length());
// After this Insert, primary cache contains k2 and secondary cache contains
// k1's item and k2's dummy item.
ASSERT_OK(cache->Insert(
"k2", item2_2, &CompressedSecondaryCacheTest::helper_, str2.length()));
ASSERT_OK(cache->Insert(key2, item2_2, GetHelper(), str2.length()));
Cache::Handle* handle2;
handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
ASSERT_NE(handle2, nullptr);
cache->Release(handle2);
@ -693,14 +600,12 @@ class CompressedSecondaryCacheTest : public testing::Test {
// strict_capacity_limit is true, but the lookup should still succeed.
// A k1's dummy item is inserted into primary cache.
Cache::Handle* handle1;
handle1 = cache->Lookup("k1", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle1 = cache->Lookup(key1, GetHelper(), this, Cache::Priority::LOW);
ASSERT_NE(handle1, nullptr);
cache->Release(handle1);
// Since k1 didn't get inserted, k2 should still be in cache
handle2 = cache->Lookup("k2", &CompressedSecondaryCacheTest::helper_,
test_item_creator, Cache::Priority::LOW, true);
handle2 = cache->Lookup(key2, GetHelper(), this, Cache::Priority::LOW);
ASSERT_NE(handle2, nullptr);
cache->Release(handle2);
@ -723,8 +628,9 @@ class CompressedSecondaryCacheTest : public testing::Test {
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
allocator);
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
allocator));
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
@ -741,7 +647,7 @@ class CompressedSecondaryCacheTest : public testing::Test {
current_chunk = current_chunk->next;
ASSERT_EQ(current_chunk->size, 98);
sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
}
void MergeChunksIntoValueTest() {
@ -775,7 +681,8 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::string str = str1 + str2 + str3;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0);
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0));
size_t charge{0};
CacheAllocationPtr value =
sec_cache->MergeChunksIntoValue(chunks_head, charge);
@ -805,8 +712,9 @@ class CompressedSecondaryCacheTest : public testing::Test {
using CacheValueChunk = CompressedSecondaryCache::CacheValueChunk;
std::unique_ptr<CompressedSecondaryCache> sec_cache =
std::make_unique<CompressedSecondaryCache>(1000, 0, true, 0.5, 0.0,
allocator);
std::make_unique<CompressedSecondaryCache>(
CompressedSecondaryCacheOptions(1000, 0, true, 0.5, 0.0,
allocator));
Random rnd(301);
// 8500 = 8169 + 233 + 98, so there should be 3 chunks after split.
size_t str_size{8500};
@ -822,31 +730,29 @@ class CompressedSecondaryCacheTest : public testing::Test {
std::string value_str{value.get(), charge};
ASSERT_EQ(strcmp(value_str.data(), str.data()), 0);
sec_cache->GetDeletionCallback(true)("dummy", chunks_head);
sec_cache->GetHelper(true)->del_cb(chunks_head, /*alloc*/ nullptr);
}
private:
bool fail_create_;
};
Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_(
CompressedSecondaryCacheTest::SizeCallback,
CompressedSecondaryCacheTest::SaveToCallback,
CompressedSecondaryCacheTest::DeletionCallback);
class CompressedSecondaryCacheTest
: public CompressedSecondaryCacheTestBase,
public testing::WithParamInterface<std::string> {
const std::string& Type() override { return GetParam(); }
};
Cache::CacheItemHelper CompressedSecondaryCacheTest::helper_fail_(
CompressedSecondaryCacheTest::SizeCallback,
CompressedSecondaryCacheTest::SaveToCallbackFail,
CompressedSecondaryCacheTest::DeletionCallback);
INSTANTIATE_TEST_CASE_P(CompressedSecondaryCacheTest,
CompressedSecondaryCacheTest, GetTestingCacheTypes());
class CompressedSecCacheTestWithCompressAndAllocatorParam
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<
std::tuple<bool, bool, std::string>> {
public:
CompressedSecCacheTestWithCompressAndAllocatorParam() {
sec_cache_is_compressed_ = std::get<0>(GetParam());
use_jemalloc_ = std::get<1>(GetParam());
}
const std::string& Type() override { return std::get<2>(GetParam()); }
bool sec_cache_is_compressed_;
bool use_jemalloc_;
};
@ -857,20 +763,20 @@ TEST_P(CompressedSecCacheTestWithCompressAndAllocatorParam, BasicTes) {
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecCacheTestWithCompressAndAllocatorParam,
::testing::Combine(testing::Bool(), testing::Bool()));
::testing::Combine(testing::Bool(), testing::Bool(),
GetTestingCacheTypes()));
class CompressedSecondaryCacheTestWithCompressionParam
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<bool> {
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<std::tuple<bool, std::string>> {
public:
CompressedSecondaryCacheTestWithCompressionParam() {
sec_cache_is_compressed_ = GetParam();
sec_cache_is_compressed_ = std::get<0>(GetParam());
}
const std::string& Type() override { return std::get<1>(GetParam()); }
bool sec_cache_is_compressed_;
};
#ifndef ROCKSDB_LITE
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, BasicTestFromString) {
std::shared_ptr<SecondaryCache> sec_cache{nullptr};
std::string sec_cache_uri;
@ -934,7 +840,6 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
BasicTestHelper(sec_cache, sec_cache_is_compressed_);
}
#endif // ROCKSDB_LITE
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, FailsTest) {
FailsTest(sec_cache_is_compressed_);
@ -960,18 +865,92 @@ TEST_P(CompressedSecondaryCacheTestWithCompressionParam,
IntegrationFullCapacityTest(sec_cache_is_compressed_);
}
TEST_P(CompressedSecondaryCacheTestWithCompressionParam, EntryRoles) {
CompressedSecondaryCacheOptions opts;
opts.capacity = 2048;
opts.num_shard_bits = 0;
if (sec_cache_is_compressed_) {
if (!LZ4_Supported()) {
ROCKSDB_GTEST_SKIP("This test requires LZ4 support.");
return;
}
} else {
opts.compression_type = CompressionType::kNoCompression;
}
// Select a random subset to include, for fast test
Random& r = *Random::GetTLSInstance();
CacheEntryRoleSet do_not_compress;
for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
// A few included on average, but decent chance of zero
if (r.OneIn(5)) {
do_not_compress.Add(static_cast<CacheEntryRole>(i));
}
}
opts.do_not_compress_roles = do_not_compress;
std::shared_ptr<SecondaryCache> sec_cache = NewCompressedSecondaryCache(opts);
// Fixed seed to ensure consistent compressibility (doesn't compress)
std::string junk(Random(301).RandomString(1000));
for (uint32_t i = 0; i < kNumCacheEntryRoles; ++i) {
CacheEntryRole role = static_cast<CacheEntryRole>(i);
// Uniquify `junk`
junk[0] = static_cast<char>(i);
TestItem item{junk.data(), junk.length()};
Slice ith_key = Slice(junk.data(), 16);
get_perf_context()->Reset();
ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_dummy_count, 1U);
ASSERT_OK(sec_cache->Insert(ith_key, &item, GetHelper(role)));
ASSERT_EQ(get_perf_context()->compressed_sec_cache_insert_real_count, 1U);
bool kept_in_sec_cache{true};
std::unique_ptr<SecondaryCacheResultHandle> handle =
sec_cache->Lookup(ith_key, GetHelper(role), this, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_NE(handle, nullptr);
// Lookup returns the right data
std::unique_ptr<TestItem> val =
std::unique_ptr<TestItem>(static_cast<TestItem*>(handle->Value()));
ASSERT_NE(val, nullptr);
ASSERT_EQ(memcmp(val->Buf(), item.Buf(), item.Size()), 0);
bool compressed =
sec_cache_is_compressed_ && !do_not_compress.Contains(role);
if (compressed) {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes,
1000);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes,
1007);
} else {
ASSERT_EQ(get_perf_context()->compressed_sec_cache_uncompressed_bytes, 0);
ASSERT_EQ(get_perf_context()->compressed_sec_cache_compressed_bytes, 0);
}
}
}
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecondaryCacheTestWithCompressionParam,
testing::Bool());
testing::Combine(testing::Bool(),
GetTestingCacheTypes()));
class CompressedSecCacheTestWithCompressAndSplitParam
: public CompressedSecondaryCacheTest,
public ::testing::WithParamInterface<std::tuple<bool, bool>> {
: public CompressedSecondaryCacheTestBase,
public ::testing::WithParamInterface<
std::tuple<bool, bool, std::string>> {
public:
CompressedSecCacheTestWithCompressAndSplitParam() {
sec_cache_is_compressed_ = std::get<0>(GetParam());
enable_custom_split_merge_ = std::get<1>(GetParam());
}
const std::string& Type() override { return std::get<2>(GetParam()); }
bool sec_cache_is_compressed_;
bool enable_custom_split_merge_;
};
@ -982,20 +961,112 @@ TEST_P(CompressedSecCacheTestWithCompressAndSplitParam, BasicIntegrationTest) {
INSTANTIATE_TEST_CASE_P(CompressedSecCacheTests,
CompressedSecCacheTestWithCompressAndSplitParam,
::testing::Combine(testing::Bool(), testing::Bool()));
::testing::Combine(testing::Bool(), testing::Bool(),
GetTestingCacheTypes()));
TEST_F(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
TEST_P(CompressedSecondaryCacheTest, SplitValueIntoChunksTest) {
SplitValueIntoChunksTest();
}
TEST_F(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
TEST_P(CompressedSecondaryCacheTest, MergeChunksIntoValueTest) {
MergeChunksIntoValueTest();
}
TEST_F(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
TEST_P(CompressedSecondaryCacheTest, SplictValueAndMergeChunksTest) {
SplictValueAndMergeChunksTest();
}
class CompressedSecCacheTestWithTiered : public ::testing::Test {
public:
CompressedSecCacheTestWithTiered() {
LRUCacheOptions lru_opts;
TieredVolatileCacheOptions opts;
lru_opts.capacity = 70 << 20;
opts.cache_opts = &lru_opts;
opts.cache_type = PrimaryCacheType::kCacheTypeLRU;
opts.comp_cache_opts.capacity = 30 << 20;
cache_ = NewTieredVolatileCache(opts);
cache_res_mgr_ =
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
cache_);
}
protected:
CacheReservationManager* cache_res_mgr() { return cache_res_mgr_.get(); }
Cache* GetCache() {
return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
cache_.get())
->TEST_GetCache();
}
SecondaryCache* GetSecondaryCache() {
return static_cast_with_check<CacheWithSecondaryAdapter, Cache>(
cache_.get())
->TEST_GetSecondaryCache();
}
size_t GetPercent(size_t val, unsigned int percent) {
return static_cast<size_t>(val * percent / 100);
}
private:
std::shared_ptr<Cache> cache_;
std::shared_ptr<CacheReservationManager> cache_res_mgr_;
};
bool CacheUsageWithinBounds(size_t val1, size_t val2, size_t error) {
return ((val1 < (val2 + error)) && (val1 > (val2 - error)));
}
TEST_F(CompressedSecCacheTestWithTiered, CacheReservationManager) {
CompressedSecondaryCache* sec_cache =
reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
// Use EXPECT_PRED3 instead of EXPECT_NEAR to void too many size_t to
// double explicit casts
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(10 << 20));
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
GetPercent(37 << 20, 1));
EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
GetPercent(3 << 20, 1));
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(0));
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
}
TEST_F(CompressedSecCacheTestWithTiered,
CacheReservationManagerMultipleUpdate) {
CompressedSecondaryCache* sec_cache =
reinterpret_cast<CompressedSecondaryCache*>(GetSecondaryCache());
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
int i;
for (i = 0; i < 10; ++i) {
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation((1 + i) << 20));
}
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (37 << 20),
GetPercent(37 << 20, 1));
EXPECT_PRED3(CacheUsageWithinBounds, sec_cache->TEST_GetUsage(), (3 << 20),
GetPercent(3 << 20, 1));
for (i = 10; i > 0; --i) {
ASSERT_OK(cache_res_mgr()->UpdateCacheReservation(((i - 1) << 20)));
}
EXPECT_PRED3(CacheUsageWithinBounds, GetCache()->GetUsage(), (30 << 20),
GetPercent(30 << 20, 1));
EXPECT_EQ(sec_cache->TEST_GetUsage(), 0);
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

510
cache/lru_cache.cc vendored

@ -14,28 +14,29 @@
#include <cstdio>
#include <cstdlib>
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "port/lang.h"
#include "util/distributed_mutex.h"
namespace ROCKSDB_NAMESPACE {
namespace lru_cache {
// A distinct pointer value for marking "dummy" cache entries
void* const kDummyValueMarker = const_cast<char*>("kDummyValueMarker");
LRUHandleTable::LRUHandleTable(int max_upper_hash_bits)
LRUHandleTable::LRUHandleTable(int max_upper_hash_bits,
MemoryAllocator* allocator)
: length_bits_(/* historical starting size*/ 4),
list_(new LRUHandle* [size_t{1} << length_bits_] {}),
elems_(0),
max_length_bits_(max_upper_hash_bits) {}
max_length_bits_(max_upper_hash_bits),
allocator_(allocator) {}
LRUHandleTable::~LRUHandleTable() {
auto alloc = allocator_;
ApplyToEntriesRange(
[](LRUHandle* h) {
[alloc](LRUHandle* h) {
if (!h->HasRefs()) {
h->Free();
h->Free(alloc);
}
},
0, size_t{1} << length_bits_);
@ -95,7 +96,7 @@ void LRUHandleTable::Resize() {
std::unique_ptr<LRUHandle* []> new_list {
new LRUHandle* [size_t{1} << new_length_bits] {}
};
uint32_t count = 0;
[[maybe_unused]] uint32_t count = 0;
for (uint32_t i = 0; i < old_length; i++) {
LRUHandle* h = list_[i];
while (h != nullptr) {
@ -118,7 +119,8 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double low_pri_pool_ratio, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits,
SecondaryCache* secondary_cache)
MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback)
: CacheShardBase(metadata_charge_policy),
capacity_(0),
high_pri_pool_usage_(0),
@ -128,11 +130,11 @@ LRUCacheShard::LRUCacheShard(size_t capacity, bool strict_capacity_limit,
high_pri_pool_capacity_(0),
low_pri_pool_ratio_(low_pri_pool_ratio),
low_pri_pool_capacity_(0),
table_(max_upper_hash_bits),
table_(max_upper_hash_bits, allocator),
usage_(0),
lru_usage_(0),
mutex_(use_adaptive_mutex),
secondary_cache_(secondary_cache) {
eviction_callback_(*eviction_callback) {
// Make empty circular linked list.
lru_.next = &lru_;
lru_.prev = &lru_;
@ -159,13 +161,14 @@ void LRUCacheShard::EraseUnRefEntries() {
}
for (auto entry : last_reference_list) {
entry->Free();
entry->Free(table_.GetAllocator());
}
}
void LRUCacheShard::ApplyToSomeEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state) {
// The state is essentially going to be the starting hash, which works
// nicely even if we resize between calls because we use upper-most
@ -192,11 +195,8 @@ void LRUCacheShard::ApplyToSomeEntries(
table_.ApplyToEntriesRange(
[callback,
metadata_charge_policy = metadata_charge_policy_](LRUHandle* h) {
DeleterFn deleter = h->IsSecondaryCacheCompatible()
? h->info_.helper->del_cb
: h->info_.deleter;
callback(h->key(), h->value, h->GetCharge(metadata_charge_policy),
deleter);
h->helper);
},
index_begin, index_end);
}
@ -334,16 +334,19 @@ void LRUCacheShard::EvictFromLRU(size_t charge,
}
}
void LRUCacheShard::TryInsertIntoSecondaryCache(
autovector<LRUHandle*> evicted_handles) {
for (auto entry : evicted_handles) {
if (secondary_cache_ && entry->IsSecondaryCacheCompatible() &&
!entry->IsInSecondaryCache()) {
secondary_cache_->Insert(entry->key(), entry->value, entry->info_.helper)
.PermitUncheckedError();
void LRUCacheShard::NotifyEvicted(
const autovector<LRUHandle*>& evicted_handles) {
MemoryAllocator* alloc = table_.GetAllocator();
for (LRUHandle* entry : evicted_handles) {
if (eviction_callback_ &&
eviction_callback_(entry->key(),
reinterpret_cast<Cache::Handle*>(entry))) {
// Callback took ownership of obj; just free handle
free(entry);
} else {
// Free the entries here outside of mutex for performance reasons.
entry->Free(alloc);
}
// Free the entries here outside of mutex for performance reasons.
entry->Free();
}
}
@ -357,7 +360,7 @@ void LRUCacheShard::SetCapacity(size_t capacity) {
EvictFromLRU(0, &last_reference_list);
}
TryInsertIntoSecondaryCache(last_reference_list);
NotifyEvicted(last_reference_list);
}
void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
@ -365,8 +368,7 @@ void LRUCacheShard::SetStrictCapacityLimit(bool strict_capacity_limit) {
strict_capacity_limit_ = strict_capacity_limit;
}
Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
bool free_handle_on_fail) {
Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle) {
Status s = Status::OK();
autovector<LRUHandle*> last_reference_list;
@ -385,10 +387,9 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
// into cache and get evicted immediately.
last_reference_list.push_back(e);
} else {
if (free_handle_on_fail) {
free(e);
*handle = nullptr;
}
free(e);
e = nullptr;
*handle = nullptr;
s = Status::MemoryLimit("Insert failed due to LRU cache being full.");
}
} else {
@ -420,192 +421,27 @@ Status LRUCacheShard::InsertItem(LRUHandle* e, LRUHandle** handle,
}
}
TryInsertIntoSecondaryCache(last_reference_list);
NotifyEvicted(last_reference_list);
return s;
}
void LRUCacheShard::Promote(LRUHandle* e) {
SecondaryCacheResultHandle* secondary_handle = e->sec_handle;
assert(secondary_handle->IsReady());
// e is not thread-shared here; OK to modify "immutable" fields as well as
// "mutable" (normally requiring mutex)
e->SetIsPending(false);
e->value = secondary_handle->Value();
assert(e->total_charge == 0);
size_t value_size = secondary_handle->Size();
delete secondary_handle;
if (e->value) {
e->CalcTotalCharge(value_size, metadata_charge_policy_);
Status s;
if (e->IsStandalone()) {
assert(secondary_cache_ && secondary_cache_->SupportForceErase());
// Insert a dummy handle and return a standalone handle to caller.
// Charge the standalone handle.
autovector<LRUHandle*> last_reference_list;
bool free_standalone_handle{false};
{
DMutexLock l(mutex_);
// Free the space following strict LRU policy until enough space
// is freed or the lru list is empty.
EvictFromLRU(e->total_charge, &last_reference_list);
if ((usage_ + e->total_charge) > capacity_ && strict_capacity_limit_) {
free_standalone_handle = true;
} else {
usage_ += e->total_charge;
}
}
TryInsertIntoSecondaryCache(last_reference_list);
if (free_standalone_handle) {
e->Unref();
e->Free();
e = nullptr;
} else {
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
}
// Insert a dummy handle into the primary cache. This dummy handle is
// not IsSecondaryCacheCompatible().
// FIXME? This should not overwrite an existing non-dummy entry in the
// rare case that one exists
Cache::Priority priority =
e->IsHighPri() ? Cache::Priority::HIGH : Cache::Priority::LOW;
s = Insert(e->key(), e->hash, kDummyValueMarker, /*charge=*/0,
/*deleter=*/nullptr, /*helper=*/nullptr, /*handle=*/nullptr,
priority);
} else {
e->SetInCache(true);
LRUHandle* handle = e;
// This InsertItem() could fail if the cache is over capacity and
// strict_capacity_limit_ is true. In such a case, we don't want
// InsertItem() to free the handle, since the item is already in memory
// and the caller will most likely just read it from disk if we erase it
// here.
s = InsertItem(e, &handle, /*free_handle_on_fail=*/false);
if (s.ok()) {
PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
}
}
if (!s.ok()) {
// Item is in memory, but not accounted against the cache capacity.
// When the handle is released, the item should get deleted.
assert(!e->InCache());
}
} else {
// Secondary cache lookup failed. The caller will take care of detecting
// this and eventually releasing e.
assert(!e->value);
assert(!e->InCache());
}
}
LRUHandle* LRUCacheShard::Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* helper,
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait,
Statistics* stats) {
LRUHandle* e = nullptr;
bool found_dummy_entry{false};
{
DMutexLock l(mutex_);
e = table_.Lookup(key, hash);
if (e != nullptr) {
assert(e->InCache());
if (e->value == kDummyValueMarker) {
// For a dummy handle, if it was retrieved from secondary cache,
// it may still exist in secondary cache.
// If the handle exists in secondary cache, the value should be
// erased from sec cache and be inserted into primary cache.
found_dummy_entry = true;
// Let the dummy entry be overwritten
e = nullptr;
} else {
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(e);
}
e->Ref();
e->SetHit();
}
}
}
// If handle table lookup failed or the handle is a dummy one, allocate
// a handle outside the mutex if we re going to lookup in the secondary cache.
//
// When a block is firstly Lookup from CompressedSecondaryCache, we just
// insert a dummy block into the primary cache (charging the actual size of
// the block) and don't erase the block from CompressedSecondaryCache. A
// standalone handle is returned to the caller. Only if the block is hit
// again, we erase it from CompressedSecondaryCache and add it into the
// primary cache.
if (!e && secondary_cache_ && helper && helper->saveto_cb) {
// For objects from the secondary cache, we expect the caller to provide
// a way to create/delete the primary cache object. The only case where
// a deleter would not be required is for dummy entries inserted for
// accounting purposes, which we won't demote to the secondary cache
// anyway.
assert(create_cb && helper->del_cb);
bool is_in_sec_cache{false};
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(key, create_cb, wait, found_dummy_entry,
is_in_sec_cache);
if (secondary_handle != nullptr) {
e = static_cast<LRUHandle*>(malloc(sizeof(LRUHandle) - 1 + key.size()));
e->m_flags = 0;
e->im_flags = 0;
e->SetSecondaryCacheCompatible(true);
e->info_.helper = helper;
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
e->SetPriority(priority);
memcpy(e->key_data, key.data(), key.size());
e->value = nullptr;
e->sec_handle = secondary_handle.release();
e->total_charge = 0;
e->Ref();
e->SetIsInSecondaryCache(is_in_sec_cache);
e->SetIsStandalone(secondary_cache_->SupportForceErase() &&
!found_dummy_entry);
if (wait) {
Promote(e);
if (e) {
if (!e->value) {
// The secondary cache returned a handle, but the lookup failed.
e->Unref();
e->Free();
e = nullptr;
} else {
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
}
}
} else {
// If wait is false, we always return a handle and let the caller
// release the handle after checking for success or failure.
e->SetIsPending(true);
// This may be slightly inaccurate, if the lookup eventually fails.
// But the probability is very low.
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
}
} else {
// Caller will most likely overwrite the dummy entry with an Insert
// after this Lookup fails
assert(e == nullptr);
const Cache::CacheItemHelper* /*helper*/,
Cache::CreateContext* /*create_context*/,
Cache::Priority /*priority*/,
Statistics* /*stats*/) {
DMutexLock l(mutex_);
LRUHandle* e = table_.Lookup(key, hash);
if (e != nullptr) {
assert(e->InCache());
if (!e->HasRefs()) {
// The entry is in LRU since it's in hash and has no external
// references.
LRU_Remove(e);
}
e->Ref();
e->SetHit();
}
return e;
}
@ -614,8 +450,6 @@ bool LRUCacheShard::Ref(LRUHandle* e) {
DMutexLock l(mutex_);
// To create another reference - entry must be already externally referenced.
assert(e->HasRefs());
// Pending handles are not for sharing
assert(!e->IsPending());
e->Ref();
return true;
}
@ -639,14 +473,13 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
if (e == nullptr) {
return false;
}
bool last_reference = false;
// Must Wait or WaitAll first on pending handles. Otherwise, would leak
// a secondary cache handle.
assert(!e->IsPending());
bool must_free;
bool was_in_cache;
{
DMutexLock l(mutex_);
last_reference = e->Unref();
if (last_reference && e->InCache()) {
must_free = e->Unref();
was_in_cache = e->InCache();
if (must_free && was_in_cache) {
// The item is still in cache, and nobody else holds a reference to it.
if (usage_ > capacity_ || erase_if_last_ref) {
// The LRU list must be empty since the cache is full.
@ -657,28 +490,39 @@ bool LRUCacheShard::Release(LRUHandle* e, bool /*useful*/,
} else {
// Put the item back on the LRU list, and don't free it.
LRU_Insert(e);
last_reference = false;
must_free = false;
}
}
// If it was the last reference, then decrement the cache usage.
if (last_reference) {
// If about to be freed, then decrement the cache usage.
if (must_free) {
assert(usage_ >= e->total_charge);
usage_ -= e->total_charge;
}
}
// Free the entry here outside of mutex for performance reasons.
if (last_reference) {
e->Free();
if (must_free) {
// Only call eviction callback if we're sure no one requested erasure
// FIXME: disabled because of test churn
if (false && was_in_cache && !erase_if_last_ref && eviction_callback_ &&
eviction_callback_(e->key(), reinterpret_cast<Cache::Handle*>(e))) {
// Callback took ownership of obj; just free handle
free(e);
} else {
e->Free(table_.GetAllocator());
}
}
return last_reference;
return must_free;
}
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
size_t charge,
void (*deleter)(const Slice& key, void* value),
const Cache::CacheItemHelper* helper,
LRUHandle** handle, Cache::Priority priority) {
LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge) {
assert(helper);
// value == nullptr is reserved for indicating failure in SecondaryCache
assert(!(helper->IsSecondaryCacheCompatible() && value == nullptr));
// Allocate the memory here outside of the mutex.
// If the cache is full, we'll have to release it.
// It shouldn't happen very often though.
@ -688,27 +532,58 @@ Status LRUCacheShard::Insert(const Slice& key, uint32_t hash, void* value,
e->value = value;
e->m_flags = 0;
e->im_flags = 0;
if (helper) {
// Use only one of the two parameters
assert(deleter == nullptr);
// value == nullptr is reserved for indicating failure for when secondary
// cache compatible
assert(value != nullptr);
e->SetSecondaryCacheCompatible(true);
e->info_.helper = helper;
} else {
e->info_.deleter = deleter;
}
e->helper = helper;
e->key_length = key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
e->SetInCache(true);
e->SetPriority(priority);
memcpy(e->key_data, key.data(), key.size());
e->CalcTotalCharge(charge, metadata_charge_policy_);
return InsertItem(e, handle, /* free_handle_on_fail */ true);
return e;
}
Status LRUCacheShard::Insert(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge, LRUHandle** handle,
Cache::Priority priority) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetPriority(priority);
e->SetInCache(true);
return InsertItem(e, handle);
}
LRUHandle* LRUCacheShard::CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper,
size_t charge,
bool allow_uncharged) {
LRUHandle* e = CreateHandle(key, hash, value, helper, charge);
e->SetIsStandalone(true);
e->Ref();
autovector<LRUHandle*> last_reference_list;
{
DMutexLock l(mutex_);
EvictFromLRU(e->total_charge, &last_reference_list);
if (strict_capacity_limit_ && (usage_ + e->total_charge) > capacity_) {
if (allow_uncharged) {
e->total_charge = 0;
} else {
free(e);
e = nullptr;
}
} else {
usage_ += e->total_charge;
}
}
NotifyEvicted(last_reference_list);
return e;
}
void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
@ -733,18 +608,8 @@ void LRUCacheShard::Erase(const Slice& key, uint32_t hash) {
// Free the entry here outside of mutex for performance reasons.
// last_reference will only be true if e != nullptr.
if (last_reference) {
e->Free();
}
}
bool LRUCacheShard::IsReady(LRUHandle* e) {
bool ready = true;
if (e->IsPending()) {
assert(secondary_cache_);
assert(e->sec_handle);
ready = e->sec_handle->IsReady();
e->Free(table_.GetAllocator());
}
return ready;
}
size_t LRUCacheShard::GetUsage() const {
@ -781,30 +646,20 @@ void LRUCacheShard::AppendPrintableOptions(std::string& str) const {
str.append(buffer);
}
LRUCache::LRUCache(size_t capacity, int num_shard_bits,
bool strict_capacity_limit, double high_pri_pool_ratio,
double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> allocator,
bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
std::shared_ptr<SecondaryCache> _secondary_cache)
: ShardedCache(capacity, num_shard_bits, strict_capacity_limit,
std::move(allocator)),
secondary_cache_(std::move(_secondary_cache)) {
LRUCache::LRUCache(const LRUCacheOptions& opts) : ShardedCache(opts) {
size_t per_shard = GetPerShardCapacity();
SecondaryCache* secondary_cache = secondary_cache_.get();
InitShards([=](LRUCacheShard* cs) {
new (cs) LRUCacheShard(
per_shard, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, use_adaptive_mutex, metadata_charge_policy,
/* max_upper_hash_bits */ 32 - num_shard_bits, secondary_cache);
MemoryAllocator* alloc = memory_allocator();
InitShards([&](LRUCacheShard* cs) {
new (cs) LRUCacheShard(per_shard, opts.strict_capacity_limit,
opts.high_pri_pool_ratio, opts.low_pri_pool_ratio,
opts.use_adaptive_mutex, opts.metadata_charge_policy,
/* max_upper_hash_bits */ 32 - opts.num_shard_bits,
alloc, &eviction_callback_);
});
}
void* LRUCache::Value(Handle* handle) {
Cache::ObjectPtr LRUCache::Value(Handle* handle) {
auto h = reinterpret_cast<const LRUHandle*>(handle);
assert(!h->IsPending() || h->value == nullptr);
assert(h->value != kDummyValueMarker);
return h->value;
}
@ -813,13 +668,10 @@ size_t LRUCache::GetCharge(Handle* handle) const {
GetShard(0).metadata_charge_policy_);
}
Cache::DeleterFn LRUCache::GetDeleter(Handle* handle) const {
const Cache::CacheItemHelper* LRUCache::GetCacheItemHelper(
Handle* handle) const {
auto h = reinterpret_cast<const LRUHandle*>(handle);
if (h->IsSecondaryCacheCompatible()) {
return h->info_.helper->del_cb;
} else {
return h->info_.deleter;
}
return h->helper;
}
size_t LRUCache::TEST_GetLRUSize() {
@ -830,51 +682,9 @@ double LRUCache::GetHighPriPoolRatio() {
return GetShard(0).GetHighPriPoolRatio();
}
void LRUCache::WaitAll(std::vector<Handle*>& handles) {
if (secondary_cache_) {
std::vector<SecondaryCacheResultHandle*> sec_handles;
sec_handles.reserve(handles.size());
for (Handle* handle : handles) {
if (!handle) {
continue;
}
LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
if (!lru_handle->IsPending()) {
continue;
}
sec_handles.emplace_back(lru_handle->sec_handle);
}
secondary_cache_->WaitAll(sec_handles);
for (Handle* handle : handles) {
if (!handle) {
continue;
}
LRUHandle* lru_handle = reinterpret_cast<LRUHandle*>(handle);
if (!lru_handle->IsPending()) {
continue;
}
GetShard(lru_handle->hash).Promote(lru_handle);
}
}
}
void LRUCache::AppendPrintableOptions(std::string& str) const {
ShardedCache::AppendPrintableOptions(str); // options from shard
if (secondary_cache_) {
str.append(" secondary_cache:\n");
str.append(secondary_cache_->GetPrintableOptions());
}
}
} // namespace lru_cache
std::shared_ptr<Cache> NewLRUCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
const std::shared_ptr<SecondaryCache>& secondary_cache,
double low_pri_pool_ratio) {
std::shared_ptr<Cache> LRUCacheOptions::MakeSharedCache() const {
if (num_shard_bits >= 20) {
return nullptr; // The cache cannot be sharded into too many fine pieces.
}
@ -890,32 +700,24 @@ std::shared_ptr<Cache> NewLRUCache(
// Invalid high_pri_pool_ratio and low_pri_pool_ratio combination
return nullptr;
}
if (num_shard_bits < 0) {
num_shard_bits = GetDefaultCacheShardBits(capacity);
}
return std::make_shared<LRUCache>(
capacity, num_shard_bits, strict_capacity_limit, high_pri_pool_ratio,
low_pri_pool_ratio, std::move(memory_allocator), use_adaptive_mutex,
metadata_charge_policy, secondary_cache);
}
std::shared_ptr<Cache> NewLRUCache(const LRUCacheOptions& cache_opts) {
return NewLRUCache(cache_opts.capacity, cache_opts.num_shard_bits,
cache_opts.strict_capacity_limit,
cache_opts.high_pri_pool_ratio,
cache_opts.memory_allocator, cache_opts.use_adaptive_mutex,
cache_opts.metadata_charge_policy,
cache_opts.secondary_cache, cache_opts.low_pri_pool_ratio);
}
std::shared_ptr<Cache> NewLRUCache(
size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator, bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
double low_pri_pool_ratio) {
return NewLRUCache(capacity, num_shard_bits, strict_capacity_limit,
high_pri_pool_ratio, memory_allocator, use_adaptive_mutex,
metadata_charge_policy, nullptr, low_pri_pool_ratio);
// For sanitized options
LRUCacheOptions opts = *this;
if (opts.num_shard_bits < 0) {
opts.num_shard_bits = GetDefaultCacheShardBits(capacity);
}
std::shared_ptr<Cache> cache = std::make_shared<LRUCache>(opts);
if (secondary_cache) {
cache = std::make_shared<CacheWithSecondaryAdapter>(cache, secondary_cache);
}
return cache;
}
std::shared_ptr<RowCache> LRUCacheOptions::MakeSharedRowCache() const {
if (secondary_cache) {
// Not allowed for a RowCache
return nullptr;
}
// Works while RowCache is an alias for Cache
return MakeSharedCache();
}
} // namespace ROCKSDB_NAMESPACE

179
cache/lru_cache.h vendored

@ -13,9 +13,9 @@
#include "cache/sharded_cache.h"
#include "port/lang.h"
#include "port/likely.h"
#include "port/malloc.h"
#include "port/port.h"
#include "rocksdb/secondary_cache.h"
#include "util/autovector.h"
#include "util/distributed_mutex.h"
@ -48,19 +48,9 @@ namespace lru_cache {
// While refs > 0, public properties like value and deleter must not change.
struct LRUHandle {
void* value;
union Info {
Info() {}
~Info() {}
Cache::DeleterFn deleter;
const Cache::CacheItemHelper* helper;
} info_;
// An entry is not added to the LRUHandleTable until the secondary cache
// lookup is complete, so its safe to have this union.
union {
LRUHandle* next_hash;
SecondaryCacheResultHandle* sec_handle;
};
Cache::ObjectPtr value;
const Cache::CacheItemHelper* helper;
LRUHandle* next_hash;
LRUHandle* next;
LRUHandle* prev;
size_t total_charge; // TODO(opt): Only allow uint32_t?
@ -93,14 +83,8 @@ struct LRUHandle {
IM_IS_HIGH_PRI = (1 << 0),
// Whether this entry is low priority entry.
IM_IS_LOW_PRI = (1 << 1),
// Can this be inserted into the secondary cache.
IM_IS_SECONDARY_CACHE_COMPATIBLE = (1 << 2),
// Is the handle still being read from a lower tier.
IM_IS_PENDING = (1 << 3),
// Whether this handle is still in a lower tier
IM_IS_IN_SECONDARY_CACHE = (1 << 4),
// Marks result handles that should not be inserted into cache
IM_IS_STANDALONE = (1 << 5),
IM_IS_STANDALONE = (1 << 2),
};
// Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
@ -130,13 +114,6 @@ struct LRUHandle {
bool IsLowPri() const { return im_flags & IM_IS_LOW_PRI; }
bool InLowPriPool() const { return m_flags & M_IN_LOW_PRI_POOL; }
bool HasHit() const { return m_flags & M_HAS_HIT; }
bool IsSecondaryCacheCompatible() const {
return im_flags & IM_IS_SECONDARY_CACHE_COMPATIBLE;
}
bool IsPending() const { return im_flags & IM_IS_PENDING; }
bool IsInSecondaryCache() const {
return im_flags & IM_IS_IN_SECONDARY_CACHE;
}
bool IsStandalone() const { return im_flags & IM_IS_STANDALONE; }
void SetInCache(bool in_cache) {
@ -178,30 +155,6 @@ struct LRUHandle {
void SetHit() { m_flags |= M_HAS_HIT; }
void SetSecondaryCacheCompatible(bool compat) {
if (compat) {
im_flags |= IM_IS_SECONDARY_CACHE_COMPATIBLE;
} else {
im_flags &= ~IM_IS_SECONDARY_CACHE_COMPATIBLE;
}
}
void SetIsPending(bool pending) {
if (pending) {
im_flags |= IM_IS_PENDING;
} else {
im_flags &= ~IM_IS_PENDING;
}
}
void SetIsInSecondaryCache(bool is_in_secondary_cache) {
if (is_in_secondary_cache) {
im_flags |= IM_IS_IN_SECONDARY_CACHE;
} else {
im_flags &= ~IM_IS_IN_SECONDARY_CACHE;
}
}
void SetIsStandalone(bool is_standalone) {
if (is_standalone) {
im_flags |= IM_IS_STANDALONE;
@ -210,22 +163,11 @@ struct LRUHandle {
}
}
void Free() {
void Free(MemoryAllocator* allocator) {
assert(refs == 0);
if (!IsSecondaryCacheCompatible() && info_.deleter) {
(*info_.deleter)(key(), value);
} else if (IsSecondaryCacheCompatible()) {
if (IsPending()) {
assert(sec_handle != nullptr);
SecondaryCacheResultHandle* tmp_sec_handle = sec_handle;
tmp_sec_handle->Wait();
value = tmp_sec_handle->Value();
delete tmp_sec_handle;
}
if (value) {
(*info_.helper->del_cb)(key(), value);
}
assert(helper);
if (helper->del_cb) {
helper->del_cb(value, allocator);
}
free(this);
@ -267,7 +209,7 @@ struct LRUHandle {
// 4.4.3's builtin hashtable.
class LRUHandleTable {
public:
explicit LRUHandleTable(int max_upper_hash_bits);
explicit LRUHandleTable(int max_upper_hash_bits, MemoryAllocator* allocator);
~LRUHandleTable();
LRUHandle* Lookup(const Slice& key, uint32_t hash);
@ -291,6 +233,8 @@ class LRUHandleTable {
size_t GetOccupancyCount() const { return elems_; }
MemoryAllocator* GetAllocator() const { return allocator_; }
private:
// Return a pointer to slot that points to a cache entry that
// matches key/hash. If there is no such cache entry, return a
@ -312,16 +256,22 @@ class LRUHandleTable {
// Set from max_upper_hash_bits (see constructor).
const int max_length_bits_;
// From Cache, needed for delete
MemoryAllocator* const allocator_;
};
// A single shard of sharded cache.
class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
public:
// NOTE: the eviction_callback ptr is saved, as is it assumed to be kept
// alive in Cache.
LRUCacheShard(size_t capacity, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
bool use_adaptive_mutex,
CacheMetadataChargePolicy metadata_charge_policy,
int max_upper_hash_bits, SecondaryCache* secondary_cache);
int max_upper_hash_bits, MemoryAllocator* allocator,
const Cache::EvictionCallback* eviction_callback);
public: // Type definitions expected as parameter to ShardedCache
using HandleImpl = LRUHandle;
@ -329,8 +279,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
using HashCref = uint32_t;
public: // Function definitions expected as parameter to ShardedCache
static inline HashVal ComputeHash(const Slice& key) {
return Lower32of64(GetSliceNPHash64(key));
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return Lower32of64(GetSliceNPHash64(key, seed));
}
// Separate from constructor so caller can easily make an array of LRUCache
@ -348,29 +298,21 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
void SetLowPriorityPoolRatio(double low_pri_pool_ratio);
// Like Cache methods, but with an extra "hash" parameter.
inline Status Insert(const Slice& key, uint32_t hash, void* value,
size_t charge, Cache::DeleterFn deleter,
LRUHandle** handle, Cache::Priority priority) {
return Insert(key, hash, value, charge, deleter, nullptr, handle, priority);
}
inline Status Insert(const Slice& key, uint32_t hash, void* value,
const Cache::CacheItemHelper* helper, size_t charge,
LRUHandle** handle, Cache::Priority priority) {
assert(helper);
return Insert(key, hash, value, charge, nullptr, helper, handle, priority);
}
// If helper_cb is null, the values of the following arguments don't matter.
Status Insert(const Slice& key, uint32_t hash, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge,
LRUHandle** handle, Cache::Priority priority);
LRUHandle* CreateStandalone(const Slice& key, uint32_t hash,
Cache::ObjectPtr obj,
const Cache::CacheItemHelper* helper,
size_t charge, bool allow_uncharged);
LRUHandle* Lookup(const Slice& key, uint32_t hash,
const Cache::CacheItemHelper* helper,
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait, Statistics* stats);
inline LRUHandle* Lookup(const Slice& key, uint32_t hash) {
return Lookup(key, hash, nullptr, nullptr, Cache::Priority::LOW, true,
nullptr);
}
Cache::CreateContext* create_context,
Cache::Priority priority, Statistics* stats);
bool Release(LRUHandle* handle, bool useful, bool erase_if_last_ref);
bool IsReady(LRUHandle* /*handle*/);
void Wait(LRUHandle* /*handle*/) {}
bool Ref(LRUHandle* handle);
void Erase(const Slice& key, uint32_t hash);
@ -384,8 +326,9 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
size_t GetTableAddressCount() const;
void ApplyToSomeEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const std::function<void(const Slice& key, Cache::ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state);
void EraseUnRefEntries();
@ -409,23 +352,10 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
private:
friend class LRUCache;
// Insert an item into the hash table and, if handle is null, insert into
// the LRU list. Older items are evicted as necessary. If the cache is full
// and free_handle_on_fail is true, the item is deleted and handle is set to
// nullptr.
Status InsertItem(LRUHandle* item, LRUHandle** handle,
bool free_handle_on_fail);
Status Insert(const Slice& key, uint32_t hash, void* value, size_t charge,
DeleterFn deleter, const Cache::CacheItemHelper* helper,
LRUHandle** handle, Cache::Priority priority);
// Promote an item looked up from the secondary cache to the LRU cache.
// The item may be still in the secondary cache.
// It is only inserted into the hash table and not the LRU list, and only
// if the cache is not at full capacity, as is the case during Insert. The
// caller should hold a reference on the LRUHandle. When the caller releases
// the last reference, the item is added to the LRU list.
// The item is promoted to the high pri or low pri pool as specified by the
// caller in Lookup.
void Promote(LRUHandle* e);
// the LRU list. Older items are evicted as necessary. Frees `item` on
// non-OK status.
Status InsertItem(LRUHandle* item, LRUHandle** handle);
void LRU_Remove(LRUHandle* e);
void LRU_Insert(LRUHandle* e);
@ -439,8 +369,11 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
// holding the mutex_.
void EvictFromLRU(size_t charge, autovector<LRUHandle*>* deleted);
// Try to insert the evicted handles into the secondary cache.
void TryInsertIntoSecondaryCache(autovector<LRUHandle*> evicted_handles);
void NotifyEvicted(const autovector<LRUHandle*>& evicted_handles);
LRUHandle* CreateHandle(const Slice& key, uint32_t hash,
Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge);
// Initialized before use.
size_t capacity_;
@ -503,8 +436,8 @@ class ALIGN_AS(CACHE_LINE_SIZE) LRUCacheShard final : public CacheShardBase {
// don't mind mutex_ invoking the non-const actions.
mutable DMutex mutex_;
// Owned by LRUCache
SecondaryCache* secondary_cache_;
// A reference to Cache::eviction_callback_
const Cache::EvictionCallback& eviction_callback_;
};
class LRUCache
@ -513,28 +446,16 @@ class LRUCache
#endif
: public ShardedCache<LRUCacheShard> {
public:
LRUCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
double high_pri_pool_ratio, double low_pri_pool_ratio,
std::shared_ptr<MemoryAllocator> memory_allocator = nullptr,
bool use_adaptive_mutex = kDefaultToAdaptiveMutex,
CacheMetadataChargePolicy metadata_charge_policy =
kDontChargeCacheMetadata,
std::shared_ptr<SecondaryCache> secondary_cache = nullptr);
explicit LRUCache(const LRUCacheOptions& opts);
const char* Name() const override { return "LRUCache"; }
void* Value(Handle* handle) override;
ObjectPtr Value(Handle* handle) override;
size_t GetCharge(Handle* handle) const override;
DeleterFn GetDeleter(Handle* handle) const override;
void WaitAll(std::vector<Handle*>& handles) override;
const CacheItemHelper* GetCacheItemHelper(Handle* handle) const override;
// Retrieves number of elements in LRU, for unit test purpose only.
size_t TEST_GetLRUSize();
// Retrieves high pri pool ratio.
double GetHighPriPoolRatio();
void AppendPrintableOptions(std::string& str) const override;
private:
std::shared_ptr<SecondaryCache> secondary_cache_;
};
} // namespace lru_cache

File diff suppressed because it is too large Load Diff

@ -11,20 +11,32 @@ namespace ROCKSDB_NAMESPACE {
namespace {
size_t SliceSize(void* obj) { return static_cast<Slice*>(obj)->size(); }
void NoopDelete(Cache::ObjectPtr, MemoryAllocator*) {}
Status SliceSaveTo(void* from_obj, size_t from_offset, size_t length,
void* out) {
size_t SliceSize(Cache::ObjectPtr obj) {
return static_cast<Slice*>(obj)->size();
}
Status SliceSaveTo(Cache::ObjectPtr from_obj, size_t from_offset, size_t length,
char* out) {
const Slice& slice = *static_cast<Slice*>(from_obj);
std::memcpy(out, slice.data() + from_offset, length);
return Status::OK();
}
Status FailCreate(const Slice&, Cache::CreateContext*, MemoryAllocator*,
Cache::ObjectPtr*, size_t*) {
return Status::NotSupported("Only for dumping data into SecondaryCache");
}
} // namespace
Status SecondaryCache::InsertSaved(const Slice& key, const Slice& saved) {
static Cache::CacheItemHelper helper_no_secondary{CacheEntryRole::kMisc,
&NoopDelete};
static Cache::CacheItemHelper helper{
&SliceSize, &SliceSaveTo, GetNoopDeleterForRole<CacheEntryRole::kMisc>()};
CacheEntryRole::kMisc, &NoopDelete, &SliceSize,
&SliceSaveTo, &FailCreate, &helper_no_secondary};
// NOTE: depends on Insert() being synchronous, not keeping pointer `&saved`
return Insert(key, const_cast<Slice*>(&saved), &helper);
}

@ -0,0 +1,433 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#include "cache/secondary_cache_adapter.h"
#include "monitoring/perf_context_imp.h"
#include "util/cast_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// A distinct pointer value for marking "dummy" cache entries
struct Dummy {
char val[7] = "kDummy";
};
const Dummy kDummy{};
Cache::ObjectPtr const kDummyObj = const_cast<Dummy*>(&kDummy);
} // namespace
// When CacheWithSecondaryAdapter is constructed with the distribute_cache_res
// parameter set to true, it manages the entire memory budget across the
// primary and secondary cache. The secondary cache is assumed to be in
// memory, such as the CompressedSecondaryCache. When a placeholder entry
// is inserted by a CacheReservationManager instance to reserve memory,
// the CacheWithSecondaryAdapter ensures that the reservation is distributed
// proportionally across the primary/secondary caches.
//
// The primary block cache is initially sized to the sum of the primary cache
// budget + teh secondary cache budget, as follows -
// |--------- Primary Cache Configured Capacity -----------|
// |---Secondary Cache Budget----|----Primary Cache Budget-----|
//
// A ConcurrentCacheReservationManager member in the CacheWithSecondaryAdapter,
// pri_cache_res_,
// is used to help with tracking the distribution of memory reservations.
// Initially, it accounts for the entire secondary cache budget as a
// reservation against the primary cache. This shrinks the usable capacity of
// the primary cache to the budget that the user originally desired.
//
// |--Reservation for Sec Cache--|-Pri Cache Usable Capacity---|
//
// When a reservation placeholder is inserted into the adapter, it is inserted
// directly into the primary cache. This means the entire charge of the
// placeholder is counted against the primary cache. To compensate and count
// a portion of it against the secondary cache, the secondary cache Deflate()
// method is called to shrink it. Since the Deflate() causes the secondary
// actual usage to shrink, it is refelcted here by releasing an equal amount
// from the pri_cache_res_ reservation. The Deflate() in the secondary cache
// can be, but is not required to be, implemented using its own cache
// reservation manager.
//
// For example, if the pri/sec ratio is 70/30, and the combined capacity is
// 100MB, the intermediate and final state after inserting a reservation
// placeholder for 10MB would be as follows -
//
// |-Reservation for Sec Cache-|-Pri Cache Usable Capacity-|---R---|
// 1. After inserting the placeholder in primary
// |------- 30MB -------------|------- 60MB -------------|-10MB--|
// 2. After deflating the secondary and adjusting the reservation for
// secondary against the primary
// |------- 27MB -------------|------- 63MB -------------|-10MB--|
//
// Likewise, when the user inserted placeholder is released, the secondary
// cache Inflate() method is called to grow it, and the pri_cache_res_
// reservation is increased by an equal amount.
//
// Another way of implementing this would have been to simply split the user
// reservation into primary and seconary components. However, this would
// require allocating a structure to track the associated secondary cache
// reservation, which adds some complexity and overhead.
//
CacheWithSecondaryAdapter::CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache, bool distribute_cache_res)
: CacheWrapper(std::move(target)),
secondary_cache_(std::move(secondary_cache)),
distribute_cache_res_(distribute_cache_res) {
target_->SetEvictionCallback([this](const Slice& key, Handle* handle) {
return EvictionHandler(key, handle);
});
if (distribute_cache_res_) {
size_t sec_capacity = 0;
pri_cache_res_ = std::make_shared<ConcurrentCacheReservationManager>(
std::make_shared<CacheReservationManagerImpl<CacheEntryRole::kMisc>>(
target_));
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
// Initially, the primary cache is sized to uncompressed cache budget plsu
// compressed secondary cache budget. The secondary cache budget is then
// taken away from the primary cache through cache reservations. Later,
// when a placeholder entry is inserted by the caller, its inserted
// into the primary cache and the portion that should be assigned to the
// secondary cache is freed from the reservation.
s = pri_cache_res_->UpdateCacheReservation(sec_capacity);
assert(s.ok());
sec_cache_res_ratio_ = (double)sec_capacity / target_->GetCapacity();
}
}
CacheWithSecondaryAdapter::~CacheWithSecondaryAdapter() {
// `*this` will be destroyed before `*target_`, so we have to prevent
// use after free
target_->SetEvictionCallback({});
#ifndef NDEBUG
if (distribute_cache_res_) {
size_t sec_capacity = 0;
Status s = secondary_cache_->GetCapacity(sec_capacity);
assert(s.ok());
assert(pri_cache_res_->GetTotalReservedCacheSize() == sec_capacity);
}
#endif // NDEBUG
}
bool CacheWithSecondaryAdapter::EvictionHandler(const Slice& key,
Handle* handle) {
auto helper = GetCacheItemHelper(handle);
if (helper->IsSecondaryCacheCompatible()) {
auto obj = target_->Value(handle);
// Ignore dummy entry
if (obj != kDummyObj) {
// Spill into secondary cache.
secondary_cache_->Insert(key, obj, helper).PermitUncheckedError();
}
}
// Never takes ownership of obj
return false;
}
bool CacheWithSecondaryAdapter::ProcessDummyResult(Cache::Handle** handle,
bool erase) {
if (*handle && target_->Value(*handle) == kDummyObj) {
target_->Release(*handle, erase);
*handle = nullptr;
return true;
} else {
return false;
}
}
void CacheWithSecondaryAdapter::CleanupCacheObject(
ObjectPtr obj, const CacheItemHelper* helper) {
if (helper->del_cb) {
helper->del_cb(obj, memory_allocator());
}
}
Cache::Handle* CacheWithSecondaryAdapter::Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache) {
assert(secondary_handle->IsReady());
ObjectPtr obj = secondary_handle->Value();
if (!obj) {
// Nothing found.
return nullptr;
}
// Found something.
switch (helper->role) {
case CacheEntryRole::kFilterBlock:
RecordTick(stats, SECONDARY_CACHE_FILTER_HITS);
break;
case CacheEntryRole::kIndexBlock:
RecordTick(stats, SECONDARY_CACHE_INDEX_HITS);
break;
case CacheEntryRole::kDataBlock:
RecordTick(stats, SECONDARY_CACHE_DATA_HITS);
break;
default:
break;
}
PERF_COUNTER_ADD(secondary_cache_hit_count, 1);
RecordTick(stats, SECONDARY_CACHE_HITS);
// Note: SecondaryCache::Size() is really charge (from the CreateCallback)
size_t charge = secondary_handle->Size();
Handle* result = nullptr;
// Insert into primary cache, possibly as a standalone+dummy entries.
if (secondary_cache_->SupportForceErase() && !found_dummy_entry) {
// Create standalone and insert dummy
// Allow standalone to be created even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
// Insert dummy to record recent use
// TODO: try to avoid case where inserting this dummy could overwrite a
// regular entry
Status s = Insert(key, kDummyObj, &kNoopCacheItemHelper, /*charge=*/0,
/*handle=*/nullptr, priority);
s.PermitUncheckedError();
// Nothing to do or clean up on dummy insertion failure
} else {
// Insert regular entry into primary cache.
// Don't allow it to spill into secondary cache again if it was kept there.
Status s = Insert(
key, obj, kept_in_sec_cache ? helper->without_secondary_compat : helper,
charge, &result, priority);
if (s.ok()) {
assert(result);
PERF_COUNTER_ADD(block_cache_real_handle_count, 1);
} else {
// Create standalone result instead, even if cache is full, to avoid
// reading the entry from storage.
result =
CreateStandalone(key, obj, helper, charge, /*allow_uncharged*/ true);
assert(result);
PERF_COUNTER_ADD(block_cache_standalone_handle_count, 1);
}
}
return result;
}
Status CacheWithSecondaryAdapter::Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper,
size_t charge, Handle** handle,
Priority priority) {
Status s = target_->Insert(key, value, helper, charge, handle, priority);
if (s.ok() && value == nullptr && distribute_cache_res_) {
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
s = secondary_cache_->Deflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/false);
assert(s.ok());
}
return s;
}
Cache::Handle* CacheWithSecondaryAdapter::Lookup(const Slice& key,
const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority,
Statistics* stats) {
// NOTE: we could just StartAsyncLookup() and Wait(), but this should be a bit
// more efficient
Handle* result =
target_->Lookup(key, helper, create_context, priority, stats);
bool secondary_compatible = helper && helper->IsSecondaryCacheCompatible();
bool found_dummy_entry =
ProcessDummyResult(&result, /*erase=*/secondary_compatible);
if (!result && secondary_compatible) {
// Try our secondary cache
bool kept_in_sec_cache = false;
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(key, helper, create_context, /*wait*/ true,
found_dummy_entry, /*out*/ kept_in_sec_cache);
if (secondary_handle) {
result = Promote(std::move(secondary_handle), key, helper, priority,
stats, found_dummy_entry, kept_in_sec_cache);
}
}
return result;
}
bool CacheWithSecondaryAdapter::Release(Handle* handle,
bool erase_if_last_ref) {
if (erase_if_last_ref) {
ObjectPtr v = target_->Value(handle);
if (v == nullptr && distribute_cache_res_) {
size_t charge = target_->GetCharge(handle);
size_t sec_charge = static_cast<size_t>(charge * (sec_cache_res_ratio_));
Status s = secondary_cache_->Inflate(sec_charge);
assert(s.ok());
s = pri_cache_res_->UpdateCacheReservation(sec_charge, /*increase=*/true);
assert(s.ok());
}
}
return target_->Release(handle, erase_if_last_ref);
}
Cache::ObjectPtr CacheWithSecondaryAdapter::Value(Handle* handle) {
ObjectPtr v = target_->Value(handle);
// TODO with stacked secondaries: might fail in EvictionHandler
assert(v != kDummyObj);
return v;
}
void CacheWithSecondaryAdapter::StartAsyncLookupOnMySecondary(
AsyncLookupHandle& async_handle) {
assert(!async_handle.IsPending());
assert(async_handle.result_handle == nullptr);
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle =
secondary_cache_->Lookup(async_handle.key, async_handle.helper,
async_handle.create_context, /*wait*/ false,
async_handle.found_dummy_entry,
/*out*/ async_handle.kept_in_sec_cache);
if (secondary_handle) {
// TODO with stacked secondaries: Check & process if already ready?
async_handle.pending_handle = secondary_handle.release();
async_handle.pending_cache = secondary_cache_.get();
}
}
void CacheWithSecondaryAdapter::StartAsyncLookup(
AsyncLookupHandle& async_handle) {
target_->StartAsyncLookup(async_handle);
if (!async_handle.IsPending()) {
bool secondary_compatible =
async_handle.helper &&
async_handle.helper->IsSecondaryCacheCompatible();
async_handle.found_dummy_entry |= ProcessDummyResult(
&async_handle.result_handle, /*erase=*/secondary_compatible);
if (async_handle.Result() == nullptr && secondary_compatible) {
// Not found and not pending on another secondary cache
StartAsyncLookupOnMySecondary(async_handle);
}
}
}
void CacheWithSecondaryAdapter::WaitAll(AsyncLookupHandle* async_handles,
size_t count) {
if (count == 0) {
// Nothing to do
return;
}
// Requests that are pending on *my* secondary cache, at the start of this
// function
std::vector<AsyncLookupHandle*> my_pending;
// Requests that are pending on an "inner" secondary cache (managed somewhere
// under target_), as of the start of this function
std::vector<AsyncLookupHandle*> inner_pending;
// Initial accounting of pending handles, excluding those already handled
// by "outer" secondary caches. (See cur->pending_cache = nullptr.)
for (size_t i = 0; i < count; ++i) {
AsyncLookupHandle* cur = async_handles + i;
if (cur->pending_cache) {
assert(cur->IsPending());
assert(cur->helper);
assert(cur->helper->IsSecondaryCacheCompatible());
if (cur->pending_cache == secondary_cache_.get()) {
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
} else {
// Remember as potentially needing a lookup in my secondary
inner_pending.push_back(cur);
}
}
}
// Wait on inner-most cache lookups first
// TODO with stacked secondaries: because we are not using proper
// async/await constructs here yet, there is a false synchronization point
// here where all the results at one level are needed before initiating
// any lookups at the next level. Probably not a big deal, but worth noting.
if (!inner_pending.empty()) {
target_->WaitAll(async_handles, count);
}
// For those that failed to find something, convert to lookup in my
// secondary cache.
for (AsyncLookupHandle* cur : inner_pending) {
if (cur->Result() == nullptr) {
// Not found, try my secondary
StartAsyncLookupOnMySecondary(*cur);
if (cur->IsPending()) {
assert(cur->pending_cache == secondary_cache_.get());
my_pending.push_back(cur);
// Mark as "to be handled by this caller"
cur->pending_cache = nullptr;
}
}
}
// Wait on all lookups on my secondary cache
{
std::vector<SecondaryCacheResultHandle*> my_secondary_handles;
for (AsyncLookupHandle* cur : my_pending) {
my_secondary_handles.push_back(cur->pending_handle);
}
secondary_cache_->WaitAll(std::move(my_secondary_handles));
}
// Process results
for (AsyncLookupHandle* cur : my_pending) {
std::unique_ptr<SecondaryCacheResultHandle> secondary_handle(
cur->pending_handle);
cur->pending_handle = nullptr;
cur->result_handle = Promote(
std::move(secondary_handle), cur->key, cur->helper, cur->priority,
cur->stats, cur->found_dummy_entry, cur->kept_in_sec_cache);
assert(cur->pending_cache == nullptr);
}
}
std::string CacheWithSecondaryAdapter::GetPrintableOptions() const {
std::string str = target_->GetPrintableOptions();
str.append(" secondary_cache:\n");
str.append(secondary_cache_->GetPrintableOptions());
return str;
}
const char* CacheWithSecondaryAdapter::Name() const {
// To the user, at least for now, configure the underlying cache with
// a secondary cache. So we pretend to be that cache
return target_->Name();
}
std::shared_ptr<Cache> NewTieredVolatileCache(
TieredVolatileCacheOptions& opts) {
if (!opts.cache_opts) {
return nullptr;
}
std::shared_ptr<Cache> cache;
if (opts.cache_type == PrimaryCacheType::kCacheTypeLRU) {
LRUCacheOptions cache_opts =
*(static_cast_with_check<LRUCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache_opts.capacity += opts.comp_cache_opts.capacity;
cache = cache_opts.MakeSharedCache();
} else if (opts.cache_type == PrimaryCacheType::kCacheTypeHCC) {
HyperClockCacheOptions cache_opts =
*(static_cast_with_check<HyperClockCacheOptions, ShardedCacheOptions>(
opts.cache_opts));
cache = cache_opts.MakeSharedCache();
} else {
return nullptr;
}
std::shared_ptr<SecondaryCache> sec_cache;
sec_cache = NewCompressedSecondaryCache(opts.comp_cache_opts);
return std::make_shared<CacheWithSecondaryAdapter>(cache, sec_cache, true);
}
} // namespace ROCKSDB_NAMESPACE

@ -0,0 +1,76 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include "cache/cache_reservation_manager.h"
#include "rocksdb/secondary_cache.h"
namespace ROCKSDB_NAMESPACE {
class CacheWithSecondaryAdapter : public CacheWrapper {
public:
explicit CacheWithSecondaryAdapter(
std::shared_ptr<Cache> target,
std::shared_ptr<SecondaryCache> secondary_cache,
bool distribute_cache_res = false);
~CacheWithSecondaryAdapter() override;
Status Insert(const Slice& key, ObjectPtr value,
const CacheItemHelper* helper, size_t charge,
Handle** handle = nullptr,
Priority priority = Priority::LOW) override;
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
CreateContext* create_context,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override;
using Cache::Release;
bool Release(Handle* handle, bool erase_if_last_ref = false) override;
ObjectPtr Value(Handle* handle) override;
void StartAsyncLookup(AsyncLookupHandle& async_handle) override;
void WaitAll(AsyncLookupHandle* async_handles, size_t count) override;
std::string GetPrintableOptions() const override;
const char* Name() const override;
Cache* TEST_GetCache() { return target_.get(); }
SecondaryCache* TEST_GetSecondaryCache() { return secondary_cache_.get(); }
private:
bool EvictionHandler(const Slice& key, Handle* handle);
void StartAsyncLookupOnMySecondary(AsyncLookupHandle& async_handle);
Handle* Promote(
std::unique_ptr<SecondaryCacheResultHandle>&& secondary_handle,
const Slice& key, const CacheItemHelper* helper, Priority priority,
Statistics* stats, bool found_dummy_entry, bool kept_in_sec_cache);
bool ProcessDummyResult(Cache::Handle** handle, bool erase);
void CleanupCacheObject(ObjectPtr obj, const CacheItemHelper* helper);
std::shared_ptr<SecondaryCache> secondary_cache_;
// Whether to proportionally distribute cache memory reservations, i.e
// placeholder entries with null value and a non-zero charge, across
// the primary and secondary caches.
bool distribute_cache_res_;
// A cache reservation manager to keep track of secondary cache memory
// usage by reserving equivalent capacity against the primary cache
std::shared_ptr<ConcurrentCacheReservationManager> pri_cache_res_;
// Fraction of a cache memory reservation to be assigned to the secondary
// cache
double sec_cache_res_ratio_;
};
} // namespace ROCKSDB_NAMESPACE

@ -13,20 +13,57 @@
#include <cstdint>
#include <memory>
#include "env/unique_id_gen.h"
#include "rocksdb/env.h"
#include "util/hash.h"
#include "util/math.h"
#include "util/mutexlock.h"
namespace ROCKSDB_NAMESPACE {
namespace {
// The generated seeds must fit in 31 bits so that
// ShardedCacheOptions::hash_seed can be set to it explicitly, for
// diagnostic/debugging purposes.
constexpr uint32_t kSeedMask = 0x7fffffff;
uint32_t DetermineSeed(int32_t hash_seed_option) {
if (hash_seed_option >= 0) {
// User-specified exact seed
return static_cast<uint32_t>(hash_seed_option);
}
static SemiStructuredUniqueIdGen gen;
if (hash_seed_option == ShardedCacheOptions::kHostHashSeed) {
std::string hostname;
Status s = Env::Default()->GetHostNameString(&hostname);
if (s.ok()) {
return GetSliceHash(hostname) & kSeedMask;
} else {
// Fall back on something stable within the process.
return BitwiseAnd(gen.GetBaseUpper(), kSeedMask);
}
} else {
// for kQuasiRandomHashSeed and fallback
uint32_t val = gen.GenerateNext<uint32_t>() & kSeedMask;
// Perform some 31-bit bijective transformations so that we get
// quasirandom, not just incrementing. (An incrementing seed from a
// random starting point would be fine, but hard to describe in a name.)
// See https://en.wikipedia.org/wiki/Quasirandom and using a murmur-like
// transformation here for our bijection in the lower 31 bits.
// See https://en.wikipedia.org/wiki/MurmurHash
val *= /*31-bit prime*/ 1150630961;
val ^= (val & kSeedMask) >> 17;
val *= /*31-bit prime*/ 1320603883;
return val & kSeedMask;
}
}
} // namespace
ShardedCacheBase::ShardedCacheBase(size_t capacity, int num_shard_bits,
bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> allocator)
: Cache(std::move(allocator)),
ShardedCacheBase::ShardedCacheBase(const ShardedCacheOptions& opts)
: Cache(opts.memory_allocator),
last_id_(1),
shard_mask_((uint32_t{1} << num_shard_bits) - 1),
strict_capacity_limit_(strict_capacity_limit),
capacity_(capacity) {}
shard_mask_((uint32_t{1} << opts.num_shard_bits) - 1),
hash_seed_(DetermineSeed(opts.hash_seed)),
strict_capacity_limit_(opts.strict_capacity_limit),
capacity_(opts.capacity) {}
size_t ShardedCacheBase::ComputePerShardCapacity(size_t capacity) const {
uint32_t num_shards = GetNumShards();

@ -15,7 +15,7 @@
#include "port/lang.h"
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
#include "util/hash.h"
#include "util/mutexlock.h"
@ -34,8 +34,8 @@ class CacheShardBase {
std::string GetPrintableOptions() const { return ""; }
using HashVal = uint64_t;
using HashCref = uint64_t;
static inline HashVal ComputeHash(const Slice& key) {
return GetSliceNPHash64(key);
static inline HashVal ComputeHash(const Slice& key, uint32_t seed) {
return GetSliceNPHash64(key, seed);
}
static inline uint32_t HashPieceForSharding(HashCref hash) {
return Lower32of64(hash);
@ -49,21 +49,19 @@ class CacheShardBase {
HashCref GetHash() const;
...
};
Status Insert(const Slice& key, HashCref hash, void* value, size_t charge,
DeleterFn deleter, HandleImpl** handle,
Cache::Priority priority) = 0;
Status Insert(const Slice& key, HashCref hash, void* value,
Status Insert(const Slice& key, HashCref hash, Cache::ObjectPtr value,
const Cache::CacheItemHelper* helper, size_t charge,
HandleImpl** handle, Cache::Priority priority) = 0;
HandleImpl* Lookup(const Slice& key, HashCref hash) = 0;
HandleImpl** handle, Cache::Priority priority,
bool standalone) = 0;
Handle* CreateStandalone(const Slice& key, HashCref hash, ObjectPtr obj,
const CacheItemHelper* helper,
size_t charge, bool allow_uncharged) = 0;
HandleImpl* Lookup(const Slice& key, HashCref hash,
const Cache::CacheItemHelper* helper,
const Cache::CreateCallback& create_cb,
Cache::Priority priority, bool wait,
Cache::CreateContext* create_context,
Cache::Priority priority,
Statistics* stats) = 0;
bool Release(HandleImpl* handle, bool useful, bool erase_if_last_ref) = 0;
bool IsReady(HandleImpl* handle) = 0;
void Wait(HandleImpl* handle) = 0;
bool Ref(HandleImpl* handle) = 0;
void Erase(const Slice& key, HashCref hash) = 0;
void SetCapacity(size_t capacity) = 0;
@ -77,8 +75,9 @@ class CacheShardBase {
// *state == 0 and implementation sets *state = SIZE_MAX to indicate
// completion.
void ApplyToSomeEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const std::function<void(const Slice& key, ObjectPtr value,
size_t charge,
const Cache::CacheItemHelper* helper)>& callback,
size_t average_entries_per_lock, size_t* state) = 0;
void EraseUnRefEntries() = 0;
*/
@ -90,9 +89,7 @@ class CacheShardBase {
// Portions of ShardedCache that do not depend on the template parameter
class ShardedCacheBase : public Cache {
public:
ShardedCacheBase(size_t capacity, int num_shard_bits,
bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> memory_allocator);
explicit ShardedCacheBase(const ShardedCacheOptions& opts);
virtual ~ShardedCacheBase() = default;
int GetNumShardBits() const;
@ -107,6 +104,8 @@ class ShardedCacheBase : public Cache {
size_t GetUsage(Handle* handle) const override;
std::string GetPrintableOptions() const override;
uint32_t GetHashSeed() const override { return hash_seed_; }
protected: // fns
virtual void AppendPrintableOptions(std::string& str) const = 0;
size_t GetPerShardCapacity() const;
@ -115,6 +114,7 @@ class ShardedCacheBase : public Cache {
protected: // data
std::atomic<uint64_t> last_id_; // For NewId
const uint32_t shard_mask_;
const uint32_t hash_seed_;
// Dynamic configuration parameters, guarded by config_mutex_
bool strict_capacity_limit_;
@ -135,10 +135,8 @@ class ShardedCache : public ShardedCacheBase {
using HashCref = typename CacheShard::HashCref;
using HandleImpl = typename CacheShard::HandleImpl;
ShardedCache(size_t capacity, int num_shard_bits, bool strict_capacity_limit,
std::shared_ptr<MemoryAllocator> allocator)
: ShardedCacheBase(capacity, num_shard_bits, strict_capacity_limit,
allocator),
explicit ShardedCache(const ShardedCacheOptions& opts)
: ShardedCacheBase(opts),
shards_(reinterpret_cast<CacheShard*>(port::cacheline_aligned_alloc(
sizeof(CacheShard) * GetNumShards()))),
destroy_shards_in_dtor_(false) {}
@ -172,41 +170,38 @@ class ShardedCache : public ShardedCacheBase {
[s_c_l](CacheShard* cs) { cs->SetStrictCapacityLimit(s_c_l); });
}
Status Insert(const Slice& key, void* value, size_t charge, DeleterFn deleter,
Handle** handle, Priority priority) override {
HashVal hash = CacheShard::ComputeHash(key);
auto h_out = reinterpret_cast<HandleImpl**>(handle);
return GetShard(hash).Insert(key, hash, value, charge, deleter, h_out,
priority);
}
Status Insert(const Slice& key, void* value, const CacheItemHelper* helper,
Status Insert(const Slice& key, ObjectPtr obj, const CacheItemHelper* helper,
size_t charge, Handle** handle = nullptr,
Priority priority = Priority::LOW) override {
if (!helper) {
return Status::InvalidArgument();
}
HashVal hash = CacheShard::ComputeHash(key);
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
auto h_out = reinterpret_cast<HandleImpl**>(handle);
return GetShard(hash).Insert(key, hash, value, helper, charge, h_out,
return GetShard(hash).Insert(key, hash, obj, helper, charge, h_out,
priority);
}
Handle* Lookup(const Slice& key, Statistics* /*stats*/) override {
HashVal hash = CacheShard::ComputeHash(key);
HandleImpl* result = GetShard(hash).Lookup(key, hash);
Handle* CreateStandalone(const Slice& key, ObjectPtr obj,
const CacheItemHelper* helper, size_t charge,
bool allow_uncharged) override {
assert(helper);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).CreateStandalone(
key, hash, obj, helper, charge, allow_uncharged);
return reinterpret_cast<Handle*>(result);
}
Handle* Lookup(const Slice& key, const CacheItemHelper* helper,
const CreateCallback& create_cb, Priority priority, bool wait,
Handle* Lookup(const Slice& key, const CacheItemHelper* helper = nullptr,
CreateContext* create_context = nullptr,
Priority priority = Priority::LOW,
Statistics* stats = nullptr) override {
HashVal hash = CacheShard::ComputeHash(key);
HandleImpl* result = GetShard(hash).Lookup(key, hash, helper, create_cb,
priority, wait, stats);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
HandleImpl* result = GetShard(hash).Lookup(key, hash, helper,
create_context, priority, stats);
return reinterpret_cast<Handle*>(result);
}
void Erase(const Slice& key) override {
HashVal hash = CacheShard::ComputeHash(key);
HashVal hash = CacheShard::ComputeHash(key, hash_seed_);
GetShard(hash).Erase(key, hash);
}
@ -215,14 +210,6 @@ class ShardedCache : public ShardedCacheBase {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Release(h, useful, erase_if_last_ref);
}
bool IsReady(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).IsReady(h);
}
void Wait(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
GetShard(h->GetHash()).Wait(h);
}
bool Ref(Handle* handle) override {
auto h = reinterpret_cast<HandleImpl*>(handle);
return GetShard(h->GetHash()).Ref(h);
@ -238,14 +225,14 @@ class ShardedCache : public ShardedCacheBase {
return SumOverShards2(&CacheShard::GetPinnedUsage);
}
size_t GetOccupancyCount() const override {
return SumOverShards2(&CacheShard::GetPinnedUsage);
return SumOverShards2(&CacheShard::GetOccupancyCount);
}
size_t GetTableAddressCount() const override {
return SumOverShards2(&CacheShard::GetTableAddressCount);
}
void ApplyToAllEntries(
const std::function<void(const Slice& key, void* value, size_t charge,
DeleterFn deleter)>& callback,
const std::function<void(const Slice& key, ObjectPtr value, size_t charge,
const CacheItemHelper* helper)>& callback,
const ApplyToAllEntriesOptions& opts) override {
uint32_t num_shards = GetNumShards();
// Iterate over part of each shard, rotating between shards, to

375
cache/typed_cache.h vendored

@ -0,0 +1,375 @@
// Copyright (c) Meta Platforms, Inc. and affiliates.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
// APIs for accessing Cache in a type-safe and convenient way. Cache is kept
// at a low, thin level of abstraction so that different implementations can
// be plugged in, but these wrappers provide clean, convenient access to the
// most common operations.
//
// A number of template classes are needed for sharing common structure. The
// key classes are these:
//
// * PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value.
// * BasicTypedCacheInterface<TValue> - Used for primary cache storage of
// objects of type TValue.
// * FullTypedCacheHelper<TValue, TCreateContext> - Used for secondary cache
// compatible storage of objects of type TValue.
// * For each of these, there's a "Shared" version
// (e.g. FullTypedSharedCacheInterface) that holds a shared_ptr to the Cache,
// rather than assuming external ownership by holding only a raw `Cache*`.
#pragma once
#include <algorithm>
#include <cstdint>
#include <memory>
#include <type_traits>
#include "cache/cache_helpers.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/advanced_options.h"
namespace ROCKSDB_NAMESPACE {
// For future consideration:
// * Pass in value to Insert with std::unique_ptr& to simplify ownership
// transfer logic in callers
// * Make key type a template parameter (e.g. useful for table cache)
// * Closer integration with CacheHandleGuard (opt-in, so not always
// paying the extra overhead)
#define CACHE_TYPE_DEFS() \
using Priority = Cache::Priority; \
using Handle = Cache::Handle; \
using ObjectPtr = Cache::ObjectPtr; \
using CreateContext = Cache::CreateContext; \
using CacheItemHelper = Cache::CacheItemHelper /* caller ; */
template <typename CachePtr>
class BaseCacheInterface {
public:
CACHE_TYPE_DEFS();
/*implicit*/ BaseCacheInterface(CachePtr cache) : cache_(std::move(cache)) {}
inline void Release(Handle* handle) { cache_->Release(handle); }
inline void ReleaseAndEraseIfLastRef(Handle* handle) {
cache_->Release(handle, /*erase_if_last_ref*/ true);
}
inline void RegisterReleaseAsCleanup(Handle* handle, Cleanable& cleanable) {
cleanable.RegisterCleanup(&ReleaseCacheHandleCleanup, get(), handle);
}
inline Cache* get() const { return &*cache_; }
explicit inline operator bool() const noexcept { return cache_ != nullptr; }
protected:
CachePtr cache_;
};
// PlaceholderCacheInterface - Used for making cache reservations, with
// entries that have a charge but no value. CacheEntryRole is required as
// a template parameter.
template <CacheEntryRole kRole, typename CachePtr = Cache*>
class PlaceholderCacheInterface : public BaseCacheInterface<CachePtr> {
public:
CACHE_TYPE_DEFS();
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
inline Status Insert(const Slice& key, size_t charge, Handle** handle) {
return this->cache_->Insert(key, /*value=*/nullptr, GetHelper(), charge,
handle);
}
static const Cache::CacheItemHelper* GetHelper() {
static const Cache::CacheItemHelper kHelper{kRole};
return &kHelper;
}
};
template <CacheEntryRole kRole>
using PlaceholderSharedCacheInterface =
PlaceholderCacheInterface<kRole, std::shared_ptr<Cache>>;
template <class TValue>
class BasicTypedCacheHelperFns {
public:
CACHE_TYPE_DEFS();
// E.g. char* for char[]
using TValuePtr = std::remove_extent_t<TValue>*;
protected:
inline static ObjectPtr UpCastValue(TValuePtr value) { return value; }
inline static TValuePtr DownCastValue(ObjectPtr value) {
return static_cast<TValuePtr>(value);
}
static void Delete(ObjectPtr value, MemoryAllocator* allocator) {
// FIXME: Currently, no callers actually allocate the ObjectPtr objects
// using the custom allocator, just subobjects that keep a reference to
// the allocator themselves (with CacheAllocationPtr).
if (/*DISABLED*/ false && allocator) {
if constexpr (std::is_destructible_v<TValue>) {
DownCastValue(value)->~TValue();
}
allocator->Deallocate(value);
} else {
// Like delete but properly handles TValue=char[] etc.
std::default_delete<TValue>{}(DownCastValue(value));
}
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, CacheEntryRole kRole>
class BasicTypedCacheHelper : public BasicTypedCacheHelperFns<TValue> {
public:
static const Cache::CacheItemHelper* GetBasicHelper() {
static const Cache::CacheItemHelper kHelper{kRole,
&BasicTypedCacheHelper::Delete};
return &kHelper;
}
};
// BasicTypedCacheInterface - Used for primary cache storage of objects of
// type TValue, which can be cleaned up with std::default_delete<TValue>. The
// role is provided by TValue::kCacheEntryRole or given in an optional
// template parameter.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class BasicTypedCacheInterface : public BaseCacheInterface<CachePtr>,
public BasicTypedCacheHelper<TValue, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
struct TypedHandle : public Handle {};
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
// ctor
using BaseCacheInterface<CachePtr>::BaseCacheInterface;
struct TypedAsyncLookupHandle : public Cache::AsyncLookupHandle {
TypedHandle* Result() {
return reinterpret_cast<TypedHandle*>(Cache::AsyncLookupHandle::Result());
}
};
inline Status Insert(const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr,
Priority priority = Priority::LOW) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
return this->cache_->Insert(
key, BasicTypedCacheHelperFns<TValue>::UpCastValue(value),
GetBasicHelper(), charge, untyped_handle, priority);
}
inline TypedHandle* Lookup(const Slice& key, Statistics* stats = nullptr) {
return reinterpret_cast<TypedHandle*>(
this->cache_->BasicLookup(key, stats));
}
inline void StartAsyncLookup(TypedAsyncLookupHandle& async_handle) {
assert(async_handle.helper == nullptr);
this->cache_->StartAsyncLookup(async_handle);
}
inline CacheHandleGuard<TValue> Guard(TypedHandle* handle) {
if (handle) {
return CacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline std::shared_ptr<TValue> SharedGuard(TypedHandle* handle) {
if (handle) {
return MakeSharedCacheHandleGuard<TValue>(&*this->cache_, handle);
} else {
return {};
}
}
inline TValuePtr Value(TypedHandle* handle) {
return BasicTypedCacheHelperFns<TValue>::DownCastValue(
this->cache_->Value(handle));
}
};
// BasicTypedSharedCacheInterface - Like BasicTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, CacheEntryRole kRole = TValue::kCacheEntryRole>
using BasicTypedSharedCacheInterface =
BasicTypedCacheInterface<TValue, kRole, std::shared_ptr<Cache>>;
// TValue must implement ContentSlice() and ~TValue
// TCreateContext must implement Create(std::unique_ptr<TValue>*, ...)
template <class TValue, class TCreateContext>
class FullTypedCacheHelperFns : public BasicTypedCacheHelperFns<TValue> {
public:
CACHE_TYPE_DEFS();
protected:
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
static size_t Size(ObjectPtr v) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
return slice.size();
}
static Status SaveTo(ObjectPtr v, size_t from_offset, size_t length,
char* out) {
TValuePtr value = DownCastValue(v);
auto slice = value->ContentSlice();
assert(from_offset < slice.size());
assert(from_offset + length <= slice.size());
std::copy_n(slice.data() + from_offset, length, out);
return Status::OK();
}
static Status Create(const Slice& data, CreateContext* context,
MemoryAllocator* allocator, ObjectPtr* out_obj,
size_t* out_charge) {
std::unique_ptr<TValue> value = nullptr;
if constexpr (sizeof(TCreateContext) > 0) {
TCreateContext* tcontext = static_cast<TCreateContext*>(context);
tcontext->Create(&value, out_charge, data, allocator);
} else {
TCreateContext::Create(&value, out_charge, data, allocator);
}
*out_obj = UpCastValue(value.release());
return Status::OK();
}
};
// In its own class to try to minimize the number of distinct CacheItemHelper
// instances (e.g. don't vary by CachePtr)
template <class TValue, class TCreateContext, CacheEntryRole kRole>
class FullTypedCacheHelper
: public FullTypedCacheHelperFns<TValue, TCreateContext> {
public:
static const Cache::CacheItemHelper* GetFullHelper() {
static const Cache::CacheItemHelper kHelper{
kRole,
&FullTypedCacheHelper::Delete,
&FullTypedCacheHelper::Size,
&FullTypedCacheHelper::SaveTo,
&FullTypedCacheHelper::Create,
BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper()};
return &kHelper;
}
};
// FullTypedCacheHelper - Used for secondary cache compatible storage of
// objects of type TValue. In addition to BasicTypedCacheInterface constraints,
// we require TValue::ContentSlice() to return persistable data. This
// simplifies usage for the normal case of simple secondary cache compatibility
// (can give you a Slice to the data already in memory). In addition to
// TCreateContext performing the role of Cache::CreateContext, it is also
// expected to provide a function Create(std::unique_ptr<TValue>* value,
// size_t* out_charge, const Slice& data, MemoryAllocator* allocator) for
// creating new TValue.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole,
typename CachePtr = Cache*>
class FullTypedCacheInterface
: public BasicTypedCacheInterface<TValue, kRole, CachePtr>,
public FullTypedCacheHelper<TValue, TCreateContext, kRole> {
public:
CACHE_TYPE_DEFS();
using typename BasicTypedCacheInterface<TValue, kRole, CachePtr>::TypedHandle;
using typename BasicTypedCacheInterface<TValue, kRole,
CachePtr>::TypedAsyncLookupHandle;
using typename BasicTypedCacheHelperFns<TValue>::TValuePtr;
using BasicTypedCacheHelper<TValue, kRole>::GetBasicHelper;
using FullTypedCacheHelper<TValue, TCreateContext, kRole>::GetFullHelper;
using BasicTypedCacheHelperFns<TValue>::UpCastValue;
using BasicTypedCacheHelperFns<TValue>::DownCastValue;
// ctor
using BasicTypedCacheInterface<TValue, kRole,
CachePtr>::BasicTypedCacheInterface;
// Insert with SecondaryCache compatibility (subject to CacheTier).
// (Basic Insert() also inherited.)
inline Status InsertFull(
const Slice& key, TValuePtr value, size_t charge,
TypedHandle** handle = nullptr, Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
auto untyped_handle = reinterpret_cast<Handle**>(handle);
auto helper = lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier
? GetFullHelper()
: GetBasicHelper();
return this->cache_->Insert(key, UpCastValue(value), helper, charge,
untyped_handle, priority);
}
// Like SecondaryCache::InsertSaved, with SecondaryCache compatibility
// (subject to CacheTier).
inline Status InsertSaved(
const Slice& key, const Slice& data, TCreateContext* create_context,
Priority priority = Priority::LOW,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier,
size_t* out_charge = nullptr) {
ObjectPtr value;
size_t charge;
Status st = GetFullHelper()->create_cb(data, create_context,
this->cache_->memory_allocator(),
&value, &charge);
if (out_charge) {
*out_charge = charge;
}
if (st.ok()) {
st = InsertFull(key, DownCastValue(value), charge, nullptr /*handle*/,
priority, lowest_used_cache_tier);
} else {
GetFullHelper()->del_cb(value, this->cache_->memory_allocator());
}
return st;
}
// Lookup with SecondaryCache support (subject to CacheTier).
// (Basic Lookup() also inherited.)
inline TypedHandle* LookupFull(
const Slice& key, TCreateContext* create_context = nullptr,
Priority priority = Priority::LOW, Statistics* stats = nullptr,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
return reinterpret_cast<TypedHandle*>(this->cache_->Lookup(
key, GetFullHelper(), create_context, priority, stats));
} else {
return BasicTypedCacheInterface<TValue, kRole, CachePtr>::Lookup(key,
stats);
}
}
inline void StartAsyncLookupFull(
TypedAsyncLookupHandle& async_handle,
CacheTier lowest_used_cache_tier = CacheTier::kNonVolatileBlockTier) {
if (lowest_used_cache_tier == CacheTier::kNonVolatileBlockTier) {
async_handle.helper = GetFullHelper();
this->cache_->StartAsyncLookup(async_handle);
} else {
BasicTypedCacheInterface<TValue, kRole, CachePtr>::StartAsyncLookup(
async_handle);
}
}
};
// FullTypedSharedCacheInterface - Like FullTypedCacheInterface but with a
// shared_ptr<Cache> for keeping Cache alive.
template <class TValue, class TCreateContext,
CacheEntryRole kRole = TValue::kCacheEntryRole>
using FullTypedSharedCacheInterface =
FullTypedCacheInterface<TValue, TCreateContext, kRole,
std::shared_ptr<Cache>>;
#undef CACHE_TYPE_DEFS
} // namespace ROCKSDB_NAMESPACE

@ -12,7 +12,7 @@ fi
ROOT=".."
# Fetch right version of gcov
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then
source $ROOT/build_tools/fbcode_config_platform009.sh
source $ROOT/build_tools/fbcode_config_platform010.sh
GCOV=$GCC_BASE/bin/gcov
else
GCOV=$(which gcov)

@ -21,6 +21,8 @@ CRASHTEST_PY=$(PYTHON) -u tools/db_crashtest.py --stress_cmd=$(DB_STRESS_CMD) --
blackbox_crash_test_with_multiops_wp_txn \
crash_test_with_tiered_storage blackbox_crash_test_with_tiered_storage \
whitebox_crash_test_with_tiered_storage \
whitebox_crash_test_with_optimistic_txn \
blackbox_crash_test_with_optimistic_txn \
crash_test: $(DB_STRESS_CMD)
# Do not parallelize
@ -37,6 +39,11 @@ crash_test_with_txn: $(DB_STRESS_CMD)
$(CRASHTEST_MAKE) whitebox_crash_test_with_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_txn
crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
# Do not parallelize
$(CRASHTEST_MAKE) whitebox_crash_test_with_optimistic_txn
$(CRASHTEST_MAKE) blackbox_crash_test_with_optimistic_txn
crash_test_with_best_efforts_recovery: blackbox_crash_test_with_best_efforts_recovery
crash_test_with_ts: $(DB_STRESS_CMD)
@ -80,6 +87,9 @@ blackbox_crash_test_with_multiops_wp_txn: $(DB_STRESS_CMD)
blackbox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage blackbox $(CRASH_TEST_EXT_ARGS)
blackbox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn blackbox $(CRASH_TEST_EXT_ARGS)
ifeq ($(CRASH_TEST_KILL_ODD),)
CRASH_TEST_KILL_ODD=888887
endif
@ -105,3 +115,7 @@ whitebox_crash_test_with_ts: $(DB_STRESS_CMD)
whitebox_crash_test_with_tiered_storage: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --test_tiered_storage whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)
whitebox_crash_test_with_optimistic_txn: $(DB_STRESS_CMD)
$(CRASHTEST_PY) --optimistic_txn whitebox --random_kill_odd \
$(CRASH_TEST_KILL_ODD) $(CRASH_TEST_EXT_ARGS)

@ -47,6 +47,11 @@ void ArenaWrappedDBIter::Init(
read_options_ = read_options;
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
if (!CheckFSFeatureSupport(env->GetFileSystem().get(),
FSSupportedOps::kAsyncIO)) {
read_options_.async_io = false;
}
}
Status ArenaWrappedDBIter::Refresh() {

@ -13,12 +13,6 @@
namespace ROCKSDB_NAMESPACE {
std::unique_ptr<BlobContents> BlobContents::Create(
CacheAllocationPtr&& allocation, size_t size) {
return std::unique_ptr<BlobContents>(
new BlobContents(std::move(allocation), size));
}
size_t BlobContents::ApproximateMemoryUsage() const {
size_t usage = 0;
@ -45,46 +39,4 @@ size_t BlobContents::ApproximateMemoryUsage() const {
return usage;
}
size_t BlobContents::SizeCallback(void* obj) {
assert(obj);
return static_cast<const BlobContents*>(obj)->size();
}
Status BlobContents::SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out) {
assert(from_obj);
const BlobContents* buf = static_cast<const BlobContents*>(from_obj);
assert(buf->size() >= from_offset + length);
memcpy(out, buf->data().data() + from_offset, length);
return Status::OK();
}
Cache::CacheItemHelper* BlobContents::GetCacheItemHelper() {
static Cache::CacheItemHelper cache_helper(
&SizeCallback, &SaveToCallback,
GetCacheEntryDeleterForRole<BlobContents, CacheEntryRole::kBlobValue>());
return &cache_helper;
}
Status BlobContents::CreateCallback(CacheAllocationPtr&& allocation,
const void* buf, size_t size,
void** out_obj, size_t* charge) {
assert(allocation);
memcpy(allocation.get(), buf, size);
std::unique_ptr<BlobContents> obj = Create(std::move(allocation), size);
BlobContents* const contents = obj.release();
*out_obj = contents;
*charge = contents->ApproximateMemoryUsage();
return Status::OK();
}
} // namespace ROCKSDB_NAMESPACE

@ -7,8 +7,8 @@
#include <memory>
#include "memory/memory_allocator.h"
#include "rocksdb/cache.h"
#include "memory/memory_allocator_impl.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/rocksdb_namespace.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
@ -18,8 +18,8 @@ namespace ROCKSDB_NAMESPACE {
// A class representing a single uncompressed value read from a blob file.
class BlobContents {
public:
static std::unique_ptr<BlobContents> Create(CacheAllocationPtr&& allocation,
size_t size);
BlobContents(CacheAllocationPtr&& allocation, size_t size)
: allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
BlobContents(const BlobContents&) = delete;
BlobContents& operator=(const BlobContents&) = delete;
@ -34,23 +34,26 @@ class BlobContents {
size_t ApproximateMemoryUsage() const;
// Callbacks for secondary cache
static size_t SizeCallback(void* obj);
static Status SaveToCallback(void* from_obj, size_t from_offset,
size_t length, void* out);
static Cache::CacheItemHelper* GetCacheItemHelper();
static Status CreateCallback(CacheAllocationPtr&& allocation, const void* buf,
size_t size, void** out_obj, size_t* charge);
// For TypedCacheInterface
const Slice& ContentSlice() const { return data_; }
static constexpr CacheEntryRole kCacheEntryRole = CacheEntryRole::kBlobValue;
private:
BlobContents(CacheAllocationPtr&& allocation, size_t size)
: allocation_(std::move(allocation)), data_(allocation_.get(), size) {}
CacheAllocationPtr allocation_;
Slice data_;
};
class BlobContentsCreator : public Cache::CreateContext {
public:
static void Create(std::unique_ptr<BlobContents>* out, size_t* out_charge,
const Slice& contents, MemoryAllocator* alloc) {
auto raw = new BlobContents(AllocateAndCopyBlock(contents, alloc),
contents.size());
out->reset(raw);
if (out_charge) {
*out_charge = raw->ApproximateMemoryUsage();
}
}
};
} // namespace ROCKSDB_NAMESPACE

@ -134,13 +134,6 @@ class BlobCountingIterator : public InternalIterator {
if (!iter_->Valid()) {
status_ = iter_->status();
return;
} else if (iter_->IsDeleteRangeSentinelKey()) {
// CompactionMergingIterator emits range tombstones, and range tombstone
// keys can be truncated at file boundaries. This means the range
// tombstone keys can have op_type kTypeBlobIndex.
// This could crash the ProcessInFlow() call below since
// value is empty for these keys.
return;
}
TEST_SYNC_POINT(

@ -13,6 +13,7 @@
#include "db/blob/blob_index.h"
#include "db/blob/blob_log_format.h"
#include "db/blob/blob_log_writer.h"
#include "db/blob/blob_source.h"
#include "db/event_helpers.h"
#include "db/version_set.h"
#include "file/filename.h"
@ -258,6 +259,7 @@ Status BlobFileBuilder::CompressBlobIfNeeded(
return Status::OK();
}
// TODO: allow user CompressionOptions, including max_compressed_bytes_per_kb
CompressionOptions opts;
CompressionContext context(blob_compression_type_);
constexpr uint64_t sample_for_compression = 0;
@ -393,7 +395,7 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
uint64_t blob_offset) const {
Status s = Status::OK();
auto blob_cache = immutable_options_->blob_cache;
BlobSource::SharedCacheInterface blob_cache{immutable_options_->blob_cache};
auto statistics = immutable_options_->statistics.get();
bool warm_cache =
prepopulate_blob_cache_ == PrepopulateBlobCache::kFlushOnly &&
@ -407,34 +409,12 @@ Status BlobFileBuilder::PutBlobIntoCacheIfNeeded(const Slice& blob,
const Cache::Priority priority = Cache::Priority::BOTTOM;
// Objects to be put into the cache have to be heap-allocated and
// self-contained, i.e. own their contents. The Cache has to be able to
// take unique ownership of them.
CacheAllocationPtr allocation =
AllocateBlock(blob.size(), blob_cache->memory_allocator());
memcpy(allocation.get(), blob.data(), blob.size());
std::unique_ptr<BlobContents> buf =
BlobContents::Create(std::move(allocation), blob.size());
Cache::CacheItemHelper* const cache_item_helper =
BlobContents::GetCacheItemHelper();
assert(cache_item_helper);
if (immutable_options_->lowest_used_cache_tier ==
CacheTier::kNonVolatileBlockTier) {
s = blob_cache->Insert(key, buf.get(), cache_item_helper,
buf->ApproximateMemoryUsage(),
nullptr /* cache_handle */, priority);
} else {
s = blob_cache->Insert(key, buf.get(), buf->ApproximateMemoryUsage(),
cache_item_helper->del_cb,
nullptr /* cache_handle */, priority);
}
s = blob_cache.InsertSaved(key, blob, nullptr /*context*/, priority,
immutable_options_->lowest_used_cache_tier);
if (s.ok()) {
RecordTick(statistics, BLOB_DB_CACHE_ADD);
RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, buf->size());
buf.release();
RecordTick(statistics, BLOB_DB_CACHE_BYTES_WRITE, blob.size());
} else {
RecordTick(statistics, BLOB_DB_CACHE_ADD_FAILURES);
}

@ -25,7 +25,7 @@ BlobFileCache::BlobFileCache(Cache* cache,
HistogramImpl* blob_file_read_hist,
const std::shared_ptr<IOTracer>& io_tracer)
: cache_(cache),
mutex_(kNumberOfMutexStripes, kGetSliceNPHash64UnseededFnPtr),
mutex_(kNumberOfMutexStripes),
immutable_options_(immutable_options),
file_options_(file_options),
column_family_id_(column_family_id),
@ -37,29 +37,29 @@ BlobFileCache::BlobFileCache(Cache* cache,
}
Status BlobFileCache::GetBlobFileReader(
uint64_t blob_file_number,
const ReadOptions& read_options, uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader) {
assert(blob_file_reader);
assert(blob_file_reader->IsEmpty());
const Slice key = GetSlice(&blob_file_number);
const Slice key = GetSliceForKey(&blob_file_number);
assert(cache_);
Cache::Handle* handle = cache_->Lookup(key);
TypedHandle* handle = cache_.Lookup(key);
if (handle) {
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
*blob_file_reader = cache_.Guard(handle);
return Status::OK();
}
TEST_SYNC_POINT("BlobFileCache::GetBlobFileReader:DoubleCheck");
// Check again while holding mutex
MutexLock lock(mutex_.get(key));
MutexLock lock(&mutex_.Get(key));
handle = cache_->Lookup(key);
handle = cache_.Lookup(key);
if (handle) {
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
*blob_file_reader = cache_.Guard(handle);
return Status::OK();
}
@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader(
{
assert(file_options_);
const Status s = BlobFileReader::Create(
*immutable_options_, *file_options_, column_family_id_,
*immutable_options_, read_options, *file_options_, column_family_id_,
blob_file_read_hist_, blob_file_number, io_tracer_, &reader);
if (!s.ok()) {
RecordTick(statistics, NO_FILE_ERRORS);
@ -84,8 +84,7 @@ Status BlobFileCache::GetBlobFileReader(
{
constexpr size_t charge = 1;
const Status s = cache_->Insert(key, reader.get(), charge,
&DeleteCacheEntry<BlobFileReader>, &handle);
const Status s = cache_.Insert(key, reader.get(), charge, &handle);
if (!s.ok()) {
RecordTick(statistics, NO_FILE_ERRORS);
return s;
@ -94,7 +93,7 @@ Status BlobFileCache::GetBlobFileReader(
reader.release();
*blob_file_reader = CacheHandleGuard<BlobFileReader>(cache_, handle);
*blob_file_reader = cache_.Guard(handle);
return Status::OK();
}

@ -7,7 +7,8 @@
#include <cinttypes>
#include "cache/cache_helpers.h"
#include "cache/typed_cache.h"
#include "db/blob/blob_file_reader.h"
#include "rocksdb/rocksdb_namespace.h"
#include "util/mutexlock.h"
@ -18,7 +19,6 @@ struct ImmutableOptions;
struct FileOptions;
class HistogramImpl;
class Status;
class BlobFileReader;
class Slice;
class IOTracer;
@ -32,14 +32,18 @@ class BlobFileCache {
BlobFileCache(const BlobFileCache&) = delete;
BlobFileCache& operator=(const BlobFileCache&) = delete;
Status GetBlobFileReader(uint64_t blob_file_number,
Status GetBlobFileReader(const ReadOptions& read_options,
uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader);
private:
Cache* cache_;
using CacheInterface =
BasicTypedCacheInterface<BlobFileReader, CacheEntryRole::kMisc>;
using TypedHandle = CacheInterface::TypedHandle;
CacheInterface cache_;
// Note: mutex_ below is used to guard against multiple threads racing to open
// the same file.
Striped<port::Mutex, Slice> mutex_;
Striped<CacheAlignedWrapper<port::Mutex>> mutex_;
const ImmutableOptions* immutable_options_;
const FileOptions* file_options_;
uint32_t column_family_id_;

@ -118,7 +118,9 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
// First try: reader should be opened and put in cache
CacheHandleGuard<BlobFileReader> first;
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
const ReadOptions read_options;
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&first));
ASSERT_NE(first.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -126,7 +128,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) {
// Second try: reader should be served from cache
CacheHandleGuard<BlobFileReader> second;
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&second));
ASSERT_NE(second.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -163,19 +166,21 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) {
CacheHandleGuard<BlobFileReader> first;
CacheHandleGuard<BlobFileReader> second;
const ReadOptions read_options;
SyncPoint::GetInstance()->SetCallBack(
"BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) {
// Disabling sync points to prevent infinite recursion
SyncPoint::GetInstance()->DisableProcessing();
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second));
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options,
blob_file_number, &second));
ASSERT_NE(second.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
});
SyncPoint::GetInstance()->EnableProcessing();
ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first));
ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number,
&first));
ASSERT_NE(first.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0);
@ -213,8 +218,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) {
CacheHandleGuard<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_TRUE(
blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError());
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
.IsIOError());
ASSERT_EQ(reader.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);
@ -253,8 +260,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) {
// strict_capacity_limit is set
CacheHandleGuard<BlobFileReader> reader;
ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader)
.IsMemoryLimit());
const ReadOptions read_options;
ASSERT_TRUE(
blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader)
.IsMemoryLimit());
ASSERT_EQ(reader.GetValue(), nullptr);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1);
ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1);

@ -23,32 +23,19 @@ class BlobFileCompletionCallback {
const std::vector<std::shared_ptr<EventListener>>& listeners,
const std::string& dbname)
: event_logger_(event_logger), listeners_(listeners), dbname_(dbname) {
#ifndef ROCKSDB_LITE
sst_file_manager_ = sst_file_manager;
mutex_ = mutex;
error_handler_ = error_handler;
#else
(void)sst_file_manager;
(void)mutex;
(void)error_handler;
#endif // ROCKSDB_LITE
}
void OnBlobFileCreationStarted(const std::string& file_name,
const std::string& column_family_name,
int job_id,
BlobFileCreationReason creation_reason) {
#ifndef ROCKSDB_LITE
// Notify the listeners.
EventHelpers::NotifyBlobFileCreationStarted(listeners_, dbname_,
column_family_name, file_name,
job_id, creation_reason);
#else
(void)file_name;
(void)column_family_name;
(void)job_id;
(void)creation_reason;
#endif
}
Status OnBlobFileCompleted(const std::string& file_name,
@ -61,7 +48,6 @@ class BlobFileCompletionCallback {
uint64_t blob_count, uint64_t blob_bytes) {
Status s;
#ifndef ROCKSDB_LITE
auto sfm = static_cast<SstFileManagerImpl*>(sst_file_manager_);
if (sfm) {
// Report new blob files to SstFileManagerImpl
@ -74,7 +60,6 @@ class BlobFileCompletionCallback {
error_handler_->SetBGError(s, BackgroundErrorReason::kFlush);
}
}
#endif // !ROCKSDB_LITE
// Notify the listeners.
EventHelpers::LogAndNotifyBlobFileCreationFinished(
@ -89,11 +74,9 @@ class BlobFileCompletionCallback {
}
private:
#ifndef ROCKSDB_LITE
SstFileManager* sst_file_manager_;
InstrumentedMutex* mutex_;
ErrorHandler* error_handler_;
#endif // ROCKSDB_LITE
EventLogger* event_logger_;
std::vector<std::shared_ptr<EventListener>> listeners_;
std::string dbname_;

@ -12,7 +12,7 @@
#include "db/blob/blob_log_format.h"
#include "file/file_prefetch_buffer.h"
#include "file/filename.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "options/cf_options.h"
#include "rocksdb/file_system.h"
#include "rocksdb/slice.h"
@ -26,9 +26,10 @@
namespace ROCKSDB_NAMESPACE {
Status BlobFileReader::Create(
const ImmutableOptions& immutable_options, const FileOptions& file_options,
uint32_t column_family_id, HistogramImpl* blob_file_read_hist,
uint64_t blob_file_number, const std::shared_ptr<IOTracer>& io_tracer,
const ImmutableOptions& immutable_options, const ReadOptions& read_options,
const FileOptions& file_options, uint32_t column_family_id,
HistogramImpl* blob_file_read_hist, uint64_t blob_file_number,
const std::shared_ptr<IOTracer>& io_tracer,
std::unique_ptr<BlobFileReader>* blob_file_reader) {
assert(blob_file_reader);
assert(!*blob_file_reader);
@ -52,15 +53,17 @@ Status BlobFileReader::Create(
CompressionType compression_type = kNoCompression;
{
const Status s = ReadHeader(file_reader.get(), column_family_id, statistics,
&compression_type);
const Status s =
ReadHeader(file_reader.get(), read_options, column_family_id,
statistics, &compression_type);
if (!s.ok()) {
return s;
}
}
{
const Status s = ReadFooter(file_reader.get(), file_size, statistics);
const Status s =
ReadFooter(file_reader.get(), read_options, file_size, statistics);
if (!s.ok()) {
return s;
}
@ -134,6 +137,7 @@ Status BlobFileReader::OpenFile(
}
Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint32_t column_family_id,
Statistics* statistics,
CompressionType* compression_type) {
@ -151,9 +155,10 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
constexpr size_t read_size = BlobLogHeader::kSize;
// TODO: rate limit reading headers from blob files.
const Status s = ReadFromFile(file_reader, read_offset, read_size,
statistics, &header_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
const Status s =
ReadFromFile(file_reader, read_options, read_offset, read_size,
statistics, &header_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
if (!s.ok()) {
return s;
}
@ -187,6 +192,7 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader,
}
Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t file_size, Statistics* statistics) {
assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize);
assert(file_reader);
@ -202,9 +208,10 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
constexpr size_t read_size = BlobLogFooter::kSize;
// TODO: rate limit reading footers from blob files.
const Status s = ReadFromFile(file_reader, read_offset, read_size,
statistics, &footer_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
const Status s =
ReadFromFile(file_reader, read_options, read_offset, read_size,
statistics, &footer_slice, &buf, &aligned_buf,
Env::IO_TOTAL /* rate_limiter_priority */);
if (!s.ok()) {
return s;
}
@ -232,6 +239,7 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader,
}
Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t read_offset, size_t read_size,
Statistics* statistics, Slice* slice,
Buffer* buf, AlignedBuf* aligned_buf,
@ -246,17 +254,23 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader,
Status s;
IOOptions io_options;
s = file_reader->PrepareIOOptions(read_options, io_options);
if (!s.ok()) {
return s;
}
if (file_reader->use_direct_io()) {
constexpr char* scratch = nullptr;
s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch,
s = file_reader->Read(io_options, read_offset, read_size, slice, scratch,
aligned_buf, rate_limiter_priority);
} else {
buf->reset(new char[read_size]);
constexpr AlignedBuf* aligned_scratch = nullptr;
s = file_reader->Read(IOOptions(), read_offset, read_size, slice,
buf->get(), aligned_scratch, rate_limiter_priority);
s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(),
aligned_scratch, rate_limiter_priority);
}
if (!s.ok()) {
@ -324,8 +338,13 @@ Status BlobFileReader::GetBlob(
Status s;
constexpr bool for_compaction = true;
IOOptions io_options;
s = file_reader_->PrepareIOOptions(read_options, io_options);
if (!s.ok()) {
return s;
}
prefetched = prefetch_buffer->TryReadFromCache(
IOOptions(), file_reader_.get(), record_offset,
io_options, file_reader_.get(), record_offset,
static_cast<size_t>(record_size), &record_slice, &s,
read_options.rate_limiter_priority, for_compaction);
if (!s.ok()) {
@ -338,10 +357,10 @@ Status BlobFileReader::GetBlob(
PERF_COUNTER_ADD(blob_read_count, 1);
PERF_COUNTER_ADD(blob_read_byte, record_size);
PERF_TIMER_GUARD(blob_read_time);
const Status s = ReadFromFile(file_reader_.get(), record_offset,
static_cast<size_t>(record_size), statistics_,
&record_slice, &buf, &aligned_buf,
read_options.rate_limiter_priority);
const Status s = ReadFromFile(
file_reader_.get(), read_options, record_offset,
static_cast<size_t>(record_size), statistics_, &record_slice, &buf,
&aligned_buf, read_options.rate_limiter_priority);
if (!s.ok()) {
return s;
}
@ -420,11 +439,11 @@ void BlobFileReader::MultiGetBlob(
assert(req->offset >= adjustment);
adjustments.push_back(adjustment);
FSReadRequest read_req = {};
FSReadRequest read_req;
read_req.offset = req->offset - adjustment;
read_req.len = req->len + adjustment;
read_reqs.emplace_back(read_req);
total_len += read_req.len;
read_reqs.emplace_back(std::move(read_req));
}
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, total_len);
@ -569,12 +588,7 @@ Status BlobFileReader::UncompressBlobIfNeeded(
assert(result);
if (compression_type == kNoCompression) {
CacheAllocationPtr allocation =
AllocateBlock(value_slice.size(), allocator);
memcpy(allocation.get(), value_slice.data(), value_slice.size());
*result = BlobContents::Create(std::move(allocation), value_slice.size());
BlobContentsCreator::Create(result, nullptr, value_slice, allocator);
return Status::OK();
}
@ -602,7 +616,7 @@ Status BlobFileReader::UncompressBlobIfNeeded(
return Status::Corruption("Unable to uncompress blob");
}
*result = BlobContents::Create(std::move(output), uncompressed_size);
result->reset(new BlobContents(std::move(output), uncompressed_size));
return Status::OK();
}

@ -29,6 +29,7 @@ class Statistics;
class BlobFileReader {
public:
static Status Create(const ImmutableOptions& immutable_options,
const ReadOptions& read_options,
const FileOptions& file_options,
uint32_t column_family_id,
HistogramImpl* blob_file_read_hist,
@ -74,15 +75,18 @@ class BlobFileReader {
std::unique_ptr<RandomAccessFileReader>* file_reader);
static Status ReadHeader(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint32_t column_family_id, Statistics* statistics,
CompressionType* compression_type);
static Status ReadFooter(const RandomAccessFileReader* file_reader,
uint64_t file_size, Statistics* statistics);
const ReadOptions& read_options, uint64_t file_size,
Statistics* statistics);
using Buffer = std::unique_ptr<char[]>;
static Status ReadFromFile(const RandomAccessFileReader* file_reader,
const ReadOptions& read_options,
uint64_t read_offset, size_t read_size,
Statistics* statistics, Slice* slice, Buffer* buf,
AlignedBuf* aligned_buf,

@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) {
std::unique_ptr<BlobFileReader> reader;
ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
// Make sure the blob can be retrieved with and without checksum verification
ReadOptions read_options;
read_options.verify_checksums = false;
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
}
@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
}
@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
}
@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/,
&reader)
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
}
@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) {
std::unique_ptr<BlobFileReader> reader;
constexpr uint32_t incorrect_column_family_id = 2;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(),
incorrect_column_family_id,
const ReadOptions read_options;
ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options,
FileOptions(), incorrect_column_family_id,
blob_file_read_hist, blob_file_number,
nullptr /*IOTracer*/, &reader)
.IsCorruption());
@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
SyncPoint::GetInstance()->SetCallBack(
"BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) {
@ -728,13 +728,12 @@ TEST_F(BlobFileReaderTest, Compression) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
// Make sure the blob can be retrieved with and without checksum verification
ReadOptions read_options;
read_options.verify_checksums = false;
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
@ -803,10 +802,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
ASSERT_OK(BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader));
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader));
SyncPoint::GetInstance()->SetCallBack(
"BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) {
@ -895,10 +894,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
const Status s = BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader);
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);
const bool fail_during_create =
(sync_point_ != "BlobFileReader::GetBlob:ReadFromFile");
@ -983,10 +982,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) {
constexpr HistogramImpl* blob_file_read_hist = nullptr;
std::unique_ptr<BlobFileReader> reader;
const ReadOptions read_options;
const Status s = BlobFileReader::Create(
immutable_options, FileOptions(), column_family_id, blob_file_read_hist,
blob_file_number, nullptr /*IOTracer*/, &reader);
immutable_options, read_options, FileOptions(), column_family_id,
blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader);
const bool fail_during_create =
sync_point_ != "BlobFileReader::GetBlob:TamperWithResult";

@ -7,7 +7,7 @@
#include "db/blob/blob_log_sequential_reader.h"
#include "file/random_access_file_reader.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "util/stop_watch.h"
namespace ROCKSDB_NAMESPACE {

@ -10,7 +10,7 @@
#include "db/blob/blob_log_format.h"
#include "file/writable_file_writer.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "rocksdb/system_clock.h"
#include "test_util/sync_point.h"
#include "util/coding.h"

@ -13,7 +13,7 @@
#include "db/blob/blob_contents.h"
#include "db/blob/blob_file_reader.h"
#include "db/blob/blob_log_format.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "options/cf_options.h"
#include "table/get_context.h"
#include "table/multiget_context.h"
@ -30,16 +30,14 @@ BlobSource::BlobSource(const ImmutableOptions* immutable_options,
blob_file_cache_(blob_file_cache),
blob_cache_(immutable_options->blob_cache),
lowest_used_cache_tier_(immutable_options->lowest_used_cache_tier) {
#ifndef ROCKSDB_LITE
auto bbto =
immutable_options->table_factory->GetOptions<BlockBasedTableOptions>();
if (bbto &&
bbto->cache_usage_options.options_overrides.at(CacheEntryRole::kBlobCache)
.charged == CacheEntryRoleOptions::Decision::kEnabled) {
blob_cache_ = std::make_shared<ChargedCache>(immutable_options->blob_cache,
bbto->block_cache);
blob_cache_ = SharedCacheInterface{std::make_shared<ChargedCache>(
immutable_options->blob_cache, bbto->block_cache)};
}
#endif // ROCKSDB_LITE
}
BlobSource::~BlobSource() = default;
@ -82,9 +80,8 @@ Status BlobSource::PutBlobIntoCache(
assert(cached_blob);
assert(cached_blob->IsEmpty());
Cache::Handle* cache_handle = nullptr;
TypedHandle* cache_handle = nullptr;
const Status s = InsertEntryIntoCache(cache_key, blob->get(),
(*blob)->ApproximateMemoryUsage(),
&cache_handle, Cache::Priority::BOTTOM);
if (s.ok()) {
blob->release();
@ -106,26 +103,10 @@ Status BlobSource::PutBlobIntoCache(
return s;
}
Cache::Handle* BlobSource::GetEntryFromCache(const Slice& key) const {
Cache::Handle* cache_handle = nullptr;
if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
Cache::CreateCallback create_cb =
[allocator = blob_cache_->memory_allocator()](
const void* buf, size_t size, void** out_obj,
size_t* charge) -> Status {
return BlobContents::CreateCallback(AllocateBlock(size, allocator), buf,
size, out_obj, charge);
};
cache_handle = blob_cache_->Lookup(key, BlobContents::GetCacheItemHelper(),
create_cb, Cache::Priority::BOTTOM,
true /* wait_for_cache */, statistics_);
} else {
cache_handle = blob_cache_->Lookup(key, statistics_);
}
return cache_handle;
BlobSource::TypedHandle* BlobSource::GetEntryFromCache(const Slice& key) const {
return blob_cache_.LookupFull(key, nullptr /* context */,
Cache::Priority::BOTTOM, statistics_,
lowest_used_cache_tier_);
}
void BlobSource::PinCachedBlob(CacheHandleGuard<BlobContents>* cached_blob,
@ -166,24 +147,11 @@ void BlobSource::PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
}
Status BlobSource::InsertEntryIntoCache(const Slice& key, BlobContents* value,
size_t charge,
Cache::Handle** cache_handle,
TypedHandle** cache_handle,
Cache::Priority priority) const {
Status s;
Cache::CacheItemHelper* const cache_item_helper =
BlobContents::GetCacheItemHelper();
assert(cache_item_helper);
if (lowest_used_cache_tier_ == CacheTier::kNonVolatileBlockTier) {
s = blob_cache_->Insert(key, value, cache_item_helper, charge, cache_handle,
priority);
} else {
s = blob_cache_->Insert(key, value, charge, cache_item_helper->del_cb,
cache_handle, priority);
}
return s;
return blob_cache_.InsertFull(key, value, value->ApproximateMemoryUsage(),
cache_handle, priority,
lowest_used_cache_tier_);
}
Status BlobSource::GetBlob(const ReadOptions& read_options,
@ -241,7 +209,8 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,
{
CacheHandleGuard<BlobFileReader> blob_file_reader;
s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
&blob_file_reader);
if (!s.ok()) {
return s;
}
@ -252,9 +221,10 @@ Status BlobSource::GetBlob(const ReadOptions& read_options,
return Status::Corruption("Compression type mismatch when reading blob");
}
MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
? blob_cache_->memory_allocator()
: nullptr;
MemoryAllocator* const allocator =
(blob_cache_ && read_options.fill_cache)
? blob_cache_.get()->memory_allocator()
: nullptr;
uint64_t read_size = 0;
s = blob_file_reader.GetValue()->GetBlob(
@ -403,8 +373,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
}
CacheHandleGuard<BlobFileReader> blob_file_reader;
Status s =
blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader);
Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number,
&blob_file_reader);
if (!s.ok()) {
for (size_t i = 0; i < _blob_reqs.size(); ++i) {
BlobReadRequest* const req = _blob_reqs[i].first;
@ -418,9 +388,10 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options,
assert(blob_file_reader.GetValue());
MemoryAllocator* const allocator = (blob_cache_ && read_options.fill_cache)
? blob_cache_->memory_allocator()
: nullptr;
MemoryAllocator* const allocator =
(blob_cache_ && read_options.fill_cache)
? blob_cache_.get()->memory_allocator()
: nullptr;
blob_file_reader.GetValue()->MultiGetBlob(read_options, allocator,
_blob_reqs, &_bytes_read);

@ -8,8 +8,9 @@
#include <cinttypes>
#include <memory>
#include "cache/cache_helpers.h"
#include "cache/cache_key.h"
#include "cache/typed_cache.h"
#include "db/blob/blob_contents.h"
#include "db/blob/blob_file_cache.h"
#include "db/blob/blob_read_request.h"
#include "rocksdb/cache.h"
@ -23,7 +24,6 @@ struct ImmutableOptions;
class Status;
class FilePrefetchBuffer;
class Slice;
class BlobContents;
// BlobSource is a class that provides universal access to blobs, regardless of
// whether they are in the blob cache, secondary cache, or (remote) storage.
@ -95,9 +95,9 @@ class BlobSource {
uint64_t* bytes_read);
inline Status GetBlobFileReader(
uint64_t blob_file_number,
const ReadOptions& read_options, uint64_t blob_file_number,
CacheHandleGuard<BlobFileReader>* blob_file_reader) {
return blob_file_cache_->GetBlobFileReader(blob_file_number,
return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number,
blob_file_reader);
}
@ -106,6 +106,14 @@ class BlobSource {
bool TEST_BlobInCache(uint64_t file_number, uint64_t file_size,
uint64_t offset, size_t* charge = nullptr) const;
// For TypedSharedCacheInterface
void Create(BlobContents** out, const char* buf, size_t size,
MemoryAllocator* alloc);
using SharedCacheInterface =
FullTypedSharedCacheInterface<BlobContents, BlobContentsCreator>;
using TypedHandle = SharedCacheInterface::TypedHandle;
private:
Status GetBlobFromCache(const Slice& cache_key,
CacheHandleGuard<BlobContents>* cached_blob) const;
@ -120,10 +128,10 @@ class BlobSource {
static void PinOwnedBlob(std::unique_ptr<BlobContents>* owned_blob,
PinnableSlice* value);
Cache::Handle* GetEntryFromCache(const Slice& key) const;
TypedHandle* GetEntryFromCache(const Slice& key) const;
Status InsertEntryIntoCache(const Slice& key, BlobContents* value,
size_t charge, Cache::Handle** cache_handle,
TypedHandle** cache_handle,
Cache::Priority priority) const;
inline CacheKey GetCacheKey(uint64_t file_number, uint64_t /*file_size*/,
@ -141,7 +149,7 @@ class BlobSource {
BlobFileCache* blob_file_cache_;
// A cache to store uncompressed blobs.
std::shared_ptr<Cache> blob_cache_;
mutable SharedCacheInterface blob_cache_;
// The control option of how the cache tiers will be used. Currently rocksdb
// support block/blob cache (volatile tier) and secondary cache (this tier

@ -517,7 +517,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) {
compression, blob_offsets, blob_sizes);
CacheHandleGuard<BlobFileReader> blob_file_reader;
ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader));
ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number,
&blob_file_reader));
ASSERT_NE(blob_file_reader.GetValue(), nullptr);
const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize();
@ -1139,26 +1140,18 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
blob_file_cache.get());
CacheHandleGuard<BlobFileReader> file_reader;
ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader));
ReadOptions read_options;
ASSERT_OK(
blob_source.GetBlobFileReader(read_options, file_number, &file_reader));
ASSERT_NE(file_reader.GetValue(), nullptr);
const uint64_t file_size = file_reader.GetValue()->GetFileSize();
ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression);
ReadOptions read_options;
read_options.verify_checksums = true;
auto blob_cache = options_.blob_cache;
auto secondary_cache = lru_cache_opts_.secondary_cache;
Cache::CreateCallback create_cb = [](const void* buf, size_t size,
void** out_obj,
size_t* charge) -> Status {
CacheAllocationPtr allocation(new char[size]);
return BlobContents::CreateCallback(std::move(allocation), buf, size,
out_obj, charge);
};
{
// GetBlob
std::vector<PinnableSlice> values(keys.size());
@ -1219,15 +1212,16 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
{
CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[0]);
const Slice key0 = cache_key.AsSlice();
auto handle0 = blob_cache->Lookup(key0, statistics);
auto handle0 = blob_cache->BasicLookup(key0, statistics);
ASSERT_EQ(handle0, nullptr);
// key0's item should be in the secondary cache.
bool is_in_sec_cache = false;
auto sec_handle0 =
secondary_cache->Lookup(key0, create_cb, true,
/*advise_erase=*/true, is_in_sec_cache);
ASSERT_FALSE(is_in_sec_cache);
bool kept_in_sec_cache = false;
auto sec_handle0 = secondary_cache->Lookup(
key0, BlobSource::SharedCacheInterface::GetFullHelper(),
/*context*/ nullptr, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_FALSE(kept_in_sec_cache);
ASSERT_NE(sec_handle0, nullptr);
ASSERT_TRUE(sec_handle0->IsReady());
auto value = static_cast<BlobContents*>(sec_handle0->Value());
@ -1246,15 +1240,16 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
{
CacheKey cache_key = base_cache_key.WithOffset(blob_offsets[1]);
const Slice key1 = cache_key.AsSlice();
auto handle1 = blob_cache->Lookup(key1, statistics);
auto handle1 = blob_cache->BasicLookup(key1, statistics);
ASSERT_NE(handle1, nullptr);
blob_cache->Release(handle1);
bool is_in_sec_cache = false;
auto sec_handle1 =
secondary_cache->Lookup(key1, create_cb, true,
/*advise_erase=*/true, is_in_sec_cache);
ASSERT_FALSE(is_in_sec_cache);
bool kept_in_sec_cache = false;
auto sec_handle1 = secondary_cache->Lookup(
key1, BlobSource::SharedCacheInterface::GetFullHelper(),
/*context*/ nullptr, true,
/*advise_erase=*/true, kept_in_sec_cache);
ASSERT_FALSE(kept_in_sec_cache);
ASSERT_EQ(sec_handle1, nullptr);
ASSERT_TRUE(blob_source.TEST_BlobInCache(file_number, file_size,
@ -1276,7 +1271,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// key0 should be in the primary cache.
CacheKey cache_key0 = base_cache_key.WithOffset(blob_offsets[0]);
const Slice key0 = cache_key0.AsSlice();
auto handle0 = blob_cache->Lookup(key0, statistics);
auto handle0 = blob_cache->BasicLookup(key0, statistics);
ASSERT_NE(handle0, nullptr);
auto value = static_cast<BlobContents*>(blob_cache->Value(handle0));
ASSERT_NE(value, nullptr);
@ -1286,12 +1281,12 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// key1 is not in the primary cache and is in the secondary cache.
CacheKey cache_key1 = base_cache_key.WithOffset(blob_offsets[1]);
const Slice key1 = cache_key1.AsSlice();
auto handle1 = blob_cache->Lookup(key1, statistics);
auto handle1 = blob_cache->BasicLookup(key1, statistics);
ASSERT_EQ(handle1, nullptr);
// erase key0 from the primary cache.
blob_cache->Erase(key0);
handle0 = blob_cache->Lookup(key0, statistics);
handle0 = blob_cache->BasicLookup(key0, statistics);
ASSERT_EQ(handle0, nullptr);
// key1 promotion should succeed due to the primary cache being empty. we
@ -1307,7 +1302,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
// in the secondary cache. So, the primary cache's Lookup() without
// secondary cache support cannot see it. (NOTE: The dummy handle used
// to be a leaky abstraction but not anymore.)
handle1 = blob_cache->Lookup(key1, statistics);
handle1 = blob_cache->BasicLookup(key1, statistics);
ASSERT_EQ(handle1, nullptr);
// But after another access, it is promoted to primary cache
@ -1315,7 +1310,7 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) {
blob_offsets[1]));
// And Lookup() can find it (without secondary cache support)
handle1 = blob_cache->Lookup(key1, statistics);
handle1 = blob_cache->BasicLookup(key1, statistics);
ASSERT_NE(handle1, nullptr);
ASSERT_NE(blob_cache->Value(handle1), nullptr);
blob_cache->Release(handle1);
@ -1379,7 +1374,7 @@ class BlobSourceCacheReservationTest : public DBTestBase {
static constexpr std::size_t kSizeDummyEntry = CacheReservationManagerImpl<
CacheEntryRole::kBlobCache>::GetDummyEntrySize();
static constexpr std::size_t kCacheCapacity = 1 * kSizeDummyEntry;
static constexpr std::size_t kCacheCapacity = 2 * kSizeDummyEntry;
static constexpr int kNumShardBits = 0; // 2^0 shard
static constexpr uint32_t kColumnFamilyId = 1;
@ -1398,7 +1393,6 @@ class BlobSourceCacheReservationTest : public DBTestBase {
std::string db_session_id_;
};
#ifndef ROCKSDB_LITE
TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
options_.cf_paths.emplace_back(
test::PerThreadDBPath(
@ -1513,11 +1507,10 @@ TEST_F(BlobSourceCacheReservationTest, SimpleCacheReservation) {
}
}
TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservation) {
options_.cf_paths.emplace_back(
test::PerThreadDBPath(
env_,
"BlobSourceCacheReservationTest_IncreaseCacheReservationOnFullCache"),
env_, "BlobSourceCacheReservationTest_IncreaseCacheReservation"),
0);
GenerateKeysAndBlobs();
@ -1525,7 +1518,7 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
DestroyAndReopen(options_);
ImmutableOptions immutable_options(options_);
constexpr size_t blob_size = kSizeDummyEntry / (kNumBlobs / 2);
constexpr size_t blob_size = 24 << 10; // 24KB
for (size_t i = 0; i < kNumBlobs; ++i) {
blob_file_size_ -= blobs_[i].size(); // old blob size
blob_strs_[i].resize(blob_size, '@');
@ -1583,11 +1576,6 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
std::vector<PinnableSlice> values(keys_.size());
// Since we resized each blob to be kSizeDummyEntry / (num_blobs / 2), we
// can't fit all the blobs in the cache at the same time, which means we
// should observe cache evictions once we reach the cache's capacity.
// Due to the overhead of the cache and the BlobContents objects, as well as
// jemalloc bin sizes, this happens after inserting seven blobs.
uint64_t blob_bytes = 0;
for (size_t i = 0; i < kNumBlobs; ++i) {
ASSERT_OK(blob_source.GetBlob(
@ -1598,22 +1586,21 @@ TEST_F(BlobSourceCacheReservationTest, IncreaseCacheReservationOnFullCache) {
// Release cache handle
values[i].Reset();
if (i < kNumBlobs / 2 - 1) {
size_t charge = 0;
ASSERT_TRUE(blob_source.TEST_BlobInCache(
kBlobFileNumber, blob_file_size_, blob_offsets[i], &charge));
size_t charge = 0;
ASSERT_TRUE(blob_source.TEST_BlobInCache(kBlobFileNumber, blob_file_size_,
blob_offsets[i], &charge));
blob_bytes += charge;
}
blob_bytes += charge;
ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(), kSizeDummyEntry);
ASSERT_EQ(cache_res_mgr->GetTotalReservedCacheSize(),
(blob_bytes <= kSizeDummyEntry) ? kSizeDummyEntry
: (2 * kSizeDummyEntry));
ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(), blob_bytes);
ASSERT_EQ(cache_res_mgr->GetTotalMemoryUsed(),
options_.blob_cache->GetUsage());
}
}
}
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE

@ -11,6 +11,7 @@
#include "db/blob/blob_index.h"
#include "db/blob/blob_log_format.h"
#include "db/db_test_util.h"
#include "db/db_with_timestamp_test_util.h"
#include "port/stack_trace.h"
#include "test_util/sync_point.h"
#include "utilities/fault_injection_env.h"
@ -584,7 +585,6 @@ TEST_F(DBBlobBasicTest, MultiGetBlobsFromCache) {
}
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
Options options = GetDefaultOptions();
@ -773,7 +773,6 @@ TEST_F(DBBlobBasicTest, MultiGetWithDirectIO) {
ASSERT_EQ(values[2], second_blob);
}
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, MultiGetBlobsFromMultipleFiles) {
Options options = GetDefaultOptions();
@ -1062,7 +1061,6 @@ TEST_F(DBBlobBasicTest, GetBlob_IndexWithInvalidFileNumber) {
.IsCorruption());
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, GenerateIOTracing) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -1117,7 +1115,6 @@ TEST_F(DBBlobBasicTest, GenerateIOTracing) {
ASSERT_GT(blob_files_op_count, 2);
}
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, BestEffortsRecovery_MissingNewestBlobFile) {
Options options = GetDefaultOptions();
@ -1219,7 +1216,6 @@ TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
ASSERT_EQ(values[2], "v2_0");
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, Properties) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -1382,7 +1378,6 @@ TEST_F(DBBlobBasicTest, PropertiesMultiVersion) {
BlobLogRecord::CalculateAdjustmentForRecordHeader(key_size) +
blob_size + BlobLogFooter::kSize));
}
#endif // !ROCKSDB_LITE
class DBBlobBasicIOErrorTest : public DBBlobBasicTest,
public testing::WithParamInterface<std::string> {
@ -1632,7 +1627,6 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsDuringFlush) {
options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
Options options = GetDefaultOptions();
@ -1700,7 +1694,6 @@ TEST_F(DBBlobBasicTest, DynamicallyWarmCacheDuringFlush) {
/*end=*/nullptr));
EXPECT_EQ(0, options.statistics->getTickerCount(BLOB_DB_CACHE_ADD));
}
#endif // !ROCKSDB_LITE
TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
CompressedSecondaryCacheOptions secondary_cache_opts;
@ -1779,6 +1772,461 @@ TEST_F(DBBlobBasicTest, WarmCacheWithBlobsSecondary) {
1);
}
TEST_F(DBBlobBasicTest, GetEntityBlob) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.min_blob_size = 0;
Reopen(options);
constexpr char key[] = "key";
constexpr char blob_value[] = "blob_value";
constexpr char other_key[] = "other_key";
constexpr char other_blob_value[] = "other_blob_value";
ASSERT_OK(Put(key, blob_value));
ASSERT_OK(Put(other_key, other_blob_value));
ASSERT_OK(Flush());
WideColumns expected_columns{{kDefaultWideColumnName, blob_value}};
WideColumns other_expected_columns{
{kDefaultWideColumnName, other_blob_value}};
{
PinnableWideColumns result;
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(), key,
&result));
ASSERT_EQ(result.columns(), expected_columns);
}
{
PinnableWideColumns result;
ASSERT_OK(db_->GetEntity(ReadOptions(), db_->DefaultColumnFamily(),
other_key, &result));
ASSERT_EQ(result.columns(), other_expected_columns);
}
{
constexpr size_t num_keys = 2;
std::array<Slice, num_keys> keys{{key, other_key}};
std::array<PinnableWideColumns, num_keys> results;
std::array<Status, num_keys> statuses;
db_->MultiGetEntity(ReadOptions(), db_->DefaultColumnFamily(), num_keys,
&keys[0], &results[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(results[0].columns(), expected_columns);
ASSERT_OK(statuses[1]);
ASSERT_EQ(results[1].columns(), other_expected_columns);
}
}
class DBBlobWithTimestampTest : public DBBasicTestWithTimestampBase {
protected:
DBBlobWithTimestampTest()
: DBBasicTestWithTimestampBase("db_blob_with_timestamp_test") {}
};
TEST_F(DBBlobWithTimestampTest, GetBlob) {
Options options = GetDefaultOptions();
options.create_if_missing = true;
options.enable_blob_files = true;
options.min_blob_size = 0;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
constexpr char key[] = "key";
constexpr char blob_value[] = "blob_value";
ASSERT_OK(db_->Put(write_opts, key, ts, blob_value));
ASSERT_OK(Flush());
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::string value;
ASSERT_OK(db_->Get(read_opts, key, &value));
ASSERT_EQ(value, blob_value);
}
TEST_F(DBBlobWithTimestampTest, MultiGetBlobs) {
constexpr size_t min_blob_size = 6;
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.min_blob_size = min_blob_size;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
// Put then retrieve three key-values. The first value is below the size limit
// and is thus stored inline; the other two are stored separately as blobs.
constexpr size_t num_keys = 3;
constexpr char first_key[] = "first_key";
constexpr char first_value[] = "short";
static_assert(sizeof(first_value) - 1 < min_blob_size,
"first_value too long to be inlined");
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, first_key, ts, first_value));
constexpr char second_key[] = "second_key";
constexpr char second_value[] = "long_value";
static_assert(sizeof(second_value) - 1 >= min_blob_size,
"second_value too short to be stored as blob");
ASSERT_OK(db_->Put(write_opts, second_key, ts, second_value));
constexpr char third_key[] = "third_key";
constexpr char third_value[] = "other_long_value";
static_assert(sizeof(third_value) - 1 >= min_blob_size,
"third_value too short to be stored as blob");
ASSERT_OK(db_->Put(write_opts, third_key, ts, third_value));
ASSERT_OK(Flush());
ReadOptions read_options;
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
read_options.timestamp = &read_ts_slice;
std::array<Slice, num_keys> keys{{first_key, second_key, third_key}};
{
std::array<PinnableSlice, num_keys> values;
std::array<Status, num_keys> statuses;
db_->MultiGet(read_options, db_->DefaultColumnFamily(), num_keys, &keys[0],
&values[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(values[0], first_value);
ASSERT_OK(statuses[1]);
ASSERT_EQ(values[1], second_value);
ASSERT_OK(statuses[2]);
ASSERT_EQ(values[2], third_value);
}
}
TEST_F(DBBlobWithTimestampTest, GetMergeBlobWithPut) {
Options options = GetDefaultOptions();
options.merge_operator = MergeOperators::CreateStringAppendOperator();
options.enable_blob_files = true;
options.min_blob_size = 0;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v2"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v3"));
ASSERT_OK(Flush());
std::string value;
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
ASSERT_OK(db_->Get(read_opts, "Key1", &value));
ASSERT_EQ(value, "v1,v2,v3");
}
TEST_F(DBBlobWithTimestampTest, MultiGetMergeBlobWithPut) {
constexpr size_t num_keys = 3;
Options options = GetDefaultOptions();
options.merge_operator = MergeOperators::CreateStringAppendOperator();
options.enable_blob_files = true;
options.min_blob_size = 0;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
WriteOptions write_opts;
const std::string ts = Timestamp(1, 0);
ASSERT_OK(db_->Put(write_opts, "Key0", ts, "v0_0"));
ASSERT_OK(db_->Put(write_opts, "Key1", ts, "v1_0"));
ASSERT_OK(db_->Put(write_opts, "Key2", ts, "v2_0"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_1"));
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key1", ts, "v1_1"));
ASSERT_OK(Flush());
ASSERT_OK(
db_->Merge(write_opts, db_->DefaultColumnFamily(), "Key0", ts, "v0_2"));
ASSERT_OK(Flush());
const std::string read_ts = Timestamp(2, 0);
Slice read_ts_slice(read_ts);
ReadOptions read_opts;
read_opts.timestamp = &read_ts_slice;
std::array<Slice, num_keys> keys{{"Key0", "Key1", "Key2"}};
std::array<PinnableSlice, num_keys> values;
std::array<Status, num_keys> statuses;
db_->MultiGet(read_opts, db_->DefaultColumnFamily(), num_keys, &keys[0],
&values[0], &statuses[0]);
ASSERT_OK(statuses[0]);
ASSERT_EQ(values[0], "v0_0,v0_1,v0_2");
ASSERT_OK(statuses[1]);
ASSERT_EQ(values[1], "v1_0,v1_1");
ASSERT_OK(statuses[2]);
ASSERT_EQ(values[2], "v2_0");
}
TEST_F(DBBlobWithTimestampTest, IterateBlobs) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
options.create_if_missing = true;
const size_t kTimestampSize = Timestamp(0, 0).size();
TestComparator test_cmp(kTimestampSize);
options.comparator = &test_cmp;
DestroyAndReopen(options);
int num_blobs = 5;
std::vector<std::string> keys;
std::vector<std::string> blobs;
WriteOptions write_opts;
std::vector<std::string> write_timestamps = {Timestamp(1, 0),
Timestamp(2, 0)};
// For each key in ["key0", ... "keyi", ...], write two versions:
// Timestamp(1, 0), "blobi0"
// Timestamp(2, 0), "blobi1"
for (int i = 0; i < num_blobs; i++) {
keys.push_back("key" + std::to_string(i));
blobs.push_back("blob" + std::to_string(i));
for (size_t j = 0; j < write_timestamps.size(); j++) {
ASSERT_OK(db_->Put(write_opts, keys[i], write_timestamps[j],
blobs[i] + std::to_string(j)));
}
}
ASSERT_OK(Flush());
ReadOptions read_options;
std::vector<std::string> read_timestamps = {Timestamp(0, 0), Timestamp(3, 0)};
Slice ts_upper_bound(read_timestamps[1]);
read_options.timestamp = &ts_upper_bound;
auto check_iter_entry =
[](const Iterator* iter, const std::string& expected_key,
const std::string& expected_ts, const std::string& expected_value,
bool key_is_internal = true) {
ASSERT_OK(iter->status());
if (key_is_internal) {
std::string expected_ukey_and_ts;
expected_ukey_and_ts.assign(expected_key.data(), expected_key.size());
expected_ukey_and_ts.append(expected_ts.data(), expected_ts.size());
ParsedInternalKey parsed_ikey;
ASSERT_OK(ParseInternalKey(iter->key(), &parsed_ikey,
true /* log_err_key */));
ASSERT_EQ(parsed_ikey.user_key, expected_ukey_and_ts);
} else {
ASSERT_EQ(iter->key(), expected_key);
}
ASSERT_EQ(iter->timestamp(), expected_ts);
ASSERT_EQ(iter->value(), expected_value);
};
// Forward iterating one version of each key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key1", Timestamp(2, 0), "blob11")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1), /*key_is_internal*/ false);
iter->Next();
}
}
// Forward iteration, then reverse to backward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[i], write_timestamps[1],
blobs[i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Next();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs * 2 - 1 - i],
write_timestamps[1],
blobs[num_blobs * 2 - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Prev();
}
}
}
// Backward iterating one versions of each key, get in this order:
// [("key4", Timestamp(2, 0), "blob41"),
// ("key3", Timestamp(2, 0), "blob31")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs; i++) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i], write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
iter->Prev();
}
}
// Backward iteration, then reverse to forward.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = 0; i < num_blobs * 2 - 1; i++) {
if (i < num_blobs) {
check_iter_entry(iter.get(), keys[num_blobs - 1 - i],
write_timestamps[1],
blobs[num_blobs - 1 - i] + std::to_string(1),
/*key_is_internal*/ false);
if (i != num_blobs - 1) {
iter->Prev();
}
} else {
if (i != num_blobs) {
check_iter_entry(iter.get(), keys[i - num_blobs], write_timestamps[1],
blobs[i - num_blobs] + std::to_string(1),
/*key_is_internal*/ false);
}
iter->Next();
}
}
}
Slice ts_lower_bound(read_timestamps[0]);
read_options.iter_start_ts = &ts_lower_bound;
// Forward iterating multiple versions of the same key, get in this order:
// [("key0", Timestamp(2, 0), "blob01"),
// ("key0", Timestamp(1, 0), "blob00"),
// ("key1", Timestamp(2, 0), "blob11")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = 0; i < num_blobs; i++) {
for (size_t j = write_timestamps.size(); j > 0; --j) {
check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
blobs[i] + std::to_string(j - 1));
iter->Next();
}
}
}
// Backward iterating multiple versions of the same key, get in this order:
// [("key4", Timestamp(1, 0), "blob00"),
// ("key4", Timestamp(2, 0), "blob01"),
// ("key3", Timestamp(1, 0), "blob10")...]
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = num_blobs; i > 0; i--) {
for (size_t j = 0; j < write_timestamps.size(); j++) {
check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
blobs[i - 1] + std::to_string(j));
iter->Prev();
}
}
}
int upper_bound_idx = num_blobs - 2;
int lower_bound_idx = 1;
Slice upper_bound_slice(keys[upper_bound_idx]);
Slice lower_bound_slice(keys[lower_bound_idx]);
read_options.iterate_upper_bound = &upper_bound_slice;
read_options.iterate_lower_bound = &lower_bound_slice;
// Forward iteration with upper and lower bound.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToFirst();
for (int i = lower_bound_idx; i < upper_bound_idx; i++) {
for (size_t j = write_timestamps.size(); j > 0; --j) {
check_iter_entry(iter.get(), keys[i], write_timestamps[j - 1],
blobs[i] + std::to_string(j - 1));
iter->Next();
}
}
}
// Backward iteration with upper and lower bound.
{
std::unique_ptr<Iterator> iter(db_->NewIterator(read_options));
ASSERT_OK(iter->status());
iter->SeekToLast();
for (int i = upper_bound_idx; i > lower_bound_idx; i--) {
for (size_t j = 0; j < write_timestamps.size(); j++) {
check_iter_entry(iter.get(), keys[i - 1], write_timestamps[j],
blobs[i - 1] + std::to_string(j));
iter->Prev();
}
}
}
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -16,7 +16,6 @@ class DBBlobCompactionTest : public DBTestBase {
explicit DBBlobCompactionTest()
: DBTestBase("db_blob_compaction_test", /*env_do_fsync=*/false) {}
#ifndef ROCKSDB_LITE
const std::vector<InternalStats::CompactionStats>& GetCompactionStats() {
VersionSet* const versions = dbfull()->GetVersionSet();
assert(versions);
@ -30,7 +29,6 @@ class DBBlobCompactionTest : public DBTestBase {
return internal_stats->TEST_GetCompactionStats();
}
#endif // ROCKSDB_LITE
};
namespace {
@ -250,7 +248,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
ASSERT_OK(db_->Get(ReadOptions(), long_key, &value));
ASSERT_EQ("value", value);
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -258,7 +255,6 @@ TEST_F(DBBlobCompactionTest, FilterByKeyLength) {
// this involves neither reading nor writing blobs
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -299,7 +295,6 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
ASSERT_EQ(long_value, value);
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -307,12 +302,10 @@ TEST_F(DBBlobCompactionTest, FilterByValueLength) {
// this involves reading but not writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
#ifndef ROCKSDB_LITE
TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
Options options = GetDefaultOptions();
@ -388,7 +381,6 @@ TEST_F(DBBlobCompactionTest, BlobCompactWithStartingLevel) {
Close();
}
#endif
TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
Options options = GetDefaultOptions();
@ -413,7 +405,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
ASSERT_EQ(new_blob_value, Get(key));
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -421,7 +412,6 @@ TEST_F(DBBlobCompactionTest, BlindWriteFilter) {
// this involves writing but not reading blobs
ASSERT_EQ(compaction_stats[1].bytes_read_blob, 0);
ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -540,7 +530,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
ASSERT_EQ(kv.second + std::string(padding), Get(kv.first));
}
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -548,7 +537,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilter) {
// this involves reading and writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_GT(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}
@ -606,7 +594,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
/*end=*/nullptr));
ASSERT_EQ(blob_files, GetBlobFileNumbers());
#ifndef ROCKSDB_LITE
const auto& compaction_stats = GetCompactionStats();
ASSERT_GE(compaction_stats.size(), 2);
@ -614,7 +601,6 @@ TEST_F(DBBlobCompactionTest, CompactionFilterReadBlobAndKeep) {
// this involves reading but not writing blobs
ASSERT_GT(compaction_stats[1].bytes_read_blob, 0);
ASSERT_EQ(compaction_stats[1].bytes_written_blob, 0);
#endif // ROCKSDB_LITE
Close();
}

@ -34,7 +34,6 @@ class DBBlobCorruptionTest : public DBTestBase {
}
};
#ifndef ROCKSDB_LITE
TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
Options options = GetDefaultOptions();
options.enable_blob_files = true;
@ -71,7 +70,6 @@ TEST_F(DBBlobCorruptionTest, VerifyWholeBlobFileChecksum) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -131,9 +131,7 @@ class DBBlobIndexTest : public DBTestBase {
ASSERT_OK(Flush());
ASSERT_OK(
dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
#ifndef ROCKSDB_LITE
ASSERT_EQ("0,1", FilesPerLevel());
#endif // !ROCKSDB_LITE
break;
}
}
@ -459,7 +457,6 @@ TEST_F(DBBlobIndexTest, Iterate) {
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
#ifndef ROCKSDB_LITE
// Iterator with blob support and using seek.
ASSERT_OK(dbfull()->SetOptions(
cfh(), {{"max_sequential_skip_in_iterations", "0"}}));
@ -484,7 +481,6 @@ TEST_F(DBBlobIndexTest, Iterate) {
create_blob_iterator, check_is_blob(false));
verify(15, Status::kOk, get_value(16, 0), get_value(14, 0),
create_blob_iterator, check_is_blob(false));
#endif // !ROCKSDB_LITE
for (auto* snapshot : snapshots) {
dbfull()->ReleaseSnapshot(snapshot);
@ -584,12 +580,10 @@ TEST_F(DBBlobIndexTest, IntegratedBlobIterate) {
Status expected_status;
verify(1, expected_status, expected_value);
#ifndef ROCKSDB_LITE
// Test DBIter::FindValueForCurrentKeyUsingSeek flow.
ASSERT_OK(dbfull()->SetOptions(cfh(),
{{"max_sequential_skip_in_iterations", "0"}}));
verify(1, expected_status, expected_value);
#endif // !ROCKSDB_LITE
}
} // namespace ROCKSDB_NAMESPACE

@ -15,6 +15,7 @@
#include "db/blob/blob_file_builder.h"
#include "db/compaction/compaction_iterator.h"
#include "db/dbformat.h"
#include "db/event_helpers.h"
#include "db/internal_stats.h"
#include "db/merge_helper.h"
@ -56,8 +57,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
const FileOptions& file_options, const ReadOptions& read_options,
TableCache* table_cache, InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,
FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,
@ -107,11 +108,9 @@ Status BuildTable(
std::vector<std::string> blob_file_paths;
std::string file_checksum = kUnknownFileChecksum;
std::string file_checksum_func_name = kUnknownFileChecksumFuncName;
#ifndef ROCKSDB_LITE
EventHelpers::NotifyTableFileCreationStarted(ioptions.listeners, dbname,
tboptions.column_family_name,
fname, job_id, tboptions.reason);
#endif // !ROCKSDB_LITE
Env* env = db_options.env;
assert(env);
FileSystem* fs = db_options.fs.get();
@ -176,10 +175,10 @@ Status BuildTable(
builder = NewTableBuilder(tboptions, file_writer.get());
}
auto ucmp = tboptions.internal_comparator.user_comparator();
MergeHelper merge(
env, tboptions.internal_comparator.user_comparator(),
ioptions.merge_operator.get(), compaction_filter.get(), ioptions.logger,
true /* internal key corruption is not ok */,
env, ucmp, ioptions.merge_operator.get(), compaction_filter.get(),
ioptions.logger, true /* internal key corruption is not ok */,
snapshots.empty() ? 0 : snapshots.back(), snapshot_checker);
std::unique_ptr<BlobFileBuilder> blob_file_builder(
@ -197,32 +196,49 @@ Status BuildTable(
const std::atomic<bool> kManualCompactionCanceledFalse{false};
CompactionIterator c_iter(
iter, tboptions.internal_comparator.user_comparator(), &merge,
kMaxSequenceNumber, &snapshots, earliest_write_conflict_snapshot,
job_snapshot, snapshot_checker, env,
iter, ucmp, &merge, kMaxSequenceNumber, &snapshots,
earliest_write_conflict_snapshot, job_snapshot, snapshot_checker, env,
ShouldReportDetailedTime(env, ioptions.stats),
true /* internal key corruption is not ok */, range_del_agg.get(),
blob_file_builder.get(), ioptions.allow_data_in_errors,
ioptions.enforce_single_del_contracts,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
true /* must_count_input_entries */,
/*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
const size_t ts_sz = ucmp->timestamp_size();
const bool strip_timestamp =
ts_sz > 0 && !ioptions.persist_user_defined_timestamps;
std::string key_after_flush_buf;
c_iter.SeekToFirst();
for (; c_iter.Valid(); c_iter.Next()) {
const Slice& key = c_iter.key();
const Slice& value = c_iter.value();
const ParsedInternalKey& ikey = c_iter.ikey();
// Generate a rolling 64-bit hash of the key and values
// Note :
// Here "key" integrates 'sequence_number'+'kType'+'user key'.
s = output_validator.Add(key, value);
Slice key_after_flush = key;
// If user defined timestamps will be stripped from user key after flush,
// the in memory version of the key act logically the same as one with a
// minimum timestamp. We update the timestamp here so file boundary and
// output validator, block builder all see the effect of the stripping.
if (strip_timestamp) {
key_after_flush_buf.clear();
ReplaceInternalKeyWithMinTimestamp(&key_after_flush_buf, key, ts_sz);
key_after_flush = key_after_flush_buf;
}
// Generate a rolling 64-bit hash of the key and values
// Note :
// Here "key" integrates 'sequence_number'+'kType'+'user key'.
s = output_validator.Add(key_after_flush, value);
if (!s.ok()) {
break;
}
builder->Add(key, value);
builder->Add(key_after_flush, value);
s = meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type);
s = meta->UpdateBoundaries(key_after_flush, value, ikey.sequence,
ikey.type);
if (!s.ok()) {
break;
}
@ -242,21 +258,28 @@ Status BuildTable(
if (s.ok()) {
auto range_del_it = range_del_agg->NewIterator();
Slice last_tombstone_start_user_key{};
for (range_del_it->SeekToFirst(); range_del_it->Valid();
range_del_it->Next()) {
auto tombstone = range_del_it->Tombstone();
auto kv = tombstone.Serialize();
// TODO(yuzhangyu): handle range deletion for UDT in memtables only.
builder->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_end = tombstone.SerializeEndKey();
meta->UpdateBoundariesForRange(kv.first, tombstone_end, tombstone.seq_,
tboptions.internal_comparator);
if (version) {
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
meta->compensated_range_deletion_size += versions->ApproximateSize(
approx_opts, version, kv.first.Encode(), tombstone_end.Encode(),
0 /* start_level */, -1 /* end_level */,
TableReaderCaller::kFlush);
if (last_tombstone_start_user_key.empty() ||
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
range_del_it->start_key()) < 0) {
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
meta->compensated_range_deletion_size += versions->ApproximateSize(
approx_opts, read_options, version, kv.first.Encode(),
tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */,
TableReaderCaller::kFlush);
}
last_tombstone_start_user_key = range_del_it->start_key();
}
}
}
@ -264,8 +287,9 @@ Status BuildTable(
TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
const bool empty = builder->IsEmpty();
if (num_input_entries != nullptr) {
assert(c_iter.HasNumInputEntryScanned());
*num_input_entries =
c_iter.num_input_entry_scanned() + num_unfragmented_tombstones;
c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
}
if (!s.ok() || empty) {
builder->Abandon();
@ -288,7 +312,10 @@ Status BuildTable(
if (s.ok() && !empty) {
uint64_t file_size = builder->FileSize();
meta->fd.file_size = file_size;
meta->tail_size = builder->GetTailSize();
meta->marked_for_compaction = builder->NeedCompact();
meta->user_defined_timestamps_persisted =
ioptions.persist_user_defined_timestamps;
assert(meta->fd.GetFileSize() > 0);
tp = builder
->GetTableProperties(); // refresh now that builder is finished
@ -348,6 +375,8 @@ Status BuildTable(
s = *io_status;
}
// TODO(yuzhangyu): handle the key copy in the blob when ts should be
// stripped.
if (blob_file_builder) {
if (s.ok()) {
s = blob_file_builder->Finish();
@ -366,7 +395,6 @@ Status BuildTable(
// here because this is a special case after we finish the table building.
// No matter whether use_direct_io_for_flush_and_compaction is true,
// the goal is to cache it here for further user reads.
ReadOptions read_options;
std::unique_ptr<InternalIterator> it(table_cache->NewIterator(
read_options, file_options, tboptions.internal_comparator, *meta,
nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor,
@ -378,7 +406,8 @@ Status BuildTable(
MaxFileSizeForL0MetaPin(mutable_cf_options),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key*/ nullptr,
/*allow_unprepared_value*/ false));
/*allow_unprepared_value*/ false,
mutable_cf_options.block_protection_bytes_per_key));
s = it->status();
if (s.ok() && paranoid_file_checks) {
OutputValidator file_validator(tboptions.internal_comparator,

@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions,
extern Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
const FileOptions& file_options, const ReadOptions& read_options,
TableCache* table_cache, InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,
FileMetaData* meta, std::vector<BlobFileAddition>* blob_file_additions,

@ -7,8 +7,6 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef ROCKSDB_LITE
#include "rocksdb/c.h"
#include <cstdlib>
@ -17,7 +15,7 @@
#include <vector>
#include "port/port.h"
#include "rocksdb/cache.h"
#include "rocksdb/advanced_cache.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/comparator.h"
#include "rocksdb/convenience.h"
@ -48,6 +46,7 @@
#include "rocksdb/utilities/write_batch_with_index.h"
#include "rocksdb/write_batch.h"
#include "utilities/merge_operators.h"
#include "rocksdb/env_encryption.h"
using ROCKSDB_NAMESPACE::BackupEngine;
using ROCKSDB_NAMESPACE::BackupEngineOptions;
@ -69,6 +68,7 @@ using ROCKSDB_NAMESPACE::CompactionOptionsFIFO;
using ROCKSDB_NAMESPACE::CompactRangeOptions;
using ROCKSDB_NAMESPACE::Comparator;
using ROCKSDB_NAMESPACE::CompressionType;
using ROCKSDB_NAMESPACE::ConfigOptions;
using ROCKSDB_NAMESPACE::CuckooTableOptions;
using ROCKSDB_NAMESPACE::DB;
using ROCKSDB_NAMESPACE::DBOptions;
@ -78,6 +78,8 @@ using ROCKSDB_NAMESPACE::EnvOptions;
using ROCKSDB_NAMESPACE::FileLock;
using ROCKSDB_NAMESPACE::FilterPolicy;
using ROCKSDB_NAMESPACE::FlushOptions;
using ROCKSDB_NAMESPACE::HistogramData;
using ROCKSDB_NAMESPACE::HyperClockCacheOptions;
using ROCKSDB_NAMESPACE::InfoLogLevel;
using ROCKSDB_NAMESPACE::IngestExternalFileOptions;
using ROCKSDB_NAMESPACE::Iterator;
@ -208,6 +210,9 @@ struct rocksdb_logger_t {
struct rocksdb_lru_cache_options_t {
LRUCacheOptions rep;
};
struct rocksdb_hyper_clock_cache_options_t {
HyperClockCacheOptions rep;
};
struct rocksdb_memory_allocator_t {
std::shared_ptr<MemoryAllocator> rep;
};
@ -276,6 +281,11 @@ struct rocksdb_compactionfiltercontext_t {
CompactionFilter::Context rep;
};
struct rocksdb_statistics_histogram_data_t {
rocksdb_statistics_histogram_data_t() : rep() {}
HistogramData rep;
};
struct rocksdb_compactionfilter_t : public CompactionFilter {
void* state_;
void (*destructor_)(void*);
@ -1054,6 +1064,36 @@ rocksdb_column_family_handle_t* rocksdb_create_column_family(
return handle;
}
rocksdb_column_family_handle_t** rocksdb_create_column_families(
rocksdb_t* db, const rocksdb_options_t* column_family_options,
int num_column_families, const char* const* column_family_names,
size_t* lencfs, char** errptr) {
std::vector<ColumnFamilyHandle*> handles;
std::vector<std::string> names;
for (int i = 0; i != num_column_families; ++i) {
names.push_back(std::string(column_family_names[i]));
}
SaveError(errptr, db->rep->CreateColumnFamilies(
ColumnFamilyOptions(column_family_options->rep), names,
&handles));
*lencfs = handles.size();
rocksdb_column_family_handle_t** c_handles =
static_cast<rocksdb_column_family_handle_t**>(
malloc(sizeof(rocksdb_column_family_handle_t*) * handles.size()));
for (size_t i = 0; i != handles.size(); ++i) {
c_handles[i] = new rocksdb_column_family_handle_t;
c_handles[i]->rep = handles[i];
}
return c_handles;
}
void rocksdb_create_column_families_destroy(
rocksdb_column_family_handle_t** list) {
free(list);
}
rocksdb_column_family_handle_t* rocksdb_create_column_family_with_ttl(
rocksdb_t* db, const rocksdb_options_t* column_family_options,
const char* column_family_name, int ttl, char** errptr) {
@ -1805,6 +1845,17 @@ void rocksdb_flush_cf(rocksdb_t* db, const rocksdb_flushoptions_t* options,
SaveError(errptr, db->rep->Flush(options->rep, column_family->rep));
}
void rocksdb_flush_cfs(rocksdb_t* db, const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t** column_families,
int num_column_families, char** errptr) {
std::vector<ColumnFamilyHandle*> column_family_handles;
for (int i = 0; i < num_column_families; i++) {
column_family_handles.push_back(column_families[i]->rep);
}
SaveError(errptr, db->rep->Flush(options->rep, column_family_handles));
}
void rocksdb_flush_wal(rocksdb_t* db, unsigned char sync, char** errptr) {
SaveError(errptr, db->rep->FlushWAL(sync));
}
@ -2498,8 +2549,12 @@ void rocksdb_load_latest_options(
rocksdb_options_t*** list_column_family_options, char** errptr) {
DBOptions db_opt;
std::vector<ColumnFamilyDescriptor> cf_descs;
Status s = LoadLatestOptions(std::string(db_path), env->rep, &db_opt,
&cf_descs, ignore_unknown_options, &cache->rep);
ConfigOptions config_opts;
config_opts.ignore_unknown_options = ignore_unknown_options;
config_opts.input_strings_escaped = true;
config_opts.env = env->rep;
Status s = LoadLatestOptions(config_opts, std::string(db_path), &db_opt,
&cf_descs, &cache->rep);
if (s.ok()) {
char** cf_names = (char**)malloc(cf_descs.size() * sizeof(char*));
rocksdb_options_t** cf_options = (rocksdb_options_t**)malloc(
@ -2620,14 +2675,6 @@ void rocksdb_block_based_options_set_block_cache(
}
}
void rocksdb_block_based_options_set_block_cache_compressed(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache_compressed) {
if (block_cache_compressed) {
options->rep.block_cache_compressed = block_cache_compressed->rep;
}
}
void rocksdb_block_based_options_set_whole_key_filtering(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.whole_key_filtering = v;
@ -2983,6 +3030,29 @@ void rocksdb_options_enable_statistics(rocksdb_options_t* opt) {
opt->rep.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
}
void rocksdb_options_set_statistics_level(rocksdb_options_t* opt, int level) {
if (!opt->rep.statistics) {
return;
}
if (level < rocksdb_statistics_level_disable_all) {
level = rocksdb_statistics_level_disable_all;
}
if (level > rocksdb_statistics_level_all) {
level = rocksdb_statistics_level_all;
}
opt->rep.statistics->set_stats_level(
static_cast<ROCKSDB_NAMESPACE::StatsLevel>(level));
}
int rocksdb_options_get_statistics_level(rocksdb_options_t* opt) {
if (!opt->rep.statistics) {
return ROCKSDB_NAMESPACE::StatsLevel::kDisableAll;
}
return static_cast<int>(opt->rep.statistics->get_stats_level());
}
void rocksdb_options_set_skip_stats_update_on_db_open(rocksdb_options_t* opt,
unsigned char val) {
opt->rep.skip_stats_update_on_db_open = val;
@ -3730,16 +3800,21 @@ void rocksdb_options_set_hash_link_list_rep(rocksdb_options_t* opt,
ROCKSDB_NAMESPACE::NewHashLinkListRepFactory(bucket_count));
}
void rocksdb_options_set_plain_table_factory(rocksdb_options_t* opt,
uint32_t user_key_len,
int bloom_bits_per_key,
double hash_table_ratio,
size_t index_sparseness) {
void rocksdb_options_set_plain_table_factory(
rocksdb_options_t* opt, uint32_t user_key_len, int bloom_bits_per_key,
double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size,
char encoding_type, unsigned char full_scan_mode,
unsigned char store_index_in_file) {
ROCKSDB_NAMESPACE::PlainTableOptions options;
options.user_key_len = user_key_len;
options.bloom_bits_per_key = bloom_bits_per_key;
options.hash_table_ratio = hash_table_ratio;
options.index_sparseness = index_sparseness;
options.huge_page_tlb_size = huge_page_tlb_size;
options.encoding_type =
static_cast<ROCKSDB_NAMESPACE::EncodingType>(encoding_type);
options.full_scan_mode = full_scan_mode;
options.store_index_in_file = store_index_in_file;
ROCKSDB_NAMESPACE::TableFactory* factory =
ROCKSDB_NAMESPACE::NewPlainTableFactory(options);
@ -3817,6 +3892,26 @@ char* rocksdb_options_statistics_get_string(rocksdb_options_t* opt) {
return nullptr;
}
uint64_t rocksdb_options_statistics_get_ticker_count(rocksdb_options_t* opt,
uint32_t ticker_type) {
ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
if (statistics) {
return statistics->getTickerCount(ticker_type);
}
return 0;
}
void rocksdb_options_statistics_get_histogram_data(
rocksdb_options_t* opt, uint32_t type,
rocksdb_statistics_histogram_data_t* const data) {
ROCKSDB_NAMESPACE::Statistics* statistics = opt->rep.statistics.get();
if (statistics) {
statistics->histogramData(type, &data->rep);
} else {
*data = rocksdb_statistics_histogram_data_t{};
}
}
void rocksdb_options_set_ratelimiter(rocksdb_options_t* opt,
rocksdb_ratelimiter_t* limiter) {
if (limiter) {
@ -3878,6 +3973,15 @@ void rocksdb_options_add_compact_on_deletion_collector_factory(
opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
}
void rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
rocksdb_options_t* opt, size_t window_size, size_t num_dels_trigger,
double deletion_ratio) {
std::shared_ptr<ROCKSDB_NAMESPACE::TablePropertiesCollectorFactory>
compact_on_del = NewCompactOnDeletionCollectorFactory(
window_size, num_dels_trigger, deletion_ratio);
opt->rep.table_properties_collector_factories.emplace_back(compact_on_del);
}
void rocksdb_set_perf_level(int v) {
PerfLevel level = static_cast<PerfLevel>(v);
SetPerfLevel(level);
@ -4054,6 +4158,8 @@ uint64_t rocksdb_perfcontext_metric(rocksdb_perfcontext_t* context,
return rep->blob_decompress_time;
case rocksdb_internal_range_del_reseek_count:
return rep->internal_range_del_reseek_count;
case rocksdb_block_read_cpu_time:
return rep->block_read_cpu_time;
default:
break;
}
@ -4449,6 +4555,15 @@ rocksdb_readoptions_get_io_timeout(rocksdb_readoptions_t* opt) {
return opt->rep.io_timeout.count();
}
void rocksdb_readoptions_set_async_io(rocksdb_readoptions_t* opt,
unsigned char v) {
opt->rep.async_io = v;
}
unsigned char rocksdb_readoptions_get_async_io(rocksdb_readoptions_t* opt) {
return opt->rep.async_io;
}
void rocksdb_readoptions_set_timestamp(rocksdb_readoptions_t* opt,
const char* ts, size_t tslen) {
if (ts == nullptr) {
@ -4660,12 +4775,59 @@ rocksdb_cache_t* rocksdb_cache_create_lru_with_strict_capacity_limit(
}
rocksdb_cache_t* rocksdb_cache_create_lru_opts(
rocksdb_lru_cache_options_t* opt) {
const rocksdb_lru_cache_options_t* opt) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = NewLRUCache(opt->rep);
return c;
}
rocksdb_hyper_clock_cache_options_t* rocksdb_hyper_clock_cache_options_create(
size_t capacity, size_t estimated_entry_charge) {
return new rocksdb_hyper_clock_cache_options_t{
HyperClockCacheOptions(capacity, estimated_entry_charge)};
}
void rocksdb_hyper_clock_cache_options_destroy(
rocksdb_hyper_clock_cache_options_t* opt) {
delete opt;
}
void rocksdb_hyper_clock_cache_options_set_capacity(
rocksdb_hyper_clock_cache_options_t* opts, size_t capacity) {
opts->rep.capacity = capacity;
}
void rocksdb_hyper_clock_cache_options_set_estimated_entry_charge(
rocksdb_hyper_clock_cache_options_t* opts, size_t estimated_entry_charge) {
opts->rep.estimated_entry_charge = estimated_entry_charge;
}
void rocksdb_hyper_clock_cache_options_set_num_shard_bits(
rocksdb_hyper_clock_cache_options_t* opts, int num_shard_bits) {
opts->rep.num_shard_bits = num_shard_bits;
}
void rocksdb_hyper_clock_cache_options_set_memory_allocator(
rocksdb_hyper_clock_cache_options_t* opts,
rocksdb_memory_allocator_t* memory_allocator) {
opts->rep.memory_allocator = memory_allocator->rep;
}
rocksdb_cache_t* rocksdb_cache_create_hyper_clock(
size_t capacity, size_t estimated_entry_charge) {
HyperClockCacheOptions opts(capacity, estimated_entry_charge);
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = opts.MakeSharedCache();
return c;
}
rocksdb_cache_t* rocksdb_cache_create_hyper_clock_opts(
const rocksdb_hyper_clock_cache_options_t* opts) {
rocksdb_cache_t* c = new rocksdb_cache_t;
c->rep = opts->rep.MakeSharedCache();
return c;
}
void rocksdb_cache_destroy(rocksdb_cache_t* cache) { delete cache; }
void rocksdb_cache_disown_data(rocksdb_cache_t* cache) {
@ -4676,18 +4838,26 @@ void rocksdb_cache_set_capacity(rocksdb_cache_t* cache, size_t capacity) {
cache->rep->SetCapacity(capacity);
}
size_t rocksdb_cache_get_capacity(rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_capacity(const rocksdb_cache_t* cache) {
return cache->rep->GetCapacity();
}
size_t rocksdb_cache_get_usage(rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_usage(const rocksdb_cache_t* cache) {
return cache->rep->GetUsage();
}
size_t rocksdb_cache_get_pinned_usage(rocksdb_cache_t* cache) {
size_t rocksdb_cache_get_pinned_usage(const rocksdb_cache_t* cache) {
return cache->rep->GetPinnedUsage();
}
size_t rocksdb_cache_get_table_address_count(const rocksdb_cache_t* cache) {
return cache->rep->GetTableAddressCount();
}
size_t rocksdb_cache_get_occupancy_count(const rocksdb_cache_t* cache) {
return cache->rep->GetOccupancyCount();
}
rocksdb_dbpath_t* rocksdb_dbpath_create(const char* path,
uint64_t target_size) {
rocksdb_dbpath_t* result = new rocksdb_dbpath_t;
@ -4705,6 +4875,20 @@ rocksdb_env_t* rocksdb_create_default_env() {
return result;
}
rocksdb_env_t* rocksdb_create_encrypted_env(const char* key) {
rocksdb_env_t* result = new rocksdb_env_t;
std::shared_ptr<rocksdb::EncryptionProvider> provider;
Status status = rocksdb::EncryptionProvider::CreateFromString(
ConfigOptions(), "ippcp", &provider);
assert(status.ok());
status =
provider->AddCipher("", key, 32, false);
assert(status.ok());
result->rep = NewEncryptedEnv(Env::Default(), provider);
result->is_default = true;
return result;
}
rocksdb_env_t* rocksdb_create_mem_env() {
rocksdb_env_t* result = new rocksdb_env_t;
result->rep = ROCKSDB_NAMESPACE::NewMemEnv(Env::Default());
@ -4906,6 +5090,12 @@ void rocksdb_ingestexternalfileoptions_set_ingest_behind(
opt->rep.ingest_behind = ingest_behind;
}
void rocksdb_ingestexternalfileoptions_set_fail_if_not_bottommost_level(
rocksdb_ingestexternalfileoptions_t* opt,
unsigned char fail_if_not_bottommost_level) {
opt->rep.fail_if_not_bottommost_level = fail_if_not_bottommost_level;
}
void rocksdb_ingestexternalfileoptions_destroy(
rocksdb_ingestexternalfileoptions_t* opt) {
delete opt;
@ -5067,6 +5257,17 @@ rocksdb_fifo_compaction_options_t* rocksdb_fifo_compaction_options_create() {
return result;
}
void rocksdb_fifo_compaction_options_set_allow_compaction(
rocksdb_fifo_compaction_options_t* fifo_opts,
unsigned char allow_compaction) {
fifo_opts->rep.allow_compaction = allow_compaction;
}
unsigned char rocksdb_fifo_compaction_options_get_allow_compaction(
rocksdb_fifo_compaction_options_t* fifo_opts) {
return fifo_opts->rep.allow_compaction;
}
void rocksdb_fifo_compaction_options_set_max_table_files_size(
rocksdb_fifo_compaction_options_t* fifo_opts, uint64_t size) {
fifo_opts->rep.max_table_files_size = size;
@ -5487,6 +5688,20 @@ int rocksdb_transactiondb_property_int(rocksdb_transactiondb_t* db,
}
}
rocksdb_t* rocksdb_transactiondb_get_base_db(rocksdb_transactiondb_t* txn_db) {
DB* base_db = txn_db->rep->GetBaseDB();
if (base_db != nullptr) {
rocksdb_t* result = new rocksdb_t;
result->rep = base_db;
return result;
}
return nullptr;
}
void rocksdb_transactiondb_close_base_db(rocksdb_t* base_db) { delete base_db; }
rocksdb_transaction_t* rocksdb_transaction_begin(
rocksdb_transactiondb_t* txn_db,
const rocksdb_writeoptions_t* write_options,
@ -5771,6 +5986,35 @@ void rocksdb_transaction_multi_get(rocksdb_transaction_t* txn,
}
}
void rocksdb_transaction_multi_get_for_update(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGetForUpdate(options->rep, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
void rocksdb_transaction_multi_get_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
@ -5803,6 +6047,38 @@ void rocksdb_transaction_multi_get_cf(
}
}
void rocksdb_transaction_multi_get_for_update_cf(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
const rocksdb_column_family_handle_t* const* column_families,
size_t num_keys, const char* const* keys_list,
const size_t* keys_list_sizes, char** values_list,
size_t* values_list_sizes, char** errs) {
std::vector<Slice> keys(num_keys);
std::vector<ColumnFamilyHandle*> cfs(num_keys);
for (size_t i = 0; i < num_keys; i++) {
keys[i] = Slice(keys_list[i], keys_list_sizes[i]);
cfs[i] = column_families[i]->rep;
}
std::vector<std::string> values(num_keys);
std::vector<Status> statuses =
txn->rep->MultiGetForUpdate(options->rep, cfs, keys, &values);
for (size_t i = 0; i < num_keys; i++) {
if (statuses[i].ok()) {
values_list[i] = CopyString(values[i]);
values_list_sizes[i] = values[i].size();
errs[i] = nullptr;
} else {
values_list[i] = nullptr;
values_list_sizes[i] = 0;
if (!statuses[i].IsNotFound()) {
errs[i] = strdup(statuses[i].ToString().c_str());
} else {
errs[i] = nullptr;
}
}
}
}
// Read a key outside a transaction
char* rocksdb_transactiondb_get(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
@ -6104,6 +6380,18 @@ void rocksdb_transactiondb_flush_cf(
SaveError(errptr, txn_db->rep->Flush(options->rep, column_family->rep));
}
void rocksdb_transactiondb_flush_cfs(
rocksdb_transactiondb_t* txn_db, const rocksdb_flushoptions_t* options,
rocksdb_column_family_handle_t** column_families, int num_column_families,
char** errptr) {
std::vector<ColumnFamilyHandle*> column_family_handles;
for (int i = 0; i < num_column_families; i++) {
column_family_handles.push_back(column_families[i]->rep);
}
SaveError(errptr, txn_db->rep->Flush(options->rep, column_family_handles));
}
rocksdb_checkpoint_t* rocksdb_transactiondb_checkpoint_object_create(
rocksdb_transactiondb_t* txn_db, char** errptr) {
Checkpoint* checkpoint;
@ -6391,6 +6679,59 @@ void rocksdb_enable_manual_compaction(rocksdb_t* db) {
db->rep->EnableManualCompaction();
}
} // end extern "C"
rocksdb_statistics_histogram_data_t*
rocksdb_statistics_histogram_data_create() {
return new rocksdb_statistics_histogram_data_t{};
}
void rocksdb_statistics_histogram_data_destroy(
rocksdb_statistics_histogram_data_t* data) {
delete data;
}
double rocksdb_statistics_histogram_data_get_median(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.median;
}
double rocksdb_statistics_histogram_data_get_p95(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.percentile95;
}
double rocksdb_statistics_histogram_data_get_p99(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.percentile99;
}
double rocksdb_statistics_histogram_data_get_average(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.average;
}
double rocksdb_statistics_histogram_data_get_std_dev(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.standard_deviation;
}
double rocksdb_statistics_histogram_data_get_max(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.max;
}
uint64_t rocksdb_statistics_histogram_data_get_count(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.count;
}
uint64_t rocksdb_statistics_histogram_data_get_sum(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.sum;
}
#endif // !ROCKSDB_LITE
double rocksdb_statistics_histogram_data_get_min(
rocksdb_statistics_histogram_data_t* data) {
return data->rep.min;
}
} // end extern "C"

@ -3,17 +3,14 @@
found in the LICENSE file. See the AUTHORS file for names of contributors. */
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
#include <stdio.h>
#ifndef ROCKSDB_LITE // Lite does not support C API
#include "rocksdb/c.h"
#include <assert.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include "rocksdb/c.h"
#ifndef OS_WIN
#include <unistd.h>
#endif
@ -490,6 +487,19 @@ static void CheckTxnPinGetCF(rocksdb_transaction_t* txn,
rocksdb_pinnableslice_destroy(p);
}
static void CheckTxnGetForUpdate(rocksdb_transaction_t* txn,
const rocksdb_readoptions_t* options,
const char* key, const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = rocksdb_transaction_get_for_update(txn, options, key, strlen(key),
&val_len, true, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckTxnDBGet(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options, const char* key,
const char* expected) {
@ -517,6 +527,20 @@ static void CheckTxnDBGetCF(rocksdb_transactiondb_t* txn_db,
Free(&val);
}
static void CheckTxnGetForUpdateCF(
rocksdb_transaction_t* txn, const rocksdb_readoptions_t* options,
rocksdb_column_family_handle_t* column_family, const char* key,
const char* expected) {
char* err = NULL;
size_t val_len;
char* val;
val = rocksdb_transaction_get_for_update_cf(
txn, options, column_family, key, strlen(key), &val_len, true, &err);
CheckNoError(err);
CheckEqual(expected, val, val_len);
Free(&val);
}
static void CheckTxnDBPinGet(rocksdb_transactiondb_t* txn_db,
const rocksdb_readoptions_t* options,
const char* key, const char* expected) {
@ -696,6 +720,8 @@ int main(int argc, char** argv) {
rocksdb_options_add_compact_on_deletion_collector_factory(options, 10000,
10001);
rocksdb_options_add_compact_on_deletion_collector_factory_del_ratio(
options, 10000, 10001, 0.0);
StartPhase("destroy");
rocksdb_destroy_db(options, dbname, &err);
@ -1647,7 +1673,8 @@ int main(int argc, char** argv) {
rocksdb_options_set_prefix_extractor(
options, rocksdb_slicetransform_create_fixed_prefix(3));
rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16, 0, 0, 0,
0);
rocksdb_options_set_allow_concurrent_memtable_write(options, 0);
db = rocksdb_open(options, dbname, &err);
@ -2033,6 +2060,15 @@ int main(int argc, char** argv) {
CheckCondition(29.0 ==
rocksdb_options_get_experimental_mempurge_threshold(o));
CheckCondition(rocksdb_statistics_level_disable_all ==
rocksdb_options_get_statistics_level(o));
rocksdb_options_enable_statistics(o);
CheckCondition(rocksdb_statistics_level_disable_all !=
rocksdb_options_get_statistics_level(o));
rocksdb_options_set_statistics_level(o, rocksdb_statistics_level_all);
CheckCondition(rocksdb_statistics_level_all ==
rocksdb_options_get_statistics_level(o));
/* Blob Options */
rocksdb_options_set_enable_blob_files(o, 1);
CheckCondition(1 == rocksdb_options_get_enable_blob_files(o));
@ -2572,6 +2608,9 @@ int main(int argc, char** argv) {
rocksdb_readoptions_set_io_timeout(ro, 400);
CheckCondition(400 == rocksdb_readoptions_get_io_timeout(ro));
rocksdb_readoptions_set_async_io(ro, 1);
CheckCondition(1 == rocksdb_readoptions_get_async_io(ro));
rocksdb_readoptions_destroy(ro);
}
@ -3091,6 +3130,17 @@ int main(int argc, char** argv) {
CheckTxnDBGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
CheckTxnDBPinGetCF(txn_db, roptions, cfh, "cf_foo", NULL);
// memory usage
rocksdb_t* base_db = rocksdb_transactiondb_get_base_db(txn_db);
rocksdb_memory_consumers_t* consumers = rocksdb_memory_consumers_create();
rocksdb_memory_consumers_add_db(consumers, base_db);
rocksdb_memory_usage_t* usage =
rocksdb_approximate_memory_usage_create(consumers, &err);
CheckNoError(err);
rocksdb_approximate_memory_usage_destroy(usage);
rocksdb_memory_consumers_destroy(consumers);
rocksdb_transactiondb_close_base_db(base_db);
// flush
rocksdb_flushoptions_t* flush_options = rocksdb_flushoptions_create();
rocksdb_flushoptions_set_wait(flush_options, 1);
@ -3203,6 +3253,120 @@ int main(int argc, char** argv) {
rocksdb_transactiondb_options_destroy(txn_db_options);
}
StartPhase("transactions_multi_get_for_update");
{
// open a TransactionDB
txn_db_options = rocksdb_transactiondb_options_create();
rocksdb_transactiondb_options_set_transaction_lock_timeout(txn_db_options,
0);
txn_options = rocksdb_transaction_options_create();
rocksdb_options_set_create_if_missing(options, 1);
txn_db = rocksdb_transactiondb_open(options, txn_db_options, dbname, &err);
CheckNoError(err);
rocksdb_transactiondb_put(txn_db, woptions, "foo", 3, "hey", 3, &err);
CheckNoError(err);
rocksdb_transactiondb_put(txn_db, woptions, "bar", 3, "hello", 5, &err);
CheckNoError(err);
// begin transactions
txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
rocksdb_transaction_t* txn2 =
rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
// multi get
{
const char* keys[2] = {"foo", "bar"};
const size_t keys_sizes[2] = {3, 3};
char* vals[2];
size_t vals_sizes[2];
char* errs[2];
const char* expected[2] = {"hey", "hello"};
rocksdb_transaction_multi_get_for_update(
txn, roptions, 2, keys, keys_sizes, vals, vals_sizes, errs);
CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
}
char* conflict_err = NULL;
size_t val_len;
rocksdb_transaction_get_for_update(txn2, roptions, "foo", 3, &val_len, true,
&conflict_err);
// get-for-update conflict
CheckCondition(conflict_err != NULL);
Free(&conflict_err);
// commit
rocksdb_transaction_commit(txn, &err);
CheckNoError(err);
// should work after first tx is commited
CheckTxnGetForUpdate(txn2, roptions, "foo", "hey");
// commit the second one
rocksdb_transaction_commit(txn2, &err);
CheckNoError(err);
// destroy txns
rocksdb_transaction_destroy(txn);
rocksdb_transaction_destroy(txn2);
// same for column families
rocksdb_column_family_handle_t* cfh;
cfh = rocksdb_transactiondb_create_column_family(txn_db, options,
"txn_db_cf", &err);
CheckNoError(err);
rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_foo", 6, "cf_hello",
8, &err);
CheckNoError(err);
rocksdb_transactiondb_put_cf(txn_db, woptions, cfh, "cf_bar", 6, "cf_hey",
6, &err);
CheckNoError(err);
txn = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
txn2 = rocksdb_transaction_begin(txn_db, woptions, txn_options, NULL);
{
const rocksdb_column_family_handle_t* get_handles[2] = {cfh, cfh};
const char* keys[2] = {"cf_foo", "cf_bar"};
const size_t keys_sizes[2] = {6, 6};
char* vals[2];
size_t vals_sizes[2];
char* errs[2];
const char* expected[2] = {"cf_hello", "cf_hey"};
rocksdb_transaction_multi_get_for_update_cf(txn, roptions, get_handles, 2,
keys, keys_sizes, vals,
vals_sizes, errs);
CheckMultiGetValues(2, vals, vals_sizes, errs, expected);
}
char* conflict_err_cf = NULL;
size_t val_len_cf;
rocksdb_transaction_get_for_update_cf(txn2, roptions, cfh, "cf_foo", 6,
&val_len_cf, true, &conflict_err_cf);
CheckCondition(conflict_err_cf != NULL);
Free(&conflict_err_cf);
rocksdb_transaction_commit(txn, &err);
CheckNoError(err);
CheckTxnGetForUpdateCF(txn2, roptions, cfh, "cf_foo", "cf_hello");
rocksdb_transaction_commit(txn2, &err);
CheckNoError(err);
// close and destroy
rocksdb_column_family_handle_destroy(cfh);
rocksdb_transaction_destroy(txn);
rocksdb_transaction_destroy(txn2);
rocksdb_transactiondb_close(txn_db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_transaction_options_destroy(txn_options);
rocksdb_transactiondb_options_destroy(txn_db_options);
}
StartPhase("optimistic_transactions");
{
rocksdb_options_t* db_options = rocksdb_options_create();
@ -3232,8 +3396,19 @@ int main(int argc, char** argv) {
rocksdb_put(db, woptions, "key", 3, "value", 5, &err);
CheckNoError(err);
rocksdb_column_family_handle_t *cfh1, *cfh2;
cfh1 = rocksdb_create_column_family(db, db_options, "txn_db_cf1", &err);
cfh2 = rocksdb_create_column_family(db, db_options, "txn_db_cf2", &err);
char** list_const_cf_names = (char**)malloc(2 * sizeof(char*));
list_const_cf_names[0] = "txn_db_cf1";
list_const_cf_names[1] = "txn_db_cf2";
size_t cflen;
rocksdb_column_family_handle_t** list_cfh = rocksdb_create_column_families(
db, db_options, 2, (const char* const*)list_const_cf_names, &cflen,
&err);
free(list_const_cf_names);
CheckNoError(err);
assert(cflen == 2);
cfh1 = list_cfh[0];
cfh2 = list_cfh[1];
rocksdb_create_column_families_destroy(list_cfh);
txn = rocksdb_optimistictransaction_begin(otxn_db, woptions, otxn_options,
NULL);
rocksdb_transaction_put_cf(txn, cfh1, "key_cf1", 7, "val_cf1", 7, &err);
@ -3447,6 +3622,71 @@ int main(int argc, char** argv) {
rocksdb_readoptions_destroy(ropts);
}
StartPhase("statistics");
{
const uint32_t BYTES_WRITTEN_TICKER = 40;
const uint32_t DB_WRITE_HIST = 1;
rocksdb_statistics_histogram_data_t* hist =
rocksdb_statistics_histogram_data_create();
{
// zero by default
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 ==
rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 ==
rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_min(hist));
}
rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err);
CheckNoError(err);
rocksdb_options_enable_statistics(options);
rocksdb_options_set_statistics_level(options, rocksdb_statistics_level_all);
db = rocksdb_open(options, dbname, &err);
CheckNoError(err);
CheckCondition(0 == rocksdb_options_statistics_get_ticker_count(
options, BYTES_WRITTEN_TICKER));
rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 == rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 == rocksdb_statistics_histogram_data_get_sum(hist));
int i;
for (i = 0; i < 10; ++i) {
char key = '0' + (char)i;
rocksdb_put(db, woptions, &key, 1, "", 1, &err);
CheckNoError(err);
}
CheckCondition(0 != rocksdb_options_statistics_get_ticker_count(
options, BYTES_WRITTEN_TICKER));
rocksdb_options_statistics_get_histogram_data(options, DB_WRITE_HIST, hist);
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_median(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p95(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_p99(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_average(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_std_dev(hist));
CheckCondition(0.0 != rocksdb_statistics_histogram_data_get_max(hist));
CheckCondition(0 != rocksdb_statistics_histogram_data_get_count(hist));
CheckCondition(0 != rocksdb_statistics_histogram_data_get_sum(hist));
rocksdb_statistics_histogram_data_destroy(hist);
}
StartPhase("cancel_all_background_work");
rocksdb_cancel_all_background_work(db, 1);
@ -3465,12 +3705,3 @@ int main(int argc, char** argv) {
fprintf(stderr, "PASS\n");
return 0;
}
#else
int main(void) {
fprintf(stderr, "SKIPPED\n");
return 0;
}
#endif // !ROCKSDB_LITE

@ -53,11 +53,9 @@ ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
if (cfd_ != nullptr) {
#ifndef ROCKSDB_LITE
for (auto& listener : cfd_->ioptions()->listeners) {
listener->OnColumnFamilyHandleDeletionStarted(this);
}
#endif // ROCKSDB_LITE
// Job id == 0 means that this is not our background process, but rather
// user thread
// Need to hold some shared pointers owned by the initial_cf_options
@ -88,15 +86,10 @@ const std::string& ColumnFamilyHandleImpl::GetName() const {
}
Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
#ifndef ROCKSDB_LITE
// accessing mutable cf-options requires db mutex.
InstrumentedMutexLock l(mutex_);
*desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
return Status::OK();
#else
(void)desc;
return Status::NotSupported();
#endif // !ROCKSDB_LITE
}
const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
@ -347,7 +340,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.hard_pending_compaction_bytes_limit;
}
#ifndef ROCKSDB_LITE
// When the DB is stopped, it's possible that there are some .trash files that
// were not deleted yet, when we open the DB we will find these .trash files
// and schedule them to be deleted (or delete immediately if SstFileManager
@ -359,7 +351,6 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
result.cf_paths[i].path)
.PermitUncheckedError();
}
#endif
if (result.cf_paths.empty()) {
result.cf_paths = db_options.db_paths;
@ -391,8 +382,9 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
if (result.ttl == kDefaultTtl) {
if (is_block_based_table &&
result.compaction_style != kCompactionStyleFIFO) {
if (is_block_based_table) {
// FIFO also requires max_open_files=-1, which is checked in
// ValidateOptions().
result.ttl = kAdjustedTtl;
} else {
result.ttl = 0;
@ -400,40 +392,35 @@ ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
}
const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
// Turn on periodic compactions and set them to occur once every 30 days if
// compaction filters are used and periodic_compaction_seconds is set to the
// default value.
if (result.compaction_style != kCompactionStyleFIFO) {
if (result.compaction_style == kCompactionStyleLevel) {
if ((result.compaction_filter != nullptr ||
result.compaction_filter_factory != nullptr) &&
result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
is_block_based_table) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
} else {
// result.compaction_style == kCompactionStyleFIFO
if (result.ttl == 0) {
if (is_block_based_table) {
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
result.ttl = result.periodic_compaction_seconds;
}
} else if (result.periodic_compaction_seconds != 0) {
result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
} else if (result.compaction_style == kCompactionStyleUniversal) {
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
is_block_based_table) {
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
}
} else if (result.compaction_style == kCompactionStyleFIFO) {
if (result.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
ROCKS_LOG_WARN(
db_options.info_log.get(),
"periodic_compaction_seconds does not support FIFO compaction. You"
"may want to set option TTL instead.");
}
}
// TTL compactions would work similar to Periodic Compactions in Universal in
// most of the cases. So, if ttl is set, execute the periodic compaction
// codepath.
if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
if (result.periodic_compaction_seconds != 0) {
// For universal compaction, `ttl` and `periodic_compaction_seconds` mean the
// same thing, take the stricter value.
if (result.compaction_style == kCompactionStyleUniversal) {
if (result.periodic_compaction_seconds == 0) {
result.periodic_compaction_seconds = result.ttl;
} else if (result.ttl != 0) {
result.periodic_compaction_seconds =
std::min(result.ttl, result.periodic_compaction_seconds);
} else {
result.periodic_compaction_seconds = result.ttl;
}
}
@ -557,7 +544,6 @@ ColumnFamilyData::ColumnFamilyData(
next_(nullptr),
prev_(nullptr),
log_number_(0),
flush_reason_(FlushReason::kOthers),
column_family_set_(column_family_set),
queued_for_flush_(false),
queued_for_compaction_(false),
@ -603,7 +589,6 @@ ColumnFamilyData::ColumnFamilyData(
if (ioptions_.compaction_style == kCompactionStyleLevel) {
compaction_picker_.reset(
new LevelCompactionPicker(ioptions_, &internal_comparator_));
#ifndef ROCKSDB_LITE
} else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset(
new UniversalCompactionPicker(ioptions_, &internal_comparator_));
@ -617,7 +602,6 @@ ColumnFamilyData::ColumnFamilyData(
"Column family %s does not use any background compaction. "
"Compactions can only be done via CompactFiles\n",
GetName().c_str());
#endif // !ROCKSDB_LITE
} else {
ROCKS_LOG_ERROR(ioptions_.logger,
"Unable to recognize the specified compaction style %d. "
@ -881,7 +865,7 @@ int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
}
} // anonymous namespace
std::pair<WriteStallCondition, ColumnFamilyData::WriteStallCause>
std::pair<WriteStallCondition, WriteStallCause>
ColumnFamilyData::GetWriteStallConditionAndCause(
int num_unflushed_memtables, int num_l0_files,
uint64_t num_compaction_needed_bytes,
@ -954,7 +938,8 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
if (compaction_picker_->IsLevel0CompactionInProgress()) {
internal_stats_->AddCFStats(
InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_STOPS, 1);
InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
1);
}
ROCKS_LOG_WARN(ioptions_.logger,
"[%s] Stopping writes because we have %d level-0 files",
@ -975,7 +960,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
SetupDelay(write_controller, compaction_needed_bytes,
prev_compaction_needed_bytes_, was_stopped,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_SLOWDOWNS, 1);
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1);
ROCKS_LOG_WARN(
ioptions_.logger,
"[%s] Stalling writes because we have %d immutable memtables "
@ -993,11 +978,11 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
SetupDelay(write_controller, compaction_needed_bytes,
prev_compaction_needed_bytes_, was_stopped || near_stop,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_SLOWDOWNS,
1);
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1);
if (compaction_picker_->IsLevel0CompactionInProgress()) {
internal_stats_->AddCFStats(
InternalStats::LOCKED_L0_FILE_COUNT_LIMIT_SLOWDOWNS, 1);
InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
1);
}
ROCKS_LOG_WARN(ioptions_.logger,
"[%s] Stalling writes because we have %d level-0 files "
@ -1023,7 +1008,7 @@ WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
prev_compaction_needed_bytes_, was_stopped || near_stop,
mutable_cf_options.disable_auto_compactions);
internal_stats_->AddCFStats(
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_SLOWDOWNS, 1);
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1);
ROCKS_LOG_WARN(
ioptions_.logger,
"[%s] Stalling writes because of estimated pending compaction "
@ -1152,6 +1137,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables(
*overlap = false;
// Create an InternalIterator over all unflushed memtables
Arena arena;
// TODO: plumb Env::IOActivity
ReadOptions read_opts;
read_opts.total_order_seek = true;
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
@ -1255,30 +1241,11 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
// (if no Scrape happens).
assert(ptr != SuperVersion::kSVInUse);
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
if (sv == SuperVersion::kSVObsolete ||
sv->version_number != super_version_number_.load()) {
if (sv == SuperVersion::kSVObsolete) {
RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_CLEANUPS);
db->mutex()->Lock();
// NOTE: underlying resources held by superversion (sst files) might
// not be released until the next background job.
sv->Cleanup();
if (db->immutable_db_options().avoid_unnecessary_blocking_io) {
db->AddSuperVersionsToFreeQueue(sv);
db->SchedulePurge();
} else {
sv_to_delete = sv;
}
} else {
db->mutex()->Lock();
}
db->mutex()->Lock();
sv = super_version_->Ref();
db->mutex()->Unlock();
delete sv_to_delete;
}
assert(sv != nullptr);
return sv;
@ -1409,6 +1376,33 @@ Status ColumnFamilyData::ValidateOptions(
}
}
const auto* ucmp = cf_options.comparator;
assert(ucmp);
if (ucmp->timestamp_size() > 0 &&
!cf_options.persist_user_defined_timestamps) {
if (db_options.atomic_flush) {
return Status::NotSupported(
"Not persisting user-defined timestamps feature is not supported"
"in combination with atomic flush.");
}
if (db_options.allow_concurrent_memtable_write) {
return Status::NotSupported(
"Not persisting user-defined timestamps feature is not supported"
" in combination with concurrent memtable write.");
}
const char* comparator_name = cf_options.comparator->Name();
size_t name_size = strlen(comparator_name);
const char* suffix = ".u64ts";
size_t suffix_size = strlen(suffix);
if (name_size <= suffix_size ||
strcmp(comparator_name + name_size - suffix_size, suffix) != 0) {
return Status::NotSupported(
"Not persisting user-defined timestamps"
"feature only support user-defined timestamps formatted as "
"uint64_t.");
}
}
if (cf_options.enable_blob_garbage_collection) {
if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
cf_options.blob_garbage_collection_age_cutoff > 1.0) {
@ -1438,10 +1432,40 @@ Status ColumnFamilyData::ValidateOptions(
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
if (std::find(supported.begin(), supported.end(),
cf_options.block_protection_bytes_per_key) == supported.end()) {
return Status::NotSupported(
"Block per key-value checksum protection only supports 0, 1, 2, 4 "
"or 8 bytes per key.");
}
if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds
.empty()) {
if (cf_options.compaction_style != kCompactionStyleFIFO) {
return Status::NotSupported(
"Option file_temperature_age_thresholds only supports FIFO "
"compaction.");
} else if (cf_options.num_levels > 1) {
return Status::NotSupported(
"Option file_temperature_age_thresholds is only supported when "
"num_levels = 1.");
} else {
const auto& ages =
cf_options.compaction_options_fifo.file_temperature_age_thresholds;
assert(ages.size() >= 1);
// check that age is sorted
for (size_t i = 0; i < ages.size() - 1; ++i) {
if (ages[i].age >= ages[i + 1].age) {
return Status::NotSupported(
"Option file_temperature_age_thresholds requires elements to be "
"sorted in increasing order with respect to `age` field.");
}
}
}
}
return s;
}
#ifndef ROCKSDB_LITE
Status ColumnFamilyData::SetOptions(
const DBOptions& db_opts,
const std::unordered_map<std::string, std::string>& options_map) {
@ -1460,7 +1484,6 @@ Status ColumnFamilyData::SetOptions(
}
return s;
}
#endif // ROCKSDB_LITE
// REQUIRES: DB mutex held
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
@ -1519,6 +1542,43 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
return data_dirs_[path_id].get();
}
bool ColumnFamilyData::ShouldPostponeFlushToRetainUDT(
uint64_t max_memtable_id) {
const Comparator* ucmp = user_comparator();
const size_t ts_sz = ucmp->timestamp_size();
if (ts_sz == 0 || ioptions_.persist_user_defined_timestamps) {
return false;
}
// If users set the `persist_user_defined_timestamps` flag to false, they
// should also set the `full_history_ts_low` flag to indicate the range of
// user-defined timestamps to retain in memory. Otherwise, we do not
// explicitly postpone flush to retain UDTs.
const std::string& full_history_ts_low = GetFullHistoryTsLow();
if (full_history_ts_low.empty()) {
return false;
}
#ifndef NDEBUG
Slice last_table_newest_udt;
#endif /* !NDEBUG */
for (const Slice& table_newest_udt :
imm()->GetTablesNewestUDT(max_memtable_id)) {
assert(table_newest_udt.size() == full_history_ts_low.size());
assert(last_table_newest_udt.empty() ||
ucmp->CompareTimestamp(table_newest_udt, last_table_newest_udt) >=
0);
// Checking the newest UDT contained in MemTable with ascending ID up to
// `max_memtable_id`. MemTable with bigger ID will have newer UDT, return
// immediately on finding the first MemTable that needs postponing.
if (ucmp->CompareTimestamp(table_newest_udt, full_history_ts_low) >= 0) {
return true;
}
#ifndef NDEBUG
last_table_newest_udt = table_newest_udt;
#endif /* !NDEBUG */
}
return false;
}
void ColumnFamilyData::RecoverEpochNumbers() {
assert(current_);
auto* vstorage = current_->storage_info();
@ -1621,6 +1681,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
db_id_, db_session_id_);
column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd});
auto ucmp = new_cfd->user_comparator();
assert(ucmp);
size_t ts_sz = ucmp->timestamp_size();
running_ts_sz_.insert({id, ts_sz});
if (ts_sz > 0) {
ts_sz_for_record_.insert({id, ts_sz});
}
max_column_family_ = std::max(max_column_family_, id);
// add to linked list
new_cfd->next_ = dummy_cfd_;
@ -1636,10 +1703,13 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
// under a DB mutex AND from a write thread
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
auto cfd_iter = column_family_data_.find(cfd->GetID());
uint32_t cf_id = cfd->GetID();
auto cfd_iter = column_family_data_.find(cf_id);
assert(cfd_iter != column_family_data_.end());
column_family_data_.erase(cfd_iter);
column_families_.erase(cfd->GetName());
running_ts_sz_.erase(cf_id);
ts_sz_for_record_.erase(cf_id);
}
// under a DB mutex OR from a write thread

@ -310,10 +310,6 @@ class ColumnFamilyData {
void SetLogNumber(uint64_t log_number) { log_number_ = log_number; }
uint64_t GetLogNumber() const { return log_number_; }
void SetFlushReason(FlushReason flush_reason) {
flush_reason_ = flush_reason;
}
FlushReason GetFlushReason() const { return flush_reason_; }
// thread-safe
const FileOptions* soptions() const;
const ImmutableOptions* ioptions() const { return &ioptions_; }
@ -339,12 +335,10 @@ class ColumnFamilyData {
// Validate CF options against DB options
static Status ValidateOptions(const DBOptions& db_options,
const ColumnFamilyOptions& cf_options);
#ifndef ROCKSDB_LITE
// REQUIRES: DB mutex held
Status SetOptions(
const DBOptions& db_options,
const std::unordered_map<std::string, std::string>& options_map);
#endif // ROCKSDB_LITE
InternalStats* internal_stats() { return internal_stats_.get(); }
@ -468,12 +462,6 @@ class ColumnFamilyData {
bool queued_for_flush() { return queued_for_flush_; }
bool queued_for_compaction() { return queued_for_compaction_; }
enum class WriteStallCause {
kNone,
kMemtableLimit,
kL0FileCountLimit,
kPendingCompactionBytes,
};
static std::pair<WriteStallCondition, WriteStallCause>
GetWriteStallConditionAndCause(
int num_unflushed_memtables, int num_l0_files,
@ -518,6 +506,12 @@ class ColumnFamilyData {
return full_history_ts_low_;
}
// REQUIRES: DB mutex held.
// Return true if flushing up to MemTables with ID `max_memtable_id`
// should be postponed to retain user-defined timestamps according to the
// user's setting. Called by background flush job.
bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id);
ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
std::shared_ptr<CacheReservationManager>
@ -616,8 +610,6 @@ class ColumnFamilyData {
// recovered from
uint64_t log_number_;
std::atomic<FlushReason> flush_reason_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
@ -719,6 +711,16 @@ class ColumnFamilySet {
Version* dummy_version,
const ColumnFamilyOptions& options);
const UnorderedMap<uint32_t, size_t>& GetRunningColumnFamiliesTimestampSize()
const {
return running_ts_sz_;
}
const UnorderedMap<uint32_t, size_t>&
GetColumnFamiliesTimestampSizeForRecord() const {
return ts_sz_for_record_;
}
iterator begin() { return iterator(dummy_cfd_->next_); }
iterator end() { return iterator(dummy_cfd_); }
@ -744,6 +746,15 @@ class ColumnFamilySet {
UnorderedMap<std::string, uint32_t> column_families_;
UnorderedMap<uint32_t, ColumnFamilyData*> column_family_data_;
// Mutating / reading `running_ts_sz_` and `ts_sz_for_record_` follow
// the same requirements as `column_families_` and `column_family_data_`.
// Mapping from column family id to user-defined timestamp size for all
// running column families.
UnorderedMap<uint32_t, size_t> running_ts_sz_;
// Mapping from column family id to user-defined timestamp size for
// column families with non-zero user-defined timestamp size.
UnorderedMap<uint32_t, size_t> ts_sz_for_record_;
uint32_t max_column_family_;
const FileOptions file_options_;

@ -17,6 +17,7 @@
#include "options/options_parser.h"
#include "port/port.h"
#include "port/stack_trace.h"
#include "rocksdb/comparator.h"
#include "rocksdb/convenience.h"
#include "rocksdb/db.h"
#include "rocksdb/env.h"
@ -63,6 +64,9 @@ class ColumnFamilyTestBase : public testing::Test {
db_options_.create_if_missing = true;
db_options_.fail_if_options_file_error = true;
db_options_.env = env_;
}
void SetUp() override {
EXPECT_OK(DestroyDB(dbname_, Options(db_options_, column_family_options_)));
}
@ -71,11 +75,7 @@ class ColumnFamilyTestBase : public testing::Test {
for (auto h : handles_) {
ColumnFamilyDescriptor cfdescriptor;
Status s = h->GetDescriptor(&cfdescriptor);
#ifdef ROCKSDB_LITE
EXPECT_TRUE(s.IsNotSupported());
#else
EXPECT_OK(s);
#endif // ROCKSDB_LITE
column_families.push_back(cfdescriptor);
}
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
@ -197,12 +197,10 @@ class ColumnFamilyTestBase : public testing::Test {
&db_);
}
#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
void AssertOpenReadOnly(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
ASSERT_OK(OpenReadOnly(cf, options));
}
#endif // !ROCKSDB_LITE
void Open(std::vector<std::string> cf,
std::vector<ColumnFamilyOptions> options = {}) {
@ -224,27 +222,16 @@ class ColumnFamilyTestBase : public testing::Test {
}
bool IsDbWriteStopped() {
#ifndef ROCKSDB_LITE
uint64_t v;
EXPECT_TRUE(dbfull()->GetIntProperty("rocksdb.is-write-stopped", &v));
return (v == 1);
#else
return dbfull()->TEST_write_controler().IsStopped();
#endif // !ROCKSDB_LITE
}
uint64_t GetDbDelayedWriteRate() {
#ifndef ROCKSDB_LITE
uint64_t v;
EXPECT_TRUE(
dbfull()->GetIntProperty("rocksdb.actual-delayed-write-rate", &v));
return v;
#else
if (!dbfull()->TEST_write_controler().NeedsDelay()) {
return 0;
}
return dbfull()->TEST_write_controler().delayed_write_rate();
#endif // !ROCKSDB_LITE
}
void Destroy(const std::vector<ColumnFamilyDescriptor>& column_families =
@ -267,7 +254,6 @@ class ColumnFamilyTestBase : public testing::Test {
db_->CreateColumnFamily(current_cf_opt, cfs[i], &handles_[cfi]));
names_[cfi] = cfs[i];
#ifndef ROCKSDB_LITE // RocksDBLite does not support GetDescriptor
// Verify the CF options of the returned CF handle.
ColumnFamilyDescriptor desc;
ASSERT_OK(handles_[cfi]->GetDescriptor(&desc));
@ -276,7 +262,6 @@ class ColumnFamilyTestBase : public testing::Test {
ASSERT_OK(RocksDBOptionsParser::VerifyCFOptions(
ConfigOptions(), desc.options,
SanitizeOptions(dbfull()->immutable_db_options(), current_cf_opt)));
#endif // !ROCKSDB_LITE
cfi++;
}
}
@ -325,7 +310,6 @@ class ColumnFamilyTestBase : public testing::Test {
ASSERT_OK(db_->FlushWAL(/*sync=*/false));
}
#ifndef ROCKSDB_LITE // TEST functions in DB are not supported in lite
void WaitForFlush(int cf) {
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable(handles_[cf]));
}
@ -339,7 +323,6 @@ class ColumnFamilyTestBase : public testing::Test {
void AssertMaxTotalInMemoryState(uint64_t value) {
ASSERT_EQ(value, MaxTotalInMemoryState());
}
#endif // !ROCKSDB_LITE
Status Put(int cf, const std::string& key, const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(value));
@ -377,7 +360,6 @@ class ColumnFamilyTestBase : public testing::Test {
"rocksdb.num-files-at-level" + std::to_string(level));
}
#ifndef ROCKSDB_LITE
// Return spread of files per level
std::string FilesPerLevel(int cf) {
std::string result;
@ -394,31 +376,19 @@ class ColumnFamilyTestBase : public testing::Test {
result.resize(last_non_zero_offset);
return result;
}
#endif
void AssertFilesPerLevel(const std::string& value, int cf) {
#ifndef ROCKSDB_LITE
ASSERT_EQ(value, FilesPerLevel(cf));
#else
(void)value;
(void)cf;
#endif
}
#ifndef ROCKSDB_LITE // GetLiveFilesMetaData is not supported
int CountLiveFiles() {
std::vector<LiveFileMetaData> metadata;
db_->GetLiveFilesMetaData(&metadata);
return static_cast<int>(metadata.size());
}
#endif // !ROCKSDB_LITE
void AssertCountLiveFiles(int expected_value) {
#ifndef ROCKSDB_LITE
ASSERT_EQ(expected_value, CountLiveFiles());
#else
(void)expected_value;
#endif
}
// Do n memtable flushes, each of which produces an sstable
@ -432,7 +402,6 @@ class ColumnFamilyTestBase : public testing::Test {
}
}
#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
int CountLiveLogFiles() {
int micros_wait_for_log_deletion = 20000;
env_->SleepForMicroseconds(micros_wait_for_log_deletion);
@ -461,25 +430,18 @@ class ColumnFamilyTestBase : public testing::Test {
return ret;
return 0;
}
#endif // !ROCKSDB_LITE
void AssertCountLiveLogFiles(int value) {
#ifndef ROCKSDB_LITE // GetSortedWalFiles is not supported
ASSERT_EQ(value, CountLiveLogFiles());
#else
(void)value;
#endif // !ROCKSDB_LITE
}
void AssertNumberOfImmutableMemtables(std::vector<int> num_per_cf) {
assert(num_per_cf.size() == handles_.size());
#ifndef ROCKSDB_LITE // GetProperty is not supported in lite
for (size_t i = 0; i < num_per_cf.size(); ++i) {
ASSERT_EQ(num_per_cf[i], GetProperty(static_cast<int>(i),
"rocksdb.num-immutable-mem-table"));
}
#endif // !ROCKSDB_LITE
}
void CopyFile(const std::string& source, const std::string& destination,
@ -575,7 +537,6 @@ TEST_P(ColumnFamilyTest, DontReuseColumnFamilyID) {
}
}
#ifndef ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
Open();
@ -598,7 +559,6 @@ TEST_P(ColumnFamilyTest, CreateCFRaceWithGetAggProperty) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
}
#endif // !ROCKSDB_LITE
class FlushEmptyCFTestWithParam
: public ColumnFamilyTestBase,
@ -942,7 +902,6 @@ TEST_P(ColumnFamilyTest, IgnoreRecoveredLog) {
}
}
#ifndef ROCKSDB_LITE // TEST functions used are not supported
TEST_P(ColumnFamilyTest, FlushTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two"});
@ -1057,7 +1016,6 @@ TEST_P(ColumnFamilyTest, LogDeletionTest) {
AssertCountLiveLogFiles(4);
Close();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CrashAfterFlush) {
std::unique_ptr<FaultInjectionTestEnv> fault_env(
@ -1097,7 +1055,6 @@ TEST_P(ColumnFamilyTest, OpenNonexistentColumnFamily) {
ASSERT_TRUE(TryOpen({"default", "dne"}).IsInvalidArgument());
}
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
// Makes sure that obsolete log files get deleted
TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
// disable flushing stale column families
@ -1205,14 +1162,12 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
AssertCountLiveLogFiles(7);
Close();
}
#endif // !ROCKSDB_LITE
// The test is commented out because we want to test that snapshot is
// not created for memtables not supported it, but There isn't a memtable
// that doesn't support snapshot right now. If we have one later, we can
// re-enable the test.
//
// #ifndef ROCKSDB_LITE // Cuckoo is not supported in lite
// TEST_P(ColumnFamilyTest, MemtableNotSupportSnapshot) {
// db_options_.allow_concurrent_memtable_write = false;
// Open();
@ -1232,7 +1187,6 @@ TEST_P(ColumnFamilyTest, DifferentWriteBufferSizes) {
// {second}); auto* s3 = dbfull()->GetSnapshot(); ASSERT_TRUE(s3 == nullptr);
// Close();
// }
// #endif // !ROCKSDB_LITE
class TestComparator : public Comparator {
int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
@ -1299,13 +1253,13 @@ TEST_P(ColumnFamilyTest, DifferentMergeOperators) {
Close();
}
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
Open();
CreateColumnFamilies({"one", "two"});
ColumnFamilyOptions default_cf, one, two;
db_options_.max_open_files = 20; // only 10 files in file cache
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
@ -1323,6 +1277,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1367,9 +1322,7 @@ TEST_P(ColumnFamilyTest, DifferentCompactionStyles) {
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE
// Sync points not supported in RocksDB Lite
TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
@ -1379,6 +1332,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
@ -1395,6 +1349,7 @@ TEST_P(ColumnFamilyTest, MultipleManualCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1477,13 +1432,14 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
;
table_options.no_block_cache = true;
default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
@ -1494,6 +1450,7 @@ TEST_P(ColumnFamilyTest, AutomaticAndManualCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -1572,13 +1529,14 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
db_options_.max_open_files = 20; // only 10 files in file cache
db_options_.max_background_compactions = 3;
default_cf.level_compaction_dynamic_level_bytes = false;
default_cf.compaction_style = kCompactionStyleLevel;
default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10;
default_cf.max_compaction_bytes = default_cf.target_file_size_base * 1100;
BlockBasedTableOptions table_options = GetBlockBasedTableOptions();
;
table_options.no_block_cache = true;
default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
@ -1589,6 +1547,7 @@ TEST_P(ColumnFamilyTest, ManualAndAutomaticCompactions) {
one.level0_file_num_compaction_trigger = 4;
one.write_buffer_size = 120000;
two.level_compaction_dynamic_level_bytes = false;
two.compaction_style = kCompactionStyleLevel;
two.num_levels = 4;
two.level0_file_num_compaction_trigger = 3;
@ -2033,9 +1992,7 @@ TEST_P(ColumnFamilyTest, SameCFAutomaticManualCompactions) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // Tailing iterator not supported
namespace {
std::string IterStatus(Iterator* iter) {
std::string result;
@ -2093,9 +2050,7 @@ TEST_P(ColumnFamilyTest, NewIteratorsTest) {
Destroy();
}
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // ReadOnlyDB is not supported
TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2144,9 +2099,7 @@ TEST_P(ColumnFamilyTest, ReadOnlyDBTest) {
s = OpenReadOnly({"one", "four"});
ASSERT_TRUE(!s.ok());
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported in lite
TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
Open();
CreateColumnFamiliesAndReopen({"one", "two", "three", "four"});
@ -2168,9 +2121,7 @@ TEST_P(ColumnFamilyTest, DontRollEmptyLogs) {
ASSERT_EQ(static_cast<size_t>(total_new_writable_files), handles_.size() + 1);
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForCompaction() is not supported in lite
TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
Open();
CreateColumnFamilies({"one", "two"});
@ -2217,7 +2168,6 @@ TEST_P(ColumnFamilyTest, FlushStaleColumnFamilies) {
ASSERT_EQ(0, dbfull()->TEST_total_log_size());
Close();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateMissingColumnFamilies) {
Status s = TryOpen({"one", "two"});
@ -2457,8 +2407,6 @@ TEST_P(ColumnFamilyTest, FlushAndDropRaceCondition) {
Destroy();
}
#ifndef ROCKSDB_LITE
// skipped as persisting options is not supported in ROCKSDB_LITE
namespace {
std::atomic<int> test_stage(0);
std::atomic<bool> ordered_by_writethread(false);
@ -2540,7 +2488,6 @@ TEST_P(ColumnFamilyTest, CreateAndDropRace) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks();
}
#endif // !ROCKSDB_LITE
TEST_P(ColumnFamilyTest, WriteStallSingleColumnFamily) {
const uint64_t kBaseRate = 800000u;
@ -2950,7 +2897,6 @@ TEST_P(ColumnFamilyTest, CreateDropAndDestroy) {
ASSERT_OK(db_->DestroyColumnFamilyHandle(cfh));
}
#ifndef ROCKSDB_LITE
TEST_P(ColumnFamilyTest, CreateDropAndDestroyWithoutFileDeletion) {
ColumnFamilyHandle* cfh;
Open();
@ -3005,9 +2951,7 @@ TEST_P(ColumnFamilyTest, FlushCloseWALFiles) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // WaitForFlush() is not supported
TEST_P(ColumnFamilyTest, IteratorCloseWALFile1) {
SpecialEnv env(Env::Default());
db_options_.env = &env;
@ -3114,9 +3058,7 @@ TEST_P(ColumnFamilyTest, IteratorCloseWALFile2) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
#ifndef ROCKSDB_LITE // TEST functions are not supported in lite
TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
SpecialEnv env(Env::Default());
// Allow both of flush and purge job to schedule.
@ -3192,7 +3134,6 @@ TEST_P(ColumnFamilyTest, ForwardIteratorCloseWALFile) {
db_options_.env = env_;
Close();
}
#endif // !ROCKSDB_LITE
// Disable on windows because SyncWAL requires env->IsSyncThreadSafe()
// to return true which is not so in unbuffered mode.
@ -3443,6 +3384,205 @@ TEST(ColumnFamilyTest, ValidateMemtableKVChecksumOption) {
ASSERT_OK(ColumnFamilyData::ValidateOptions(db_options, cf_options));
}
// Tests the flushing behavior of a column family to retain user-defined
// timestamp when `persist_user_defined_timestamp` is false.
class ColumnFamilyRetainUDTTest : public ColumnFamilyTestBase {
public:
ColumnFamilyRetainUDTTest() : ColumnFamilyTestBase(kLatestFormatVersion) {}
void SetUp() override {
db_options_.allow_concurrent_memtable_write = false;
column_family_options_.comparator =
test::BytewiseComparatorWithU64TsWrapper();
column_family_options_.persist_user_defined_timestamps = false;
ColumnFamilyTestBase::SetUp();
}
Status Put(int cf, const std::string& key, const std::string& ts,
const std::string& value) {
return db_->Put(WriteOptions(), handles_[cf], Slice(key), Slice(ts),
Slice(value));
}
};
class TestTsComparator : public Comparator {
public:
TestTsComparator() : Comparator(8 /*ts_sz*/) {}
int Compare(const ROCKSDB_NAMESPACE::Slice& /*a*/,
const ROCKSDB_NAMESPACE::Slice& /*b*/) const override {
return 0;
}
const char* Name() const override { return "TestTs"; }
void FindShortestSeparator(
std::string* /*start*/,
const ROCKSDB_NAMESPACE::Slice& /*limit*/) const override {}
void FindShortSuccessor(std::string* /*key*/) const override {}
};
TEST_F(ColumnFamilyRetainUDTTest, SanityCheck) {
Open();
ColumnFamilyOptions cf_options;
cf_options.persist_user_defined_timestamps = false;
TestTsComparator test_comparator;
cf_options.comparator = &test_comparator;
ColumnFamilyHandle* handle;
// Not persisting user-defined timestamps feature only supports user-defined
// timestamps formatted as uint64_t.
ASSERT_TRUE(
db_->CreateColumnFamily(cf_options, "pikachu", &handle).IsNotSupported());
Destroy();
// Not persisting user-defined timestamps feature doesn't work in combination
// with atomic flush.
db_options_.atomic_flush = true;
ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
// Not persisting user-defined timestamps feature doesn't work in combination
// with concurrent memtable write.
db_options_.atomic_flush = false;
db_options_.allow_concurrent_memtable_write = true;
ASSERT_TRUE(TryOpen({"default"}).IsNotSupported());
Close();
}
TEST_F(ColumnFamilyRetainUDTTest, FullHistoryTsLowNotSet) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
// No `full_history_ts_low` explicitly set by user, flush is continued
// without checking if its UDTs expired.
ASSERT_OK(Flush(0));
// After flush, `full_history_ts_low` should be automatically advanced to
// the effective cutoff timestamp: write_ts + 1
std::string cutoff_ts;
PutFixed64(&cutoff_ts, 2);
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, AllKeysExpired) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
std::string cutoff_ts;
PutFixed64(&cutoff_ts, 3);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
// All keys expired w.r.t the configured `full_history_ts_low`, flush continue
// without the need for a re-schedule.
ASSERT_OK(Flush(0));
// `full_history_ts_low` stays unchanged after flush.
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushToAvoidWriteStall) {
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(1, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string cutoff_ts;
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
PutFixed64(&cutoff_ts, 1);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
ASSERT_OK(db_->SetOptions(handles_[0], {{"max_write_buffer_number", "1"}}));
// Not all keys expired, but flush is continued without a re-schedule because
// of risk of write stall.
ASSERT_OK(Flush(0));
// After flush, `full_history_ts_low` should be automatically advanced to
// the effective cutoff timestamp: write_ts + 1
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
cutoff_ts.clear();
PutFixed64(&cutoff_ts, 2);
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
TEST_F(ColumnFamilyRetainUDTTest, NotAllKeysExpiredFlushRescheduled) {
std::string cutoff_ts;
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::AfterRetainUDTReschedule:cb", [&](void* /*arg*/) {
// Increasing full_history_ts_low so all keys expired after the initial
// FlushRequest is rescheduled
cutoff_ts.clear();
PutFixed64(&cutoff_ts, 3);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
});
SyncPoint::GetInstance()->SetCallBack(
"DBImpl::BackgroundFlush:CheckFlushRequest:cb", [&](void* arg) {
ASSERT_NE(nullptr, arg);
auto reschedule_count = *static_cast<int*>(arg);
ASSERT_EQ(2, reschedule_count);
});
SyncPoint::GetInstance()->EnableProcessing();
Open();
std::string write_ts;
PutFixed64(&write_ts, 1);
ASSERT_OK(Put(0, "foo", write_ts, "v1"));
PutFixed64(&cutoff_ts, 1);
ASSERT_OK(db_->IncreaseFullHistoryTsLow(handles_[0], cutoff_ts));
// Not all keys expired, and there is no risk of write stall. Flush is
// rescheduled. The actual flush happens after `full_history_ts_low` is
// increased to mark all keys expired.
ASSERT_OK(Flush(0));
std::string effective_full_history_ts_low;
ASSERT_OK(
db_->GetFullHistoryTsLow(handles_[0], &effective_full_history_ts_low));
// `full_history_ts_low` stays unchanged.
ASSERT_EQ(cutoff_ts, effective_full_history_ts_low);
Close();
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
}
} // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) {

@ -3,7 +3,6 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include <mutex>
#include <string>
@ -67,6 +66,7 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
const int kWriteBufferSize = 10000;
const int kLevel0Trigger = 2;
options.create_if_missing = true;
options.level_compaction_dynamic_level_bytes = false;
options.compaction_style = kCompactionStyleLevel;
// Small slowdown and stop trigger for experimental purpose.
options.level0_slowdown_writes_trigger = 20;
@ -121,7 +121,9 @@ TEST_F(CompactFilesTest, L0ConflictsFiles) {
TEST_F(CompactFilesTest, MultipleLevel) {
Options options;
options.create_if_missing = true;
options.level_compaction_dynamic_level_bytes = true;
// Otherwise background compaction can happen to
// drain unnecessary level
options.level_compaction_dynamic_level_bytes = false;
options.num_levels = 6;
// Add listener
FlushedFileCollector* collector = new FlushedFileCollector();
@ -182,7 +184,6 @@ TEST_F(CompactFilesTest, MultipleLevel) {
for (int invalid_output_level = 0; invalid_output_level < 5;
invalid_output_level++) {
s = db->CompactFiles(CompactionOptions(), files, invalid_output_level);
std::cout << s.ToString() << std::endl;
ASSERT_TRUE(s.IsInvalidArgument());
}
@ -359,6 +360,7 @@ TEST_F(CompactFilesTest, CompactionFilterWithGetSv) {
std::shared_ptr<FilterWithGet> cf(new FilterWithGet());
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.create_if_missing = true;
options.compaction_filter = cf.get();
@ -401,6 +403,7 @@ TEST_F(CompactFilesTest, SentinelCompressionType) {
CompactionStyle::kCompactionStyleNone}) {
ASSERT_OK(DestroyDB(db_name_, Options()));
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.compaction_style = compaction_style;
// L0: Snappy, L1: ZSTD, L2: Snappy
options.compression_per_level = {CompressionType::kSnappyCompression,
@ -490,13 +493,3 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as DBImpl::CompactFiles is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // !ROCKSDB_LITE

@ -13,6 +13,7 @@
#include <vector>
#include "db/column_family.h"
#include "logging/logging.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/sst_partitioner.h"
#include "test_util/sync_point.h"
@ -20,14 +21,16 @@
namespace ROCKSDB_NAMESPACE {
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
auto c = user_cmp->CompareWithoutTimestamp(a.user_key(), b.user_key());
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b));
if (c != 0) {
return c;
}
auto a_footer = ExtractInternalKeyFooter(a.Encode());
auto b_footer = ExtractInternalKeyFooter(b.Encode());
auto a_footer = ExtractInternalKeyFooter(a);
auto b_footer = ExtractInternalKeyFooter(b);
if (a_footer == kRangeTombstoneSentinel) {
if (b_footer != kRangeTombstoneSentinel) {
return -1;
@ -201,6 +204,34 @@ bool Compaction::IsFullCompaction(
return num_files_in_compaction == total_num_files;
}
const TablePropertiesCollection& Compaction::GetTableProperties() {
if (!input_table_properties_initialized_) {
const ReadOptions read_options(Env::IOActivity::kCompaction);
for (size_t i = 0; i < num_input_levels(); ++i) {
for (const FileMetaData* fmd : *(this->inputs(i))) {
std::shared_ptr<const TableProperties> tp;
std::string file_name =
TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(),
fmd->fd.GetPathId());
Status s = input_version_->GetTableProperties(read_options, &tp, fmd,
&file_name);
if (s.ok()) {
table_properties_[file_name] = tp;
} else {
ROCKS_LOG_ERROR(immutable_options_.info_log,
"Unable to load table properties for file %" PRIu64
" --- %s\n",
fmd->fd.GetNumber(), s.ToString().c_str());
}
}
}
input_table_properties_initialized_ = true;
};
return table_properties_;
}
Compaction::Compaction(
VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
const MutableCFOptions& _mutable_cf_options,
@ -462,6 +493,11 @@ bool Compaction::IsTrivialMove() const {
return false;
}
if (compaction_reason_ == CompactionReason::kChangeTemperature) {
// Changing temperature usually requires rewriting the file.
return false;
}
// Used in universal compaction, where trivial move can be done if the
// input files are non overlapping
if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
@ -478,26 +514,25 @@ bool Compaction::IsTrivialMove() const {
// assert inputs_.size() == 1
std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
for (const auto& file : inputs_.front().files) {
std::vector<FileMetaData*> file_grand_parents;
if (output_level_ + 1 >= number_levels_) {
continue;
}
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
&file->largest, &file_grand_parents);
const auto compaction_size =
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
if (compaction_size > max_compaction_bytes_) {
return false;
}
if (partitioner.get() != nullptr) {
if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
file->largest.user_key())) {
if (output_level_ + 1 < number_levels_) {
std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
for (const auto& file : inputs_.front().files) {
std::vector<FileMetaData*> file_grand_parents;
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
&file->largest,
&file_grand_parents);
const auto compaction_size =
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
if (compaction_size > max_compaction_bytes_) {
return false;
}
if (partitioner.get() != nullptr) {
if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
file->largest.user_key())) {
return false;
}
}
}
}
@ -555,6 +590,49 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
return false;
}
bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const {
assert(input_version_ != nullptr);
assert(level_ptrs != nullptr);
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) <
0);
if (bottommost_level_) {
return true /* does not overlap */;
} else if (output_level_ != 0 &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
const Comparator* user_cmp = cfd_->user_comparator();
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files =
input_vstorage_->LevelFiles(lvl);
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
auto* f = files[level_ptrs->at(lvl)];
// Advance until the first file with begin_key <= f->largest.user_key()
if (user_cmp->CompareWithoutTimestamp(begin_key,
f->largest.user_key()) > 0) {
continue;
}
// We know that the previous file prev_f, if exists, has
// prev_f->largest.user_key() < begin_key.
if (user_cmp->CompareWithoutTimestamp(end_key,
f->smallest.user_key()) <= 0) {
// not overlapping with this level
break;
} else {
// We have:
// - begin_key < end_key,
// - begin_key <= f->largest.user_key(), and
// - end_key > f->smallest.user_key()
return false /* overlap */;
}
}
}
return true /* does not overlap */;
}
return false /* overlaps */;
};
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
for (size_t i = 0; i < num_input_levels(); i++) {

@ -18,8 +18,6 @@ namespace ROCKSDB_NAMESPACE {
// The file contains class Compaction, as well as some helper functions
// and data structures used by the class.
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
// null which provides the property that a==null indicates a key that is less
// than any key and b==null indicates a key that is greater than any key. Note
@ -33,8 +31,19 @@ const uint64_t kRangeTombstoneSentinel =
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&);
inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a,
const InternalKey& b) {
return sstableKeyCompare(user_cmp, a, b.Encode());
}
inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const Slice& b) {
return sstableKeyCompare(user_cmp, a.Encode(), b);
}
inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
const InternalKey& b) {
return sstableKeyCompare(user_cmp, a.Encode(), b.Encode());
}
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
const InternalKey& b);
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
@ -205,10 +214,18 @@ class Compaction {
void AddInputDeletions(VersionEdit* edit);
// Returns true if the available information we have guarantees that
// the input "user_key" does not exist in any level beyond "output_level()".
// the input "user_key" does not exist in any level beyond `output_level()`.
bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
std::vector<size_t>* level_ptrs) const;
// Returns true if the user key range [begin_key, end_key) does not exist
// in any level beyond `output_level()`.
// Used for checking range tombstones, so we assume begin_key < end_key.
// begin_key and end_key should include timestamp if enabled.
bool KeyRangeNotExistsBeyondOutputLevel(
const Slice& begin_key, const Slice& end_key,
std::vector<size_t>* level_ptrs) const;
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
//
@ -309,12 +326,16 @@ class Compaction {
int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs);
TablePropertiesCollection GetOutputTableProperties() const {
return output_table_properties_;
}
// If called before a compaction finishes, will return
// table properties of all compaction input files.
// If called after a compaction finished, will return
// table properties of all compaction input and output files.
const TablePropertiesCollection& GetTableProperties();
void SetOutputTableProperties(TablePropertiesCollection tp) {
output_table_properties_ = std::move(tp);
void SetOutputTableProperties(
const std::string& file_name,
const std::shared_ptr<const TableProperties>& tp) {
table_properties_[file_name] = tp;
}
Slice GetSmallestUserKey() const { return smallest_user_key_; }
@ -501,8 +522,9 @@ class Compaction {
// Does input compression match the output compression?
bool InputCompressionMatchesOutput() const;
bool input_table_properties_initialized_ = false;
// table properties of output files
TablePropertiesCollection output_table_properties_;
TablePropertiesCollection table_properties_;
// smallest user keys in compaction
// includes timestamp if user-defined timestamp is enabled.
@ -548,13 +570,16 @@ struct PerKeyPlacementContext {
const Slice value;
const SequenceNumber seq_num;
bool output_to_penultimate_level;
bool& output_to_penultimate_level;
PerKeyPlacementContext(int _level, Slice _key, Slice _value,
SequenceNumber _seq_num)
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
output_to_penultimate_level = false;
}
SequenceNumber _seq_num,
bool& _output_to_penultimate_level)
: level(_level),
key(_key),
value(_value),
seq_num(_seq_num),
output_to_penultimate_level(_output_to_penultimate_level) {}
};
#endif /* !NDEBUG */

@ -13,6 +13,7 @@
#include "db/blob/blob_index.h"
#include "db/blob/prefetch_buffer_collection.h"
#include "db/snapshot_checker.h"
#include "db/wide/wide_column_serialization.h"
#include "logging/logging.h"
#include "port/likely.h"
#include "rocksdb/listener.h"
@ -30,7 +31,8 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
const Compaction* compaction, const CompactionFilter* compaction_filter,
bool must_count_input_entries, const Compaction* compaction,
const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low,
@ -44,8 +46,9 @@ CompactionIterator::CompactionIterator(
manual_compaction_canceled,
std::unique_ptr<CompactionProxy>(
compaction ? new RealCompaction(compaction) : nullptr),
compaction_filter, shutting_down, info_log, full_history_ts_low,
preserve_time_min_seqno, preclude_last_level_min_seqno) {}
must_count_input_entries, compaction_filter, shutting_down, info_log,
full_history_ts_low, preserve_time_min_seqno,
preclude_last_level_min_seqno) {}
CompactionIterator::CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@ -57,15 +60,14 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction,
std::unique_ptr<CompactionProxy> compaction, bool must_count_input_entries,
const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low,
const SequenceNumber preserve_time_min_seqno,
const SequenceNumber preclude_last_level_min_seqno)
: input_(input, cmp,
!compaction || compaction->DoesInputReferenceBlobFiles()),
: input_(input, cmp, must_count_input_entries),
cmp_(cmp),
merge_helper_(merge_helper),
snapshots_(snapshots),
@ -124,6 +126,9 @@ CompactionIterator::CompactionIterator(
timestamp_size_ == full_history_ts_low_->size());
#endif
input_.SetPinnedItersMgr(&pinned_iters_mgr_);
// The default `merge_until_status_` does not need to be checked since it is
// overwritten as soon as `MergeUntil()` is called
merge_until_status_.PermitUncheckedError();
TEST_SYNC_POINT_CALLBACK("CompactionIterator:AfterInit", compaction_.get());
}
@ -178,6 +183,20 @@ void CompactionIterator::Next() {
ikey_.user_key = current_key_.GetUserKey();
validity_info_.SetValid(ValidContext::kMerge1);
} else {
if (merge_until_status_.IsMergeInProgress()) {
// `Status::MergeInProgress()` tells us that the previous `MergeUntil()`
// produced only merge operands. Those merge operands were accessed and
// written out using `merge_out_iter_`. Since `merge_out_iter_` is
// exhausted at this point, all merge operands have been written out.
//
// Still, there may be a base value (PUT, DELETE, SINGLEDEL, etc.) that
// needs to be written out. Normally, `CompactionIterator` would skip it
// on the basis that it has already output something in the same
// snapshot stripe. To prevent this, we reset `has_current_user_key_` to
// trick the future iteration from finding out the snapshot stripe is
// unchanged.
has_current_user_key_ = false;
}
// We consumed all pinned merge operands, release pinned iterators
pinned_iters_mgr_.ReleasePinnedData();
// MergeHelper moves the iterator to the first record after the merged
@ -204,39 +223,47 @@ void CompactionIterator::Next() {
bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
Slice* skip_until) {
// TODO: support compaction filter for wide-column entities
if (!compaction_filter_ ||
(ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex)) {
if (!compaction_filter_) {
return true;
}
bool error = false;
// If the user has specified a compaction filter and the sequence
// number is greater than any external snapshot, then invoke the
// filter. If the return value of the compaction filter is true,
// replace the entry with a deletion marker.
CompactionFilter::Decision filter = CompactionFilter::Decision::kUndetermined;
compaction_filter_value_.clear();
compaction_filter_skip_until_.Clear();
if (ikey_.type != kTypeValue && ikey_.type != kTypeBlobIndex &&
ikey_.type != kTypeWideColumnEntity) {
return true;
}
CompactionFilter::Decision decision =
CompactionFilter::Decision::kUndetermined;
CompactionFilter::ValueType value_type =
ikey_.type == kTypeValue ? CompactionFilter::ValueType::kValue
: CompactionFilter::ValueType::kBlobIndex;
: ikey_.type == kTypeBlobIndex
? CompactionFilter::ValueType::kBlobIndex
: CompactionFilter::ValueType::kWideColumnEntity;
// Hack: pass internal key to BlobIndexCompactionFilter since it needs
// to get sequence number.
assert(compaction_filter_);
Slice& filter_key =
(ikey_.type == kTypeValue ||
const Slice& filter_key =
(ikey_.type != kTypeBlobIndex ||
!compaction_filter_->IsStackedBlobDbInternalCompactionFilter())
? ikey_.user_key
: key_;
compaction_filter_value_.clear();
compaction_filter_skip_until_.Clear();
std::vector<std::pair<std::string, std::string>> new_columns;
{
StopWatchNano timer(clock_, report_detailed_time_);
if (kTypeBlobIndex == ikey_.type) {
filter = compaction_filter_->FilterBlobByKey(
if (ikey_.type == kTypeBlobIndex) {
decision = compaction_filter_->FilterBlobByKey(
level_, filter_key, &compaction_filter_value_,
compaction_filter_skip_until_.rep());
if (CompactionFilter::Decision::kUndetermined == filter &&
if (decision == CompactionFilter::Decision::kUndetermined &&
!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
if (compaction_ == nullptr) {
if (!compaction_) {
status_ =
Status::Corruption("Unexpected blob index outside of compaction");
validity_info_.Invalidate();
@ -282,33 +309,61 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
value_type = CompactionFilter::ValueType::kValue;
}
}
if (CompactionFilter::Decision::kUndetermined == filter) {
filter = compaction_filter_->FilterV2(
level_, filter_key, value_type,
blob_value_.empty() ? value_ : blob_value_, &compaction_filter_value_,
if (decision == CompactionFilter::Decision::kUndetermined) {
const Slice* existing_val = nullptr;
const WideColumns* existing_col = nullptr;
WideColumns existing_columns;
if (ikey_.type != kTypeWideColumnEntity) {
if (!blob_value_.empty()) {
existing_val = &blob_value_;
} else {
existing_val = &value_;
}
} else {
Slice value_copy = value_;
const Status s =
WideColumnSerialization::Deserialize(value_copy, existing_columns);
if (!s.ok()) {
status_ = s;
validity_info_.Invalidate();
return false;
}
existing_col = &existing_columns;
}
decision = compaction_filter_->FilterV3(
level_, filter_key, value_type, existing_val, existing_col,
&compaction_filter_value_, &new_columns,
compaction_filter_skip_until_.rep());
}
iter_stats_.total_filter_time +=
env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
}
if (CompactionFilter::Decision::kUndetermined == filter) {
// Should not reach here, since FilterV2 should never return kUndetermined.
status_ =
Status::NotSupported("FilterV2() should never return kUndetermined");
if (decision == CompactionFilter::Decision::kUndetermined) {
// Should not reach here, since FilterV2/FilterV3 should never return
// kUndetermined.
status_ = Status::NotSupported(
"FilterV2/FilterV3 should never return kUndetermined");
validity_info_.Invalidate();
return false;
}
if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil &&
if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil &&
cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
0) {
// Can't skip to a key smaller than the current one.
// Keep the key as per FilterV2 documentation.
filter = CompactionFilter::Decision::kKeep;
// Keep the key as per FilterV2/FilterV3 documentation.
decision = CompactionFilter::Decision::kKeep;
}
if (filter == CompactionFilter::Decision::kRemove) {
if (decision == CompactionFilter::Decision::kRemove) {
// convert the current key to a delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeDeletion;
@ -316,7 +371,7 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (filter == CompactionFilter::Decision::kPurge) {
} else if (decision == CompactionFilter::Decision::kPurge) {
// convert the current key to a single delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeSingleDeletion;
@ -324,19 +379,19 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with single delete
value_.clear();
iter_stats_.num_record_drop_user++;
} else if (filter == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type == kTypeBlobIndex) {
// value transfer from blob file to inlined data
} else if (decision == CompactionFilter::Decision::kChangeValue) {
if (ikey_.type != kTypeValue) {
ikey_.type = kTypeValue;
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue);
}
value_ = compaction_filter_value_;
} else if (filter == CompactionFilter::Decision::kRemoveAndSkipUntil) {
} else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) {
*need_skip = true;
compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
kValueTypeForSeek);
*skip_until = compaction_filter_skip_until_.Encode();
} else if (filter == CompactionFilter::Decision::kChangeBlobIndex) {
} else if (decision == CompactionFilter::Decision::kChangeBlobIndex) {
// Only the StackableDB-based BlobDB impl's compaction filter should return
// kChangeBlobIndex. Decision about rewriting blob and changing blob index
// in the integrated BlobDB impl is made in subsequent call to
@ -348,23 +403,56 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
validity_info_.Invalidate();
return false;
}
if (ikey_.type == kTypeValue) {
// value transfer from inlined data to blob file
if (ikey_.type != kTypeBlobIndex) {
ikey_.type = kTypeBlobIndex;
current_key_.UpdateInternalKey(ikey_.sequence, ikey_.type);
current_key_.UpdateInternalKey(ikey_.sequence, kTypeBlobIndex);
}
value_ = compaction_filter_value_;
} else if (filter == CompactionFilter::Decision::kIOError) {
} else if (decision == CompactionFilter::Decision::kIOError) {
if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
status_ = Status::NotSupported(
"CompactionFilter for integrated BlobDB should not return kIOError");
validity_info_.Invalidate();
return false;
}
status_ = Status::IOError("Failed to access blob during compaction filter");
error = true;
validity_info_.Invalidate();
return false;
} else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) {
WideColumns sorted_columns;
sorted_columns.reserve(new_columns.size());
for (const auto& column : new_columns) {
sorted_columns.emplace_back(column.first, column.second);
}
std::sort(sorted_columns.begin(), sorted_columns.end(),
[](const WideColumn& lhs, const WideColumn& rhs) {
return lhs.name().compare(rhs.name()) < 0;
});
{
const Status s = WideColumnSerialization::Serialize(
sorted_columns, compaction_filter_value_);
if (!s.ok()) {
status_ = s;
validity_info_.Invalidate();
return false;
}
}
if (ikey_.type != kTypeWideColumnEntity) {
ikey_.type = kTypeWideColumnEntity;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeWideColumnEntity);
}
value_ = compaction_filter_value_;
}
return !error;
return true;
}
void CompactionIterator::NextFromInput() {
@ -895,14 +983,15 @@ void CompactionIterator::NextFromInput() {
// have hit (A)
// We encapsulate the merge related state machine in a different
// object to minimize change to the existing flow.
Status s = merge_helper_->MergeUntil(
merge_until_status_ = merge_helper_->MergeUntil(
&input_, range_del_agg_, prev_snapshot, bottommost_level_,
allow_data_in_errors_, blob_fetcher_.get(), full_history_ts_low_,
prefetch_buffers_.get(), &iter_stats_);
merge_out_iter_.SeekToFirst();
if (!s.ok() && !s.IsMergeInProgress()) {
status_ = s;
if (!merge_until_status_.ok() &&
!merge_until_status_.IsMergeInProgress()) {
status_ = merge_until_status_;
return;
} else if (merge_out_iter_.Valid()) {
// NOTE: key, value, and ikey_ refer to old entries.
@ -1113,17 +1202,7 @@ void CompactionIterator::GarbageCollectBlobIfNeeded() {
void CompactionIterator::DecideOutputLevel() {
assert(compaction_->SupportsPerKeyPlacement());
#ifndef NDEBUG
// Could be overridden by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_,
ikey_.sequence);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
&context);
output_to_penultimate_level_ = context.output_to_penultimate_level;
#else
output_to_penultimate_level_ = false;
#endif // NDEBUG
// if the key is newer than the cutoff sequence or within the earliest
// snapshot, it should output to the penultimate level.
if (ikey_.sequence > preclude_last_level_min_seqno_ ||
@ -1131,6 +1210,17 @@ void CompactionIterator::DecideOutputLevel() {
output_to_penultimate_level_ = true;
}
#ifndef NDEBUG
// Could be overridden by unittest
PerKeyPlacementContext context(level_, ikey_.user_key, value_, ikey_.sequence,
output_to_penultimate_level_);
TEST_SYNC_POINT_CALLBACK("CompactionIterator::PrepareOutput.context",
&context);
if (ikey_.sequence > earliest_snapshot_) {
output_to_penultimate_level_ = true;
}
#endif // NDEBUG
if (output_to_penultimate_level_) {
// If it's decided to output to the penultimate level, but unsafe to do so,
// still output to the last level. For example, moving the data from a lower
@ -1323,6 +1413,7 @@ std::unique_ptr<BlobFetcher> CompactionIterator::CreateBlobFetcherIfNeeded(
}
ReadOptions read_options;
read_options.io_activity = Env::IOActivity::kCompaction;
read_options.fill_cache = false;
return std::unique_ptr<BlobFetcher>(new BlobFetcher(version, read_options));

@ -38,15 +38,18 @@ class SequenceIterWrapper : public InternalIterator {
bool Valid() const override { return inner_iter_->Valid(); }
Status status() const override { return inner_iter_->status(); }
void Next() override {
num_itered_++;
if (!inner_iter_->IsDeleteRangeSentinelKey()) {
num_itered_++;
}
inner_iter_->Next();
}
void Seek(const Slice& target) override {
if (!need_count_entries_) {
has_num_itered_ = false;
inner_iter_->Seek(target);
} else {
// For flush cases, we need to count total number of entries, so we
// do Next() rather than Seek().
// Need to count total number of entries,
// so we do Next() rather than Seek().
while (inner_iter_->Valid() &&
icmp_.Compare(inner_iter_->key(), target) < 0) {
Next();
@ -62,7 +65,8 @@ class SequenceIterWrapper : public InternalIterator {
void SeekForPrev(const Slice& /* target */) override { assert(false); }
void SeekToLast() override { assert(false); }
uint64_t num_itered() const { return num_itered_; }
uint64_t NumItered() const { return num_itered_; }
bool HasNumItered() const { return has_num_itered_; }
bool IsDeleteRangeSentinelKey() const override {
assert(Valid());
return inner_iter_->IsDeleteRangeSentinelKey();
@ -73,6 +77,7 @@ class SequenceIterWrapper : public InternalIterator {
InternalIterator* inner_iter_; // not owned
uint64_t num_itered_ = 0;
bool need_count_entries_;
bool has_num_itered_ = true;
};
class CompactionIterator {
@ -189,6 +194,10 @@ class CompactionIterator {
const Compaction* compaction_;
};
// @param must_count_input_entries if true, `NumInputEntryScanned()` will
// return the number of input keys scanned. If false, `NumInputEntryScanned()`
// will return this number if no Seek was called on `input`. User should call
// `HasNumInputEntryScanned()` first in this case.
CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@ -199,7 +208,7 @@ class CompactionIterator {
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
const Compaction* compaction = nullptr,
bool must_count_input_entries, const Compaction* compaction = nullptr,
const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr,
@ -219,6 +228,7 @@ class CompactionIterator {
bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction,
bool must_count_input_entries,
const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr,
@ -253,7 +263,8 @@ class CompactionIterator {
return current_user_key_;
}
const CompactionIterationStats& iter_stats() const { return iter_stats_; }
uint64_t num_input_entry_scanned() const { return input_.num_itered(); }
bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
// If the current key should be placed on penultimate level, only valid if
// per_key_placement is supported
bool output_to_penultimate_level() const {
@ -444,6 +455,7 @@ class CompactionIterator {
bool clear_and_output_next_key_ = false;
MergeOutputIterator merge_out_iter_;
Status merge_until_status_;
// PinnedIteratorsManager used to pin input_ Iterator blocks while reading
// merge operands and then releasing them after consuming them.
PinnedIteratorsManager pinned_iters_mgr_;

@ -293,8 +293,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
true /*enforce_single_del_contracts*/,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr,
full_history_ts_low));
std::move(compaction), /*must_count_input_entries=*/false, filter,
&shutting_down_, /*info_log=*/nullptr, full_history_ts_low));
}
void AddSnapshot(SequenceNumber snapshot,

@ -192,8 +192,8 @@ CompactionJob::CompactionJob(
assert(log_buffer_ != nullptr);
const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
db_options_.enable_thread_tracking);
ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
ThreadStatusUtil::SetColumnFamily(cfd);
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
ReportStartedCompaction(compaction);
}
@ -204,10 +204,6 @@ CompactionJob::~CompactionJob() {
}
void CompactionJob::ReportStartedCompaction(Compaction* compaction) {
const auto* cfd = compact_->compaction->column_family_data();
ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
db_options_.enable_thread_tracking);
ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID,
job_id_);
@ -264,7 +260,7 @@ void CompactionJob::Prepare() {
StopWatch sw(db_options_.clock, stats_, SUBCOMPACTION_SETUP_TIME);
GenSubcompactionBoundaries();
}
if (boundaries_.size() > 1) {
if (boundaries_.size() >= 1) {
for (size_t i = 0; i <= boundaries_.size(); i++) {
compact_->sub_compact_states.emplace_back(
c, (i != 0) ? std::optional<Slice>(boundaries_[i - 1]) : std::nullopt,
@ -291,12 +287,14 @@ void CompactionJob::Prepare() {
c->immutable_options()->preclude_last_level_data_seconds);
if (preserve_time_duration > 0) {
const ReadOptions read_options(Env::IOActivity::kCompaction);
// setup seqno_time_mapping_
seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration);
for (const auto& each_level : *c->inputs()) {
for (const auto& fmd : each_level.files) {
std::shared_ptr<const TableProperties> tp;
Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr);
Status s =
cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr);
if (s.ok()) {
seqno_time_mapping_.Add(tp->seqno_to_time_mapping)
.PermitUncheckedError();
@ -472,7 +470,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
// overlap with N-1 other ranges. Since we requested a relatively large number
// (128) of ranges from each input files, even N range overlapping would
// cause relatively small inaccuracy.
const ReadOptions read_options(Env::IOActivity::kCompaction);
auto* c = compact_->compaction;
if (c->max_subcompactions() <= 1 &&
!(c->immutable_options()->compaction_pri == kRoundRobin &&
@ -506,7 +504,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
FileMetaData* f = flevel->files[i].file_metadata;
std::vector<TableReader::Anchor> my_anchors;
Status s = cfd->table_cache()->ApproximateKeyAnchors(
ReadOptions(), icomp, *f, my_anchors);
read_options, icomp, *f,
c->mutable_cf_options()->block_protection_bytes_per_key,
my_anchors);
if (!s.ok() || my_anchors.empty()) {
my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize());
}
@ -722,11 +722,12 @@ Status CompactionJob::Run() {
// use_direct_io_for_flush_and_compaction is true, we will regard this
// verification as user reads since the goal is to cache it here for
// further user reads
ReadOptions read_options;
const ReadOptions verify_table_read_options(
Env::IOActivity::kCompaction);
InternalIterator* iter = cfd->table_cache()->NewIterator(
read_options, file_options_, cfd->internal_comparator(),
files_output[file_idx]->meta, /*range_del_agg=*/nullptr,
prefix_extractor,
verify_table_read_options, file_options_,
cfd->internal_comparator(), files_output[file_idx]->meta,
/*range_del_agg=*/nullptr, prefix_extractor,
/*table_reader_ptr=*/nullptr,
cfd->internal_stats()->GetFileReadHist(
compact_->compaction->output_level()),
@ -736,7 +737,9 @@ Status CompactionJob::Run() {
*compact_->compaction->mutable_cf_options()),
/*smallest_compaction_key=*/nullptr,
/*largest_compaction_key=*/nullptr,
/*allow_unprepared_value=*/false);
/*allow_unprepared_value=*/false,
compact_->compaction->mutable_cf_options()
->block_protection_bytes_per_key);
auto s = iter->status();
if (s.ok() && paranoid_file_checks_) {
@ -793,20 +796,51 @@ Status CompactionJob::Run() {
auto fn =
TableFileName(state.compaction->immutable_options()->cf_paths,
output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
tp[fn] = output.table_properties;
compact_->compaction->SetOutputTableProperties(fn,
output.table_properties);
}
}
compact_->compaction->SetOutputTableProperties(std::move(tp));
// Finish up all book-keeping to unify the subcompaction results
// Finish up all bookkeeping to unify the subcompaction results.
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
UpdateCompactionStats();
uint64_t num_input_range_del = 0;
bool ok = UpdateCompactionStats(&num_input_range_del);
// (Sub)compactions returned ok, do sanity check on the number of input keys.
if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
size_t ts_sz = compact_->compaction->column_family_data()
->user_comparator()
->timestamp_size();
// When trim_ts_ is non-empty, CompactionIterator takes
// HistoryTrimmingIterator as input iterator and sees a trimmed view of
// input keys. So the number of keys it processed is not suitable for
// verification here.
// TODO: support verification when trim_ts_ is non-empty.
if (!(ts_sz > 0 && !trim_ts_.empty()) &&
db_options_.compaction_verify_record_count) {
assert(compaction_stats_.stats.num_input_records > 0);
// TODO: verify the number of range deletion entries.
uint64_t expected =
compaction_stats_.stats.num_input_records - num_input_range_del;
uint64_t actual = compaction_job_stats_->num_input_records;
if (expected != actual) {
std::string msg =
"Total number of input records: " + std::to_string(expected) +
", but processed " + std::to_string(actual) + " records.";
ROCKS_LOG_WARN(
db_options_.info_log, "[%s] [JOB %d] Compaction %s",
compact_->compaction->column_family_data()->GetName().c_str(),
job_context_->job_id, msg.c_str());
status = Status::Corruption(
"Compaction number of input keys does not match number of keys "
"processed.");
}
}
}
RecordCompactionIOStats();
LogFlush(db_options_.info_log);
TEST_SYNC_POINT("CompactionJob::Run():End");
compact_->status = status;
TEST_SYNC_POINT_CALLBACK("CompactionJob::Run():EndStatusSet", &status);
return status;
}
@ -978,7 +1012,6 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
void CompactionJob::NotifyOnSubcompactionBegin(
SubcompactionState* sub_compact) {
#ifndef ROCKSDB_LITE
Compaction* c = compact_->compaction;
if (db_options_.listeners.empty()) {
@ -1004,14 +1037,10 @@ void CompactionJob::NotifyOnSubcompactionBegin(
}
info.status.PermitUncheckedError();
#else
(void)sub_compact;
#endif // ROCKSDB_LITE
}
void CompactionJob::NotifyOnSubcompactionCompleted(
SubcompactionState* sub_compact) {
#ifndef ROCKSDB_LITE
if (db_options_.listeners.empty()) {
return;
@ -1032,16 +1061,11 @@ void CompactionJob::NotifyOnSubcompactionCompleted(
for (const auto& listener : db_options_.listeners) {
listener->OnSubcompactionCompleted(info);
}
#else
(void)sub_compact;
#endif // ROCKSDB_LITE
}
void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
assert(sub_compact);
assert(sub_compact->compaction);
#ifndef ROCKSDB_LITE
if (db_options_.compaction_service) {
CompactionServiceJobStatus comp_status =
ProcessKeyValueCompactionWithCompactionService(sub_compact);
@ -1052,7 +1076,6 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
// fallback to local compaction
assert(comp_status == CompactionServiceJobStatus::kUseLocal);
}
#endif // !ROCKSDB_LITE
uint64_t prev_cpu_micros = db_options_.clock->CPUMicros();
@ -1093,6 +1116,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
read_options.verify_checksums = true;
read_options.fill_cache = false;
read_options.rate_limiter_priority = GetRateLimiterPriority();
read_options.io_activity = Env::IOActivity::kCompaction;
// Compaction iterators shouldn't be confined to a single prefix.
// Compactions use Seek() for
// (a) concurrent compactions,
@ -1103,17 +1127,17 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
// GenSubcompactionBoundaries doesn't strip away the timestamp.
size_t ts_sz = cfd->user_comparator()->timestamp_size();
if (start.has_value()) {
read_options.iterate_lower_bound = &start.value();
read_options.iterate_lower_bound = &(*start);
if (ts_sz > 0) {
start_without_ts = StripTimestampFromUserKey(start.value(), ts_sz);
read_options.iterate_lower_bound = &start_without_ts.value();
start_without_ts = StripTimestampFromUserKey(*start, ts_sz);
read_options.iterate_lower_bound = &(*start_without_ts);
}
}
if (end.has_value()) {
read_options.iterate_upper_bound = &end.value();
read_options.iterate_upper_bound = &(*end);
if (ts_sz > 0) {
end_without_ts = StripTimestampFromUserKey(end.value(), ts_sz);
read_options.iterate_upper_bound = &end_without_ts.value();
end_without_ts = StripTimestampFromUserKey(*end, ts_sz);
read_options.iterate_upper_bound = &(*end_without_ts);
}
}
@ -1128,6 +1152,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
IterKey end_ikey;
Slice start_slice;
Slice end_slice;
Slice start_user_key{};
Slice end_user_key{};
static constexpr char kMaxTs[] =
"\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff";
@ -1143,21 +1169,22 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
}
if (start.has_value()) {
start_ikey.SetInternalKey(start.value(), kMaxSequenceNumber,
kValueTypeForSeek);
start_ikey.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
if (ts_sz > 0) {
start_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
&ts_slice);
}
start_slice = start_ikey.GetInternalKey();
start_user_key = start_ikey.GetUserKey();
}
if (end.has_value()) {
end_ikey.SetInternalKey(end.value(), kMaxSequenceNumber, kValueTypeForSeek);
end_ikey.SetInternalKey(*end, kMaxSequenceNumber, kValueTypeForSeek);
if (ts_sz > 0) {
end_ikey.UpdateInternalKey(kMaxSequenceNumber, kValueTypeForSeek,
&ts_slice);
}
end_slice = end_ikey.GetInternalKey();
end_user_key = end_ikey.GetUserKey();
}
std::unique_ptr<InternalIterator> clip;
@ -1256,6 +1283,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
/*expect_valid_internal_key=*/true, range_del_agg.get(),
blob_file_builder.get(), db_options_.allow_data_in_errors,
db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
sub_compact->compaction
->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
sub_compact->compaction, compaction_filter, shutting_down_,
db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
preclude_last_level_min_seqno_);
@ -1273,11 +1302,15 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
[this, sub_compact](CompactionOutputs& outputs) {
return this->OpenCompactionOutputFile(sub_compact, outputs);
};
const CompactionFileCloseFunc close_file_func =
[this, sub_compact](CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(status, sub_compact, outputs,
next_table_min_key);
[this, sub_compact, start_user_key, end_user_key](
CompactionOutputs& outputs, const Status& status,
const Slice& next_table_min_key) {
return this->FinishCompactionOutputFile(
status, sub_compact, outputs, next_table_min_key,
sub_compact->start.has_value() ? &start_user_key : nullptr,
sub_compact->end.has_value() ? &end_user_key : nullptr);
};
Status status;
@ -1288,8 +1321,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
// Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
// returns true.
assert(!end.has_value() || cfd->user_comparator()->Compare(
c_iter->user_key(), end.value()) < 0);
assert(!end.has_value() ||
cfd->user_comparator()->Compare(c_iter->user_key(), *end) < 0);
if (c_iter_stats.num_input_records % kRecordStatsEvery ==
kRecordStatsEvery - 1) {
@ -1316,8 +1349,25 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
if (c_iter->status().IsManualCompactionPaused()) {
break;
}
#ifndef NDEBUG
bool stop = false;
TEST_SYNC_POINT_CALLBACK("CompactionJob::ProcessKeyValueCompaction()::stop",
static_cast<void*>(&stop));
if (stop) {
break;
}
#endif // NDEBUG
}
// This number may not be accurate when CompactionIterator was created
// with `must_count_input_entries=false`.
assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
c_iter->HasNumInputEntryScanned());
sub_compact->compaction_job_stats.has_num_input_records =
c_iter->HasNumInputEntryScanned();
sub_compact->compaction_job_stats.num_input_records =
c_iter->NumInputEntryScanned();
sub_compact->compaction_job_stats.num_blobs_read =
c_iter_stats.num_blobs_read;
sub_compact->compaction_job_stats.total_blob_bytes_read =
@ -1467,7 +1517,8 @@ void CompactionJob::RecordDroppedKeys(
Status CompactionJob::FinishCompactionOutputFile(
const Status& input_status, SubcompactionState* sub_compact,
CompactionOutputs& outputs, const Slice& next_table_min_key) {
CompactionOutputs& outputs, const Slice& next_table_min_key,
const Slice* comp_start_user_key, const Slice* comp_end_user_key) {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
assert(sub_compact != nullptr);
@ -1497,12 +1548,10 @@ Status CompactionJob::FinishCompactionOutputFile(
// output_to_penultimate_level compaction here, as it's only used to decide
// if range dels could be dropped.
if (outputs.HasRangeDel()) {
s = outputs.AddRangeDels(
sub_compact->start.has_value() ? &(sub_compact->start.value())
: nullptr,
sub_compact->end.has_value() ? &(sub_compact->end.value()) : nullptr,
range_del_out_stats, bottommost_level_, cfd->internal_comparator(),
earliest_snapshot, next_table_min_key, full_history_ts_low_);
s = outputs.AddRangeDels(comp_start_user_key, comp_end_user_key,
range_del_out_stats, bottommost_level_,
cfd->internal_comparator(), earliest_snapshot,
next_table_min_key, full_history_ts_low_);
}
RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
TEST_SYNC_POINT("CompactionJob::FinishCompactionOutputFile1");
@ -1614,7 +1663,6 @@ Status CompactionJob::FinishCompactionOutputFile(
TableFileCreationReason::kCompaction, status_for_listener, file_checksum,
file_checksum_func_name);
#ifndef ROCKSDB_LITE
// Report new file to SstFileManagerImpl
auto sfm =
static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
@ -1633,7 +1681,6 @@ Status CompactionJob::FinishCompactionOutputFile(
db_error_handler_->SetBGError(s, BackgroundErrorReason::kCompaction);
}
}
#endif
outputs.ResetBuilder();
return s;
@ -1645,6 +1692,7 @@ Status CompactionJob::InstallCompactionResults(
db_mutex_->AssertHeld();
const ReadOptions read_options(Env::IOActivity::kCompaction);
auto* compaction = compact_->compaction;
assert(compaction);
@ -1722,8 +1770,8 @@ Status CompactionJob::InstallCompactionResults(
}
return versions_->LogAndApply(compaction->column_family_data(),
mutable_cf_options, edit, db_mutex_,
db_directory_);
mutable_cf_options, read_options, edit,
db_mutex_, db_directory_);
}
void CompactionJob::RecordCompactionIOStats() {
@ -1758,11 +1806,9 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
std::string fname = GetTableFileName(file_number);
// Fire events.
ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
#ifndef ROCKSDB_LITE
EventHelpers::NotifyTableFileCreationStarted(
cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
TableFileCreationReason::kCompaction);
#endif // !ROCKSDB_LITE
// Make the output file
std::unique_ptr<FSWritableFile> writable_file;
#ifndef NDEBUG
@ -1821,10 +1867,10 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact,
uint64_t current_time = static_cast<uint64_t>(temp_current_time);
InternalKey tmp_start, tmp_end;
if (sub_compact->start.has_value()) {
tmp_start.SetMinPossibleForUserKey(sub_compact->start.value());
tmp_start.SetMinPossibleForUserKey(*(sub_compact->start));
}
if (sub_compact->end.has_value()) {
tmp_end.SetMinPossibleForUserKey(sub_compact->end.value());
tmp_end.SetMinPossibleForUserKey(*(sub_compact->end));
}
uint64_t oldest_ancester_time =
sub_compact->compaction->MinInputFileOldestAncesterTime(
@ -1899,7 +1945,6 @@ void CompactionJob::CleanupCompaction() {
compact_ = nullptr;
}
#ifndef ROCKSDB_LITE
namespace {
void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
assert(prefix_length > 0);
@ -1908,25 +1953,53 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
}
} // namespace
#endif // !ROCKSDB_LITE
void CompactionJob::UpdateCompactionStats() {
bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
assert(compact_);
Compaction* compaction = compact_->compaction;
compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
compaction_stats_.stats.num_input_files_in_output_level = 0;
bool has_error = false;
const ReadOptions read_options(Env::IOActivity::kCompaction);
const auto& input_table_properties = compaction->GetTableProperties();
for (int input_level = 0;
input_level < static_cast<int>(compaction->num_input_levels());
++input_level) {
size_t num_input_files = compaction->num_input_files(input_level);
uint64_t* bytes_read;
if (compaction->level(input_level) != compaction->output_level()) {
UpdateCompactionInputStatsHelper(
&compaction_stats_.stats.num_input_files_in_non_output_levels,
&compaction_stats_.stats.bytes_read_non_output_levels, input_level);
compaction_stats_.stats.num_input_files_in_non_output_levels +=
static_cast<int>(num_input_files);
bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
} else {
UpdateCompactionInputStatsHelper(
&compaction_stats_.stats.num_input_files_in_output_level,
&compaction_stats_.stats.bytes_read_output_level, input_level);
compaction_stats_.stats.num_input_files_in_output_level +=
static_cast<int>(num_input_files);
bytes_read = &compaction_stats_.stats.bytes_read_output_level;
}
for (size_t i = 0; i < num_input_files; ++i) {
const FileMetaData* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
uint64_t file_input_entries = file_meta->num_entries;
uint64_t file_num_range_del = file_meta->num_range_deletions;
if (file_input_entries == 0) {
uint64_t file_number = file_meta->fd.GetNumber();
// Try getting info from table property
std::string fn =
TableFileName(compaction->immutable_options()->cf_paths,
file_number, file_meta->fd.GetPathId());
const auto& tp = input_table_properties.find(fn);
if (tp != input_table_properties.end()) {
file_input_entries = tp->second->num_entries;
file_num_range_del = tp->second->num_range_deletions;
} else {
has_error = true;
}
}
compaction_stats_.stats.num_input_records += file_input_entries;
if (num_input_range_del) {
*num_input_range_del += file_num_range_del;
}
}
}
@ -1936,26 +2009,11 @@ void CompactionJob::UpdateCompactionStats() {
compaction_stats_.stats.num_dropped_records =
compaction_stats_.DroppedRecords();
}
void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
uint64_t* bytes_read,
int input_level) {
const Compaction* compaction = compact_->compaction;
auto num_input_files = compaction->num_input_files(input_level);
*num_files += static_cast<int>(num_input_files);
for (size_t i = 0; i < num_input_files; ++i) {
const auto* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
compaction_stats_.stats.num_input_records +=
static_cast<uint64_t>(file_meta->num_entries);
}
return !has_error;
}
void CompactionJob::UpdateCompactionJobStats(
const InternalStats::CompactionStats& stats) const {
#ifndef ROCKSDB_LITE
compaction_job_stats_->elapsed_micros = stats.micros;
// input information
@ -1982,9 +2040,6 @@ void CompactionJob::UpdateCompactionJobStats(
CopyPrefix(compact_->LargestUserKey(), CompactionJobStats::kMaxPrefixLength,
&compaction_job_stats_->largest_output_key_prefix);
}
#else
(void)stats;
#endif // !ROCKSDB_LITE
}
void CompactionJob::LogCompaction() {

@ -192,7 +192,21 @@ class CompactionJob {
IOStatus io_status() const { return io_status_; }
protected:
void UpdateCompactionStats();
// Update the following stats in compaction_stats_.stats
// - num_input_files_in_non_output_levels
// - num_input_files_in_output_level
// - bytes_read_non_output_levels
// - bytes_read_output_level
// - num_input_records
// - bytes_read_blob
// - num_dropped_records
//
// @param num_input_range_del if non-null, will be set to the number of range
// deletion entries in this compaction input.
//
// Returns true iff compaction_stats_.stats.num_input_records and
// num_input_range_del are calculated successfully.
bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
void LogCompaction();
virtual void RecordCompactionIOStats();
void CleanupCompaction();
@ -256,7 +270,9 @@ class CompactionJob {
Status FinishCompactionOutputFile(const Status& input_status,
SubcompactionState* sub_compact,
CompactionOutputs& outputs,
const Slice& next_table_min_key);
const Slice& next_table_min_key,
const Slice* comp_start_user_key,
const Slice* comp_end_user_key);
Status InstallCompactionResults(const MutableCFOptions& mutable_cf_options);
Status OpenCompactionOutputFile(SubcompactionState* sub_compact,
CompactionOutputs& outputs);
@ -265,9 +281,6 @@ class CompactionJob {
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
CompactionJobStats* compaction_job_stats = nullptr);
void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
int input_level);
void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);

@ -24,7 +24,7 @@
#include "db/write_batch_internal.h"
#include "env/mock_env.h"
#include "file/filename.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "monitoring/thread_status_util.h"
#include "port/stack_trace.h"
#include "rocksdb/cache.h"
@ -54,12 +54,11 @@
#include "util/compression.h"
#include "util/hash.h"
#include "util/mutexlock.h"
#include "util/rate_limiter.h"
#include "util/rate_limiter_impl.h"
#include "util/string_util.h"
#include "utilities/merge_operators.h"
#if !defined(IOS_CROSS_COMPILE)
#ifndef ROCKSDB_LITE
namespace ROCKSDB_NAMESPACE {
static std::string RandomString(Random* rnd, int len, double ratio) {
@ -617,6 +616,7 @@ TEST_P(CompactionJobStatsTest, CompactionJobStatsTest) {
// via AddExpectedStats().
auto* stats_checker = new CompactionJobStatsChecker();
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.listeners.emplace_back(stats_checker);
options.create_if_missing = true;
// just enough setting to hold off auto-compaction.
@ -816,6 +816,7 @@ TEST_P(CompactionJobStatsTest, DeletionStatsTest) {
// what we expect.
auto* stats_checker = new CompactionJobDeletionStatsChecker();
Options options;
options.level_compaction_dynamic_level_bytes = false;
options.listeners.emplace_back(stats_checker);
options.create_if_missing = true;
options.level0_file_num_compaction_trigger = kTestScale + 1;
@ -959,15 +960,6 @@ int main(int argc, char** argv) {
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr, "SKIPPED, not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // !ROCKSDB_LITE
#else

@ -3,7 +3,6 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_job.h"
@ -374,7 +373,7 @@ class CompactionJobTestBase : public testing::Test {
} else if (table_type_ == TableTypeForTest::kMockTable) {
file_size = 10;
EXPECT_OK(mock_table_factory_->CreateMockTable(
env_, GenerateFileName(file_number), std::move(contents)));
env_, GenerateFileName(file_number), contents));
} else {
assert(false);
}
@ -387,12 +386,13 @@ class CompactionJobTestBase : public testing::Test {
kUnknownFileCreationTime,
versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(),
kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2,
0);
/*compensated_range_deletion_size=*/0, /*tail_size=*/0,
/*user_defined_timestamps_persisted=*/true);
mutex_.Lock();
EXPECT_OK(
versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
mutable_cf_options_, &edit, &mutex_, nullptr));
EXPECT_OK(versions_->LogAndApply(
versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_,
read_options_, &edit, &mutex_, nullptr));
mutex_.Unlock();
}
@ -455,7 +455,8 @@ class CompactionJobTestBase : public testing::Test {
Status s = cf_options_.table_factory->NewTableReader(
read_opts,
TableReaderOptions(*cfd->ioptions(), nullptr, FileOptions(),
cfd_->internal_comparator()),
cfd_->internal_comparator(),
0 /* block_protection_bytes_per_key */),
std::move(freader), file_size, &table_reader, false);
ASSERT_OK(s);
assert(table_reader);
@ -654,11 +655,12 @@ class CompactionJobTestBase : public testing::Test {
ASSERT_TRUE(full_history_ts_low_.empty() ||
ucmp_->timestamp_size() == full_history_ts_low_.size());
const std::atomic<bool> kManualCompactionCanceledFalse{false};
JobContext job_context(1, false /* create_superversion */);
CompactionJob compaction_job(
0, &compaction, db_options_, mutable_db_options_, env_options_,
versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
nullptr, nullptr, &mutex_, &error_handler_, snapshots,
earliest_write_conflict_snapshot, snapshot_checker, nullptr,
earliest_write_conflict_snapshot, snapshot_checker, &job_context,
table_cache_, &event_logger, false, false, dbname_,
&compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
@ -728,6 +730,7 @@ class CompactionJobTestBase : public testing::Test {
ColumnFamilyOptions cf_options_;
MutableCFOptions mutable_cf_options_;
MutableDBOptions mutable_db_options_;
const ReadOptions read_options_;
std::shared_ptr<Cache> table_cache_;
WriteController write_controller_;
WriteBufferManager write_buffer_manager_;
@ -2441,14 +2444,3 @@ int main(int argc, char** argv) {
RegisterCustomObjects(argc, argv);
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as CompactionJobStats is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // ROCKSDB_LITE

@ -43,7 +43,10 @@ Status CompactionOutputs::Finish(const Status& intput_status,
const uint64_t current_bytes = builder_->FileSize();
if (s.ok()) {
meta->fd.file_size = current_bytes;
meta->tail_size = builder_->GetTailSize();
meta->marked_for_compaction = builder_->NeedCompact();
meta->user_defined_timestamps_persisted = static_cast<bool>(
builder_->GetTableProperties().user_defined_timestamps_persisted);
}
current_output().finished = true;
stats_.bytes_written += current_bytes;
@ -76,6 +79,46 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
return io_s;
}
bool CompactionOutputs::UpdateFilesToCutForTTLStates(
const Slice& internal_key) {
if (!files_to_cut_for_ttl_.empty()) {
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false;
}
size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
const Slice& internal_key) {
size_t curr_key_boundary_switched_num = 0;
@ -84,11 +127,6 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
if (grandparents.empty()) {
return curr_key_boundary_switched_num;
}
assert(!internal_key.empty());
InternalKey ikey;
ikey.DecodeFrom(internal_key);
assert(ikey.Valid());
const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
// Move the grandparent_index_ to the file containing the current user_key.
@ -96,7 +134,7 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
// index points to the last file containing the key.
while (grandparent_index_ < grandparents.size()) {
if (being_grandparent_gap_) {
if (sstableKeyCompare(ucmp, ikey,
if (sstableKeyCompare(ucmp, internal_key,
grandparents[grandparent_index_]->smallest) < 0) {
break;
}
@ -109,13 +147,13 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
being_grandparent_gap_ = false;
} else {
int cmp_result = sstableKeyCompare(
ucmp, ikey, grandparents[grandparent_index_]->largest);
ucmp, internal_key, grandparents[grandparent_index_]->largest);
// If it's same key, make sure grandparent_index_ is pointing to the last
// one.
if (cmp_result < 0 ||
(cmp_result == 0 &&
(grandparent_index_ == grandparents.size() - 1 ||
sstableKeyCompare(ucmp, ikey,
sstableKeyCompare(ucmp, internal_key,
grandparents[grandparent_index_ + 1]->smallest) <
0))) {
break;
@ -185,18 +223,39 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
assert(c_iter.Valid());
// always update grandparent information like overlapped file number, size
// etc.
const Slice& internal_key = c_iter.key();
#ifndef NDEBUG
bool should_stop = false;
std::pair<bool*, const Slice> p{&should_stop, internal_key};
TEST_SYNC_POINT_CALLBACK(
"CompactionOutputs::ShouldStopBefore::manual_decision", (void*)&p);
if (should_stop) {
return true;
}
#endif // NDEBUG
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
size_t num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
size_t num_grandparent_boundaries_crossed = 0;
bool should_stop_for_ttl = false;
// Always update grandparent information like overlapped file number, size
// etc., and TTL states.
// If compaction_->output_level() == 0, there is no need to update grandparent
// info, and that `grandparent` should be empty.
if (compaction_->output_level() > 0) {
num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key);
}
if (!HasBuilder()) {
return false;
}
if (should_stop_for_ttl) {
return true;
}
// If there's user defined partitioner, check that first
if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
last_key_for_partitioner_, c_iter.user_key(),
@ -214,9 +273,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
return true;
}
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
// Check if it needs to split for RoundRobin
// Invalid local_output_split_key indicates that we do not need to split
if (local_output_split_key_ != nullptr && !is_split_) {
@ -290,41 +346,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
}
}
// check ttl file boundaries if there's any
if (!files_to_cut_for_ttl_.empty()) {
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false;
}
@ -404,114 +425,182 @@ Status CompactionOutputs::AddToOutput(
return s;
}
namespace {
void SetMaxSeqAndTs(InternalKey& internal_key, const Slice& user_key,
const size_t ts_sz) {
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
internal_key = InternalKey(user_key, kMaxSequenceNumber,
kTypeRangeDeletion, Slice(kTsMax, ts_sz));
} else {
internal_key =
InternalKey(user_key, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
internal_key.Set(user_key, kMaxSequenceNumber, kTypeRangeDeletion);
}
}
} // namespace
Status CompactionOutputs::AddRangeDels(
const Slice* comp_start_user_key, const Slice* comp_end_user_key,
CompactionIterationStats& range_del_out_stats, bool bottommost_level,
const InternalKeyComparator& icmp, SequenceNumber earliest_snapshot,
const Slice& next_table_min_key, const std::string& full_history_ts_low) {
// The following example does not happen since
// CompactionOutput::ShouldStopBefore() always return false for the first
// point key. But we should consider removing this dependency. Suppose for the
// first compaction output file,
// - next_table_min_key.user_key == comp_start_user_key
// - no point key is in the output file
// - there is a range tombstone @seqno to be added that covers
// comp_start_user_key
// Then meta.smallest will be set to comp_start_user_key@seqno
// and meta.largest will be set to comp_start_user_key@kMaxSequenceNumber
// which violates the assumption that meta.smallest should be <= meta.largest.
assert(HasRangeDel());
FileMetaData& meta = current_output().meta;
const Comparator* ucmp = icmp.user_comparator();
InternalKey lower_bound_buf, upper_bound_buf;
Slice lower_bound_guard, upper_bound_guard;
std::string smallest_user_key;
const Slice *lower_bound, *upper_bound;
bool lower_bound_from_sub_compact = false;
bool lower_bound_from_range_tombstone = false;
// We first determine the internal key lower_bound and upper_bound for
// this output file. All and only range tombstones that overlap with
// [lower_bound, upper_bound] should be added to this file. File
// boundaries (meta.smallest/largest) should be updated accordingly when
// extended by range tombstones.
size_t output_size = outputs_.size();
if (output_size == 1) {
// For the first output table, include range tombstones before the min
// key but after the subcompaction boundary.
lower_bound = comp_start_user_key;
lower_bound_from_sub_compact = true;
} else if (range_tombstone_lower_bound_.size() > 0) {
assert(meta.smallest.size() == 0 ||
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) <= 0);
lower_bound_guard = range_tombstone_lower_bound_.user_key();
lower_bound = &lower_bound_guard;
lower_bound_from_range_tombstone = true;
} else if (meta.smallest.size() > 0) {
// This is the first file in the subcompaction.
//
// When outputting a range tombstone that spans a subcompaction boundary,
// the files on either side of that boundary need to include that
// boundary's user key. Otherwise, the spanning range tombstone would lose
// coverage.
//
// To achieve this while preventing files from overlapping in internal key
// (an LSM invariant violation), we allow the earlier file to include the
// boundary user key up to `kMaxSequenceNumber,kTypeRangeDeletion`. The
// later file can begin at the boundary user key at the newest key version
// it contains. At this point that version number is unknown since we have
// not processed the range tombstones yet, so permit any version. Same story
// applies to timestamp, and a non-nullptr `comp_start_user_key` should have
// `kMaxTs` here, which similarly permits any timestamp.
if (comp_start_user_key) {
lower_bound_buf.Set(*comp_start_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
lower_bound_guard = lower_bound_buf.Encode();
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
} else {
// For subsequent output tables, only include range tombstones from min
// key onwards since the previous file was extended to contain range
// tombstones falling before min key.
smallest_user_key = meta.smallest.user_key().ToString(false /*hex*/);
lower_bound_guard = Slice(smallest_user_key);
lower_bound = &lower_bound_guard;
} else {
lower_bound = nullptr;
}
if (!next_table_min_key.empty()) {
// This may be the last file in the subcompaction in some cases, so we
// need to compare the end key of subcompaction with the next file start
// key. When the end key is chosen by the subcompaction, we know that
// it must be the biggest key in output file. Therefore, it is safe to
// use the smaller key as the upper bound of the output file, to ensure
// that there is no overlapping between different output files.
upper_bound_guard = ExtractUserKey(next_table_min_key);
if (comp_end_user_key != nullptr &&
ucmp->CompareWithoutTimestamp(upper_bound_guard, *comp_end_user_key) >=
0) {
upper_bound = comp_end_user_key;
if (range_tombstone_lower_bound_.size() > 0) {
assert(meta.smallest.size() == 0 ||
icmp.Compare(range_tombstone_lower_bound_, meta.smallest) < 0);
lower_bound_guard = range_tombstone_lower_bound_.Encode();
} else {
assert(meta.smallest.size() > 0);
lower_bound_guard = meta.smallest.Encode();
}
lower_bound = &lower_bound_guard;
}
const size_t ts_sz = ucmp->timestamp_size();
if (next_table_min_key.empty()) {
// Last file of the subcompaction.
if (comp_end_user_key) {
upper_bound_buf.Set(*comp_end_user_key, kMaxSequenceNumber,
kTypeRangeDeletion);
upper_bound_guard = upper_bound_buf.Encode();
upper_bound = &upper_bound_guard;
} else {
upper_bound = nullptr;
}
} else {
// This is the last file in the subcompaction, so extend until the
// subcompaction ends.
upper_bound = comp_end_user_key;
}
bool has_overlapping_endpoints;
if (upper_bound != nullptr && meta.largest.size() > 0) {
has_overlapping_endpoints = ucmp->CompareWithoutTimestamp(
meta.largest.user_key(), *upper_bound) == 0;
} else {
has_overlapping_endpoints = false;
// There is another file coming whose coverage will begin at
// `next_table_min_key`. The current file needs to extend range tombstone
// coverage through its own keys (through `meta.largest`) and through user
// keys preceding `next_table_min_key`'s user key.
ParsedInternalKey next_table_min_key_parsed;
ParseInternalKey(next_table_min_key, &next_table_min_key_parsed,
false /* log_err_key */)
.PermitUncheckedError();
assert(next_table_min_key_parsed.sequence < kMaxSequenceNumber);
assert(meta.largest.size() == 0 ||
icmp.Compare(meta.largest.Encode(), next_table_min_key) < 0);
assert(!lower_bound || icmp.Compare(*lower_bound, next_table_min_key) <= 0);
if (meta.largest.size() > 0 &&
ucmp->EqualWithoutTimestamp(meta.largest.user_key(),
next_table_min_key_parsed.user_key)) {
// Caution: this assumes meta.largest.Encode() lives longer than
// upper_bound, which is only true if meta.largest is never updated.
// This just happens to be the case here since meta.largest serves
// as the upper_bound.
upper_bound_guard = meta.largest.Encode();
} else {
SetMaxSeqAndTs(upper_bound_buf, next_table_min_key_parsed.user_key,
ts_sz);
upper_bound_guard = upper_bound_buf.Encode();
}
upper_bound = &upper_bound_guard;
}
if (lower_bound && upper_bound &&
icmp.Compare(*lower_bound, *upper_bound) > 0) {
assert(meta.smallest.size() == 0 &&
ucmp->EqualWithoutTimestamp(ExtractUserKey(*lower_bound),
ExtractUserKey(*upper_bound)));
// This can only happen when lower_bound have the same user key as
// next_table_min_key and that there is no point key in the current
// compaction output file.
return Status::OK();
}
// The end key of the subcompaction must be bigger or equal to the upper
// bound. If the end of subcompaction is null or the upper bound is null,
// it means that this file is the last file in the compaction. So there
// will be no overlapping between this file and others.
assert(comp_end_user_key == nullptr || upper_bound == nullptr ||
ucmp->CompareWithoutTimestamp(*upper_bound, *comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound,
has_overlapping_endpoints);
// Position the range tombstone output iterator. There may be tombstone
// fragments that are entirely out of range, so make sure that we do not
// include those.
if (lower_bound != nullptr) {
it->Seek(*lower_bound);
} else {
it->SeekToFirst();
}
for (; it->Valid(); it->Next()) {
ucmp->CompareWithoutTimestamp(ExtractUserKey(*upper_bound),
*comp_end_user_key) <= 0);
auto it = range_del_agg_->NewIterator(lower_bound, upper_bound);
Slice last_tombstone_start_user_key{};
bool reached_lower_bound = false;
const ReadOptions read_options(Env::IOActivity::kCompaction);
for (it->SeekToFirst(); it->Valid(); it->Next()) {
auto tombstone = it->Tombstone();
if (upper_bound != nullptr) {
int cmp =
ucmp->CompareWithoutTimestamp(*upper_bound, tombstone.start_key_);
if ((has_overlapping_endpoints && cmp < 0) ||
(!has_overlapping_endpoints && cmp <= 0)) {
// Tombstones starting after upper_bound only need to be included in
// the next table. If the current SST ends before upper_bound, i.e.,
// `has_overlapping_endpoints == false`, we can also skip over range
// tombstones that start exactly at upper_bound. Such range
// tombstones will be included in the next file and are not relevant
// to the point keys or endpoints of the current file.
break;
}
auto kv = tombstone.Serialize();
InternalKey tombstone_end = tombstone.SerializeEndKey();
// TODO: the underlying iterator should support clamping the bounds.
// tombstone_end.Encode is of form user_key@kMaxSeqno
// if it is equal to lower_bound, there is no need to include
// such range tombstone.
if (!reached_lower_bound && lower_bound &&
icmp.Compare(tombstone_end.Encode(), *lower_bound) <= 0) {
continue;
}
assert(!lower_bound ||
icmp.Compare(*lower_bound, tombstone_end.Encode()) <= 0);
reached_lower_bound = true;
const size_t ts_sz = ucmp->timestamp_size();
// Garbage collection for range tombstones.
// If user-defined timestamp is enabled, range tombstones are dropped if
// they are at bottommost_level, below full_history_ts_low and not visible
// in any snapshot. trim_ts_ is passed to the constructor for
// range_del_agg_, and range_del_agg_ internally drops tombstones above
// trim_ts_.
if (bottommost_level && tombstone.seq_ <= earliest_snapshot &&
bool consider_drop =
tombstone.seq_ <= earliest_snapshot &&
(ts_sz == 0 ||
(!full_history_ts_low.empty() &&
ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0))) {
ucmp->CompareTimestamp(tombstone.ts_, full_history_ts_low) < 0));
if (consider_drop && bottommost_level) {
// TODO(andrewkr): tombstones that span multiple output files are
// counted for each compaction output file, so lots of double
// counting.
@ -520,149 +609,126 @@ Status CompactionOutputs::AddRangeDels(
continue;
}
auto kv = tombstone.Serialize();
assert(lower_bound == nullptr ||
ucmp->CompareWithoutTimestamp(*lower_bound, kv.second) < 0);
// Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second);
InternalKey tombstone_start = std::move(kv.first);
InternalKey smallest_candidate{tombstone_start};
if (lower_bound != nullptr &&
ucmp->CompareWithoutTimestamp(smallest_candidate.user_key(),
*lower_bound) <= 0) {
// Pretend the smallest key has the same user key as lower_bound
// (the max key in the previous table or subcompaction) in order for
// files to appear key-space partitioned.
if (lower_bound_from_sub_compact) {
// When lower_bound is chosen by a subcompaction
// (lower_bound_from_sub_compact), we know that subcompactions over
// smaller keys cannot contain any keys at lower_bound. We also know
// that smaller subcompactions exist, because otherwise the
// subcompaction woud be unbounded on the left. As a result, we know
// that no other files on the output level will contain actual keys at
// lower_bound (an output file may have a largest key of
// lower_bound@kMaxSequenceNumber, but this only indicates a large range
// tombstone was truncated). Therefore, it is safe to use the
// tombstone's sequence number, to ensure that keys at lower_bound at
// lower levels are covered by truncated tombstones.
if (ts_sz) {
assert(tombstone.ts_.size() == ts_sz);
smallest_candidate = InternalKey(*lower_bound, tombstone.seq_,
kTypeRangeDeletion, tombstone.ts_);
} else {
smallest_candidate =
InternalKey(*lower_bound, tombstone.seq_, kTypeRangeDeletion);
}
} else if (lower_bound_from_range_tombstone) {
// When lower_bound is chosen from a range tombtone start key:
// Range tombstone keys can be truncated at file boundaries of the files
// that contain them.
//
// If this lower bound is from a range tombstone key that is not
// truncated, i.e., it was not truncated when reading from the input
// files, then its sequence number and `op_type` will be
// kMaxSequenceNumber and kTypeRangeDeletion (see
// TruncatedRangeDelIterator::start_key()). In this case, when this key
// was used as the upper bound to cut the previous compaction output
// file, the previous file's largest key could have the same value as
// this key (see the upperbound logic below). To guarantee
// non-overlapping ranges between output files, we use the range
// tombstone's actual sequence number (tombstone.seq_) for the lower
// bound of this file. If this range tombstone key is truncated, then
// the previous file's largest key will be smaller than this range
// tombstone key, so we can use it as the lower bound directly.
if (ExtractInternalKeyFooter(range_tombstone_lower_bound_.Encode()) ==
kRangeTombstoneSentinel) {
if (ts_sz) {
smallest_candidate =
InternalKey(range_tombstone_lower_bound_.user_key(),
tombstone.seq_, kTypeRangeDeletion, tombstone.ts_);
} else {
smallest_candidate =
InternalKey(range_tombstone_lower_bound_.user_key(),
tombstone.seq_, kTypeRangeDeletion);
}
} else {
assert(GetInternalKeySeqno(range_tombstone_lower_bound_.Encode()) <
kMaxSequenceNumber);
smallest_candidate = range_tombstone_lower_bound_;
}
} else {
// If lower_bound was chosen by the smallest data key in the file,
// choose lowest seqnum so this file's smallest internal key comes
// after the previous file's largest. The fake seqnum is OK because
// the read path's file-picking code only considers user key.
smallest_candidate = InternalKey(*lower_bound, 0, kTypeRangeDeletion);
}
ucmp->CompareWithoutTimestamp(ExtractUserKey(*lower_bound),
kv.second) < 0);
InternalKey tombstone_start = kv.first;
if (lower_bound &&
ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
ExtractUserKey(*lower_bound)) < 0) {
// This just updates the non-timestamp portion of `tombstone_start`'s user
// key. Ideally there would be a simpler API usage
ParsedInternalKey tombstone_start_parsed;
ParseInternalKey(tombstone_start.Encode(), &tombstone_start_parsed,
false /* log_err_key */)
.PermitUncheckedError();
// timestamp should be from where sequence number is from, which is from
// tombstone in this case
std::string ts =
tombstone_start_parsed.GetTimestamp(ucmp->timestamp_size())
.ToString();
tombstone_start_parsed.user_key = ExtractUserKey(*lower_bound);
tombstone_start.SetFrom(tombstone_start_parsed, ts);
}
InternalKey tombstone_end = tombstone.SerializeEndKey();
InternalKey largest_candidate{tombstone_end};
if (upper_bound != nullptr &&
ucmp->CompareWithoutTimestamp(*upper_bound,
largest_candidate.user_key()) <= 0) {
// Pretend the largest key has the same user key as upper_bound (the
// min key in the following table or subcompaction) in order for files
// to appear key-space partitioned.
//
// Choose highest seqnum so this file's largest internal key comes
// before the next file's/subcompaction's smallest. The fake seqnum is
// OK because the read path's file-picking code only considers the
// user key portion.
//
// Note Seek() also creates InternalKey with (user_key,
// kMaxSequenceNumber), but with kTypeDeletion (0x7) instead of
// kTypeRangeDeletion (0xF), so the range tombstone comes before the
// Seek() key in InternalKey's ordering. So Seek() will look in the
// next file for the user key
if (ts_sz) {
static constexpr char kTsMax[] = "\xff\xff\xff\xff\xff\xff\xff\xff\xff";
if (ts_sz <= strlen(kTsMax)) {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
Slice(kTsMax, ts_sz));
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion,
std::string(ts_sz, '\xff'));
}
} else {
largest_candidate =
InternalKey(*upper_bound, kMaxSequenceNumber, kTypeRangeDeletion);
}
icmp.Compare(*upper_bound, tombstone_start.Encode()) < 0) {
break;
}
#ifndef NDEBUG
SequenceNumber smallest_ikey_seqnum = kMaxSequenceNumber;
if (meta.smallest.size() > 0) {
smallest_ikey_seqnum = GetInternalKeySeqno(meta.smallest.Encode());
if (lower_bound &&
icmp.Compare(tombstone_start.Encode(), *lower_bound) < 0) {
tombstone_start.DecodeFrom(*lower_bound);
}
#endif
meta.UpdateBoundariesForRange(smallest_candidate, largest_candidate,
if (upper_bound && icmp.Compare(*upper_bound, tombstone_end.Encode()) < 0) {
tombstone_end.DecodeFrom(*upper_bound);
}
if (consider_drop && compaction_->KeyRangeNotExistsBeyondOutputLevel(
tombstone_start.user_key(),
tombstone_end.user_key(), &level_ptrs_)) {
range_del_out_stats.num_range_del_drop_obsolete++;
range_del_out_stats.num_record_drop_obsolete++;
continue;
}
// Here we show that *only* range tombstones that overlap with
// [lower_bound, upper_bound] are added to the current file, and
// sanity checking invariants that should hold:
// - [tombstone_start, tombstone_end] overlaps with [lower_bound,
// upper_bound]
// - meta.smallest <= meta.largest
// Corresponding assertions are made, the proof is broken is any of them
// fails.
// TODO: show that *all* range tombstones that overlap with
// [lower_bound, upper_bound] are added.
// TODO: some invariant about boundaries are correctly updated.
//
// Note that `tombstone_start` is updated in the if condition above, we use
// tombstone_start to refer to its initial value, i.e.,
// it->Tombstone().first, and use tombstone_start* to refer to its value
// after the update.
//
// To show [lower_bound, upper_bound] overlaps with [tombstone_start,
// tombstone_end]:
// lower_bound <= upper_bound from the if condition right after all
// bounds are initialized. We assume each tombstone fragment has
// start_key.user_key < end_key.user_key, so
// tombstone_start < tombstone_end by
// FragmentedTombstoneIterator::Tombstone(). So these two ranges are both
// non-emtpy. The flag `reached_lower_bound` and the if logic before it
// ensures lower_bound <= tombstone_end. tombstone_start is only updated
// if it has a smaller user_key than lower_bound user_key, so
// tombstone_start <= tombstone_start*. The above if condition implies
// tombstone_start* <= upper_bound. So we have
// tombstone_start <= upper_bound and lower_bound <= tombstone_end
// and the two ranges overlap.
//
// To show meta.smallest <= meta.largest:
// From the implementation of UpdateBoundariesForRange(), it suffices to
// prove that when it is first called in this function, its parameters
// satisfy `start <= end`, where start = max(tombstone_start*, lower_bound)
// and end = min(tombstone_end, upper_bound). From the above proof we have
// lower_bound <= tombstone_end and lower_bound <= upper_bound. We only need
// to show that tombstone_start* <= min(tombstone_end, upper_bound).
// Note that tombstone_start*.user_key = max(tombstone_start.user_key,
// lower_bound.user_key). Assuming tombstone_end always has
// kMaxSequenceNumber and lower_bound.seqno < kMaxSequenceNumber.
// Since lower_bound <= tombstone_end and lower_bound.seqno <
// tombstone_end.seqno (in absolute number order, not internal key order),
// lower_bound.user_key < tombstone_end.user_key.
// Since lower_bound.user_key < tombstone_end.user_key and
// tombstone_start.user_key < tombstone_end.user_key, tombstone_start* <
// tombstone_end. Since tombstone_start* <= upper_bound from the above proof
// and tombstone_start* < tombstone_end, tombstone_start* <=
// min(tombstone_end, upper_bound), so the two ranges overlap.
// Range tombstone is not supported by output validator yet.
builder_->Add(kv.first.Encode(), kv.second);
assert(icmp.Compare(tombstone_start, tombstone_end) <= 0);
meta.UpdateBoundariesForRange(tombstone_start, tombstone_end,
tombstone.seq_, icmp);
if (!bottommost_level) {
// Range tombstones are truncated at file boundaries
if (icmp.Compare(tombstone_start, meta.smallest) < 0) {
tombstone_start = meta.smallest;
}
if (icmp.Compare(tombstone_end, meta.largest) > 0) {
tombstone_end = meta.largest;
bool start_user_key_changed =
last_tombstone_start_user_key.empty() ||
ucmp->CompareWithoutTimestamp(last_tombstone_start_user_key,
it->start_key()) < 0;
last_tombstone_start_user_key = it->start_key();
if (start_user_key_changed) {
// If tombstone_start >= tombstone_end, then either no key range is
// covered, or that they have the same user key. If they have the same
// user key, then the internal key range should only be within this
// level, and no keys from older levels is covered.
if (ucmp->CompareWithoutTimestamp(tombstone_start.user_key(),
tombstone_end.user_key()) < 0) {
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
auto approximate_covered_size =
compaction_->input_version()->version_set()->ApproximateSize(
approx_opts, read_options, compaction_->input_version(),
tombstone_start.Encode(), tombstone_end.Encode(),
compaction_->output_level() + 1 /* start_level */,
-1 /* end_level */, kCompaction);
meta.compensated_range_deletion_size += approximate_covered_size;
}
}
SizeApproximationOptions approx_opts;
approx_opts.files_size_error_margin = 0.1;
auto approximate_covered_size =
compaction_->input_version()->version_set()->ApproximateSize(
approx_opts, compaction_->input_version(),
tombstone_start.Encode(), tombstone_end.Encode(),
compaction_->output_level() + 1 /* start_level */,
-1 /* end_level */, kCompaction);
meta.compensated_range_deletion_size += approximate_covered_size;
}
// The smallest key in a file is used for range tombstone truncation, so
// it cannot have a seqnum of 0 (unless the smallest data key in a file
// has a seqnum of 0). Otherwise, the truncated tombstone may expose
// deleted keys at lower levels.
assert(smallest_ikey_seqnum == 0 || lower_bound_from_range_tombstone ||
ExtractInternalKeyFooter(meta.smallest.Encode()) !=
PackSequenceAndType(0, kTypeRangeDeletion));
}
return Status::OK();
}
@ -719,6 +785,8 @@ CompactionOutputs::CompactionOutputs(const Compaction* compaction,
if (compaction->output_level() != 0) {
FillFilesToCutForTtl();
}
level_ptrs_ = std::vector<size_t>(compaction_->number_levels(), 0);
}
} // namespace ROCKSDB_NAMESPACE

@ -167,9 +167,15 @@ class CompactionOutputs {
current_output_file_size_ = 0;
}
// Add range-dels from the aggregator to the current output file
// Add range deletions from the range_del_agg_ to the current output file.
// Input parameters, `range_tombstone_lower_bound_` and current output's
// metadata determine the bounds on range deletions to add. Updates output
// file metadata boundary if extended by range tombstones.
//
// @param comp_start_user_key and comp_end_user_key include timestamp if
// user-defined timestamp is enabled.
// user-defined timestamp is enabled. Their timestamp should be max timestamp.
// @param next_table_min_key internal key lower bound for the next compaction
// output.
// @param full_history_ts_low used for range tombstone garbage collection.
Status AddRangeDels(const Slice* comp_start_user_key,
const Slice* comp_end_user_key,
@ -200,10 +206,10 @@ class CompactionOutputs {
// We may only split the output when the cursor is in the range. Split
if ((!end.has_value() ||
icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()), end.value()) < 0) &&
(!start.has_value() || icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()),
start.value()) > 0)) {
ExtractUserKey(output_split_key->Encode()), *end) < 0) &&
(!start.has_value() ||
icmp->user_comparator()->Compare(
ExtractUserKey(output_split_key->Encode()), *start) > 0)) {
local_output_split_key_ = output_split_key;
}
}
@ -221,6 +227,13 @@ class CompactionOutputs {
}
}
// Updates states related to file cutting for TTL.
// Returns a boolean value indicating whether the current
// compaction output file should be cut before `internal_key`.
//
// @param internal_key the current key to be added to output.
bool UpdateFilesToCutForTTLStates(const Slice& internal_key);
// update tracked grandparents information like grandparent index, if it's
// in the gap between 2 grandparent files, accumulated grandparent files size
// etc.
@ -343,6 +356,15 @@ class CompactionOutputs {
// The smallest key of the current output file, this is set when current
// output file's smallest key is a range tombstone start key.
InternalKey range_tombstone_lower_bound_;
// Used for calls to compaction->KeyRangeNotExistsBeyondOutputLevel() in
// CompactionOutputs::AddRangeDels().
// level_ptrs_[i] holds index of the file that was checked during the last
// call to compaction->KeyRangeNotExistsBeyondOutputLevel(). This allows
// future calls to the function to pick up where it left off, since each
// range tombstone added to output file within each subcompaction is in
// increasing key range.
std::vector<size_t> level_ptrs_;
};
// helper struct to concatenate the last level and penultimate level outputs

@ -20,7 +20,7 @@
#include "file/filename.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/string_util.h"
@ -611,23 +611,21 @@ Compaction* CompactionPicker::CompactRange(
// Universal compaction with more than one level always compacts all the
// files together to the last level.
assert(vstorage->num_levels() > 1);
int max_output_level =
vstorage->MaxOutputLevel(ioptions_.allow_ingest_behind);
// DBImpl::CompactRange() set output level to be the last level
if (ioptions_.allow_ingest_behind) {
assert(output_level == vstorage->num_levels() - 2);
} else {
assert(output_level == vstorage->num_levels() - 1);
}
assert(output_level == max_output_level);
// DBImpl::RunManualCompaction will make full range for universal compaction
assert(begin == nullptr);
assert(end == nullptr);
*compaction_end = nullptr;
int start_level = 0;
for (; start_level < vstorage->num_levels() &&
for (; start_level <= max_output_level &&
vstorage->NumLevelFiles(start_level) == 0;
start_level++) {
}
if (start_level == vstorage->num_levels()) {
if (start_level > max_output_level) {
return nullptr;
}
@ -637,9 +635,9 @@ Compaction* CompactionPicker::CompactRange(
return nullptr;
}
std::vector<CompactionInputFiles> inputs(vstorage->num_levels() -
std::vector<CompactionInputFiles> inputs(max_output_level + 1 -
start_level);
for (int level = start_level; level < vstorage->num_levels(); level++) {
for (int level = start_level; level <= max_output_level; level++) {
inputs[level - start_level].level = level;
auto& files = inputs[level - start_level].files;
for (FileMetaData* f : vstorage->LevelFiles(level)) {
@ -753,8 +751,10 @@ Compaction* CompactionPicker::CompactRange(
// for BOTTOM LEVEL compaction only, use max_file_num_to_ignore to filter out
// files that are created during the current compaction.
if (compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized &&
if ((compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kForceOptimized ||
compact_range_options.bottommost_level_compaction ==
BottommostLevelCompaction::kIfHaveCompactionFilter) &&
max_file_num_to_ignore != std::numeric_limits<uint64_t>::max()) {
assert(input_level == output_level);
// inputs_shrunk holds a continuous subset of input files which were all
@ -877,7 +877,6 @@ Compaction* CompactionPicker::CompactRange(
return compaction;
}
#ifndef ROCKSDB_LITE
namespace {
// Test whether two files have overlapping key-ranges.
bool HaveOverlappingKeyRanges(const Comparator* c, const SstFileMetaData& a,
@ -1116,7 +1115,6 @@ Status CompactionPicker::SanitizeCompactionInputFiles(
return Status::OK();
}
#endif // !ROCKSDB_LITE
void CompactionPicker::RegisterCompaction(Compaction* c) {
if (c == nullptr) {

@ -93,11 +93,9 @@ class CompactionPicker {
// into a valid one by adding more files, the function will return a
// non-ok status with specific reason.
//
#ifndef ROCKSDB_LITE
Status SanitizeCompactionInputFiles(std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta,
const int output_level) const;
#endif // ROCKSDB_LITE
// Free up the files that participated in a compaction
//
@ -229,11 +227,9 @@ class CompactionPicker {
// A helper function to SanitizeCompactionInputFiles() that
// sanitizes "input_files" by adding necessary files.
#ifndef ROCKSDB_LITE
virtual Status SanitizeCompactionInputFilesForAllLevels(
std::unordered_set<uint64_t>* input_files,
const ColumnFamilyMetaData& cf_meta, const int output_level) const;
#endif // ROCKSDB_LITE
// Keeps track of all compactions that are running on Level0.
// Protected by DB mutex
@ -246,7 +242,6 @@ class CompactionPicker {
const InternalKeyComparator* const icmp_;
};
#ifndef ROCKSDB_LITE
// A dummy compaction that never triggers any automatic
// compaction.
class NullCompactionPicker : public CompactionPicker {
@ -287,7 +282,6 @@ class NullCompactionPicker : public CompactionPicker {
return false;
}
};
#endif // !ROCKSDB_LITE
// Attempts to find an intra L0 compaction conforming to the given parameters.
//

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_fifo.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
#include <string>
@ -17,6 +16,7 @@
#include "db/column_family.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "options/options_helper.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
@ -285,31 +285,36 @@ Compaction* FIFOCompactionPicker::PickSizeCompaction(
return c;
}
Compaction* FIFOCompactionPicker::PickCompactionToWarm(
Compaction* FIFOCompactionPicker::PickTemperatureChangeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer) {
if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
const std::vector<FileTemperatureAge>& ages =
mutable_cf_options.compaction_options_fifo
.file_temperature_age_thresholds;
if (ages.empty()) {
return nullptr;
}
// PickCompactionToWarm is only triggered if there is no non-L0 files.
for (int level = 1; level < vstorage->num_levels(); ++level) {
if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
return nullptr;
}
// Does not apply to multi-level FIFO.
if (vstorage->num_levels() > 1) {
return nullptr;
}
const int kLevel0 = 0;
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
if (level_files.empty()) {
return nullptr;
}
int64_t _current_time;
auto status = ioptions_.clock->GetCurrentTime(&_current_time);
if (!status.ok()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: Couldn't get current time: %s. "
"Not doing compactions based on warm threshold. ",
cf_name.c_str(), status.ToString().c_str());
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: Couldn't get current time: %s. "
"Not doing compactions based on file temperature-age threshold. ",
cf_name.c_str(), status.ToString().c_str());
return nullptr;
}
const uint64_t current_time = static_cast<uint64_t>(_current_time);
@ -328,56 +333,77 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm(
inputs[0].level = 0;
// avoid underflow
if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
uint64_t create_time_threshold =
current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
uint64_t min_age = ages[0].age;
// kLastTemperature means target temperature is to be determined.
Temperature compaction_target_temp = Temperature::kLastTemperature;
if (current_time > min_age) {
uint64_t create_time_threshold = current_time - min_age;
uint64_t compaction_size = 0;
// We will ideally identify a file qualifying for warm tier by knowing
// the timestamp for the youngest entry in the file. However, right now
// we don't have the information. We infer it by looking at timestamp
// of the next file's (which is just younger) oldest entry's timestamp.
FileMetaData* prev_file = nullptr;
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
FileMetaData* f = *ritr;
assert(f);
if (f->being_compacted) {
// Right now this probably won't happen as we never try to schedule
// two compactions in parallel, so here we just simply don't schedule
// anything.
// We will ideally identify a file qualifying for temperature change by
// knowing the timestamp for the youngest entry in the file. However, right
// now we don't have the information. We infer it by looking at timestamp of
// the previous file's (which is just younger) oldest entry's timestamp.
Temperature cur_target_temp;
// avoid index underflow
assert(level_files.size() >= 1);
for (size_t index = level_files.size() - 1; index >= 1; --index) {
// Try to add cur_file to compaction inputs.
FileMetaData* cur_file = level_files[index];
// prev_file is just younger than cur_file
FileMetaData* prev_file = level_files[index - 1];
if (cur_file->being_compacted) {
// Should not happen since we check for
// `level0_compactions_in_progress_` above. Here we simply just don't
// schedule anything.
return nullptr;
}
uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
if (oldest_ancester_time == kUnknownOldestAncesterTime) {
uint64_t oldest_ancestor_time = prev_file->TryGetOldestAncesterTime();
if (oldest_ancestor_time == kUnknownOldestAncesterTime) {
// Older files might not have enough information. It is possible to
// handle these files by looking at newer files, but maintaining the
// logic isn't worth it.
break;
}
if (oldest_ancester_time > create_time_threshold) {
// The previous file (which has slightly older data) doesn't qualify
// for warm tier.
if (oldest_ancestor_time > create_time_threshold) {
// cur_file is too fresh
break;
}
if (prev_file != nullptr) {
compaction_size += prev_file->fd.GetFileSize();
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
cur_target_temp = ages[0].temperature;
for (size_t i = 1; i < ages.size(); ++i) {
if (current_time >= ages[i].age &&
oldest_ancestor_time <= current_time - ages[i].age) {
cur_target_temp = ages[i].temperature;
}
}
if (cur_file->temperature == cur_target_temp) {
if (inputs[0].empty()) {
continue;
} else {
break;
}
inputs[0].files.push_back(prev_file);
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with next file's oldest time %" PRIu64 " for warm",
cf_name.c_str(), prev_file->fd.GetNumber(),
oldest_ancester_time);
}
if (f->temperature == Temperature::kUnknown ||
f->temperature == Temperature::kHot) {
prev_file = f;
} else if (!inputs[0].files.empty()) {
// A warm file newer than files picked.
// cur_file needs to change temperature
if (compaction_target_temp == Temperature::kLastTemperature) {
assert(inputs[0].empty());
compaction_target_temp = cur_target_temp;
} else if (cur_target_temp != compaction_target_temp) {
assert(!inputs[0].empty());
break;
}
if (inputs[0].empty() || compaction_size + cur_file->fd.GetFileSize() <=
mutable_cf_options.max_compaction_bytes) {
inputs[0].files.push_back(cur_file);
compaction_size += cur_file->fd.GetFileSize();
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with next file's oldest time %" PRIu64 " for temperature %s.",
cf_name.c_str(), cur_file->fd.GetNumber(), oldest_ancestor_time,
temperature_to_string[cur_target_temp].c_str());
}
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
break;
} else {
assert(prev_file == nullptr);
}
}
}
@ -391,7 +417,7 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm(
std::move(inputs), 0, 0 /* output file size limit */,
0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
mutable_cf_options.compression, mutable_cf_options.compression_opts,
Temperature::kWarm,
compaction_target_temp,
/* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
vstorage->CompactionScore(0),
/* is deletion compaction */ false, /* l0_files_might_overlap */ true,
@ -413,8 +439,8 @@ Compaction* FIFOCompactionPicker::PickCompaction(
vstorage, log_buffer);
}
if (c == nullptr) {
c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
vstorage, log_buffer);
c = PickTemperatureChangeCompaction(
cf_name, mutable_cf_options, mutable_db_options, vstorage, log_buffer);
}
RegisterCompaction(c);
return c;
@ -443,4 +469,3 @@ Compaction* FIFOCompactionPicker::CompactRange(
}
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_picker.h"
@ -53,11 +52,9 @@ class FIFOCompactionPicker : public CompactionPicker {
VersionStorageInfo* version,
LogBuffer* log_buffer);
Compaction* PickCompactionToWarm(const std::string& cf_name,
const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options,
VersionStorageInfo* version,
LogBuffer* log_buffer);
Compaction* PickTemperatureChangeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
LogBuffer* log_buffer);
};
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -83,7 +83,7 @@ class LevelCompactionBuilder {
Compaction* GetCompaction();
// For the specfied level, pick a file that we want to compact.
// From `start_level_`, pick files to compact to `output_level_`.
// Returns false if there is no file to compact.
// If it returns true, inputs->files.size() will be exactly one for
// all compaction priorities except round-robin. For round-robin,
@ -107,8 +107,9 @@ class LevelCompactionBuilder {
bool PickIntraL0Compaction();
// Return true if TrivialMove is extended. `start_index` is the index of
// the intiial file picked, which should already be in `start_level_inputs_`.
bool TryExtendNonL0TrivialMove(int start_index);
// the initial file picked, which should already be in `start_level_inputs_`.
bool TryExtendNonL0TrivialMove(int start_index,
bool only_expand_right = false);
// Picks a file from level_files to compact.
// level_files is a vector of (level, file metadata) in ascending order of
@ -355,7 +356,8 @@ void LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion() {
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
if (TryExtendNonL0TrivialMove((int)start_index)) {
if (TryExtendNonL0TrivialMove((int)start_index,
true /* only_expand_right */)) {
return;
}
}
@ -501,6 +503,16 @@ Compaction* LevelCompactionBuilder::PickCompaction() {
}
Compaction* LevelCompactionBuilder::GetCompaction() {
// TryPickL0TrivialMove() does not apply to the case when compacting L0 to an
// empty output level. So L0 files is picked in PickFileToCompact() by
// compaction score. We may still be able to do trivial move when this file
// does not overlap with other L0s. This happens when
// compaction_inputs_[0].size() == 1 since SetupOtherL0FilesIfNeeded() did not
// pull in more L0s.
assert(!compaction_inputs_.empty());
bool l0_files_might_overlap =
start_level_ == 0 && !is_l0_trivial_move_ &&
(compaction_inputs_.size() > 1 || compaction_inputs_[0].size() > 1);
auto c = new Compaction(
vstorage_, ioptions_, mutable_cf_options_, mutable_db_options_,
std::move(compaction_inputs_), output_level_,
@ -515,8 +527,7 @@ Compaction* LevelCompactionBuilder::GetCompaction() {
Temperature::kUnknown,
/* max_subcompactions */ 0, std::move(grandparents_), is_manual_,
/* trim_ts */ "", start_level_score_, false /* deletion_compaction */,
/* l0_files_might_overlap */ start_level_ == 0 && !is_l0_trivial_move_,
compaction_reason_);
l0_files_might_overlap, compaction_reason_);
// If it's level 0 compaction, make sure we don't execute any other level 0
// compactions in parallel
@ -653,7 +664,8 @@ bool LevelCompactionBuilder::TryPickL0TrivialMove() {
return false;
}
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index,
bool only_expand_right) {
if (start_level_inputs_.size() == 1 &&
(ioptions_.db_paths.empty() || ioptions_.db_paths.size() == 1) &&
(mutable_cf_options_.compression_per_level.empty())) {
@ -670,6 +682,7 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
size_t total_size = initial_file->fd.GetFileSize();
CompactionInputFiles output_level_inputs;
output_level_inputs.level = output_level_;
// Expand towards right
for (int i = start_index + 1;
i < static_cast<int>(level_files.size()) &&
start_level_inputs_.size() < kMaxMultiTrivialMove;
@ -702,6 +715,37 @@ bool LevelCompactionBuilder::TryExtendNonL0TrivialMove(int start_index) {
}
start_level_inputs_.files.push_back(next_file);
}
// Expand towards left
if (!only_expand_right) {
for (int i = start_index - 1;
i >= 0 && start_level_inputs_.size() < kMaxMultiTrivialMove; i--) {
FileMetaData* next_file = level_files[i];
if (next_file->being_compacted) {
break;
}
vstorage_->GetOverlappingInputs(output_level_, &(next_file->smallest),
&(initial_file->largest),
&output_level_inputs.files);
if (!output_level_inputs.empty()) {
break;
}
if (i > 0 && compaction_picker_->icmp()
->user_comparator()
->CompareWithoutTimestamp(
next_file->smallest.user_key(),
level_files[i - 1]->largest.user_key()) == 0) {
// Not a clean up after adding the next file. Skip.
break;
}
total_size += next_file->fd.GetFileSize();
if (total_size > mutable_cf_options_.max_compaction_bytes) {
break;
}
// keep `files` sorted in increasing order by key range
start_level_inputs_.files.insert(start_level_inputs_.files.begin(),
next_file);
}
}
return start_level_inputs_.size() > 1;
}
return false;
@ -785,7 +829,10 @@ bool LevelCompactionBuilder::PickFileToCompact() {
vstorage_->GetOverlappingInputs(output_level_, &smallest, &largest,
&output_level_inputs.files);
if (output_level_inputs.empty()) {
if (TryExtendNonL0TrivialMove(index)) {
if (start_level_ > 0 &&
TryExtendNonL0TrivialMove(index,
ioptions_.compaction_pri ==
kRoundRobin /* only_expand_right */)) {
break;
}
} else {

@ -70,6 +70,11 @@ class CompactionPickerTestBase : public testing::Test {
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
ioptions_.cf_paths.emplace_back("dummy",
std::numeric_limits<uint64_t>::max());
// When the default value of this option is true, universal compaction
// tests can encounter assertion failure since SanitizeOption() is
// not run to set this option to false. So we do the sanitization
// here. Tests that test this option set this option to true explicitly.
ioptions_.level_compaction_dynamic_level_bytes = false;
}
~CompactionPickerTestBase() override {}
@ -148,7 +153,8 @@ class CompactionPickerTestBase : public testing::Test {
smallest_seq, largest_seq, marked_for_compact, temperature,
kInvalidBlobFileNumber, kUnknownOldestAncesterTime,
kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum,
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0);
kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0,
true /* user_defined_timestamps_persisted */);
f->compensated_file_size =
(compensated_file_size != 0) ? compensated_file_size : file_size;
f->oldest_ancester_time = oldest_ancestor_time;
@ -482,8 +488,6 @@ TEST_F(CompactionPickerTest, LevelTriggerDynamic4) {
ASSERT_EQ(num_levels - 1, compaction->output_level());
}
// Universal and FIFO Compactions are not supported in ROCKSDB_LITE
#ifndef ROCKSDB_LITE
TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
NewVersionStorage(1, kCompactionStyleUniversal);
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -507,7 +511,7 @@ TEST_F(CompactionPickerTest, NeedsCompactionUniversal) {
TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
const uint64_t kFileSize = 100000;
NewVersionStorage(1, kCompactionStyleUniversal);
NewVersionStorage(3 /* num_levels */, kCompactionStyleUniversal);
ioptions_.allow_ingest_behind = true;
ioptions_.num_levels = 3;
UniversalCompactionPicker universal_compaction_picker(ioptions_, &icmp_);
@ -534,6 +538,14 @@ TEST_F(CompactionPickerTest, CompactionUniversalIngestBehindReservedLevel) {
// output level should be the one above the bottom-most
ASSERT_EQ(1, compaction->output_level());
// input should not include the reserved level
const std::vector<CompactionInputFiles>* inputs = compaction->inputs();
for (const auto& compaction_input : *inputs) {
if (!compaction_input.empty()) {
ASSERT_LT(compaction_input.level, 2);
}
}
}
// Tests if the files can be trivially moved in multi level
// universal compaction when allow_trivial_move option is set
@ -1007,29 +1019,28 @@ TEST_F(CompactionPickerTest, NeedsCompactionFIFO) {
}
}
TEST_F(CompactionPickerTest, FIFOToWarm1) {
TEST_F(CompactionPickerTest, FIFOToCold1) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kColdThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0 /* level */, 4U /* file_number */, "260", "300", 1 * kFileSize, 0, 2500,
2600, 0, true, Temperature::kUnknown,
threshold_time - 2000 /* oldest_ancestor_time */);
// Qualifies for compaction to kCold.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
UpdateVersionStorageInfo();
@ -1039,33 +1050,36 @@ TEST_F(CompactionPickerTest, FIFOToWarm1) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(3U, compaction->input(0, 0)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToWarm2) {
TEST_F(CompactionPickerTest, FIFOToCold2) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kColdThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
Temperature::kUnknown, threshold_time);
// The following two files qualify for compaction to kCold.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1077,34 +1091,40 @@ TEST_F(CompactionPickerTest, FIFOToWarm2) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
TEST_F(CompactionPickerTest, FIFOToColdMaxCompactionSize) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kColdThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 9;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
// The following two files qualify for compaction to kCold.
// But only the last two should be included to respect `max_compaction_bytes`.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
@ -1118,40 +1138,45 @@ TEST_F(CompactionPickerTest, FIFOToWarmMaxSize) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
TEST_F(CompactionPickerTest, FIFOToColdWithExistingCold) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kColdThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
// The following two files qualify for compaction to kCold.
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kWarm, threshold_time - 5000);
Temperature::kCold, threshold_time - 5000);
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
@ -1159,28 +1184,32 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithExistingWarm) {
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(3U, compaction->input(0, 1)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
TEST_F(CompactionPickerTest, FIFOToColdWithHotBetweenCold) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kColdThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kCold, kColdThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
static_cast<uint64_t>(current_time) - kColdThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
@ -1188,65 +1217,78 @@ TEST_F(CompactionPickerTest, FIFOToWarmWithOngoing) {
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kUnknown, threshold_time - 3000);
Temperature::kCold, threshold_time - 3000);
// Qualifies for compaction to kCold.
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kWarm, threshold_time - 5000);
file_map_[2].first->being_compacted = true;
Temperature::kCold, threshold_time - 5000);
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// Stop if a file is being compacted
ASSERT_TRUE(compaction.get() == nullptr);
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
ASSERT_EQ(compaction->output_temperature(), Temperature::kCold);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
}
TEST_F(CompactionPickerTest, FIFOToWarmWithHotBetweenWarms) {
TEST_F(CompactionPickerTest, FIFOToColdAndWarm) {
NewVersionStorage(1, kCompactionStyleFIFO);
const uint64_t kFileSize = 100000;
const uint64_t kMaxSize = kFileSize * 100000;
uint64_t kWarmThreshold = 2000;
uint64_t kWarmThreshold = 10000;
uint64_t kHotThreshold = 2000;
fifo_options_.max_table_files_size = kMaxSize;
fifo_options_.age_for_warm = kWarmThreshold;
// Test that multiple threshold works.
fifo_options_.file_temperature_age_thresholds = {
{Temperature::kHot, kHotThreshold}, {Temperature::kWarm, kWarmThreshold}};
mutable_cf_options_.compaction_options_fifo = fifo_options_;
mutable_cf_options_.level0_file_num_compaction_trigger = 2;
mutable_cf_options_.level0_file_num_compaction_trigger = 100;
mutable_cf_options_.max_compaction_bytes = kFileSize * 100;
FIFOCompactionPicker fifo_compaction_picker(ioptions_, &icmp_);
int64_t current_time = 0;
ASSERT_OK(Env::Default()->GetCurrentTime(&current_time));
uint64_t threshold_time =
uint64_t hot_threshold_time =
static_cast<uint64_t>(current_time) - kHotThreshold;
uint64_t warm_threshold_time =
static_cast<uint64_t>(current_time) - kWarmThreshold;
Add(0, 6U, "240", "290", 2 * kFileSize, 0, 2900, 3000, 0, true,
Temperature::kUnknown, static_cast<uint64_t>(current_time) - 100);
Add(0, 5U, "240", "290", 2 * kFileSize, 0, 2700, 2800, 0, true,
Temperature::kUnknown, threshold_time + 100);
Temperature::kUnknown, hot_threshold_time + 100);
Add(0, 4U, "260", "300", 1 * kFileSize, 0, 2500, 2600, 0, true,
Temperature::kUnknown, threshold_time - 2000);
Temperature::kUnknown, hot_threshold_time - 200);
// Qualifies for Hot
Add(0, 3U, "200", "300", 4 * kFileSize, 0, 2300, 2400, 0, true,
Temperature::kWarm, threshold_time - 3000);
Temperature::kUnknown, warm_threshold_time - 100);
// Qualifies for Warm
Add(0, 2U, "200", "300", 4 * kFileSize, 0, 2100, 2200, 0, true,
Temperature::kUnknown, threshold_time - 4000);
Temperature::kUnknown, warm_threshold_time - 4000);
Add(0, 1U, "200", "300", 4 * kFileSize, 0, 2000, 2100, 0, true,
Temperature::kWarm, threshold_time - 5000);
Temperature::kUnknown, warm_threshold_time - 5000);
UpdateVersionStorageInfo();
ASSERT_EQ(fifo_compaction_picker.NeedsCompaction(vstorage_.get()), true);
std::unique_ptr<Compaction> compaction(fifo_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
// Stop if a file is being compacted
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1U, compaction->num_input_files(0));
ASSERT_EQ(2U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(compaction->compaction_reason(),
CompactionReason::kChangeTemperature);
// Assumes compaction picker picks older files first.
ASSERT_EQ(compaction->output_temperature(), Temperature::kWarm);
ASSERT_EQ(2U, compaction->num_input_files(0));
ASSERT_EQ(1U, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2U, compaction->input(0, 1)->fd.GetNumber());
}
#endif // ROCKSDB_LITE
TEST_F(CompactionPickerTest, CompactionPriMinOverlapping1) {
NewVersionStorage(6, kCompactionStyleLevel);
ioptions_.compaction_pri = kMinOverlappingRatio;
@ -2523,6 +2565,61 @@ TEST_F(CompactionPickerTest, L0TrivialMoveWholeL0) {
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, NonL0TrivialMoveExtendBothDirection) {
mutable_cf_options_.max_bytes_for_level_base = 5000;
mutable_cf_options_.level0_file_num_compaction_trigger = 4;
mutable_cf_options_.max_compaction_bytes = 10000000u;
ioptions_.level_compaction_dynamic_level_bytes = false;
NewVersionStorage(6, kCompactionStyleLevel);
Add(1, 1U, "300", "350", 3000U, 0, 710, 800, 3000U);
Add(1, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
Add(1, 3U, "700", "750", 3000U, 0, 500, 550, 3000U);
Add(2, 4U, "800", "850", 4000U, 0, 150, 200, 4000U);
UpdateVersionStorageInfo();
// File #2 should be picked first, and expand both directions to include
// files #1 and #3.
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1, compaction->num_input_levels());
ASSERT_EQ(3, compaction->num_input_files(0));
ASSERT_EQ(1, compaction->input(0, 0)->fd.GetNumber());
ASSERT_EQ(2, compaction->input(0, 1)->fd.GetNumber());
ASSERT_EQ(3, compaction->input(0, 2)->fd.GetNumber());
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, L0TrivialMoveToEmptyLevel) {
mutable_cf_options_.max_bytes_for_level_base = 5000;
mutable_cf_options_.level0_file_num_compaction_trigger = 4;
mutable_cf_options_.max_compaction_bytes = 10000000u;
ioptions_.level_compaction_dynamic_level_bytes = false;
NewVersionStorage(6, kCompactionStyleLevel);
// File 2 will be picked first, which by itself is trivial movable.
// There was a bug before where compaction also picks file 3 and 4,
// (and then file 1 since it overlaps with the key range),
// which makes the compaction not trivial movable.
Add(0, 1U, "450", "599", 3000U, 0, 710, 800, 3000U);
Add(0, 2U, "600", "651", 3001U, 0, 610, 700, 3001U);
Add(0, 3U, "300", "350", 3000U, 0, 500, 550, 3000U);
Add(0, 4U, "500", "550", 2999U, 0, 300, 350, 2999U);
UpdateVersionStorageInfo();
std::unique_ptr<Compaction> compaction(level_compaction_picker.PickCompaction(
cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(),
&log_buffer_));
ASSERT_TRUE(compaction.get() != nullptr);
ASSERT_EQ(1, compaction->num_input_levels());
ASSERT_EQ(1, compaction->num_input_files(0));
ASSERT_EQ(2, compaction->input(0, 0)->fd.GetNumber());
ASSERT_TRUE(compaction->IsTrivialMove());
}
TEST_F(CompactionPickerTest, IsTrivialMoveOffSstPartitioned) {
mutable_cf_options_.max_bytes_for_level_base = 10000u;
mutable_cf_options_.max_compaction_bytes = 10001u;
@ -2873,7 +2970,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) {
ASSERT_EQ(0, compaction->output_level());
}
#ifndef ROCKSDB_LITE
TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) {
const uint64_t kFileSize = 100000;
@ -3982,7 +4078,6 @@ TEST_P(PerKeyPlacementCompactionPickerTest,
INSTANTIATE_TEST_CASE_P(PerKeyPlacementCompactionPickerTest,
PerKeyPlacementCompactionPickerTest, ::testing::Bool());
#endif // ROCKSDB_LITE
} // namespace ROCKSDB_NAMESPACE

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_universal.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
#include <limits>
@ -20,7 +19,7 @@
#include "file/filename.h"
#include "logging/log_buffer.h"
#include "logging/logging.h"
#include "monitoring/statistics.h"
#include "monitoring/statistics_impl.h"
#include "test_util/sync_point.h"
#include "util/random.h"
#include "util/string_util.h"
@ -134,8 +133,8 @@ class UniversalCompactionBuilder {
UniversalCompactionPicker* picker_;
LogBuffer* log_buffer_;
static std::vector<SortedRun> CalculateSortedRuns(
const VersionStorageInfo& vstorage);
static std::vector<UniversalCompactionBuilder::SortedRun> CalculateSortedRuns(
const VersionStorageInfo& vstorage, int last_level);
// Pick a path ID to place a newly generated file, with its estimated file
// size.
@ -340,13 +339,13 @@ void UniversalCompactionBuilder::SortedRun::DumpSizeInfo(
std::vector<UniversalCompactionBuilder::SortedRun>
UniversalCompactionBuilder::CalculateSortedRuns(
const VersionStorageInfo& vstorage) {
const VersionStorageInfo& vstorage, int last_level) {
std::vector<UniversalCompactionBuilder::SortedRun> ret;
for (FileMetaData* f : vstorage.LevelFiles(0)) {
ret.emplace_back(0, f, f->fd.GetFileSize(), f->compensated_file_size,
f->being_compacted);
}
for (int level = 1; level < vstorage.num_levels(); level++) {
for (int level = 1; level <= last_level; level++) {
uint64_t total_compensated_size = 0U;
uint64_t total_size = 0U;
bool being_compacted = false;
@ -375,7 +374,9 @@ UniversalCompactionBuilder::CalculateSortedRuns(
Compaction* UniversalCompactionBuilder::PickCompaction() {
const int kLevel0 = 0;
score_ = vstorage_->CompactionScore(kLevel0);
sorted_runs_ = CalculateSortedRuns(*vstorage_);
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
sorted_runs_ = CalculateSortedRuns(*vstorage_, max_output_level);
if (sorted_runs_.size() == 0 ||
(vstorage_->FilesMarkedForPeriodicCompaction().empty() &&
@ -472,6 +473,8 @@ Compaction* UniversalCompactionBuilder::PickCompaction() {
"UniversalCompactionBuilder::PickCompaction:Return", nullptr);
return nullptr;
}
assert(c->output_level() <=
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind));
if (mutable_cf_options_.compaction_options_universal.allow_trivial_move ==
true &&
@ -699,22 +702,18 @@ Compaction* UniversalCompactionBuilder::PickCompactionToReduceSortedRuns(
GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
int start_level = sorted_runs_[start_index].level;
int output_level;
// last level is reserved for the files ingested behind
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
if (first_index_after == sorted_runs_.size()) {
output_level = vstorage_->num_levels() - 1;
output_level = max_output_level;
} else if (sorted_runs_[first_index_after].level == 0) {
output_level = 0;
} else {
output_level = sorted_runs_[first_index_after].level - 1;
}
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind &&
(output_level == vstorage_->num_levels() - 1)) {
assert(output_level > 1);
output_level--;
}
std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
std::vector<CompactionInputFiles> inputs(max_output_level + 1);
for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i);
}
@ -1193,8 +1192,10 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
return nullptr;
}
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
// Pick the first non-empty level after the start_level
for (output_level = start_level + 1; output_level < vstorage_->num_levels();
for (output_level = start_level + 1; output_level <= max_output_level;
output_level++) {
if (vstorage_->NumLevelFiles(output_level) != 0) {
break;
@ -1202,9 +1203,9 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
}
// If all higher levels are empty, pick the highest level as output level
if (output_level == vstorage_->num_levels()) {
if (output_level > max_output_level) {
if (start_level == 0) {
output_level = vstorage_->num_levels() - 1;
output_level = max_output_level;
} else {
// If start level is non-zero and all higher levels are empty, this
// compaction will translate into a trivial move. Since the idea is
@ -1213,11 +1214,7 @@ Compaction* UniversalCompactionBuilder::PickDeleteTriggeredCompaction() {
return nullptr;
}
}
if (ioptions_.allow_ingest_behind &&
output_level == vstorage_->num_levels() - 1) {
assert(output_level > 1);
output_level--;
}
assert(output_level <= max_output_level);
if (output_level != 0) {
if (start_level == 0) {
@ -1294,8 +1291,9 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
uint32_t path_id =
GetPathId(ioptions_, mutable_cf_options_, estimated_total_size);
int start_level = sorted_runs_[start_index].level;
std::vector<CompactionInputFiles> inputs(vstorage_->num_levels());
int max_output_level =
vstorage_->MaxOutputLevel(ioptions_.allow_ingest_behind);
std::vector<CompactionInputFiles> inputs(max_output_level + 1);
for (size_t i = 0; i < inputs.size(); ++i) {
inputs[i].level = start_level + static_cast<int>(i);
}
@ -1332,13 +1330,7 @@ Compaction* UniversalCompactionBuilder::PickCompactionWithSortedRunRange(
int output_level;
if (end_index == sorted_runs_.size() - 1) {
// output files at the last level, unless it's reserved
output_level = vstorage_->num_levels() - 1;
// last level is reserved for the files ingested behind
if (ioptions_.allow_ingest_behind) {
assert(output_level > 1);
output_level--;
}
output_level = max_output_level;
} else {
// if it's not including all sorted_runs, it can only output to the level
// above the `end_index + 1` sorted_run.
@ -1451,4 +1443,3 @@ uint64_t UniversalCompactionBuilder::GetMaxOverlappingBytes() const {
}
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -8,7 +8,6 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#ifndef ROCKSDB_LITE
#include "db/compaction/compaction_picker.h"
@ -29,4 +28,3 @@ class UniversalCompactionPicker : public CompactionPicker {
const VersionStorageInfo* vstorage) const override;
};
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -16,7 +16,6 @@
#include "options/options_helper.h"
#include "rocksdb/utilities/options_type.h"
#ifndef ROCKSDB_LITE
namespace ROCKSDB_NAMESPACE {
class SubcompactionState;
@ -832,4 +831,3 @@ bool CompactionServiceInput::TEST_Equals(CompactionServiceInput* other,
#endif // NDEBUG
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE

@ -3,7 +3,6 @@
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#ifndef ROCKSDB_LITE
#include "db/db_test_util.h"
#include "port/stack_trace.h"
@ -929,7 +928,7 @@ TEST_F(CompactionServiceTest, TablePropertiesCollector) {
}
ASSERT_OK(Flush());
}
ASSERT_OK(dbfull()->TEST_WaitForCompact(true));
ASSERT_OK(dbfull()->TEST_WaitForCompact());
ASSERT_OK(db_->GetPropertiesOfAllTables(&fname_to_props));
@ -953,14 +952,3 @@ int main(int argc, char** argv) {
RegisterCustomObjects(argc, argv);
return RUN_ALL_TESTS();
}
#else
#include <stdio.h>
int main(int /*argc*/, char** /*argv*/) {
fprintf(stderr,
"SKIPPED as CompactionService is not supported in ROCKSDB_LITE\n");
return 0;
}
#endif // ROCKSDB_LITE

@ -15,11 +15,9 @@
namespace ROCKSDB_NAMESPACE {
static std::unordered_map<std::string, OptionTypeInfo>
sst_fixed_prefix_type_info = {
#ifndef ROCKSDB_LITE
{"length",
{0, OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
#endif // ROCKSDB_LITE
};
SstPartitionerFixedPrefixFactory::SstPartitionerFixedPrefixFactory(size_t len)
@ -58,7 +56,6 @@ std::shared_ptr<SstPartitionerFactory> NewSstPartitionerFixedPrefixFactory(
return std::make_shared<SstPartitionerFixedPrefixFactory>(prefix_len);
}
#ifndef ROCKSDB_LITE
namespace {
static int RegisterSstPartitionerFactories(ObjectLibrary& library,
const std::string& /*arg*/) {
@ -73,18 +70,14 @@ static int RegisterSstPartitionerFactories(ObjectLibrary& library,
return 1;
}
} // namespace
#endif // ROCKSDB_LITE
Status SstPartitionerFactory::CreateFromString(
const ConfigOptions& options, const std::string& value,
std::shared_ptr<SstPartitionerFactory>* result) {
#ifndef ROCKSDB_LITE
static std::once_flag once;
std::call_once(once, [&]() {
RegisterSstPartitionerFactories(*(ObjectLibrary::Default().get()), "");
});
#endif // ROCKSDB_LITE
return LoadSharedObject<SstPartitionerFactory>(options, value, nullptr,
result);
return LoadSharedObject<SstPartitionerFactory>(options, value, result);
}
} // namespace ROCKSDB_NAMESPACE

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save