Resolving merge conflict

main
Naveen 10 years ago
commit ba6d660f6d
  1. 3
      .travis.yml
  2. 26
      HISTORY.md
  3. 56
      Makefile
  4. 2
      README.md
  5. 2
      build_tools/build_detect_platform
  6. 34
      build_tools/regression_build_test.sh
  7. 40
      db/builder.cc
  8. 11
      db/builder.h
  9. 133
      db/c.cc
  10. 22
      db/c_test.c
  11. 214
      db/column_family.cc
  12. 103
      db/column_family.h
  13. 11
      db/column_family_test.cc
  14. 13
      db/compaction.cc
  15. 16
      db/compaction_picker.cc
  16. 14
      db/corruption_test.cc
  17. 60
      db/cuckoo_table_db_test.cc
  18. 244
      db/db_bench.cc
  19. 27
      db/db_filesnapshot.cc
  20. 1297
      db/db_impl.cc
  21. 90
      db/db_impl.h
  22. 27
      db/db_impl_debug.cc
  23. 44
      db/db_impl_readonly.cc
  24. 4
      db/db_impl_readonly.h
  25. 98
      db/db_iter.cc
  26. 12
      db/db_iter.h
  27. 168
      db/db_iter_test.cc
  28. 1000
      db/db_test.cc
  29. 20
      db/dbformat.cc
  30. 21
      db/dbformat.h
  31. 1
      db/deletefile_test.cc
  32. 3
      db/filename.cc
  33. 62
      db/flush_scheduler.cc
  34. 39
      db/flush_scheduler.h
  35. 89
      db/forward_iterator.cc
  36. 3
      db/forward_iterator.h
  37. 14
      db/internal_stats.cc
  38. 5
      db/internal_stats.h
  39. 5
      db/log_and_apply_bench.cc
  40. 110
      db/memtable.cc
  41. 55
      db/memtable.h
  42. 10
      db/memtable_list.cc
  43. 4
      db/memtable_list.h
  44. 22
      db/plain_table_db_test.cc
  45. 41
      db/repair.cc
  46. 73
      db/simple_table_db_test.cc
  47. 2
      db/snapshot.h
  48. 43
      db/table_cache.cc
  49. 11
      db/table_cache.h
  50. 15
      db/table_properties_collector_test.cc
  51. 6
      db/version_edit.h
  52. 174
      db/version_set.cc
  53. 30
      db/version_set.h
  54. 189
      db/write_batch.cc
  55. 21
      db/write_batch_internal.h
  56. 94
      db/write_batch_test.cc
  57. 37
      db/write_controller.cc
  58. 78
      db/write_controller.h
  59. 40
      db/write_controller_test.cc
  60. 147
      db/write_thread.cc
  61. 80
      db/write_thread.h
  62. 47
      include/rocksdb/c.h
  63. 3
      include/rocksdb/cache.h
  64. 1
      include/rocksdb/compaction_filter.h
  65. 10
      include/rocksdb/db.h
  66. 64
      include/rocksdb/filter_policy.h
  67. 6
      include/rocksdb/flush_block_policy.h
  68. 84
      include/rocksdb/immutable_options.h
  69. 2
      include/rocksdb/iostats_context.h
  70. 164
      include/rocksdb/options.h
  71. 2
      include/rocksdb/statistics.h
  72. 2
      include/rocksdb/status.h
  73. 209
      include/rocksdb/table.h
  74. 39
      include/rocksdb/utilities/backupable_db.h
  75. 105
      include/rocksdb/utilities/write_batch_with_index.h
  76. 1
      include/rocksdb/write_batch.h
  77. 2
      java/Makefile
  78. 47
      java/RocksDBSample.java
  79. 210
      java/org/rocksdb/BlockBasedTableConfig.java
  80. 36
      java/org/rocksdb/GenericRateLimiterConfig.java
  81. 273
      java/org/rocksdb/Options.java
  82. 20
      java/org/rocksdb/RateLimiterConfig.java
  83. 6
      java/org/rocksdb/RocksDB.java
  84. 39
      java/org/rocksdb/benchmark/DbBenchmark.java
  85. 42
      java/org/rocksdb/test/OptionsTest.java
  86. 7
      java/rocksjni/memtablejni.cc
  87. 192
      java/rocksjni/options.cc
  88. 7
      java/rocksjni/portal.h
  89. 24
      java/rocksjni/ratelimiterjni.cc
  90. 15
      java/rocksjni/rocksjni.cc
  91. 34
      java/rocksjni/table.cc
  92. 13
      java/rocksjni/write_batch.cc
  93. 2
      port/stack_trace.cc
  94. 51
      table/adaptive_table_factory.cc
  95. 26
      table/adaptive_table_factory.h
  96. 28
      table/block.cc
  97. 28
      table/block.h
  98. 144
      table/block_based_filter_block.cc
  99. 101
      table/block_based_filter_block.h
  100. 242
      table/block_based_filter_block_test.cc
  101. Some files were not shown because too many files have changed in this diff Show More

@ -14,7 +14,6 @@ before_install:
- sudo dpkg -i libgflags-dev_2.0-1_amd64.deb - sudo dpkg -i libgflags-dev_2.0-1_amd64.deb
# Lousy hack to disable use and testing of fallocate, which doesn't behave quite # Lousy hack to disable use and testing of fallocate, which doesn't behave quite
# as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment. # as EnvPosixTest::AllocateTest expects within the Travis OpenVZ environment.
- sed -i "s/fallocate(/HACK_NO_fallocate(/" build_tools/build_detect_platform script: OPT=-DTRAVIS make check -j8
script: make check -j8
notifications: notifications:
email: false email: false

@ -1,10 +1,34 @@
# Rocksdb Change Log # Rocksdb Change Log
### Unreleased ## Unreleased (will be released with 3.6)
### Disk format changes
* If you're using RocksDB on ARM platforms and you're using default bloom filter, there is a disk format change you need to be aware of. There are three steps you need to do when you convert to new release: 1. turn off filter policy, 2. compact the whole database, 3. turn on filter policy
### Behavior changes
* We have refactored our system of stalling writes. Any stall-related statistics' meanings are changed. Instead of per-write stall counts, we now count stalls per-epoch, where epochs are periods between flushes and compactions. You'll find more information in our Tuning Perf Guide once we release RocksDB 3.6.
* When disableDataSync=true, we no longer sync the MANIFEST file.
* Add identity_as_first_hash property to CuckooTable. SST file needs to be rebuilt to be opened by reader properly.
* Change target_file_size_base type to uint64_t from int.
----- Past Releases -----
## 3.5.0 (9/3/2014)
### New Features
* Add include/utilities/write_batch_with_index.h, providing a utilitiy class to query data out of WriteBatch when building it.
* Move BlockBasedTable related options to BlockBasedTableOptions from Options. Change corresponding JNI interface. Options affected include:
no_block_cache, block_cache, block_cache_compressed, block_size, block_size_deviation, block_restart_interval, filter_policy, whole_key_filtering. filter_policy is changed to shared_ptr from a raw pointer.
* Remove deprecated options: disable_seek_compaction and db_stats_log_interval
* OptimizeForPointLookup() takes one parameter for block cache size. It now builds hash index, bloom filter, and block cache.
### Public API changes
* The Prefix Extractor used with V2 compaction filters is now passed user key to SliceTransform::Transform instead of unparsed RocksDB key.
## 3.4.0 (8/18/2014)
### New Features ### New Features
* Support Multiple DB paths in universal style compactions * Support Multiple DB paths in universal style compactions
* Add feature of storing plain table index and bloom filter in SST file. * Add feature of storing plain table index and bloom filter in SST file.
* CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0. * CompactRange() will never output compacted files to level 0. This used to be the case when all the compaction input files were at level 0.
* Added iterate_upper_bound to define the extent upto which the forward iterator will return entries. This will prevent iterating over delete markers and overwritten entries for edge cases where you want to break out the iterator anyways. This may improve perfomance in case there are a large number of delete markers or overwritten entries.
### Public API changes ### Public API changes
* DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size * DBOptions.db_paths now is a vector of a DBPath structure which indicates both of path and target size

@ -3,7 +3,6 @@
# found in the LICENSE file. See the AUTHORS file for names of contributors. # found in the LICENSE file. See the AUTHORS file for names of contributors.
# Inherit some settings from environment variables, if available # Inherit some settings from environment variables, if available
INSTALL_PATH ?= $(CURDIR)
#----------------------------------------------- #-----------------------------------------------
@ -49,6 +48,27 @@ else
PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC PLATFORM_CCFLAGS += $(JEMALLOC_INCLUDE) -DHAVE_JEMALLOC
endif endif
#-------------------------------------------------
# make install related stuff
INSTALL_PATH ?= /usr/local
uninstall:
@rm -rf $(INSTALL_PATH)/include/rocksdb
@rm -rf $(INSTALL_PATH)/lib/$(LIBRARY)
@rm -rf $(INSTALL_PATH)/lib/$(SHARED)
install:
@install -d $(INSTALL_PATH)/lib
@for header_dir in `find "include/rocksdb" -type d`; do \
install -d $(INSTALL_PATH)/$$header_dir; \
done
@for header in `find "include/rocksdb" -type f -name *.h`; do \
install -C -m 644 $$header $(INSTALL_PATH)/$$header; \
done
@[ ! -e $(LIBRARY) ] || install -C -m 644 $(LIBRARY) $(INSTALL_PATH)/lib
@[ ! -e $(SHARED) ] || install -C -m 644 $(SHARED) $(INSTALL_PATH)/lib
#-------------------------------------------------
WARNING_FLAGS = -Wall -Werror -Wsign-compare WARNING_FLAGS = -Wall -Werror -Wsign-compare
CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT) CFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual
@ -90,12 +110,14 @@ TESTS = \
blob_store_test \ blob_store_test \
filelock_test \ filelock_test \
filename_test \ filename_test \
filter_block_test \ block_based_filter_block_test \
full_filter_block_test \
histogram_test \ histogram_test \
log_test \ log_test \
manual_compaction_test \ manual_compaction_test \
memenv_test \ memenv_test \
merge_test \ merge_test \
merger_test \
redis_test \ redis_test \
reduce_levels_test \ reduce_levels_test \
plain_table_db_test \ plain_table_db_test \
@ -111,17 +133,18 @@ TESTS = \
version_edit_test \ version_edit_test \
version_set_test \ version_set_test \
file_indexer_test \ file_indexer_test \
write_batch_test\ write_batch_test \
write_controller_test\
deletefile_test \ deletefile_test \
table_test \ table_test \
thread_local_test \ thread_local_test \
geodb_test \ geodb_test \
rate_limiter_test \ rate_limiter_test \
cuckoo_table_builder_test \
options_test \ options_test \
cuckoo_table_builder_test \ cuckoo_table_builder_test \
cuckoo_table_reader_test \ cuckoo_table_reader_test \
cuckoo_table_db_test cuckoo_table_db_test \
write_batch_with_index_test
TOOLS = \ TOOLS = \
sst_dump \ sst_dump \
@ -132,7 +155,7 @@ TOOLS = \
options_test \ options_test \
blob_store_bench blob_store_bench
PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench cache_bench $(TOOLS)
# The library name is configurable since we are maintaining libraries of both # The library name is configurable since we are maintaining libraries of both
# debug/release mode. # debug/release mode.
@ -175,7 +198,7 @@ endif # PLATFORM_SHARED_EXT
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \ release tags valgrind_check whitebox_crash_test format static_lib shared_lib all \
dbg rocksdbjavastatic rocksdbjava dbg rocksdbjavastatic rocksdbjava install uninstall
all: $(LIBRARY) $(PROGRAMS) $(TESTS) all: $(LIBRARY) $(PROGRAMS) $(TESTS)
@ -264,6 +287,9 @@ $(LIBRARY): $(LIBOBJECTS)
db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) db_bench: db/db_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/db_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
cache_bench: util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL)
$(CXX) util/cache_bench.o $(LIBOBJECTS) $(TESTUTIL) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) block_hash_index_test: table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) table/block_hash_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -375,6 +401,9 @@ spatial_db_test: utilities/spatialdb/spatial_db_test.o $(LIBOBJECTS) $(TESTHARNE
ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) ttl_test: utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) utilities/ttl/ttl_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
write_batch_with_index_test: utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) utilities/write_batch_with_index/write_batch_with_index_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) dbformat_test: db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/dbformat_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -387,8 +416,11 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
filter_block_test: table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
full_filter_block_test: table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/full_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) log_test: db/log_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/log_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
@ -417,9 +449,15 @@ reduce_levels_test: tools/reduce_levels_test.o $(LIBOBJECTS) $(TESTHARNESS)
write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) write_batch_test: db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/write_batch_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
write_controller_test: db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/write_controller_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) merge_test: db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) $(CXX) db/merge_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
merger_test: table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) table/merger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) deletefile_test: db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(CXX) db/deletefile_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS)

@ -3,7 +3,7 @@
[![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb) [![Build Status](https://travis-ci.org/facebook/rocksdb.svg?branch=master)](https://travis-ci.org/facebook/rocksdb)
RocksDB is developed and maintained by Facebook Database Engineering Team. RocksDB is developed and maintained by Facebook Database Engineering Team.
It is built on on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com) It is built on earlier work on LevelDB by Sanjay Ghemawat (sanjay@google.com)
and Jeff Dean (jeff@google.com) and Jeff Dean (jeff@google.com)
This code is a library that forms the core building block for a fast This code is a library that forms the core building block for a fast

@ -46,7 +46,7 @@ PLATFORM_CXXFLAGS="-std=c++11"
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
# Default to fbcode gcc on internal fb machines # Default to fbcode gcc on internal fb machines
if [ -d /mnt/gvfs/third-party -a -z "$CXX" ]; then if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then
FBCODE_BUILD="true" FBCODE_BUILD="true"
if [ -z "$USE_CLANG" ]; then if [ -z "$USE_CLANG" ]; then
CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \ CENTOS_VERSION=`rpm -q --qf "%{VERSION}" \

@ -344,6 +344,38 @@ common_in_mem_args="--db=/dev/shm/rocksdb \
--threads=32 \ --threads=32 \
--writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram --writes_per_second=81920 > ${STAT_FILE}.seekwhilewriting_in_ram
# measure fillseq with bunch of column families
./db_bench \
--benchmarks=fillseq \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=0 \
--num=$NUM \
--writes=$NUM \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 > ${STAT_FILE}.fillseq_lots_column_families
# measure overwrite performance with bunch of column families
./db_bench \
--benchmarks=overwrite \
--num_column_families=500 \
--write_buffer_size=1048576 \
--db=$DATA_DIR \
--use_existing_db=1 \
--num=$NUM \
--writes=$((NUM / 10)) \
--open_files=55000 \
--statistics=1 \
--histogram=1 \
--disable_data_sync=1 \
--disable_wal=1 \
--sync=0 \
--threads=8 > ${STAT_FILE}.overwrite_lots_column_families
# send data to ods # send data to ods
function send_to_ods { function send_to_ods {
@ -392,3 +424,5 @@ send_benchmark_to_ods readrandom memtablereadrandom $STAT_FILE.memtablefillreadr
send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting send_benchmark_to_ods readwhilewriting readwhilewriting $STAT_FILE.readwhilewriting
send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram send_benchmark_to_ods readwhilewriting readwhilewriting_in_ram ${STAT_FILE}.readwhilewriting_in_ram
send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram send_benchmark_to_ods seekrandomwhilewriting seekwhilewriting_in_ram ${STAT_FILE}.seekwhilewriting_in_ram
send_benchmark_to_ods fillseq fillseq_lots_column_families ${STAT_FILE}.fillseq_lots_column_families
send_benchmark_to_ods overwrite overwrite_lots_column_families ${STAT_FILE}.overwrite_lots_column_families

@ -26,21 +26,24 @@ namespace rocksdb {
class TableFactory; class TableFactory;
TableBuilder* NewTableBuilder(const Options& options, TableBuilder* NewTableBuilder(const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
WritableFile* file, WritableFile* file,
CompressionType compression_type) { const CompressionType compression_type,
return options.table_factory->NewTableBuilder(options, internal_comparator, const CompressionOptions& compression_opts) {
file, compression_type); return ioptions.table_factory->NewTableBuilder(
ioptions, internal_comparator, file, compression_type, compression_opts);
} }
Status BuildTable(const std::string& dbname, Env* env, const Options& options, Status BuildTable(const std::string& dbname, Env* env,
const EnvOptions& soptions, TableCache* table_cache, const ImmutableCFOptions& ioptions,
const EnvOptions& env_options, TableCache* table_cache,
Iterator* iter, FileMetaData* meta, Iterator* iter, FileMetaData* meta,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot, const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable, const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression, const CompressionType compression,
const CompressionOptions& compression_opts,
const Env::IOPriority io_priority) { const Env::IOPriority io_priority) {
Status s; Status s;
meta->fd.file_size = 0; meta->fd.file_size = 0;
@ -50,23 +53,24 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
// If the sequence number of the smallest entry in the memtable is // If the sequence number of the smallest entry in the memtable is
// smaller than the most recent snapshot, then we do not trigger // smaller than the most recent snapshot, then we do not trigger
// removal of duplicate/deleted keys as part of this builder. // removal of duplicate/deleted keys as part of this builder.
bool purge = options.purge_redundant_kvs_while_flush; bool purge = ioptions.purge_redundant_kvs_while_flush;
if (earliest_seqno_in_memtable <= newest_snapshot) { if (earliest_seqno_in_memtable <= newest_snapshot) {
purge = false; purge = false;
} }
std::string fname = TableFileName(options.db_paths, meta->fd.GetNumber(), std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(),
meta->fd.GetPathId()); meta->fd.GetPathId());
if (iter->Valid()) { if (iter->Valid()) {
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
s = env->NewWritableFile(fname, &file, soptions); s = env->NewWritableFile(fname, &file, env_options);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
file->SetIOPriority(io_priority); file->SetIOPriority(io_priority);
TableBuilder* builder = TableBuilder* builder = NewTableBuilder(
NewTableBuilder(options, internal_comparator, file.get(), compression); ioptions, internal_comparator, file.get(),
compression, compression_opts);
// the first key is the smallest key // the first key is the smallest key
Slice key = iter->key(); Slice key = iter->key();
@ -75,8 +79,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
meta->largest_seqno = meta->smallest_seqno; meta->largest_seqno = meta->smallest_seqno;
MergeHelper merge(internal_comparator.user_comparator(), MergeHelper merge(internal_comparator.user_comparator(),
options.merge_operator.get(), options.info_log.get(), ioptions.merge_operator, ioptions.info_log,
options.min_partial_merge_operands, ioptions.min_partial_merge_operands,
true /* internal key corruption is not ok */); true /* internal key corruption is not ok */);
if (purge) { if (purge) {
@ -196,12 +200,12 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
delete builder; delete builder;
// Finish and check for file errors // Finish and check for file errors
if (s.ok() && !options.disableDataSync) { if (s.ok() && !ioptions.disable_data_sync) {
if (options.use_fsync) { if (ioptions.use_fsync) {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
s = file->Fsync(); s = file->Fsync();
} else { } else {
StopWatch sw(env, options.statistics.get(), TABLE_SYNC_MICROS); StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS);
s = file->Sync(); s = file->Sync();
} }
} }
@ -211,7 +215,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options,
if (s.ok()) { if (s.ok()) {
// Verify that the table is usable // Verify that the table is usable
Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, Iterator* it = table_cache->NewIterator(ReadOptions(), env_options,
internal_comparator, meta->fd); internal_comparator, meta->fd);
s = it->status(); s = it->status();
delete it; delete it;

@ -11,6 +11,7 @@
#include "rocksdb/status.h" #include "rocksdb/status.h"
#include "rocksdb/types.h" #include "rocksdb/types.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
namespace rocksdb { namespace rocksdb {
@ -26,8 +27,10 @@ class TableBuilder;
class WritableFile; class WritableFile;
extern TableBuilder* NewTableBuilder( extern TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator, const ImmutableCFOptions& options,
WritableFile* file, CompressionType compression_type); const InternalKeyComparator& internal_comparator,
WritableFile* file, const CompressionType compression_type,
const CompressionOptions& compression_opts);
// Build a Table file from the contents of *iter. The generated file // Build a Table file from the contents of *iter. The generated file
// will be named according to number specified in meta. On success, the rest of // will be named according to number specified in meta. On success, the rest of
@ -35,13 +38,15 @@ extern TableBuilder* NewTableBuilder(
// If no data is present in *iter, meta->file_size will be set to // If no data is present in *iter, meta->file_size will be set to
// zero, and no Table file will be produced. // zero, and no Table file will be produced.
extern Status BuildTable(const std::string& dbname, Env* env, extern Status BuildTable(const std::string& dbname, Env* env,
const Options& options, const EnvOptions& soptions, const ImmutableCFOptions& options,
const EnvOptions& env_options,
TableCache* table_cache, Iterator* iter, TableCache* table_cache, Iterator* iter,
FileMetaData* meta, FileMetaData* meta,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const SequenceNumber newest_snapshot, const SequenceNumber newest_snapshot,
const SequenceNumber earliest_seqno_in_memtable, const SequenceNumber earliest_seqno_in_memtable,
const CompressionType compression, const CompressionType compression,
const CompressionOptions& compression_opts,
const Env::IOPriority io_priority = Env::IO_HIGH); const Env::IOPriority io_priority = Env::IO_HIGH);
} // namespace rocksdb } // namespace rocksdb

@ -55,6 +55,7 @@ using rocksdb::MergeOperator;
using rocksdb::NewBloomFilterPolicy; using rocksdb::NewBloomFilterPolicy;
using rocksdb::NewLRUCache; using rocksdb::NewLRUCache;
using rocksdb::Options; using rocksdb::Options;
using rocksdb::BlockBasedTableOptions;
using rocksdb::RandomAccessFile; using rocksdb::RandomAccessFile;
using rocksdb::Range; using rocksdb::Range;
using rocksdb::ReadOptions; using rocksdb::ReadOptions;
@ -81,6 +82,7 @@ struct rocksdb_fifo_compaction_options_t { CompactionOptionsFIFO rep; };
struct rocksdb_readoptions_t { ReadOptions rep; }; struct rocksdb_readoptions_t { ReadOptions rep; };
struct rocksdb_writeoptions_t { WriteOptions rep; }; struct rocksdb_writeoptions_t { WriteOptions rep; };
struct rocksdb_options_t { Options rep; }; struct rocksdb_options_t { Options rep; };
struct rocksdb_block_based_table_options_t { BlockBasedTableOptions rep; };
struct rocksdb_seqfile_t { SequentialFile* rep; }; struct rocksdb_seqfile_t { SequentialFile* rep; };
struct rocksdb_randomfile_t { RandomAccessFile* rep; }; struct rocksdb_randomfile_t { RandomAccessFile* rep; };
struct rocksdb_writablefile_t { WritableFile* rep; }; struct rocksdb_writablefile_t { WritableFile* rep; };
@ -116,7 +118,7 @@ struct rocksdb_compactionfilter_t : public CompactionFilter {
const Slice& existing_value, const Slice& existing_value,
std::string* new_value, std::string* new_value,
bool* value_changed) const { bool* value_changed) const {
char* c_new_value = NULL; char* c_new_value = nullptr;
size_t new_value_length = 0; size_t new_value_length = 0;
unsigned char c_value_changed = 0; unsigned char c_value_changed = 0;
unsigned char result = (*filter_)( unsigned char result = (*filter_)(
@ -1053,6 +1055,74 @@ const char* rocksdb_writebatch_data(rocksdb_writebatch_t* b, size_t* size) {
return b->rep.Data().c_str(); return b->rep.Data().c_str();
} }
rocksdb_block_based_table_options_t*
rocksdb_block_based_options_create() {
return new rocksdb_block_based_table_options_t;
}
void rocksdb_block_based_options_destroy(
rocksdb_block_based_table_options_t* options) {
delete options;
}
void rocksdb_block_based_options_set_block_size(
rocksdb_block_based_table_options_t* options, size_t block_size) {
options->rep.block_size = block_size;
}
void rocksdb_block_based_options_set_block_size_deviation(
rocksdb_block_based_table_options_t* options, int block_size_deviation) {
options->rep.block_size_deviation = block_size_deviation;
}
void rocksdb_block_based_options_set_block_restart_interval(
rocksdb_block_based_table_options_t* options, int block_restart_interval) {
options->rep.block_restart_interval = block_restart_interval;
}
void rocksdb_block_based_options_set_filter_policy(
rocksdb_block_based_table_options_t* options,
rocksdb_filterpolicy_t* filter_policy) {
options->rep.filter_policy.reset(filter_policy);
}
void rocksdb_block_based_options_set_no_block_cache(
rocksdb_block_based_table_options_t* options,
unsigned char no_block_cache) {
options->rep.no_block_cache = no_block_cache;
}
void rocksdb_block_based_options_set_block_cache(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache) {
if (block_cache) {
options->rep.block_cache = block_cache->rep;
}
}
void rocksdb_block_based_options_set_block_cache_compressed(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache_compressed) {
if (block_cache_compressed) {
options->rep.block_cache_compressed = block_cache_compressed->rep;
}
}
void rocksdb_block_based_options_set_whole_key_filtering(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.whole_key_filtering = v;
}
void rocksdb_options_set_block_based_table_factory(
rocksdb_options_t *opt,
rocksdb_block_based_table_options_t* table_options) {
if (table_options) {
opt->rep.table_factory.reset(
rocksdb::NewBlockBasedTableFactory(table_options->rep));
}
}
rocksdb_options_t* rocksdb_options_create() { rocksdb_options_t* rocksdb_options_create() {
return new rocksdb_options_t; return new rocksdb_options_t;
} }
@ -1067,8 +1137,8 @@ void rocksdb_options_increase_parallelism(
} }
void rocksdb_options_optimize_for_point_lookup( void rocksdb_options_optimize_for_point_lookup(
rocksdb_options_t* opt) { rocksdb_options_t* opt, uint64_t block_cache_size_mb) {
opt->rep.OptimizeForPointLookup(); opt->rep.OptimizeForPointLookup(block_cache_size_mb);
} }
void rocksdb_options_optimize_level_style_compaction( void rocksdb_options_optimize_level_style_compaction(
@ -1111,12 +1181,6 @@ void rocksdb_options_set_compaction_filter_factory_v2(
opt->rep.compaction_filter_factory_v2 = std::shared_ptr<CompactionFilterFactoryV2>(compaction_filter_factory_v2); opt->rep.compaction_filter_factory_v2 = std::shared_ptr<CompactionFilterFactoryV2>(compaction_filter_factory_v2);
} }
void rocksdb_options_set_filter_policy(
rocksdb_options_t* opt,
rocksdb_filterpolicy_t* policy) {
opt->rep.filter_policy = policy;
}
void rocksdb_options_set_create_if_missing( void rocksdb_options_set_create_if_missing(
rocksdb_options_t* opt, unsigned char v) { rocksdb_options_t* opt, unsigned char v) {
opt->rep.create_if_missing = v; opt->rep.create_if_missing = v;
@ -1160,26 +1224,6 @@ void rocksdb_options_set_max_open_files(rocksdb_options_t* opt, int n) {
opt->rep.max_open_files = n; opt->rep.max_open_files = n;
} }
void rocksdb_options_set_cache(rocksdb_options_t* opt, rocksdb_cache_t* c) {
if (c) {
opt->rep.block_cache = c->rep;
}
}
void rocksdb_options_set_cache_compressed(rocksdb_options_t* opt, rocksdb_cache_t* c) {
if (c) {
opt->rep.block_cache_compressed = c->rep;
}
}
void rocksdb_options_set_block_size(rocksdb_options_t* opt, size_t s) {
opt->rep.block_size = s;
}
void rocksdb_options_set_block_restart_interval(rocksdb_options_t* opt, int n) {
opt->rep.block_restart_interval = n;
}
void rocksdb_options_set_target_file_size_base( void rocksdb_options_set_target_file_size_base(
rocksdb_options_t* opt, uint64_t n) { rocksdb_options_t* opt, uint64_t n) {
opt->rep.target_file_size_base = n; opt->rep.target_file_size_base = n;
@ -1272,11 +1316,6 @@ void rocksdb_options_set_prefix_extractor(
opt->rep.prefix_extractor.reset(prefix_extractor); opt->rep.prefix_extractor.reset(prefix_extractor);
} }
void rocksdb_options_set_whole_key_filtering(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.whole_key_filtering = v;
}
void rocksdb_options_set_disable_data_sync( void rocksdb_options_set_disable_data_sync(
rocksdb_options_t* opt, int disable_data_sync) { rocksdb_options_t* opt, int disable_data_sync) {
opt->rep.disableDataSync = disable_data_sync; opt->rep.disableDataSync = disable_data_sync;
@ -1287,11 +1326,6 @@ void rocksdb_options_set_use_fsync(
opt->rep.use_fsync = use_fsync; opt->rep.use_fsync = use_fsync;
} }
void rocksdb_options_set_db_stats_log_interval(
rocksdb_options_t* opt, int db_stats_log_interval) {
opt->rep.db_stats_log_interval = db_stats_log_interval;
}
void rocksdb_options_set_db_log_dir( void rocksdb_options_set_db_log_dir(
rocksdb_options_t* opt, const char* db_log_dir) { rocksdb_options_t* opt, const char* db_log_dir) {
opt->rep.db_log_dir = db_log_dir; opt->rep.db_log_dir = db_log_dir;
@ -1351,11 +1385,6 @@ void rocksdb_options_set_stats_dump_period_sec(
opt->rep.stats_dump_period_sec = v; opt->rep.stats_dump_period_sec = v;
} }
void rocksdb_options_set_block_size_deviation(
rocksdb_options_t* opt, int v) {
opt->rep.block_size_deviation = v;
}
void rocksdb_options_set_advise_random_on_open( void rocksdb_options_set_advise_random_on_open(
rocksdb_options_t* opt, unsigned char v) { rocksdb_options_t* opt, unsigned char v) {
opt->rep.advise_random_on_open = v; opt->rep.advise_random_on_open = v;
@ -1450,11 +1479,6 @@ void rocksdb_options_set_max_manifest_file_size(
opt->rep.max_manifest_file_size = v; opt->rep.max_manifest_file_size = v;
} }
void rocksdb_options_set_no_block_cache(
rocksdb_options_t* opt, unsigned char v) {
opt->rep.no_block_cache = v;
}
void rocksdb_options_set_table_cache_numshardbits( void rocksdb_options_set_table_cache_numshardbits(
rocksdb_options_t* opt, int v) { rocksdb_options_t* opt, int v) {
opt->rep.table_cache_numshardbits = v; opt->rep.table_cache_numshardbits = v;
@ -1474,10 +1498,6 @@ void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t* opt, int di
opt->rep.disable_auto_compactions = disable; opt->rep.disable_auto_compactions = disable;
} }
void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t* opt, int disable) {
opt->rep.disable_seek_compaction = disable;
}
void rocksdb_options_set_delete_obsolete_files_period_micros( void rocksdb_options_set_delete_obsolete_files_period_micros(
rocksdb_options_t* opt, uint64_t v) { rocksdb_options_t* opt, uint64_t v) {
opt->rep.delete_obsolete_files_period_micros = v; opt->rep.delete_obsolete_files_period_micros = v;
@ -1824,6 +1844,13 @@ void rocksdb_readoptions_set_snapshot(
opt->rep.snapshot = (snap ? snap->rep : nullptr); opt->rep.snapshot = (snap ? snap->rep : nullptr);
} }
void rocksdb_readoptions_set_iterate_upper_bound(
rocksdb_readoptions_t* opt,
const char* key, size_t keylen) {
Slice prefix = Slice(key, keylen);
opt->rep.iterate_upper_bound = &prefix;
}
void rocksdb_readoptions_set_read_tier( void rocksdb_readoptions_set_read_tier(
rocksdb_readoptions_t* opt, int v) { rocksdb_readoptions_t* opt, int v) {
opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v); opt->rep.read_tier = static_cast<rocksdb::ReadTier>(v);

@ -335,6 +335,7 @@ int main(int argc, char** argv) {
rocksdb_cache_t* cache; rocksdb_cache_t* cache;
rocksdb_env_t* env; rocksdb_env_t* env;
rocksdb_options_t* options; rocksdb_options_t* options;
rocksdb_block_based_table_options_t* table_options;
rocksdb_readoptions_t* roptions; rocksdb_readoptions_t* roptions;
rocksdb_writeoptions_t* woptions; rocksdb_writeoptions_t* woptions;
char* err = NULL; char* err = NULL;
@ -353,14 +354,15 @@ int main(int argc, char** argv) {
options = rocksdb_options_create(); options = rocksdb_options_create();
rocksdb_options_set_comparator(options, cmp); rocksdb_options_set_comparator(options, cmp);
rocksdb_options_set_error_if_exists(options, 1); rocksdb_options_set_error_if_exists(options, 1);
rocksdb_options_set_cache(options, cache);
rocksdb_options_set_env(options, env); rocksdb_options_set_env(options, env);
rocksdb_options_set_info_log(options, NULL); rocksdb_options_set_info_log(options, NULL);
rocksdb_options_set_write_buffer_size(options, 100000); rocksdb_options_set_write_buffer_size(options, 100000);
rocksdb_options_set_paranoid_checks(options, 1); rocksdb_options_set_paranoid_checks(options, 1);
rocksdb_options_set_max_open_files(options, 10); rocksdb_options_set_max_open_files(options, 10);
rocksdb_options_set_block_size(options, 1024); table_options = rocksdb_block_based_options_create();
rocksdb_options_set_block_restart_interval(options, 8); rocksdb_block_based_options_set_block_cache(table_options, cache);
rocksdb_options_set_block_based_table_factory(options, table_options);
rocksdb_options_set_compression(options, rocksdb_no_compression); rocksdb_options_set_compression(options, rocksdb_no_compression);
rocksdb_options_set_compression_options(options, -14, -1, 0); rocksdb_options_set_compression_options(options, -14, -1, 0);
int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression, int compression_levels[] = {rocksdb_no_compression, rocksdb_no_compression,
@ -540,10 +542,12 @@ int main(int argc, char** argv) {
policy = rocksdb_filterpolicy_create_bloom(10); policy = rocksdb_filterpolicy_create_bloom(10);
} }
rocksdb_block_based_options_set_filter_policy(table_options, policy);
// Create new database // Create new database
rocksdb_close(db); rocksdb_close(db);
rocksdb_destroy_db(options, dbname, &err); rocksdb_destroy_db(options, dbname, &err);
rocksdb_options_set_filter_policy(options, policy); rocksdb_options_set_block_based_table_factory(options, table_options);
db = rocksdb_open(options, dbname, &err); db = rocksdb_open(options, dbname, &err);
CheckNoError(err); CheckNoError(err);
rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err); rocksdb_put(db, woptions, "foo", 3, "foovalue", 8, &err);
@ -565,8 +569,9 @@ int main(int argc, char** argv) {
CheckGet(db, roptions, "foo", "foovalue"); CheckGet(db, roptions, "foo", "foovalue");
CheckGet(db, roptions, "bar", "barvalue"); CheckGet(db, roptions, "bar", "barvalue");
} }
rocksdb_options_set_filter_policy(options, NULL); // Reset the policy
rocksdb_filterpolicy_destroy(policy); rocksdb_block_based_options_set_filter_policy(table_options, NULL);
rocksdb_options_set_block_based_table_factory(options, table_options);
} }
StartPhase("compaction_filter"); StartPhase("compaction_filter");
@ -757,8 +762,7 @@ int main(int argc, char** argv) {
StartPhase("prefix"); StartPhase("prefix");
{ {
// Create new database // Create new database
rocksdb_filterpolicy_t* policy = rocksdb_filterpolicy_create_bloom(10); rocksdb_options_set_allow_mmap_reads(options, 1);
rocksdb_options_set_filter_policy(options, policy);
rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3)); rocksdb_options_set_prefix_extractor(options, rocksdb_slicetransform_create_fixed_prefix(3));
rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4); rocksdb_options_set_hash_skip_list_rep(options, 5000, 4, 4);
rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16); rocksdb_options_set_plain_table_factory(options, 4, 10, 0.75, 16);
@ -795,13 +799,13 @@ int main(int argc, char** argv) {
rocksdb_iter_get_error(iter, &err); rocksdb_iter_get_error(iter, &err);
CheckNoError(err); CheckNoError(err);
rocksdb_iter_destroy(iter); rocksdb_iter_destroy(iter);
rocksdb_filterpolicy_destroy(policy);
} }
StartPhase("cleanup"); StartPhase("cleanup");
rocksdb_close(db); rocksdb_close(db);
rocksdb_options_destroy(options); rocksdb_options_destroy(options);
rocksdb_block_based_options_destroy(table_options);
rocksdb_readoptions_destroy(roptions); rocksdb_readoptions_destroy(roptions);
rocksdb_writeoptions_destroy(woptions); rocksdb_writeoptions_destroy(woptions);
rocksdb_cache_destroy(cache); rocksdb_cache_destroy(cache);

@ -9,6 +9,11 @@
#include "db/column_family.h" #include "db/column_family.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h>
#include <vector> #include <vector>
#include <string> #include <string>
#include <algorithm> #include <algorithm>
@ -19,11 +24,43 @@
#include "db/internal_stats.h" #include "db/internal_stats.h"
#include "db/compaction_picker.h" #include "db/compaction_picker.h"
#include "db/table_properties_collector.h" #include "db/table_properties_collector.h"
#include "db/write_controller.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "util/hash_skiplist_rep.h" #include "util/hash_skiplist_rep.h"
#include "util/options_helper.h"
namespace rocksdb { namespace rocksdb {
namespace {
// This function computes the amount of time in microseconds by which a write
// should be delayed based on the number of level-0 files according to the
// following formula:
// if n < bottom, return 0;
// if n >= top, return 1000;
// otherwise, let r = (n - bottom) /
// (top - bottom)
// and return r^2 * 1000.
// The goal of this formula is to gradually increase the rate at which writes
// are slowed. We also tried linear delay (r * 1000), but it seemed to do
// slightly worse. There is no other particular reason for choosing quadratic.
uint64_t SlowdownAmount(int n, double bottom, double top) {
uint64_t delay;
if (n >= top) {
delay = 1000;
} else if (n < bottom) {
delay = 0;
} else {
// If we are here, we know that:
// level0_start_slowdown <= n < level0_slowdown
// since the previous two conditions are false.
double how_much = static_cast<double>(n - bottom) / (top - bottom);
delay = std::max(how_much * how_much * 1000, 100.0);
}
assert(delay <= 1000);
return delay;
}
} // namespace
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd, ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(ColumnFamilyData* cfd,
DBImpl* db, port::Mutex* mutex) DBImpl* db, port::Mutex* mutex)
: cfd_(cfd), db_(db), mutex_(mutex) { : cfd_(cfd), db_(db), mutex_(mutex) {
@ -49,12 +86,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); } uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
const Comparator* ColumnFamilyHandleImpl::user_comparator() const {
return cfd()->user_comparator();
}
ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src) { const ColumnFamilyOptions& src) {
ColumnFamilyOptions result = src; ColumnFamilyOptions result = src;
result.comparator = icmp; result.comparator = icmp;
result.filter_policy = (src.filter_policy != nullptr) ? ipolicy : nullptr;
#ifdef OS_MACOSX #ifdef OS_MACOSX
// TODO(icanadi) make write_buffer_size uint64_t instead of size_t // TODO(icanadi) make write_buffer_size uint64_t instead of size_t
ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30); ClipToRange(&result.write_buffer_size, ((size_t)64) << 10, ((size_t)1) << 30);
@ -70,13 +109,7 @@ ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
result.min_write_buffer_number_to_merge = result.min_write_buffer_number_to_merge =
std::min(result.min_write_buffer_number_to_merge, std::min(result.min_write_buffer_number_to_merge,
result.max_write_buffer_number - 1); result.max_write_buffer_number - 1);
if (result.block_cache == nullptr && !result.no_block_cache) {
result.block_cache = NewLRUCache(8 << 20);
}
result.compression_per_level = src.compression_per_level; result.compression_per_level = src.compression_per_level;
if (result.block_size_deviation < 0 || result.block_size_deviation > 100) {
result.block_size_deviation = 0;
}
if (result.max_mem_compaction_level >= result.num_levels) { if (result.max_mem_compaction_level >= result.num_levels) {
result.max_mem_compaction_level = result.num_levels - 1; result.max_mem_compaction_level = result.num_levels - 1;
} }
@ -184,9 +217,9 @@ void SuperVersionUnrefHandle(void* ptr) {
ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name, ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache, Version* dummy_versions, Cache* table_cache,
const ColumnFamilyOptions& options, const ColumnFamilyOptions& cf_options,
const DBOptions* db_options, const DBOptions* db_options,
const EnvOptions& storage_options, const EnvOptions& env_options,
ColumnFamilySet* column_family_set) ColumnFamilySet* column_family_set)
: id_(id), : id_(id),
name_(name), name_(name),
@ -194,10 +227,10 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
current_(nullptr), current_(nullptr),
refs_(0), refs_(0),
dropped_(false), dropped_(false),
internal_comparator_(options.comparator), internal_comparator_(cf_options.comparator),
internal_filter_policy_(options.filter_policy), options_(*db_options, SanitizeOptions(&internal_comparator_, cf_options)),
options_(*db_options, SanitizeOptions(&internal_comparator_, ioptions_(options_),
&internal_filter_policy_, options)), mutable_cf_options_(options_),
mem_(nullptr), mem_(nullptr),
imm_(options_.min_write_buffer_number_to_merge), imm_(options_.min_write_buffer_number_to_merge),
super_version_(nullptr), super_version_(nullptr),
@ -206,7 +239,6 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
next_(nullptr), next_(nullptr),
prev_(nullptr), prev_(nullptr),
log_number_(0), log_number_(0),
need_slowdown_for_num_level0_files_(false),
column_family_set_(column_family_set) { column_family_set_(column_family_set) {
Ref(); Ref();
@ -214,7 +246,7 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
if (dummy_versions != nullptr) { if (dummy_versions != nullptr) {
internal_stats_.reset( internal_stats_.reset(
new InternalStats(options_.num_levels, db_options->env, this)); new InternalStats(options_.num_levels, db_options->env, this));
table_cache_.reset(new TableCache(&options_, storage_options, table_cache)); table_cache_.reset(new TableCache(ioptions_, env_options, table_cache));
if (options_.compaction_style == kCompactionStyleUniversal) { if (options_.compaction_style == kCompactionStyleUniversal) {
compaction_picker_.reset( compaction_picker_.reset(
new UniversalCompactionPicker(&options_, &internal_comparator_)); new UniversalCompactionPicker(&options_, &internal_comparator_));
@ -287,57 +319,82 @@ ColumnFamilyData::~ColumnFamilyData() {
} }
void ColumnFamilyData::RecalculateWriteStallConditions() { void ColumnFamilyData::RecalculateWriteStallConditions() {
need_wait_for_num_memtables_ =
(imm()->size() == options()->max_write_buffer_number - 1);
if (current_ != nullptr) { if (current_ != nullptr) {
need_wait_for_num_level0_files_ = const double score = current_->MaxCompactionScore();
(current_->NumLevelFiles(0) >= options()->level0_stop_writes_trigger); const int max_level = current_->MaxCompactionScoreLevel();
auto write_controller = column_family_set_->write_controller_;
if (imm()->size() == options_.max_write_buffer_number) {
write_controller_token_ = write_controller->GetStopToken();
internal_stats_->AddCFStats(InternalStats::MEMTABLE_COMPACTION, 1);
Log(options_.info_log,
"[%s] Stopping writes because we have %d immutable memtables "
"(waiting for flush)",
name_.c_str(), imm()->size());
} else if (current_->NumLevelFiles(0) >=
options_.level0_stop_writes_trigger) {
write_controller_token_ = write_controller->GetStopToken();
internal_stats_->AddCFStats(InternalStats::LEVEL0_NUM_FILES, 1);
Log(options_.info_log,
"[%s] Stopping writes because we have %d level-0 files",
name_.c_str(), current_->NumLevelFiles(0));
} else if (options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >=
options_.level0_slowdown_writes_trigger) {
uint64_t slowdown = SlowdownAmount(
current_->NumLevelFiles(0), options_.level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger);
write_controller_token_ = write_controller->GetDelayToken(slowdown);
internal_stats_->AddCFStats(InternalStats::LEVEL0_SLOWDOWN, slowdown);
Log(options_.info_log,
"[%s] Stalling writes because we have %d level-0 files (%" PRIu64
"us)",
name_.c_str(), current_->NumLevelFiles(0), slowdown);
} else if (options_.hard_rate_limit > 1.0 &&
score > options_.hard_rate_limit) {
uint64_t kHardLimitSlowdown = 1000;
write_controller_token_ =
write_controller->GetDelayToken(kHardLimitSlowdown);
internal_stats_->RecordLevelNSlowdown(max_level, kHardLimitSlowdown,
false);
Log(options_.info_log,
"[%s] Stalling writes because we hit hard limit on level %d. "
"(%" PRIu64 "us)",
name_.c_str(), max_level, kHardLimitSlowdown);
} else if (options_.soft_rate_limit > 0.0 &&
score > options_.soft_rate_limit) {
uint64_t slowdown = SlowdownAmount(score, options_.soft_rate_limit,
options_.hard_rate_limit);
write_controller_token_ = write_controller->GetDelayToken(slowdown);
internal_stats_->RecordLevelNSlowdown(max_level, slowdown, true);
Log(options_.info_log,
"[%s] Stalling writes because we hit soft limit on level %d (%" PRIu64
"us)",
name_.c_str(), max_level, slowdown);
} else { } else {
need_wait_for_num_level0_files_ = false; write_controller_token_.reset();
} }
RecalculateWriteStallRateLimitsConditions();
}
void ColumnFamilyData::RecalculateWriteStallRateLimitsConditions() {
if (current_ != nullptr) {
exceeds_hard_rate_limit_ =
(options()->hard_rate_limit > 1.0 &&
current_->MaxCompactionScore() > options()->hard_rate_limit);
exceeds_soft_rate_limit_ =
(options()->soft_rate_limit > 0.0 &&
current_->MaxCompactionScore() > options()->soft_rate_limit);
} else {
exceeds_hard_rate_limit_ = false;
exceeds_soft_rate_limit_ = false;
} }
} }
const EnvOptions* ColumnFamilyData::soptions() const { const EnvOptions* ColumnFamilyData::soptions() const {
return &(column_family_set_->storage_options_); return &(column_family_set_->env_options_);
} }
void ColumnFamilyData::SetCurrent(Version* current) { void ColumnFamilyData::SetCurrent(Version* current) { current_ = current; }
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
}
void ColumnFamilyData::CreateNewMemtable() { void ColumnFamilyData::CreateNewMemtable(const MemTableOptions& moptions) {
assert(current_ != nullptr); assert(current_ != nullptr);
if (mem_ != nullptr) { if (mem_ != nullptr) {
delete mem_->Unref(); delete mem_->Unref();
} }
mem_ = new MemTable(internal_comparator_, options_); mem_ = new MemTable(internal_comparator_, ioptions_, moptions);
mem_->Ref(); mem_->Ref();
} }
Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) { Compaction* ColumnFamilyData::PickCompaction(LogBuffer* log_buffer) {
auto result = compaction_picker_->PickCompaction(current_, log_buffer); auto result = compaction_picker_->PickCompaction(current_, log_buffer);
RecalculateWriteStallRateLimitsConditions();
return result; return result;
} }
@ -434,7 +491,15 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
SuperVersion* ColumnFamilyData::InstallSuperVersion( SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, port::Mutex* db_mutex) { SuperVersion* new_superversion, port::Mutex* db_mutex) {
db_mutex->AssertHeld();
return InstallSuperVersion(new_superversion, db_mutex, mutable_cf_options_);
}
SuperVersion* ColumnFamilyData::InstallSuperVersion(
SuperVersion* new_superversion, port::Mutex* db_mutex,
const MutableCFOptions& mutable_cf_options) {
new_superversion->db_mutex = db_mutex; new_superversion->db_mutex = db_mutex;
new_superversion->mutable_cf_options = mutable_cf_options;
new_superversion->Init(mem_, imm_.current(), current_); new_superversion->Init(mem_, imm_.current(), current_);
SuperVersion* old_superversion = super_version_; SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion; super_version_ = new_superversion;
@ -470,19 +535,32 @@ void ColumnFamilyData::ResetThreadLocalSuperVersions() {
} }
} }
bool ColumnFamilyData::SetOptions(
const std::unordered_map<std::string, std::string>& options_map) {
MutableCFOptions new_mutable_cf_options;
if (GetMutableOptionsFromStrings(mutable_cf_options_, options_map,
&new_mutable_cf_options)) {
mutable_cf_options_ = new_mutable_cf_options;
return true;
}
return false;
}
ColumnFamilySet::ColumnFamilySet(const std::string& dbname, ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const DBOptions* db_options, const DBOptions* db_options,
const EnvOptions& storage_options, const EnvOptions& env_options,
Cache* table_cache) Cache* table_cache,
WriteController* write_controller)
: max_column_family_(0), : max_column_family_(0),
dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr, dummy_cfd_(new ColumnFamilyData(0, "", nullptr, nullptr,
ColumnFamilyOptions(), db_options, ColumnFamilyOptions(), db_options,
storage_options_, nullptr)), env_options, nullptr)),
default_cfd_cache_(nullptr), default_cfd_cache_(nullptr),
db_name_(dbname), db_name_(dbname),
db_options_(db_options), db_options_(db_options),
storage_options_(storage_options), env_options_(env_options),
table_cache_(table_cache), table_cache_(table_cache),
write_controller_(write_controller),
spin_lock_(ATOMIC_FLAG_INIT) { spin_lock_(ATOMIC_FLAG_INIT) {
// initialize linked list // initialize linked list
dummy_cfd_->prev_ = dummy_cfd_; dummy_cfd_->prev_ = dummy_cfd_;
@ -547,7 +625,7 @@ ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
assert(column_families_.find(name) == column_families_.end()); assert(column_families_.find(name) == column_families_.end());
ColumnFamilyData* new_cfd = ColumnFamilyData* new_cfd =
new ColumnFamilyData(id, name, dummy_versions, table_cache_, options, new ColumnFamilyData(id, name, dummy_versions, table_cache_, options,
db_options_, storage_options_, this); db_options_, env_options_, this);
Lock(); Lock();
column_families_.insert({name, id}); column_families_.insert({name, id});
column_family_data_.insert({id, new_cfd}); column_family_data_.insert({id, new_cfd});
@ -606,6 +684,11 @@ bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
column_family_set_->Lock(); column_family_set_->Lock();
current_ = column_family_set_->GetColumnFamily(column_family_id); current_ = column_family_set_->GetColumnFamily(column_family_id);
column_family_set_->Unlock(); column_family_set_->Unlock();
// TODO(icanadi) Maybe remove column family from the hash table when it's
// dropped?
if (current_ != nullptr && current_->IsDropped()) {
current_ = nullptr;
}
} }
handle_.SetCFD(current_); handle_.SetCFD(current_);
return current_ != nullptr; return current_ != nullptr;
@ -631,4 +714,29 @@ ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
return &handle_; return &handle_;
} }
void ColumnFamilyMemTablesImpl::CheckMemtableFull() {
if (current_ != nullptr && current_->mem()->ShouldScheduleFlush()) {
flush_scheduler_->ScheduleFlush(current_);
current_->mem()->MarkFlushScheduled();
}
}
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
return column_family_id;
}
const Comparator* GetColumnFamilyUserComparator(
ColumnFamilyHandle* column_family) {
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
return cfh->user_comparator();
}
return nullptr;
}
} // namespace rocksdb } // namespace rocksdb

@ -19,8 +19,11 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "db/memtable_list.h" #include "db/memtable_list.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "db/write_controller.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "util/thread_local.h" #include "util/thread_local.h"
#include "db/flush_scheduler.h"
#include "util/mutable_cf_options.h"
namespace rocksdb { namespace rocksdb {
@ -46,6 +49,7 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
// destroy without mutex // destroy without mutex
virtual ~ColumnFamilyHandleImpl(); virtual ~ColumnFamilyHandleImpl();
virtual ColumnFamilyData* cfd() const { return cfd_; } virtual ColumnFamilyData* cfd() const { return cfd_; }
virtual const Comparator* user_comparator() const;
virtual uint32_t GetID() const; virtual uint32_t GetID() const;
@ -78,6 +82,7 @@ struct SuperVersion {
MemTable* mem; MemTable* mem;
MemTableListVersion* imm; MemTableListVersion* imm;
Version* current; Version* current;
MutableCFOptions mutable_cf_options;
std::atomic<uint32_t> refs; std::atomic<uint32_t> refs;
// We need to_delete because during Cleanup(), imm->Unref() returns // We need to_delete because during Cleanup(), imm->Unref() returns
// all memtables that we need to free through this vector. We then // all memtables that we need to free through this vector. We then
@ -113,7 +118,6 @@ struct SuperVersion {
}; };
extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp, extern ColumnFamilyOptions SanitizeOptions(const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const ColumnFamilyOptions& src); const ColumnFamilyOptions& src);
class ColumnFamilySet; class ColumnFamilySet;
@ -133,7 +137,7 @@ class ColumnFamilyData {
void Ref() { ++refs_; } void Ref() { ++refs_; }
// will just decrease reference count to 0, but will not delete it. returns // will just decrease reference count to 0, but will not delete it. returns
// true if the ref count was decreased to zero. in that case, it can be // true if the ref count was decreased to zero. in that case, it can be
// deleted by the caller immediatelly, or later, by calling // deleted by the caller immediately, or later, by calling
// FreeDeadColumnFamilies() // FreeDeadColumnFamilies()
bool Unref() { bool Unref() {
assert(refs_ > 0); assert(refs_ > 0);
@ -157,6 +161,7 @@ class ColumnFamilyData {
// can't drop default CF // can't drop default CF
assert(id_ != 0); assert(id_ != 0);
dropped_ = true; dropped_ = true;
write_controller_token_.reset();
} }
bool IsDropped() const { return dropped_; } bool IsDropped() const { return dropped_; }
@ -169,6 +174,21 @@ class ColumnFamilyData {
// thread-safe // thread-safe
const Options* options() const { return &options_; } const Options* options() const { return &options_; }
const EnvOptions* soptions() const; const EnvOptions* soptions() const;
const ImmutableCFOptions* ioptions() const { return &ioptions_; }
// REQUIRES: DB mutex held
// This returns the MutableCFOptions used by current SuperVersion
// You shoul use this API to reference MutableCFOptions most of the time.
const MutableCFOptions* mutable_cf_options() const {
return &(super_version_->mutable_cf_options);
}
// REQUIRES: DB mutex held
// This returns the latest MutableCFOptions, which may be not in effect yet.
const MutableCFOptions* GetLatestMutableCFOptions() const {
return &mutable_cf_options_;
}
// REQUIRES: DB mutex held
bool SetOptions(
const std::unordered_map<std::string, std::string>& options_map);
InternalStats* internal_stats() { return internal_stats_.get(); } InternalStats* internal_stats() { return internal_stats_.get(); }
@ -178,7 +198,7 @@ class ColumnFamilyData {
Version* dummy_versions() { return dummy_versions_; } Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; } void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current); void SetCurrent(Version* current);
void CreateNewMemtable(); void CreateNewMemtable(const MemTableOptions& moptions);
TableCache* table_cache() const { return table_cache_.get(); } TableCache* table_cache() const { return table_cache_.get(); }
@ -219,40 +239,20 @@ class ColumnFamilyData {
// if its reference count is zero and needs deletion or nullptr if not // if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion to enable // As argument takes a pointer to allocated SuperVersion to enable
// the clients to allocate SuperVersion outside of mutex. // the clients to allocate SuperVersion outside of mutex.
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
port::Mutex* db_mutex,
const MutableCFOptions& mutable_cf_options);
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion, SuperVersion* InstallSuperVersion(SuperVersion* new_superversion,
port::Mutex* db_mutex); port::Mutex* db_mutex);
void ResetThreadLocalSuperVersions(); void ResetThreadLocalSuperVersions();
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
bool NeedWaitForNumLevel0Files() const {
return need_wait_for_num_level0_files_;
}
bool NeedWaitForNumMemtables() const {
return need_wait_for_num_memtables_;
}
bool ExceedsSoftRateLimit() const {
return exceeds_soft_rate_limit_;
}
bool ExceedsHardRateLimit() const {
return exceeds_hard_rate_limit_;
}
private: private:
friend class ColumnFamilySet; friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name, ColumnFamilyData(uint32_t id, const std::string& name,
Version* dummy_versions, Cache* table_cache, Version* dummy_versions, Cache* table_cache,
const ColumnFamilyOptions& options, const ColumnFamilyOptions& options,
const DBOptions* db_options, const DBOptions* db_options, const EnvOptions& env_options,
const EnvOptions& storage_options,
ColumnFamilySet* column_family_set); ColumnFamilySet* column_family_set);
// Recalculate some small conditions, which are changed only during // Recalculate some small conditions, which are changed only during
@ -261,7 +261,6 @@ class ColumnFamilyData {
// DBImpl::MakeRoomForWrite function to decide, if it need to make // DBImpl::MakeRoomForWrite function to decide, if it need to make
// a write stall // a write stall
void RecalculateWriteStallConditions(); void RecalculateWriteStallConditions();
void RecalculateWriteStallRateLimitsConditions();
uint32_t id_; uint32_t id_;
const std::string name_; const std::string name_;
@ -272,9 +271,10 @@ class ColumnFamilyData {
bool dropped_; // true if client dropped it bool dropped_; // true if client dropped it
const InternalKeyComparator internal_comparator_; const InternalKeyComparator internal_comparator_;
const InternalFilterPolicy internal_filter_policy_;
Options const options_; const Options options_;
const ImmutableCFOptions ioptions_;
MutableCFOptions mutable_cf_options_;
std::unique_ptr<TableCache> table_cache_; std::unique_ptr<TableCache> table_cache_;
@ -303,31 +303,13 @@ class ColumnFamilyData {
// recovered from // recovered from
uint64_t log_number_; uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// These 4 variables are updated only after compaction,
// adding new memtable, flushing memtables to files
// and/or add recalculation of compaction score.
// That's why theirs values are cached in ColumnFamilyData.
// Recalculation is made by RecalculateWriteStallConditions and
// RecalculateWriteStallRateLimitsConditions function. They are used
// in DBImpl::MakeRoomForWrite function to decide, if it need
// to sleep during write operation
bool need_wait_for_num_memtables_;
bool need_wait_for_num_level0_files_;
bool exceeds_hard_rate_limit_;
bool exceeds_soft_rate_limit_;
// An object that keeps all the compaction stats // An object that keeps all the compaction stats
// and picks the next compaction // and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_; std::unique_ptr<CompactionPicker> compaction_picker_;
ColumnFamilySet* column_family_set_; ColumnFamilySet* column_family_set_;
std::unique_ptr<WriteControllerToken> write_controller_token_;
}; };
// ColumnFamilySet has interesting thread-safety requirements // ColumnFamilySet has interesting thread-safety requirements
@ -369,7 +351,8 @@ class ColumnFamilySet {
}; };
ColumnFamilySet(const std::string& dbname, const DBOptions* db_options, ColumnFamilySet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache); const EnvOptions& env_options, Cache* table_cache,
WriteController* write_controller);
~ColumnFamilySet(); ~ColumnFamilySet();
ColumnFamilyData* GetDefault() const; ColumnFamilyData* GetDefault() const;
@ -422,8 +405,9 @@ class ColumnFamilySet {
const std::string db_name_; const std::string db_name_;
const DBOptions* const db_options_; const DBOptions* const db_options_;
const EnvOptions storage_options_; const EnvOptions env_options_;
Cache* table_cache_; Cache* table_cache_;
WriteController* write_controller_;
std::atomic_flag spin_lock_; std::atomic_flag spin_lock_;
}; };
@ -431,8 +415,11 @@ class ColumnFamilySet {
// memtables of different column families (specified by ID in the write batch) // memtables of different column families (specified by ID in the write batch)
class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables { class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
public: public:
explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set) explicit ColumnFamilyMemTablesImpl(ColumnFamilySet* column_family_set,
: column_family_set_(column_family_set), current_(nullptr) {} FlushScheduler* flush_scheduler)
: column_family_set_(column_family_set),
current_(nullptr),
flush_scheduler_(flush_scheduler) {}
// sets current_ to ColumnFamilyData with column_family_id // sets current_ to ColumnFamilyData with column_family_id
// returns false if column family doesn't exist // returns false if column family doesn't exist
@ -451,10 +438,18 @@ class ColumnFamilyMemTablesImpl : public ColumnFamilyMemTables {
// Returns column family handle for the selected column family // Returns column family handle for the selected column family
virtual ColumnFamilyHandle* GetColumnFamilyHandle() override; virtual ColumnFamilyHandle* GetColumnFamilyHandle() override;
virtual void CheckMemtableFull() override;
private: private:
ColumnFamilySet* column_family_set_; ColumnFamilySet* column_family_set_;
ColumnFamilyData* current_; ColumnFamilyData* current_;
FlushScheduler* flush_scheduler_;
ColumnFamilyHandleInternal handle_; ColumnFamilyHandleInternal handle_;
}; };
extern uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family);
extern const Comparator* GetColumnFamilyUserComparator(
ColumnFamilyHandle* column_family);
} // namespace rocksdb } // namespace rocksdb

@ -408,9 +408,15 @@ TEST(ColumnFamilyTest, WriteBatchFailure) {
Open(); Open();
CreateColumnFamiliesAndReopen({"one", "two"}); CreateColumnFamiliesAndReopen({"one", "two"});
WriteBatch batch; WriteBatch batch;
batch.Put(handles_[0], Slice("existing"), Slice("column-family"));
batch.Put(handles_[1], Slice("non-existing"), Slice("column-family")); batch.Put(handles_[1], Slice("non-existing"), Slice("column-family"));
ASSERT_OK(db_->Write(WriteOptions(), &batch)); ASSERT_OK(db_->Write(WriteOptions(), &batch));
DropColumnFamilies({1}); DropColumnFamilies({1});
WriteOptions woptions_ignore_missing_cf;
woptions_ignore_missing_cf.ignore_missing_column_families = true;
batch.Put(handles_[0], Slice("still here"), Slice("column-family"));
ASSERT_OK(db_->Write(woptions_ignore_missing_cf, &batch));
ASSERT_EQ("column-family", Get(0, "still here"));
Status s = db_->Write(WriteOptions(), &batch); Status s = db_->Write(WriteOptions(), &batch);
ASSERT_TRUE(s.IsInvalidArgument()); ASSERT_TRUE(s.IsInvalidArgument());
Close(); Close();
@ -746,9 +752,10 @@ TEST(ColumnFamilyTest, DifferentCompactionStyles) {
default_cf.num_levels = 3; default_cf.num_levels = 3;
default_cf.write_buffer_size = 64 << 10; // 64KB default_cf.write_buffer_size = 64 << 10; // 64KB
default_cf.target_file_size_base = 30 << 10; default_cf.target_file_size_base = 30 << 10;
default_cf.filter_policy = nullptr;
default_cf.no_block_cache = true;
default_cf.source_compaction_factor = 100; default_cf.source_compaction_factor = 100;
BlockBasedTableOptions table_options;
table_options.no_block_cache = true;
default_cf.table_factory.reset(NewBlockBasedTableFactory(table_options));
one.compaction_style = kCompactionStyleUniversal; one.compaction_style = kCompactionStyleUniversal;
// trigger compaction if there are >= 4 files // trigger compaction if there are >= 4 files

@ -9,7 +9,10 @@
#include "db/compaction.h" #include "db/compaction.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <vector> #include <vector>
@ -110,8 +113,8 @@ void Compaction::AddInputDeletions(VersionEdit* edit) {
} }
bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) { bool Compaction::KeyNotExistsBeyondOutputLevel(const Slice& user_key) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
return bottommost_level_; return bottommost_level_;
} }
// Maybe use binary search to find right entry instead of linear search? // Maybe use binary search to find right entry instead of linear search?
@ -174,8 +177,8 @@ void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
// Is this compaction producing files at the bottommost level? // Is this compaction producing files at the bottommost level?
void Compaction::SetupBottomMostLevel(bool is_manual) { void Compaction::SetupBottomMostLevel(bool is_manual) {
assert(cfd_->options()->compaction_style != kCompactionStyleFIFO); assert(cfd_->ioptions()->compaction_style != kCompactionStyleFIFO);
if (cfd_->options()->compaction_style == kCompactionStyleUniversal) { if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
// If universal compaction style is used and manual // If universal compaction style is used and manual
// compaction is occuring, then we are guaranteed that // compaction is occuring, then we are guaranteed that
// all files will be picked in a single compaction // all files will be picked in a single compaction
@ -267,7 +270,7 @@ void Compaction::Summary(char* output, int len) {
uint64_t Compaction::OutputFilePreallocationSize() { uint64_t Compaction::OutputFilePreallocationSize() {
uint64_t preallocation_size = 0; uint64_t preallocation_size = 0;
if (cfd_->options()->compaction_style == kCompactionStyleLevel) { if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
preallocation_size = preallocation_size =
cfd_->compaction_picker()->MaxFileSizeForLevel(output_level()); cfd_->compaction_picker()->MaxFileSizeForLevel(output_level());
} else { } else {

@ -9,7 +9,10 @@
#include "db/compaction_picker.h" #include "db/compaction_picker.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <limits> #include <limits>
#include "db/filename.h" #include "db/filename.h"
@ -39,13 +42,13 @@ CompressionType GetCompressionType(const Options& options, int level,
return kNoCompression; return kNoCompression;
} }
// If the use has specified a different compression level for each level, // If the use has specified a different compression level for each level,
// then pick the compresison for that level. // then pick the compression for that level.
if (!options.compression_per_level.empty()) { if (!options.compression_per_level.empty()) {
const int n = options.compression_per_level.size() - 1; const int n = options.compression_per_level.size() - 1;
// It is possible for level_ to be -1; in that case, we use level // It is possible for level_ to be -1; in that case, we use level
// 0's compression. This occurs mostly in backwards compatibility // 0's compression. This occurs mostly in backwards compatibility
// situations when the builder doesn't know what level the file // situations when the builder doesn't know what level the file
// belongs to. Likewise, if level_ is beyond the end of the // belongs to. Likewise, if level is beyond the end of the
// specified compression levels, use the last value. // specified compression levels, use the last value.
return options.compression_per_level[std::max(0, std::min(level, n))]; return options.compression_per_level[std::max(0, std::min(level, n))];
} else { } else {
@ -173,9 +176,12 @@ void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
} }
bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
assert(c != nullptr);
// If inputs are empty then there is nothing to expand. // If inputs are empty then there is nothing to expand.
if (!c || c->inputs_[0].empty()) { if (c->inputs_[0].empty()) {
return true; assert(c->inputs_[1].empty());
// This isn't good compaction
return false;
} }
// GetOverlappingInputs will always do the right thing for level-0. // GetOverlappingInputs will always do the right thing for level-0.
@ -427,7 +433,7 @@ Compaction* LevelCompactionPicker::PickCompaction(Version* version,
level = version->compaction_level_[i]; level = version->compaction_level_[i];
if ((version->compaction_score_[i] >= 1)) { if ((version->compaction_score_[i] >= 1)) {
c = PickCompactionBySize(version, level, version->compaction_score_[i]); c = PickCompactionBySize(version, level, version->compaction_score_[i]);
if (ExpandWhileOverlapping(c) == false) { if (c == nullptr || ExpandWhileOverlapping(c) == false) {
delete c; delete c;
c = nullptr; c = nullptr;
} else { } else {

@ -45,7 +45,9 @@ class CorruptionTest {
db_ = nullptr; db_ = nullptr;
options_.create_if_missing = true; options_.create_if_missing = true;
options_.block_size_deviation = 0; // make unit test pass for now BlockBasedTableOptions table_options;
table_options.block_size_deviation = 0; // make unit test pass for now
options_.table_factory.reset(NewBlockBasedTableFactory(table_options));
Reopen(); Reopen();
options_.create_if_missing = false; options_.create_if_missing = false;
} }
@ -60,9 +62,11 @@ class CorruptionTest {
db_ = nullptr; db_ = nullptr;
Options opt = (options ? *options : options_); Options opt = (options ? *options : options_);
opt.env = &env_; opt.env = &env_;
opt.block_cache = tiny_cache_;
opt.block_size_deviation = 0;
opt.arena_block_size = 4096; opt.arena_block_size = 4096;
BlockBasedTableOptions table_options;
table_options.block_cache = tiny_cache_;
table_options.block_size_deviation = 0;
opt.table_factory.reset(NewBlockBasedTableFactory(table_options));
return DB::Open(opt, dbname_, &db_); return DB::Open(opt, dbname_, &db_);
} }
@ -328,6 +332,9 @@ TEST(CorruptionTest, CorruptedDescriptor) {
} }
TEST(CorruptionTest, CompactionInputError) { TEST(CorruptionTest, CompactionInputError) {
Options options;
options.max_background_flushes = 0;
Reopen(&options);
Build(10); Build(10);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable(); dbi->TEST_FlushMemTable();
@ -347,6 +354,7 @@ TEST(CorruptionTest, CompactionInputErrorParanoid) {
options.paranoid_checks = true; options.paranoid_checks = true;
options.write_buffer_size = 131072; options.write_buffer_size = 131072;
options.max_write_buffer_number = 2; options.max_write_buffer_number = 2;
options.max_background_flushes = 0;
Reopen(&options); Reopen(&options);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_); DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);

@ -131,8 +131,6 @@ TEST(CuckooTableDBTest, Flush) {
ASSERT_EQ("v2", Get("key2")); ASSERT_EQ("v2", Get("key2"));
ASSERT_EQ("v3", Get("key3")); ASSERT_EQ("v3", Get("key3"));
ASSERT_EQ("NOT_FOUND", Get("key4")); ASSERT_EQ("NOT_FOUND", Get("key4"));
ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("somelongkey"));
ASSERT_EQ("Invalid argument: Length of key is invalid.", Get("s"));
// Now add more keys and flush. // Now add more keys and flush.
ASSERT_OK(Put("key4", "v4")); ASSERT_OK(Put("key4", "v4"));
@ -195,6 +193,38 @@ static std::string Key(int i) {
snprintf(buf, sizeof(buf), "key_______%06d", i); snprintf(buf, sizeof(buf), "key_______%06d", i);
return std::string(buf); return std::string(buf);
} }
static std::string Uint64Key(uint64_t i) {
std::string str;
str.resize(8);
memcpy(&str[0], static_cast<void*>(&i), 8);
return str;
}
} // namespace.
TEST(CuckooTableDBTest, Uint64Comparator) {
Options options = CurrentOptions();
options.comparator = test::Uint64Comparator();
Reopen(&options);
ASSERT_OK(Put(Uint64Key(1), "v1"));
ASSERT_OK(Put(Uint64Key(2), "v2"));
ASSERT_OK(Put(Uint64Key(3), "v3"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get(Uint64Key(1)));
ASSERT_EQ("v2", Get(Uint64Key(2)));
ASSERT_EQ("v3", Get(Uint64Key(3)));
ASSERT_EQ("NOT_FOUND", Get(Uint64Key(4)));
// Add more keys.
ASSERT_OK(Delete(Uint64Key(2))); // Delete.
ASSERT_OK(Put(Uint64Key(3), "v0")); // Update.
ASSERT_OK(Put(Uint64Key(4), "v4"));
dbfull()->TEST_FlushMemTable();
ASSERT_EQ("v1", Get(Uint64Key(1)));
ASSERT_EQ("NOT_FOUND", Get(Uint64Key(2)));
ASSERT_EQ("v0", Get(Uint64Key(3)));
ASSERT_EQ("v4", Get(Uint64Key(4)));
} }
TEST(CuckooTableDBTest, CompactionTrigger) { TEST(CuckooTableDBTest, CompactionTrigger) {
@ -215,14 +245,38 @@ TEST(CuckooTableDBTest, CompactionTrigger) {
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx))); ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
} }
dbfull()->TEST_WaitForFlushMemTable(); dbfull()->TEST_WaitForFlushMemTable();
dbfull()->TEST_CompactRange(0, nullptr, nullptr); ASSERT_EQ("2", FilesPerLevel());
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_EQ("0,2", FilesPerLevel()); ASSERT_EQ("0,2", FilesPerLevel());
for (int idx = 0; idx < 22; ++idx) { for (int idx = 0; idx < 22; ++idx) {
ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx))); ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
} }
} }
TEST(CuckooTableDBTest, CompactionIntoMultipleFiles) {
// Create a big L0 file and check it compacts into multiple files in L1.
Options options = CurrentOptions();
options.write_buffer_size = 270 << 10;
// Two SST files should be created, each containing 14 keys.
// Number of buckets will be 16. Total size ~156 KB.
options.target_file_size_base = 160 << 10;
Reopen(&options);
// Write 28 values, each 10016 B ~ 10KB
for (int idx = 0; idx < 28; ++idx) {
ASSERT_OK(Put(Key(idx), std::string(10000, 'a' + idx)));
}
dbfull()->TEST_WaitForFlushMemTable();
ASSERT_EQ("1", FilesPerLevel());
dbfull()->TEST_CompactRange(0, nullptr, nullptr);
ASSERT_EQ("0,2", FilesPerLevel());
for (int idx = 0; idx < 28; ++idx) {
ASSERT_EQ(std::string(10000, 'a' + idx), Get(Key(idx)));
}
}
TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) { TEST(CuckooTableDBTest, SameKeyInsertedInTwoDifferentFilesAndCompacted) {
// Insert same key twice so that they go to different SST files. Then wait for // Insert same key twice so that they go to different SST files. Then wait for
// compaction and check if the latest value is stored and old value removed. // compaction and check if the latest value is stored and old value removed.

@ -7,7 +7,9 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#ifndef GFLAGS #ifndef GFLAGS
#include <cstdio> #include <cstdio>
@ -37,8 +39,8 @@ int main() {
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "rocksdb/filter_policy.h"
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
#include "rocksdb/statistics.h"
#include "rocksdb/perf_context.h" #include "rocksdb/perf_context.h"
#include "port/port.h" #include "port/port.h"
#include "port/stack_trace.h" #include "port/stack_trace.h"
@ -146,6 +148,7 @@ DEFINE_int64(merge_keys, -1,
"Number of distinct keys to use for MergeRandom and " "Number of distinct keys to use for MergeRandom and "
"ReadRandomMergeRandom. " "ReadRandomMergeRandom. "
"If negative, there will be FLAGS_num keys."); "If negative, there will be FLAGS_num keys.");
DEFINE_int32(num_column_families, 1, "Number of Column Families to use.");
DEFINE_int64(reads, -1, "Number of read operations to do. " DEFINE_int64(reads, -1, "Number of read operations to do. "
"If negative, do FLAGS_num reads."); "If negative, do FLAGS_num reads.");
@ -162,6 +165,7 @@ DEFINE_int32(duration, 0, "Time in seconds for the random-ops tests to run."
DEFINE_int32(value_size, 100, "Size of each value"); DEFINE_int32(value_size, 100, "Size of each value");
DEFINE_bool(use_uint64_comparator, false, "use Uint64 user comparator");
static bool ValidateKeySize(const char* flagname, int32_t value) { static bool ValidateKeySize(const char* flagname, int32_t value) {
return true; return true;
@ -238,10 +242,11 @@ DEFINE_int32(universal_compression_size_percent, -1,
DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed" DEFINE_int64(cache_size, -1, "Number of bytes to use as a cache of uncompressed"
"data. Negative means use default settings."); "data. Negative means use default settings.");
DEFINE_int32(block_size, rocksdb::Options().block_size, DEFINE_int32(block_size, rocksdb::BlockBasedTableOptions().block_size,
"Number of bytes in a block."); "Number of bytes in a block.");
DEFINE_int32(block_restart_interval, rocksdb::Options().block_restart_interval, DEFINE_int32(block_restart_interval,
rocksdb::BlockBasedTableOptions().block_restart_interval,
"Number of keys between restart points " "Number of keys between restart points "
"for delta encoding of keys."); "for delta encoding of keys.");
@ -302,7 +307,7 @@ DEFINE_string(wal_dir, "", "If not empty, use the given dir for WAL");
DEFINE_int32(num_levels, 7, "The total number of levels"); DEFINE_int32(num_levels, 7, "The total number of levels");
DEFINE_int32(target_file_size_base, 2 * 1048576, "Target file size at level-1"); DEFINE_int64(target_file_size_base, 2 * 1048576, "Target file size at level-1");
DEFINE_int32(target_file_size_multiplier, 1, DEFINE_int32(target_file_size_multiplier, 1,
"A multiplier to compute target level-N file size (N >= 2)"); "A multiplier to compute target level-N file size (N >= 2)");
@ -509,6 +514,9 @@ DEFINE_int64(keys_per_prefix, 0, "control average number of keys generated "
"i.e. use the prefix comes with the generated random number."); "i.e. use the prefix comes with the generated random number.");
DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction " DEFINE_bool(enable_io_prio, false, "Lower the background flush/compaction "
"threads' IO priority"); "threads' IO priority");
DEFINE_bool(identity_as_first_hash, false, "the first hash function of cuckoo "
"table becomes an identity function. This is only valid when key "
"is 8 bytes");
enum RepFactory { enum RepFactory {
kSkipList, kSkipList,
@ -548,7 +556,9 @@ DEFINE_double(cuckoo_hash_ratio, 0.9, "Hash ratio for Cuckoo SST table.");
DEFINE_bool(use_hash_search, false, "if use kHashSearch " DEFINE_bool(use_hash_search, false, "if use kHashSearch "
"instead of kBinarySearch. " "instead of kBinarySearch. "
"This is valid if only we use BlockTable"); "This is valid if only we use BlockTable");
DEFINE_bool(use_block_based_filter, false, "if use kBlockBasedFilter "
"instead of kFullFilter for filter block. "
"This is valid if only we use BlockTable");
DEFINE_string(merge_operator, "", "The merge operator to use with the database." DEFINE_string(merge_operator, "", "The merge operator to use with the database."
"If a new merge operator is specified, be sure to use fresh" "If a new merge operator is specified, be sure to use fresh"
" database The possible merge operators are defined in" " database The possible merge operators are defined in"
@ -843,12 +853,19 @@ class Duration {
class Benchmark { class Benchmark {
private: private:
shared_ptr<Cache> cache_; std::shared_ptr<Cache> cache_;
shared_ptr<Cache> compressed_cache_; std::shared_ptr<Cache> compressed_cache_;
const FilterPolicy* filter_policy_; std::shared_ptr<const FilterPolicy> filter_policy_;
const SliceTransform* prefix_extractor_; const SliceTransform* prefix_extractor_;
DB* db_; struct DBWithColumnFamilies {
std::vector<DB*> multi_dbs_; std::vector<ColumnFamilyHandle*> cfh;
DB* db;
DBWithColumnFamilies() : db(nullptr) {
cfh.clear();
}
};
DBWithColumnFamilies db_;
std::vector<DBWithColumnFamilies> multi_dbs_;
int64_t num_; int64_t num_;
int value_size_; int value_size_;
int key_size_; int key_size_;
@ -1064,11 +1081,10 @@ class Benchmark {
(FLAGS_cache_numshardbits >= 1 ? (FLAGS_cache_numshardbits >= 1 ?
NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) : NewLRUCache(FLAGS_compressed_cache_size, FLAGS_cache_numshardbits) :
NewLRUCache(FLAGS_compressed_cache_size)) : nullptr), NewLRUCache(FLAGS_compressed_cache_size)) : nullptr),
filter_policy_(FLAGS_bloom_bits >= 0 filter_policy_(FLAGS_bloom_bits >= 0 ?
? NewBloomFilterPolicy(FLAGS_bloom_bits) NewBloomFilterPolicy(FLAGS_bloom_bits, FLAGS_use_block_based_filter)
: nullptr), : nullptr),
prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)), prefix_extractor_(NewFixedPrefixTransform(FLAGS_prefix_size)),
db_(nullptr),
num_(FLAGS_num), num_(FLAGS_num),
value_size_(FLAGS_value_size), value_size_(FLAGS_value_size),
key_size_(FLAGS_key_size), key_size_(FLAGS_key_size),
@ -1099,8 +1115,9 @@ class Benchmark {
} }
~Benchmark() { ~Benchmark() {
delete db_; std::for_each(db_.cfh.begin(), db_.cfh.end(),
delete filter_policy_; [](ColumnFamilyHandle* cfh) { delete cfh; });
delete db_.db;
delete prefix_extractor_; delete prefix_extractor_;
} }
@ -1159,6 +1176,16 @@ class Benchmark {
return base_name + std::to_string(id); return base_name + std::to_string(id);
} }
std::string ColumnFamilyName(int i) {
if (i == 0) {
return kDefaultColumnFamilyName;
} else {
char name[100];
snprintf(name, sizeof(name), "column_family_name_%06d", i);
return std::string(name);
}
}
void Run() { void Run() {
if (!SanityCheck()) { if (!SanityCheck()) {
exit(1); exit(1);
@ -1313,13 +1340,16 @@ class Benchmark {
name.ToString().c_str()); name.ToString().c_str());
method = nullptr; method = nullptr;
} else { } else {
if (db_ != nullptr) { if (db_.db != nullptr) {
delete db_; std::for_each(db_.cfh.begin(), db_.cfh.end(),
db_ = nullptr; [](ColumnFamilyHandle* cfh) { delete cfh; });
delete db_.db;
db_.db = nullptr;
db_.cfh.clear();
DestroyDB(FLAGS_db, Options()); DestroyDB(FLAGS_db, Options());
} }
for (size_t i = 0; i < multi_dbs_.size(); i++) { for (size_t i = 0; i < multi_dbs_.size(); i++) {
delete multi_dbs_[i]; delete multi_dbs_[i].db;
DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options()); DestroyDB(GetDbNameForMultiple(FLAGS_db, i), Options());
} }
multi_dbs_.clear(); multi_dbs_.clear();
@ -1491,7 +1521,7 @@ class Benchmark {
void Compress(ThreadState *thread) { void Compress(ThreadState *thread) {
RandomGenerator gen; RandomGenerator gen;
Slice input = gen.Generate(Options().block_size); Slice input = gen.Generate(FLAGS_block_size);
int64_t bytes = 0; int64_t bytes = 0;
int64_t produced = 0; int64_t produced = 0;
bool ok = true; bool ok = true;
@ -1541,7 +1571,7 @@ class Benchmark {
void Uncompress(ThreadState *thread) { void Uncompress(ThreadState *thread) {
RandomGenerator gen; RandomGenerator gen;
Slice input = gen.Generate(Options().block_size); Slice input = gen.Generate(FLAGS_block_size);
std::string compressed; std::string compressed;
bool ok; bool ok;
@ -1617,14 +1647,10 @@ class Benchmark {
} }
void Open() { void Open() {
assert(db_ == nullptr); assert(db_.db == nullptr);
Options options; Options options;
options.create_if_missing = !FLAGS_use_existing_db; options.create_if_missing = !FLAGS_use_existing_db;
options.block_cache = cache_; options.create_missing_column_families = FLAGS_num_column_families > 1;
options.block_cache_compressed = compressed_cache_;
if (cache_ == nullptr) {
options.no_block_cache = true;
}
options.write_buffer_size = FLAGS_write_buffer_size; options.write_buffer_size = FLAGS_write_buffer_size;
options.max_write_buffer_number = FLAGS_max_write_buffer_number; options.max_write_buffer_number = FLAGS_max_write_buffer_number;
options.min_write_buffer_number_to_merge = options.min_write_buffer_number_to_merge =
@ -1632,13 +1658,17 @@ class Benchmark {
options.max_background_compactions = FLAGS_max_background_compactions; options.max_background_compactions = FLAGS_max_background_compactions;
options.max_background_flushes = FLAGS_max_background_flushes; options.max_background_flushes = FLAGS_max_background_flushes;
options.compaction_style = FLAGS_compaction_style_e; options.compaction_style = FLAGS_compaction_style_e;
options.block_size = FLAGS_block_size;
options.block_restart_interval = FLAGS_block_restart_interval;
options.filter_policy = filter_policy_;
if (FLAGS_prefix_size != 0) { if (FLAGS_prefix_size != 0) {
options.prefix_extractor.reset( options.prefix_extractor.reset(
NewFixedPrefixTransform(FLAGS_prefix_size)); NewFixedPrefixTransform(FLAGS_prefix_size));
} }
if (FLAGS_use_uint64_comparator) {
options.comparator = test::Uint64Comparator();
if (FLAGS_key_size != 8) {
fprintf(stderr, "Using Uint64 comparator but key size is not 8.\n");
exit(1);
}
}
options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits; options.memtable_prefix_bloom_bits = FLAGS_memtable_bloom_bits;
options.bloom_locality = FLAGS_bloom_locality; options.bloom_locality = FLAGS_bloom_locality;
options.max_open_files = FLAGS_open_files; options.max_open_files = FLAGS_open_files;
@ -1712,8 +1742,11 @@ class Benchmark {
fprintf(stderr, "Invalid cuckoo_hash_ratio\n"); fprintf(stderr, "Invalid cuckoo_hash_ratio\n");
exit(1); exit(1);
} }
rocksdb::CuckooTableOptions table_options;
table_options.hash_table_ratio = FLAGS_cuckoo_hash_ratio;
table_options.identity_as_first_hash = FLAGS_identity_as_first_hash;
options.table_factory = std::shared_ptr<TableFactory>( options.table_factory = std::shared_ptr<TableFactory>(
NewCuckooTableFactory(FLAGS_cuckoo_hash_ratio)); NewCuckooTableFactory(table_options));
} else { } else {
BlockBasedTableOptions block_based_options; BlockBasedTableOptions block_based_options;
if (FLAGS_use_hash_search) { if (FLAGS_use_hash_search) {
@ -1726,6 +1759,14 @@ class Benchmark {
} else { } else {
block_based_options.index_type = BlockBasedTableOptions::kBinarySearch; block_based_options.index_type = BlockBasedTableOptions::kBinarySearch;
} }
if (cache_ == nullptr) {
block_based_options.no_block_cache = true;
}
block_based_options.block_cache = cache_;
block_based_options.block_cache_compressed = compressed_cache_;
block_based_options.block_size = FLAGS_block_size;
block_based_options.block_restart_interval = FLAGS_block_restart_interval;
block_based_options.filter_policy = filter_policy_;
options.table_factory.reset( options.table_factory.reset(
NewBlockBasedTableFactory(block_based_options)); NewBlockBasedTableFactory(block_based_options));
} }
@ -1816,10 +1857,9 @@ class Benchmark {
OpenDb(options, FLAGS_db, &db_); OpenDb(options, FLAGS_db, &db_);
} else { } else {
multi_dbs_.clear(); multi_dbs_.clear();
multi_dbs_.resize(FLAGS_num_multi_db);
for (int i = 0; i < FLAGS_num_multi_db; i++) { for (int i = 0; i < FLAGS_num_multi_db; i++) {
DB* db; OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &multi_dbs_[i]);
OpenDb(options, GetDbNameForMultiple(FLAGS_db, i), &db);
multi_dbs_.push_back(db);
} }
} }
if (FLAGS_min_level_to_compress >= 0) { if (FLAGS_min_level_to_compress >= 0) {
@ -1827,12 +1867,27 @@ class Benchmark {
} }
} }
void OpenDb(Options options, std::string db_name, DB** db) { void OpenDb(const Options& options, const std::string& db_name,
DBWithColumnFamilies* db) {
Status s; Status s;
if(FLAGS_readonly) { // Open with column families if necessary.
s = DB::OpenForReadOnly(options, db_name, db); if (FLAGS_num_column_families > 1) {
db->cfh.resize(FLAGS_num_column_families);
std::vector<ColumnFamilyDescriptor> column_families;
for (int i = 0; i < FLAGS_num_column_families; i++) {
column_families.push_back(ColumnFamilyDescriptor(
ColumnFamilyName(i), ColumnFamilyOptions(options)));
}
if (FLAGS_readonly) {
s = DB::OpenForReadOnly(options, db_name, column_families,
&db->cfh, &db->db);
} else {
s = DB::Open(options, db_name, column_families, &db->cfh, &db->db);
}
} else if (FLAGS_readonly) {
s = DB::OpenForReadOnly(options, db_name, &db->db);
} else { } else {
s = DB::Open(options, db_name, db); s = DB::Open(options, db_name, &db->db);
} }
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "open error: %s\n", s.ToString().c_str()); fprintf(stderr, "open error: %s\n", s.ToString().c_str());
@ -1900,10 +1955,18 @@ class Benchmark {
}; };
DB* SelectDB(ThreadState* thread) { DB* SelectDB(ThreadState* thread) {
if (db_ != nullptr) { return SelectDBWithCfh(thread)->db;
return db_; }
DBWithColumnFamilies* SelectDBWithCfh(ThreadState* thread) {
return SelectDBWithCfh(thread->rand.Next());
}
DBWithColumnFamilies* SelectDBWithCfh(uint64_t rand_int) {
if (db_.db != nullptr) {
return &db_;
} else { } else {
return multi_dbs_[thread->rand.Next() % multi_dbs_.size()]; return &multi_dbs_[rand_int % multi_dbs_.size()];
} }
} }
@ -1912,7 +1975,7 @@ class Benchmark {
const int64_t num_ops = writes_ == 0 ? num_ : writes_; const int64_t num_ops = writes_ == 0 ? num_ : writes_;
size_t num_key_gens = 1; size_t num_key_gens = 1;
if (db_ == nullptr) { if (db_.db == nullptr) {
num_key_gens = multi_dbs_.size(); num_key_gens = multi_dbs_.size();
} }
std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens); std::vector<std::unique_ptr<KeyGenerator>> key_gens(num_key_gens);
@ -1935,20 +1998,25 @@ class Benchmark {
Slice key = AllocateKey(); Slice key = AllocateKey();
std::unique_ptr<const char[]> key_guard(key.data()); std::unique_ptr<const char[]> key_guard(key.data());
while (!duration.Done(entries_per_batch_)) { while (!duration.Done(entries_per_batch_)) {
size_t id = 0; size_t id = thread->rand.Next() % num_key_gens;
DB* db_to_write = db_; DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(id);
if (db_to_write == nullptr) {
id = thread->rand.Next() % num_key_gens;
db_to_write = multi_dbs_[id];
}
batch.Clear(); batch.Clear();
for (int64_t j = 0; j < entries_per_batch_; j++) { for (int64_t j = 0; j < entries_per_batch_; j++) {
GenerateKeyFromInt(key_gens[id]->Next(), FLAGS_num, &key); int64_t rand_num = key_gens[id]->Next();
GenerateKeyFromInt(rand_num, FLAGS_num, &key);
if (FLAGS_num_column_families <= 1) {
batch.Put(key, gen.Generate(value_size_)); batch.Put(key, gen.Generate(value_size_));
} else {
// We use same rand_num as seed for key and column family so that we
// can deterministically find the cfh corresponding to a particular
// key while reading the key.
batch.Put(db_with_cfh->cfh[rand_num % db_with_cfh->cfh.size()],
key, gen.Generate(value_size_));
}
bytes += value_size_ + key_size_; bytes += value_size_ + key_size_;
} }
s = db_to_write->Write(write_options_, &batch); s = db_with_cfh->db->Write(write_options_, &batch);
thread->stats.FinishedOps(db_to_write, entries_per_batch_); thread->stats.FinishedOps(db_with_cfh->db, entries_per_batch_);
if (!s.ok()) { if (!s.ok()) {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1); exit(1);
@ -1958,11 +2026,11 @@ class Benchmark {
} }
void ReadSequential(ThreadState* thread) { void ReadSequential(ThreadState* thread) {
if (db_ != nullptr) { if (db_.db != nullptr) {
ReadSequential(thread, db_); ReadSequential(thread, db_.db);
} else { } else {
for (DB* db : multi_dbs_) { for (const auto& db_with_cfh : multi_dbs_) {
ReadSequential(thread, db); ReadSequential(thread, db_with_cfh.db);
} }
} }
} }
@ -1981,11 +2049,11 @@ class Benchmark {
} }
void ReadReverse(ThreadState* thread) { void ReadReverse(ThreadState* thread) {
if (db_ != nullptr) { if (db_.db != nullptr) {
ReadReverse(thread, db_); ReadReverse(thread, db_.db);
} else { } else {
for (DB* db : multi_dbs_) { for (const auto& db_with_cfh : multi_dbs_) {
ReadReverse(thread, db); ReadReverse(thread, db_with_cfh.db);
} }
} }
} }
@ -1996,7 +2064,7 @@ class Benchmark {
int64_t bytes = 0; int64_t bytes = 0;
for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) { for (iter->SeekToLast(); i < reads_ && iter->Valid(); iter->Prev()) {
bytes += iter->key().size() + iter->value().size(); bytes += iter->key().size() + iter->value().size();
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db, 1);
++i; ++i;
} }
delete iter; delete iter;
@ -2013,13 +2081,24 @@ class Benchmark {
Duration duration(FLAGS_duration, reads_); Duration duration(FLAGS_duration, reads_);
while (!duration.Done(1)) { while (!duration.Done(1)) {
DB* db = SelectDB(thread); DBWithColumnFamilies* db_with_cfh = SelectDBWithCfh(thread);
GenerateKeyFromInt(thread->rand.Next() % FLAGS_num, FLAGS_num, &key); // We use same key_rand as seed for key and column family so that we can
// deterministically find the cfh corresponding to a particular key, as it
// is done in DoWrite method.
int64_t key_rand = thread->rand.Next() % FLAGS_num;
GenerateKeyFromInt(key_rand, FLAGS_num, &key);
read++; read++;
if (db->Get(options, key, &value).ok()) { Status s;
if (FLAGS_num_column_families > 1) {
s = db_with_cfh->db->Get(options,
db_with_cfh->cfh[key_rand % db_with_cfh->cfh.size()], key, &value);
} else {
s = db_with_cfh->db->Get(options, key, &value);
}
if (s.ok()) {
found++; found++;
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db_with_cfh->db, 1);
} }
char msg[100]; char msg[100];
@ -2061,6 +2140,7 @@ class Benchmark {
++found; ++found;
} }
} }
thread->stats.FinishedOps(db, entries_per_batch_);
} }
for (auto& k : keys) { for (auto& k : keys) {
delete k.data(); delete k.data();
@ -2099,11 +2179,11 @@ class Benchmark {
Iterator* single_iter = nullptr; Iterator* single_iter = nullptr;
std::vector<Iterator*> multi_iters; std::vector<Iterator*> multi_iters;
if (db_ != nullptr) { if (db_.db != nullptr) {
single_iter = db_->NewIterator(options); single_iter = db_.db->NewIterator(options);
} else { } else {
for (DB* db : multi_dbs_) { for (const auto& db_with_cfh : multi_dbs_) {
multi_iters.push_back(db->NewIterator(options)); multi_iters.push_back(db_with_cfh.db->NewIterator(options));
} }
} }
uint64_t last_refresh = FLAGS_env->NowMicros(); uint64_t last_refresh = FLAGS_env->NowMicros();
@ -2116,16 +2196,16 @@ class Benchmark {
if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) { if (!FLAGS_use_tailing_iterator && FLAGS_iter_refresh_interval_us >= 0) {
uint64_t now = FLAGS_env->NowMicros(); uint64_t now = FLAGS_env->NowMicros();
if (now - last_refresh > (uint64_t)FLAGS_iter_refresh_interval_us) { if (now - last_refresh > (uint64_t)FLAGS_iter_refresh_interval_us) {
if (db_ != nullptr) { if (db_.db != nullptr) {
delete single_iter; delete single_iter;
single_iter = db_->NewIterator(options); single_iter = db_.db->NewIterator(options);
} else { } else {
for (auto iter : multi_iters) { for (auto iter : multi_iters) {
delete iter; delete iter;
} }
multi_iters.clear(); multi_iters.clear();
for (DB* db : multi_dbs_) { for (const auto& db_with_cfh : multi_dbs_) {
multi_iters.push_back(db->NewIterator(options)); multi_iters.push_back(db_with_cfh.db->NewIterator(options));
} }
} }
} }
@ -2143,7 +2223,7 @@ class Benchmark {
if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) { if (iter_to_use->Valid() && iter_to_use->key().compare(key) == 0) {
found++; found++;
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db_.db, 1);
} }
delete single_iter; delete single_iter;
for (auto iter : multi_iters) { for (auto iter : multi_iters) {
@ -2243,7 +2323,7 @@ class Benchmark {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db_.db, 1);
++num_writes; ++num_writes;
if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) { if (writes_per_second_by_10 && num_writes >= writes_per_second_by_10) {
@ -2403,7 +2483,7 @@ class Benchmark {
deletes_done++; deletes_done++;
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db_.db, 1);
} }
char msg[100]; char msg[100];
snprintf(msg, sizeof(msg), snprintf(msg, sizeof(msg),
@ -2542,7 +2622,7 @@ class Benchmark {
fprintf(stderr, "put error: %s\n", s.ToString().c_str()); fprintf(stderr, "put error: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db, 1);
} }
char msg[100]; char msg[100];
@ -2578,7 +2658,7 @@ class Benchmark {
fprintf(stderr, "merge error: %s\n", s.ToString().c_str()); fprintf(stderr, "merge error: %s\n", s.ToString().c_str());
exit(1); exit(1);
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db, 1);
} }
// Print some statistics // Print some statistics
@ -2639,7 +2719,7 @@ class Benchmark {
} }
thread->stats.FinishedOps(db_, 1); thread->stats.FinishedOps(db, 1);
} }
char msg[100]; char msg[100];
@ -2656,11 +2736,11 @@ class Benchmark {
} }
void PrintStats(const char* key) { void PrintStats(const char* key) {
if (db_ != nullptr) { if (db_.db != nullptr) {
PrintStats(db_, key, false); PrintStats(db_.db, key, false);
} }
for (DB* db : multi_dbs_) { for (const auto& db_with_cfh : multi_dbs_) {
PrintStats(db, key, true); PrintStats(db_with_cfh.db, key, true);
} }
} }

@ -9,7 +9,10 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <algorithm> #include <algorithm>
#include <string> #include <string>
@ -29,9 +32,9 @@ Status DBImpl::DisableFileDeletions() {
MutexLock l(&mutex_); MutexLock l(&mutex_);
++disable_delete_obsolete_files_; ++disable_delete_obsolete_files_;
if (disable_delete_obsolete_files_ == 1) { if (disable_delete_obsolete_files_ == 1) {
Log(options_.info_log, "File Deletions Disabled"); Log(db_options_.info_log, "File Deletions Disabled");
} else { } else {
Log(options_.info_log, Log(db_options_.info_log,
"File Deletions Disabled, but already disabled. Counter: %d", "File Deletions Disabled, but already disabled. Counter: %d",
disable_delete_obsolete_files_); disable_delete_obsolete_files_);
} }
@ -50,11 +53,11 @@ Status DBImpl::EnableFileDeletions(bool force) {
--disable_delete_obsolete_files_; --disable_delete_obsolete_files_;
} }
if (disable_delete_obsolete_files_ == 0) { if (disable_delete_obsolete_files_ == 0) {
Log(options_.info_log, "File Deletions Enabled"); Log(db_options_.info_log, "File Deletions Enabled");
should_purge_files = true; should_purge_files = true;
FindObsoleteFiles(deletion_state, true); FindObsoleteFiles(deletion_state, true);
} else { } else {
Log(options_.info_log, Log(db_options_.info_log,
"File Deletions Enable, but not really enabled. Counter: %d", "File Deletions Enable, but not really enabled. Counter: %d",
disable_delete_obsolete_files_); disable_delete_obsolete_files_);
} }
@ -62,10 +65,14 @@ Status DBImpl::EnableFileDeletions(bool force) {
if (should_purge_files) { if (should_purge_files) {
PurgeObsoleteFiles(deletion_state); PurgeObsoleteFiles(deletion_state);
} }
LogFlush(options_.info_log); LogFlush(db_options_.info_log);
return Status::OK(); return Status::OK();
} }
int DBImpl::IsFileDeletionsEnabled() const {
return disable_delete_obsolete_files_;
}
Status DBImpl::GetLiveFiles(std::vector<std::string>& ret, Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
uint64_t* manifest_file_size, uint64_t* manifest_file_size,
bool flush_memtable) { bool flush_memtable) {
@ -91,7 +98,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
if (!status.ok()) { if (!status.ok()) {
mutex_.Unlock(); mutex_.Unlock();
Log(options_.info_log, "Cannot Flush data %s\n", Log(db_options_.info_log, "Cannot Flush data %s\n",
status.ToString().c_str()); status.ToString().c_str());
return status; return status;
} }
@ -129,7 +136,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
Status s; Status s;
// list wal files in main db dir. // list wal files in main db dir.
VectorLogPtr logs; VectorLogPtr logs;
s = GetSortedWalsOfType(options_.wal_dir, logs, kAliveLogFile); s = GetSortedWalsOfType(db_options_.wal_dir, logs, kAliveLogFile);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -142,7 +149,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
files.clear(); files.clear();
// list wal files in archive dir. // list wal files in archive dir.
std::string archivedir = ArchivalDirectory(options_.wal_dir); std::string archivedir = ArchivalDirectory(db_options_.wal_dir);
if (env_->FileExists(archivedir)) { if (env_->FileExists(archivedir)) {
s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile); s = GetSortedWalsOfType(archivedir, files, kArchivedLogFile);
if (!s.ok()) { if (!s.ok()) {
@ -153,7 +160,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
uint64_t latest_archived_log_number = 0; uint64_t latest_archived_log_number = 0;
if (!files.empty()) { if (!files.empty()) {
latest_archived_log_number = files.back()->LogNumber(); latest_archived_log_number = files.back()->LogNumber();
Log(options_.info_log, "Latest Archived log: %" PRIu64, Log(db_options_.info_log, "Latest Archived log: %" PRIu64,
latest_archived_log_number); latest_archived_log_number);
} }
@ -166,7 +173,7 @@ Status DBImpl::GetSortedWalFiles(VectorLogPtr& files) {
// same log in both db dir and archived dir. Simply // same log in both db dir and archived dir. Simply
// ignore the one in db dir. Note that, if we read // ignore the one in db dir. Note that, if we read
// archived dir first, we would have missed the log file. // archived dir first, we would have missed the log file.
Log(options_.info_log, "%s already moved to archive", Log(db_options_.info_log, "%s already moved to archive",
log->PathName().c_str()); log->PathName().c_str());
} }
} }

File diff suppressed because it is too large Load Diff

@ -30,7 +30,11 @@
#include "util/autovector.h" #include "util/autovector.h"
#include "util/stop_watch.h" #include "util/stop_watch.h"
#include "util/thread_local.h" #include "util/thread_local.h"
#include "util/scoped_arena_iterator.h"
#include "db/internal_stats.h" #include "db/internal_stats.h"
#include "db/write_controller.h"
#include "db/flush_scheduler.h"
#include "db/write_thread.h"
namespace rocksdb { namespace rocksdb {
@ -108,6 +112,10 @@ class DBImpl : public DB {
bool reduce_level = false, int target_level = -1, bool reduce_level = false, int target_level = -1,
uint32_t target_path_id = 0); uint32_t target_path_id = 0);
using DB::SetOptions;
bool SetOptions(ColumnFamilyHandle* column_family,
const std::unordered_map<std::string, std::string>& options_map);
using DB::NumberLevels; using DB::NumberLevels;
virtual int NumberLevels(ColumnFamilyHandle* column_family); virtual int NumberLevels(ColumnFamilyHandle* column_family);
using DB::MaxMemCompactionLevel; using DB::MaxMemCompactionLevel;
@ -127,6 +135,7 @@ class DBImpl : public DB {
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
virtual Status DisableFileDeletions(); virtual Status DisableFileDeletions();
virtual Status EnableFileDeletions(bool force); virtual Status EnableFileDeletions(bool force);
virtual int IsFileDeletionsEnabled() const;
// All the returned filenames start with "/" // All the returned filenames start with "/"
virtual Status GetLiveFiles(std::vector<std::string>&, virtual Status GetLiveFiles(std::vector<std::string>&,
uint64_t* manifest_file_size, uint64_t* manifest_file_size,
@ -172,8 +181,8 @@ class DBImpl : public DB {
// Return an internal iterator over the current state of the database. // Return an internal iterator over the current state of the database.
// The keys of this iterator are internal keys (see format.h). // The keys of this iterator are internal keys (see format.h).
// The returned iterator should be deleted when no longer needed. // The returned iterator should be deleted when no longer needed.
Iterator* TEST_NewInternalIterator(ColumnFamilyHandle* column_family = Iterator* TEST_NewInternalIterator(
nullptr); Arena* arena, ColumnFamilyHandle* column_family = nullptr);
// Return the maximum overlapping data (in bytes) at next level for any // Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1. // file at a level >= 1.
@ -201,6 +210,17 @@ class DBImpl : public DB {
SequenceNumber* sequence); SequenceNumber* sequence);
Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence); Status TEST_ReadFirstLine(const std::string& fname, SequenceNumber* sequence);
void TEST_LockMutex();
void TEST_UnlockMutex();
// REQUIRES: mutex locked
void* TEST_BeginWrite();
// REQUIRES: mutex locked
// pass the pointer that you got from TEST_BeginWrite()
void TEST_EndWrite(void* w);
#endif // NDEBUG #endif // NDEBUG
// Structure to store information for candidate files to delete. // Structure to store information for candidate files to delete.
@ -274,7 +294,7 @@ class DBImpl : public DB {
// Returns the list of live files in 'live' and the list // Returns the list of live files in 'live' and the list
// of all files in the filesystem in 'candidate_files'. // of all files in the filesystem in 'candidate_files'.
// If force == false and the last call was less than // If force == false and the last call was less than
// options_.delete_obsolete_files_period_micros microseconds ago, // db_options_.delete_obsolete_files_period_micros microseconds ago,
// it will not fill up the deletion_state // it will not fill up the deletion_state
void FindObsoleteFiles(DeletionState& deletion_state, void FindObsoleteFiles(DeletionState& deletion_state,
bool force, bool force,
@ -292,23 +312,21 @@ class DBImpl : public DB {
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
unique_ptr<VersionSet> versions_; unique_ptr<VersionSet> versions_;
const DBOptions options_; const DBOptions db_options_;
Statistics* stats_; Statistics* stats_;
Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd, Iterator* NewInternalIterator(const ReadOptions&, ColumnFamilyData* cfd,
SuperVersion* super_version, SuperVersion* super_version, Arena* arena);
Arena* arena = nullptr);
private: private:
friend class DB; friend class DB;
friend class InternalStats; friend class InternalStats;
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
friend class TailingIterator;
friend class ForwardIterator; friend class ForwardIterator;
#endif #endif
friend struct SuperVersion; friend struct SuperVersion;
struct CompactionState; struct CompactionState;
struct Writer;
struct WriteContext; struct WriteContext;
Status NewDB(); Status NewDB();
@ -332,8 +350,9 @@ class DBImpl : public DB {
DeletionState& deletion_state, DeletionState& deletion_state,
LogBuffer* log_buffer); LogBuffer* log_buffer);
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // REQUIRES: log_numbers are sorted in ascending order
bool read_only); Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only);
// The following two methods are used to flush a memtable to // The following two methods are used to flush a memtable to
// storage. The first one is used atdatabase RecoveryTime (when the // storage. The first one is used atdatabase RecoveryTime (when the
@ -346,43 +365,13 @@ class DBImpl : public DB {
VersionEdit* edit, uint64_t* filenumber, VersionEdit* edit, uint64_t* filenumber,
LogBuffer* log_buffer); LogBuffer* log_buffer);
uint64_t SlowdownAmount(int n, double bottom, double top); void DelayWrite(uint64_t expiration_time);
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush) Status ScheduleFlushes(WriteContext* context);
// thread should grab the mutex_ and be the first on writers queue.
// BeginWrite is used for it.
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
// for examples), so check it via w.done before applying changes.
//
// Writer* w: writer to be placed in the queue
// uint64_t expiration_time: maximum time to be in the queue
// See also: EndWrite
Status BeginWrite(Writer* w, uint64_t expiration_time);
// After doing write job, we need to remove already used writers from
// writers_ queue and notify head of the queue about it.
// EndWrite is used for this.
//
// Writer* w: Writer, that was added by BeginWrite function
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
// does)
// we should pass last_writer as a parameter to
// EndWrite
// (if you don't touch other writers, just pass w)
// Status status: Status of write operation
// See also: BeginWrite
void EndWrite(Writer* w, Writer* last_writer, Status status);
Status MakeRoomForWrite(ColumnFamilyData* cfd,
WriteContext* context,
uint64_t expiration_time);
Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd, Status SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
WriteContext* context); WriteContext* context);
void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options); Status FlushMemTable(ColumnFamilyData* cfd, const FlushOptions& options);
@ -527,10 +516,13 @@ class DBImpl : public DB {
std::unique_ptr<Directory> db_directory_; std::unique_ptr<Directory> db_directory_;
// Queue of writers. WriteThread write_thread_;
std::deque<Writer*> writers_;
WriteBatch tmp_batch_; WriteBatch tmp_batch_;
WriteController write_controller_;
FlushScheduler flush_scheduler_;
SnapshotList snapshots_; SnapshotList snapshots_;
// cache for ReadFirstRecord() calls // cache for ReadFirstRecord() calls
@ -599,14 +591,10 @@ class DBImpl : public DB {
bool flush_on_destroy_; // Used when disableWAL is true. bool flush_on_destroy_; // Used when disableWAL is true.
static const int KEEP_LOG_FILE_NUM = 1000; static const int KEEP_LOG_FILE_NUM = 1000;
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
std::string db_absolute_path_; std::string db_absolute_path_;
// count of the number of contiguous delaying writes
int delayed_writes_;
// The options to access storage files // The options to access storage files
const EnvOptions storage_options_; const EnvOptions env_options_;
// A value of true temporarily disables scheduling of background work // A value of true temporarily disables scheduling of background work
bool bg_work_gate_closed_; bool bg_work_gate_closed_;
@ -621,9 +609,6 @@ class DBImpl : public DB {
DBImpl(const DBImpl&); DBImpl(const DBImpl&);
void operator=(const DBImpl&); void operator=(const DBImpl&);
// dump the delayed_writes_ to the log file and reset counter.
void DelayLoggingAndReset();
// Return the earliest snapshot where seqno is visible. // Return the earliest snapshot where seqno is visible.
// Store the snapshot right before that, if any, in prev_snapshot // Store the snapshot right before that, if any, in prev_snapshot
inline SequenceNumber findEarliestVisibleSnapshot( inline SequenceNumber findEarliestVisibleSnapshot(
@ -669,7 +654,6 @@ class DBImpl : public DB {
// it is not equal to src.info_log. // it is not equal to src.info_log.
extern Options SanitizeOptions(const std::string& db, extern Options SanitizeOptions(const std::string& db,
const InternalKeyComparator* icmp, const InternalKeyComparator* icmp,
const InternalFilterPolicy* ipolicy,
const Options& src); const Options& src);
extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src); extern DBOptions SanitizeOptions(const std::string& db, const DBOptions& src);

@ -20,7 +20,8 @@ uint64_t DBImpl::TEST_GetLevel0TotalSize() {
return default_cf_handle_->cfd()->current()->NumLevelBytes(0); return default_cf_handle_->cfd()->current()->NumLevelBytes(0);
} }
Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) { Iterator* DBImpl::TEST_NewInternalIterator(Arena* arena,
ColumnFamilyHandle* column_family) {
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
if (column_family == nullptr) { if (column_family == nullptr) {
cfd = default_cf_handle_->cfd(); cfd = default_cf_handle_->cfd();
@ -33,7 +34,7 @@ Iterator* DBImpl::TEST_NewInternalIterator(ColumnFamilyHandle* column_family) {
SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
mutex_.Unlock(); mutex_.Unlock();
ReadOptions roptions; ReadOptions roptions;
return NewInternalIterator(roptions, cfd, super_version); return NewInternalIterator(roptions, cfd, super_version, arena);
} }
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes( int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes(
@ -129,5 +130,27 @@ Status DBImpl::TEST_ReadFirstLine(const std::string& fname,
SequenceNumber* sequence) { SequenceNumber* sequence) {
return ReadFirstLine(fname, sequence); return ReadFirstLine(fname, sequence);
} }
void DBImpl::TEST_LockMutex() {
mutex_.Lock();
}
void DBImpl::TEST_UnlockMutex() {
mutex_.Unlock();
}
void* DBImpl::TEST_BeginWrite() {
auto w = new WriteThread::Writer(&mutex_);
Status s = write_thread_.EnterWriteThread(w, 0);
assert(s.ok() && !w->done); // No timeout and nobody should do our job
return reinterpret_cast<void*>(w);
}
void DBImpl::TEST_EndWrite(void* w) {
auto writer = reinterpret_cast<WriteThread::Writer*>(w);
write_thread_.ExitWriteThread(writer, writer, Status::OK());
delete writer;
}
} // namespace rocksdb } // namespace rocksdb
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE

@ -16,7 +16,6 @@
#include <stdint.h> #include <stdint.h>
#include <stdio.h> #include <stdio.h>
#include <vector> #include <vector>
#include <algorithm>
#include "db/db_iter.h" #include "db/db_iter.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/filename.h" #include "db/filename.h"
@ -42,17 +41,17 @@
namespace rocksdb { namespace rocksdb {
DBImplReadOnly::DBImplReadOnly(const DBOptions& options, DBImplReadOnly::DBImplReadOnly(const DBOptions& db_options,
const std::string& dbname) const std::string& dbname)
: DBImpl(options, dbname) { : DBImpl(db_options, dbname) {
Log(options_.info_log, "Opening the db in read only mode"); Log(db_options_.info_log, "Opening the db in read only mode");
} }
DBImplReadOnly::~DBImplReadOnly() { DBImplReadOnly::~DBImplReadOnly() {
} }
// Implementations of the DB interface // Implementations of the DB interface
Status DBImplReadOnly::Get(const ReadOptions& options, Status DBImplReadOnly::Get(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key, ColumnFamilyHandle* column_family, const Slice& key,
std::string* value) { std::string* value) {
Status s; Status s;
@ -62,33 +61,34 @@ Status DBImplReadOnly::Get(const ReadOptions& options,
SuperVersion* super_version = cfd->GetSuperVersion(); SuperVersion* super_version = cfd->GetSuperVersion();
MergeContext merge_context; MergeContext merge_context;
LookupKey lkey(key, snapshot); LookupKey lkey(key, snapshot);
if (super_version->mem->Get(lkey, value, &s, merge_context, if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
*cfd->options())) {
} else { } else {
super_version->current->Get(options, lkey, value, &s, &merge_context); super_version->current->Get(read_options, lkey, value, &s, &merge_context);
} }
return s; return s;
} }
Iterator* DBImplReadOnly::NewIterator(const ReadOptions& options, Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options,
ColumnFamilyHandle* column_family) { ColumnFamilyHandle* column_family) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family); auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
auto cfd = cfh->cfd(); auto cfd = cfh->cfd();
SuperVersion* super_version = cfd->GetSuperVersion()->Ref(); SuperVersion* super_version = cfd->GetSuperVersion()->Ref();
SequenceNumber latest_snapshot = versions_->LastSequence(); SequenceNumber latest_snapshot = versions_->LastSequence();
auto db_iter = NewArenaWrappedDbIterator( auto db_iter = NewArenaWrappedDbIterator(
env_, *cfd->options(), cfd->user_comparator(), env_, *cfd->ioptions(), cfd->user_comparator(),
(options.snapshot != nullptr (read_options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_ ? reinterpret_cast<const SnapshotImpl*>(
: latest_snapshot)); read_options.snapshot)->number_
auto internal_iter = : latest_snapshot),
NewInternalIterator(options, cfd, super_version, db_iter->GetArena()); cfd->options()->max_sequential_skip_in_iterations);
auto internal_iter = NewInternalIterator(
read_options, cfd, super_version, db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter); db_iter->SetIterUnderDBIter(internal_iter);
return db_iter; return db_iter;
} }
Status DBImplReadOnly::NewIterators( Status DBImplReadOnly::NewIterators(
const ReadOptions& options, const ReadOptions& read_options,
const std::vector<ColumnFamilyHandle*>& column_families, const std::vector<ColumnFamilyHandle*>& column_families,
std::vector<Iterator*>* iterators) { std::vector<Iterator*>* iterators) {
if (iterators == nullptr) { if (iterators == nullptr) {
@ -101,12 +101,14 @@ Status DBImplReadOnly::NewIterators(
for (auto cfh : column_families) { for (auto cfh : column_families) {
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd(); auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
auto db_iter = NewArenaWrappedDbIterator( auto db_iter = NewArenaWrappedDbIterator(
env_, *cfd->options(), cfd->user_comparator(), env_, *cfd->ioptions(), cfd->user_comparator(),
options.snapshot != nullptr (read_options.snapshot != nullptr
? reinterpret_cast<const SnapshotImpl*>(options.snapshot)->number_ ? reinterpret_cast<const SnapshotImpl*>(
: latest_snapshot); read_options.snapshot)->number_
: latest_snapshot),
cfd->options()->max_sequential_skip_in_iterations);
auto internal_iter = NewInternalIterator( auto internal_iter = NewInternalIterator(
options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena()); read_options, cfd, cfd->GetSuperVersion()->Ref(), db_iter->GetArena());
db_iter->SetIterUnderDBIter(internal_iter); db_iter->SetIterUnderDBIter(internal_iter);
iterators->push_back(db_iter); iterators->push_back(db_iter);
} }

@ -74,6 +74,8 @@ class DBImplReadOnly : public DBImpl {
uint32_t target_path_id = 0) override { uint32_t target_path_id = 0) override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
#ifndef ROCKSDB_LITE
virtual Status DisableFileDeletions() override { virtual Status DisableFileDeletions() override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
@ -85,6 +87,8 @@ class DBImplReadOnly : public DBImpl {
bool flush_memtable = true) override { bool flush_memtable = true) override {
return Status::NotSupported("Not supported operation in read only mode."); return Status::NotSupported("Not supported operation in read only mode.");
} }
#endif // ROCKSDB_LITE
using DBImpl::Flush; using DBImpl::Flush;
virtual Status Flush(const FlushOptions& options, virtual Status Flush(const FlushOptions& options,
ColumnFamilyHandle* column_family) override { ColumnFamilyHandle* column_family) override {

@ -58,22 +58,25 @@ class DBIter: public Iterator {
kReverse kReverse
}; };
DBIter(Env* env, const Options& options, const Comparator* cmp, DBIter(Env* env, const ImmutableCFOptions& ioptions,
Iterator* iter, SequenceNumber s, bool arena_mode) const Comparator* cmp, Iterator* iter, SequenceNumber s,
bool arena_mode, uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr)
: arena_mode_(arena_mode), : arena_mode_(arena_mode),
env_(env), env_(env),
logger_(options.info_log.get()), logger_(ioptions.info_log),
user_comparator_(cmp), user_comparator_(cmp),
user_merge_operator_(options.merge_operator.get()), user_merge_operator_(ioptions.merge_operator),
iter_(iter), iter_(iter),
sequence_(s), sequence_(s),
direction_(kForward), direction_(kForward),
valid_(false), valid_(false),
current_entry_is_merged_(false), current_entry_is_merged_(false),
statistics_(options.statistics.get()) { statistics_(ioptions.statistics),
iterate_upper_bound_(iterate_upper_bound) {
RecordTick(statistics_, NO_ITERATORS); RecordTick(statistics_, NO_ITERATORS);
has_prefix_extractor_ = (options.prefix_extractor.get() != nullptr); prefix_extractor_ = ioptions.prefix_extractor;
max_skip_ = options.max_sequential_skip_in_iterations; max_skip_ = max_sequential_skip_in_iterations;
} }
virtual ~DBIter() { virtual ~DBIter() {
RecordTick(statistics_, NO_ITERATORS, -1); RecordTick(statistics_, NO_ITERATORS, -1);
@ -132,7 +135,7 @@ class DBIter: public Iterator {
} }
} }
bool has_prefix_extractor_; const SliceTransform* prefix_extractor_;
bool arena_mode_; bool arena_mode_;
Env* const env_; Env* const env_;
Logger* logger_; Logger* logger_;
@ -149,6 +152,7 @@ class DBIter: public Iterator {
bool current_entry_is_merged_; bool current_entry_is_merged_;
Statistics* statistics_; Statistics* statistics_;
uint64_t max_skip_; uint64_t max_skip_;
const Slice* iterate_upper_bound_;
// No copying allowed // No copying allowed
DBIter(const DBIter&); DBIter(const DBIter&);
@ -194,9 +198,8 @@ void DBIter::Next() {
// NOTE: In between, saved_key_ can point to a user key that has // NOTE: In between, saved_key_ can point to a user key that has
// a delete marker // a delete marker
inline void DBIter::FindNextUserEntry(bool skipping) { inline void DBIter::FindNextUserEntry(bool skipping) {
PERF_TIMER_AUTO(find_next_user_entry_time); PERF_TIMER_GUARD(find_next_user_entry_time);
FindNextUserEntryInternal(skipping); FindNextUserEntryInternal(skipping);
PERF_TIMER_STOP(find_next_user_entry_time);
} }
// Actual implementation of DBIter::FindNextUserEntry() // Actual implementation of DBIter::FindNextUserEntry()
@ -208,7 +211,14 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
uint64_t num_skipped = 0; uint64_t num_skipped = 0;
do { do {
ParsedInternalKey ikey; ParsedInternalKey ikey;
if (ParseKey(&ikey) && ikey.sequence <= sequence_) {
if (ParseKey(&ikey)) {
if (iterate_upper_bound_ != nullptr &&
ikey.user_key.compare(*iterate_upper_bound_) >= 0) {
break;
}
if (ikey.sequence <= sequence_) {
if (skipping && if (skipping &&
user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) { user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) <= 0) {
num_skipped++; // skip this entry num_skipped++; // skip this entry
@ -241,6 +251,7 @@ void DBIter::FindNextUserEntryInternal(bool skipping) {
} }
} }
} }
}
// If we have sequentially iterated via numerous keys and still not // If we have sequentially iterated via numerous keys and still not
// found the next user-key, then it is better to seek so that we can // found the next user-key, then it is better to seek so that we can
// avoid too many key comparisons. We seek to the last occurence of // avoid too many key comparisons. We seek to the last occurence of
@ -399,6 +410,7 @@ bool DBIter::FindValueForCurrentKey() {
case kTypeDeletion: case kTypeDeletion:
operands.clear(); operands.clear();
last_not_merge_type = kTypeDeletion; last_not_merge_type = kTypeDeletion;
PERF_COUNTER_ADD(internal_delete_skipped_count, 1);
break; break;
case kTypeMerge: case kTypeMerge:
assert(user_merge_operator_ != nullptr); assert(user_merge_operator_ != nullptr);
@ -408,6 +420,7 @@ bool DBIter::FindValueForCurrentKey() {
assert(false); assert(false);
} }
PERF_COUNTER_ADD(internal_key_skipped_count, 1);
assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0); assert(user_comparator_->Compare(ikey.user_key, saved_key_.GetKey()) == 0);
iter_->Prev(); iter_->Prev();
++num_skipped; ++num_skipped;
@ -554,12 +567,29 @@ void DBIter::FindParseableKey(ParsedInternalKey* ikey, Direction direction) {
void DBIter::Seek(const Slice& target) { void DBIter::Seek(const Slice& target) {
StopWatch sw(env_, statistics_, DB_SEEK); StopWatch sw(env_, statistics_, DB_SEEK);
// total ordering is not guaranteed if prefix_extractor is set
// hence prefix based seeks will not give correct results
if (iterate_upper_bound_ != nullptr && prefix_extractor_ != nullptr) {
if (!prefix_extractor_->InDomain(*iterate_upper_bound_) ||
!prefix_extractor_->InDomain(target) ||
prefix_extractor_->Transform(*iterate_upper_bound_).compare(
prefix_extractor_->Transform(target)) != 0) {
status_ = Status::InvalidArgument("read_options.iterate_*_bound "
" and seek target need to have the same prefix.");
valid_ = false;
return;
}
}
saved_key_.Clear(); saved_key_.Clear();
// now savved_key is used to store internal key. // now savved_key is used to store internal key.
saved_key_.SetInternalKey(target, sequence_); saved_key_.SetInternalKey(target, sequence_);
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->Seek(saved_key_.GetKey()); iter_->Seek(saved_key_.GetKey());
PERF_TIMER_STOP(seek_internal_seek_time); }
if (iter_->Valid()) { if (iter_->Valid()) {
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
@ -572,14 +602,17 @@ void DBIter::Seek(const Slice& target) {
void DBIter::SeekToFirst() { void DBIter::SeekToFirst() {
// Don't use iter_::Seek() if we set a prefix extractor // Don't use iter_::Seek() if we set a prefix extractor
// because prefix seek wiil be used. // because prefix seek wiil be used.
if (has_prefix_extractor_) { if (prefix_extractor_ != nullptr) {
max_skip_ = std::numeric_limits<uint64_t>::max(); max_skip_ = std::numeric_limits<uint64_t>::max();
} }
direction_ = kForward; direction_ = kForward;
ClearSavedValue(); ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToFirst(); iter_->SeekToFirst();
PERF_TIMER_STOP(seek_internal_seek_time); }
if (iter_->Valid()) { if (iter_->Valid()) {
FindNextUserEntry(false /* not skipping */); FindNextUserEntry(false /* not skipping */);
} else { } else {
@ -590,24 +623,29 @@ void DBIter::SeekToFirst() {
void DBIter::SeekToLast() { void DBIter::SeekToLast() {
// Don't use iter_::Seek() if we set a prefix extractor // Don't use iter_::Seek() if we set a prefix extractor
// because prefix seek wiil be used. // because prefix seek wiil be used.
if (has_prefix_extractor_) { if (prefix_extractor_ != nullptr) {
max_skip_ = std::numeric_limits<uint64_t>::max(); max_skip_ = std::numeric_limits<uint64_t>::max();
} }
direction_ = kReverse; direction_ = kReverse;
ClearSavedValue(); ClearSavedValue();
PERF_TIMER_AUTO(seek_internal_seek_time);
{
PERF_TIMER_GUARD(seek_internal_seek_time);
iter_->SeekToLast(); iter_->SeekToLast();
PERF_TIMER_STOP(seek_internal_seek_time); }
PrevInternal(); PrevInternal();
} }
Iterator* NewDBIterator(Env* env, const Options& options, Iterator* NewDBIterator(Env* env, const ImmutableCFOptions& ioptions,
const Comparator* user_key_comparator, const Comparator* user_key_comparator,
Iterator* internal_iter, Iterator* internal_iter,
const SequenceNumber& sequence) { const SequenceNumber& sequence,
return new DBIter(env, options, user_key_comparator, internal_iter, sequence, uint64_t max_sequential_skip_in_iterations,
false); const Slice* iterate_upper_bound) {
return new DBIter(env, ioptions, user_key_comparator, internal_iter, sequence,
false, max_sequential_skip_in_iterations,
iterate_upper_bound);
} }
ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); } ArenaWrappedDBIter::~ArenaWrappedDBIter() { db_iter_->~DBIter(); }
@ -635,14 +673,20 @@ void ArenaWrappedDBIter::RegisterCleanup(CleanupFunction function, void* arg1,
} }
ArenaWrappedDBIter* NewArenaWrappedDbIterator( ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator, Env* env, const ImmutableCFOptions& ioptions,
const SequenceNumber& sequence) { const Comparator* user_key_comparator,
const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound) {
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter(); ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
Arena* arena = iter->GetArena(); Arena* arena = iter->GetArena();
auto mem = arena->AllocateAligned(sizeof(DBIter)); auto mem = arena->AllocateAligned(sizeof(DBIter));
DBIter* db_iter = new (mem) DBIter* db_iter = new (mem) DBIter(env, ioptions, user_key_comparator,
DBIter(env, options, user_key_comparator, nullptr, sequence, true); nullptr, sequence, true, max_sequential_skip_in_iterations,
iterate_upper_bound);
iter->SetDBIter(db_iter); iter->SetDBIter(db_iter);
return iter; return iter;
} }

@ -24,10 +24,12 @@ class DBIter;
// into appropriate user keys. // into appropriate user keys.
extern Iterator* NewDBIterator( extern Iterator* NewDBIterator(
Env* env, Env* env,
const Options& options, const ImmutableCFOptions& options,
const Comparator *user_key_comparator, const Comparator *user_key_comparator,
Iterator* internal_iter, Iterator* internal_iter,
const SequenceNumber& sequence); const SequenceNumber& sequence,
uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr);
// A wrapper iterator which wraps DB Iterator and the arena, with which the DB // A wrapper iterator which wraps DB Iterator and the arena, with which the DB
// iterator is supposed be allocated. This class is used as an entry point of // iterator is supposed be allocated. This class is used as an entry point of
@ -67,7 +69,9 @@ class ArenaWrappedDBIter : public Iterator {
// Generate the arena wrapped iterator class. // Generate the arena wrapped iterator class.
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator( extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
Env* env, const Options& options, const Comparator* user_key_comparator, Env* env, const ImmutableCFOptions& options,
const SequenceNumber& sequence); const Comparator* user_key_comparator,
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
const Slice* iterate_upper_bound = nullptr);
} // namespace rocksdb } // namespace rocksdb

@ -158,7 +158,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 10,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -191,7 +193,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 10,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -232,7 +236,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "b"); ASSERT_EQ(db_iter->key().ToString(), "b");
@ -262,7 +268,9 @@ TEST(DBIteratorTest, DBIteratorPrevNext) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 10,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "c"); ASSERT_EQ(db_iter->key().ToString(), "c");
@ -288,7 +296,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(!db_iter->Valid()); ASSERT_TRUE(!db_iter->Valid());
} }
@ -298,7 +308,9 @@ TEST(DBIteratorTest, DBIteratorEmpty) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(!db_iter->Valid()); ASSERT_TRUE(!db_iter->Valid());
} }
@ -318,7 +330,9 @@ TEST(DBIteratorTest, DBIteratorUseSkipCountSkips) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "c"); ASSERT_EQ(db_iter->key().ToString(), "c");
@ -357,7 +371,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
options.statistics = rocksdb::CreateDBStatistics(); options.statistics = rocksdb::CreateDBStatistics();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, i + 2)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, i + 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -391,7 +407,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, i + 2)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, i + 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -418,7 +436,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, 202)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 202,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -449,7 +469,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->AddPut("c", "200"); internal_iter->AddPut("c", "200");
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, i)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, i,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(!db_iter->Valid()); ASSERT_TRUE(!db_iter->Valid());
@ -464,7 +486,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->AddPut("c", "200"); internal_iter->AddPut("c", "200");
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 200)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 200,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "c"); ASSERT_EQ(db_iter->key().ToString(), "c");
@ -497,7 +521,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, i + 2)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, i + 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -530,7 +556,9 @@ TEST(DBIteratorTest, DBIteratorUseSkip) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, i + 2)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, i + 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -570,7 +598,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 1,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -590,7 +620,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -609,7 +641,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -628,7 +662,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 4,
options.max_sequential_skip_in_iterations));
db_iter->SeekToFirst(); db_iter->SeekToFirst();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -654,7 +690,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -675,7 +713,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 1,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -696,7 +736,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -717,7 +759,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 3,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -738,7 +782,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 4,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -759,7 +805,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 5,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -780,7 +828,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 6,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -803,7 +853,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -824,7 +876,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 1)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 1,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -845,7 +899,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -866,7 +922,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 3)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 3,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(!db_iter->Valid()); ASSERT_TRUE(!db_iter->Valid());
} }
@ -883,7 +941,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 4,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -904,7 +964,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 5,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -925,7 +987,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 6,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -960,7 +1024,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 0)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 0,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "a"); ASSERT_EQ(db_iter->key().ToString(), "a");
@ -993,7 +1059,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 2)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 2,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1032,7 +1100,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 4)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 4,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1071,7 +1141,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 5)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 5,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1115,7 +1187,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 6)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 6,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1160,7 +1234,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 7)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 7,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1199,7 +1275,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 9)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 9,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1244,7 +1322,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, 13)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 13,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1290,7 +1370,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter(NewDBIterator( std::unique_ptr<Iterator> db_iter(NewDBIterator(
env_, options, BytewiseComparator(), internal_iter, 14)); env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 14,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
@ -1316,7 +1398,9 @@ TEST(DBIteratorTest, DBIterator) {
internal_iter->Finish(); internal_iter->Finish();
std::unique_ptr<Iterator> db_iter( std::unique_ptr<Iterator> db_iter(
NewDBIterator(env_, options, BytewiseComparator(), internal_iter, 10)); NewDBIterator(env_, ImmutableCFOptions(options),
BytewiseComparator(), internal_iter, 10,
options.max_sequential_skip_in_iterations));
db_iter->SeekToLast(); db_iter->SeekToLast();
ASSERT_TRUE(db_iter->Valid()); ASSERT_TRUE(db_iter->Valid());
ASSERT_EQ(db_iter->key().ToString(), "b"); ASSERT_EQ(db_iter->key().ToString(), "b");

File diff suppressed because it is too large Load Diff

@ -127,26 +127,6 @@ void InternalKeyComparator::FindShortSuccessor(std::string* key) const {
} }
} }
const char* InternalFilterPolicy::Name() const {
return user_policy_->Name();
}
void InternalFilterPolicy::CreateFilter(const Slice* keys, int n,
std::string* dst) const {
// We rely on the fact that the code in table.cc does not mind us
// adjusting keys[].
Slice* mkey = const_cast<Slice*>(keys);
for (int i = 0; i < n; i++) {
mkey[i] = ExtractUserKey(keys[i]);
// TODO(sanjay): Suppress dups?
}
user_policy_->CreateFilter(keys, n, dst);
}
bool InternalFilterPolicy::KeyMayMatch(const Slice& key, const Slice& f) const {
return user_policy_->KeyMayMatch(ExtractUserKey(key), f);
}
LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) { LookupKey::LookupKey(const Slice& user_key, SequenceNumber s) {
size_t usize = user_key.size(); size_t usize = user_key.size();
size_t needed = usize + 13; // A conservative estimate size_t needed = usize + 13; // A conservative estimate

@ -124,17 +124,6 @@ class InternalKeyComparator : public Comparator {
int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const; int Compare(const ParsedInternalKey& a, const ParsedInternalKey& b) const;
}; };
// Filter policy wrapper that converts from internal keys to user keys
class InternalFilterPolicy : public FilterPolicy {
private:
const FilterPolicy* const user_policy_;
public:
explicit InternalFilterPolicy(const FilterPolicy* p) : user_policy_(p) { }
virtual const char* Name() const;
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const;
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const;
};
// Modules in this directory should keep internal keys wrapped inside // Modules in this directory should keep internal keys wrapped inside
// the following class instead of plain strings so that we do not // the following class instead of plain strings so that we do not
// incorrectly use string comparisons instead of an InternalKeyComparator. // incorrectly use string comparisons instead of an InternalKeyComparator.
@ -255,7 +244,7 @@ class IterKey {
Slice GetKey() const { return Slice(key_, key_size_); } Slice GetKey() const { return Slice(key_, key_size_); }
const size_t Size() { return key_size_; } size_t Size() { return key_size_; }
void Clear() { key_size_ = 0; } void Clear() { key_size_ = 0; }
@ -401,4 +390,12 @@ class InternalKeySliceTransform : public SliceTransform {
const SliceTransform* const transform_; const SliceTransform* const transform_;
}; };
// Read record from a write batch piece from input.
// tag, column_family, key, value and blob are return values. Callers own the
// Slice they point to.
// Tag is defined as ValueType.
// input will be advanced to after the record.
extern Status ReadRecordFromWriteBatch(Slice* input, char* tag,
uint32_t* column_family, Slice* key,
Slice* value, Slice* blob);
} // namespace rocksdb } // namespace rocksdb

@ -34,6 +34,7 @@ class DeleteFileTest {
DeleteFileTest() { DeleteFileTest() {
db_ = nullptr; db_ = nullptr;
env_ = Env::Default(); env_ = Env::Default();
options_.max_background_flushes = 0;
options_.write_buffer_size = 1024*1024*1000; options_.write_buffer_size = 1024*1024*1000;
options_.target_file_size_base = 1024*1024*1000; options_.target_file_size_base = 1024*1024*1000;
options_.max_bytes_for_level_base = 1024*1024*1000; options_.max_bytes_for_level_base = 1024*1024*1000;

@ -6,7 +6,10 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include "db/filename.h" #include "db/filename.h"
#include <inttypes.h> #include <inttypes.h>

@ -0,0 +1,62 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/flush_scheduler.h"
#include <cassert>
#include "db/column_family.h"
namespace rocksdb {
void FlushScheduler::ScheduleFlush(ColumnFamilyData* cfd) {
#ifndef NDEBUG
assert(column_families_set_.find(cfd) == column_families_set_.end());
column_families_set_.insert(cfd);
#endif // NDEBUG
cfd->Ref();
column_families_.push_back(cfd);
}
ColumnFamilyData* FlushScheduler::GetNextColumnFamily() {
ColumnFamilyData* cfd = nullptr;
while (column_families_.size() > 0) {
cfd = column_families_.front();
column_families_.pop_front();
if (cfd->IsDropped()) {
if (cfd->Unref()) {
delete cfd;
}
} else {
break;
}
}
#ifndef NDEBUG
if (cfd != nullptr) {
auto itr = column_families_set_.find(cfd);
assert(itr != column_families_set_.end());
column_families_set_.erase(itr);
}
#endif // NDEBUG
return cfd;
}
bool FlushScheduler::Empty() { return column_families_.empty(); }
void FlushScheduler::Clear() {
for (auto cfd : column_families_) {
#ifndef NDEBUG
auto itr = column_families_set_.find(cfd);
assert(itr != column_families_set_.end());
column_families_set_.erase(itr);
#endif // NDEBUG
if (cfd->Unref()) {
delete cfd;
}
}
column_families_.clear();
}
} // namespace rocksdb

@ -0,0 +1,39 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <stdint.h>
#include <deque>
#include <set>
#include <vector>
namespace rocksdb {
class ColumnFamilyData;
// This class is thread-compatible. It's should only be accessed from single
// write thread (between BeginWrite() and EndWrite())
class FlushScheduler {
public:
FlushScheduler() = default;
~FlushScheduler() = default;
void ScheduleFlush(ColumnFamilyData* cfd);
// Returns Ref()-ed column family. Client needs to Unref()
ColumnFamilyData* GetNextColumnFamily();
bool Empty();
void Clear();
private:
std::deque<ColumnFamilyData*> column_families_;
#ifndef NDEBUG
std::set<ColumnFamilyData*> column_families_set_;
#endif // NDEBUG
};
} // namespace rocksdb

@ -6,9 +6,10 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include "db/forward_iterator.h" #include "db/forward_iterator.h"
#include <limits>
#include <string> #include <string>
#include <utility> #include <utility>
#include <limits>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/db_iter.h" #include "db/db_iter.h"
#include "db/column_family.h" #include "db/column_family.h"
@ -37,12 +38,16 @@ class LevelIterator : public Iterator {
assert(file_index < files_.size()); assert(file_index < files_.size());
if (file_index != file_index_) { if (file_index != file_index_) {
file_index_ = file_index; file_index_ = file_index;
Reset();
}
valid_ = false;
}
void Reset() {
assert(file_index_ < files_.size());
file_iter_.reset(cfd_->table_cache()->NewIterator( file_iter_.reset(cfd_->table_cache()->NewIterator(
read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), read_options_, *(cfd_->soptions()), cfd_->internal_comparator(),
files_[file_index_]->fd, nullptr /* table_reader_ptr */, false)); files_[file_index_]->fd, nullptr /* table_reader_ptr */, false));
} }
valid_ = false;
}
void SeekToLast() override { void SeekToLast() override {
status_ = Status::NotSupported("LevelIterator::SeekToLast()"); status_ = Status::NotSupported("LevelIterator::SeekToLast()");
valid_ = false; valid_ = false;
@ -63,12 +68,15 @@ class LevelIterator : public Iterator {
assert(file_iter_ != nullptr); assert(file_iter_ != nullptr);
file_iter_->Seek(internal_key); file_iter_->Seek(internal_key);
valid_ = file_iter_->Valid(); valid_ = file_iter_->Valid();
assert(valid_);
} }
void Next() override { void Next() override {
assert(valid_); assert(valid_);
file_iter_->Next(); file_iter_->Next();
while (!file_iter_->Valid()) { for (;;) {
if (file_iter_->status().IsIncomplete() || file_iter_->Valid()) {
valid_ = !file_iter_->status().IsIncomplete();
return;
}
if (file_index_ + 1 >= files_.size()) { if (file_index_ + 1 >= files_.size()) {
valid_ = false; valid_ = false;
return; return;
@ -76,7 +84,6 @@ class LevelIterator : public Iterator {
SetFileIndex(file_index_ + 1); SetFileIndex(file_index_ + 1);
file_iter_->SeekToFirst(); file_iter_->SeekToFirst();
} }
valid_ = file_iter_->Valid();
} }
Slice key() const override { Slice key() const override {
assert(valid_); assert(valid_);
@ -125,9 +132,11 @@ ForwardIterator::~ForwardIterator() {
} }
void ForwardIterator::Cleanup() { void ForwardIterator::Cleanup() {
delete mutable_iter_; if (mutable_iter_ != nullptr) {
mutable_iter_->~Iterator();
}
for (auto* m : imm_iters_) { for (auto* m : imm_iters_) {
delete m; m->~Iterator();
} }
imm_iters_.clear(); imm_iters_.clear();
for (auto* f : l0_iters_) { for (auto* f : l0_iters_) {
@ -160,6 +169,8 @@ void ForwardIterator::SeekToFirst() {
if (sv_ == nullptr || if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) { sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators(); RebuildIterators();
} else if (status_.IsIncomplete()) {
ResetIncompleteIterators();
} }
SeekInternal(Slice(), true); SeekInternal(Slice(), true);
} }
@ -168,6 +179,8 @@ void ForwardIterator::Seek(const Slice& internal_key) {
if (sv_ == nullptr || if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) { sv_ ->version_number != cfd_->GetSuperVersionNumber()) {
RebuildIterators(); RebuildIterators();
} else if (status_.IsIncomplete()) {
ResetIncompleteIterators();
} }
SeekInternal(internal_key, false); SeekInternal(internal_key, false);
} }
@ -211,7 +224,15 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
} }
l0_iters_[i]->Seek(internal_key); l0_iters_[i]->Seek(internal_key);
} }
if (l0_iters_[i]->Valid()) {
if (l0_iters_[i]->status().IsIncomplete()) {
// if any of the immutable iterators is incomplete (no-io option was
// used), we are unable to reliably find the smallest key
assert(read_options_.read_tier == kBlockCacheTier);
status_ = l0_iters_[i]->status();
valid_ = false;
return;
} else if (l0_iters_[i]->Valid()) {
immutable_min_heap_.push(l0_iters_[i]); immutable_min_heap_.push(l0_iters_[i]);
} }
} }
@ -280,7 +301,14 @@ void ForwardIterator::SeekInternal(const Slice& internal_key,
level_iters_[level - 1]->SetFileIndex(f_idx); level_iters_[level - 1]->SetFileIndex(f_idx);
seek_to_first ? level_iters_[level - 1]->SeekToFirst() : seek_to_first ? level_iters_[level - 1]->SeekToFirst() :
level_iters_[level - 1]->Seek(internal_key); level_iters_[level - 1]->Seek(internal_key);
if (level_iters_[level - 1]->Valid()) {
if (level_iters_[level - 1]->status().IsIncomplete()) {
// see above
assert(read_options_.read_tier == kBlockCacheTier);
status_ = level_iters_[level - 1]->status();
valid_ = false;
return;
} else if (level_iters_[level - 1]->Valid()) {
immutable_min_heap_.push(level_iters_[level - 1]); immutable_min_heap_.push(level_iters_[level - 1]);
} }
} }
@ -304,7 +332,7 @@ void ForwardIterator::Next() {
assert(valid_); assert(valid_);
if (sv_ == nullptr || if (sv_ == nullptr ||
sv_ ->version_number != cfd_->GetSuperVersionNumber()) { sv_->version_number != cfd_->GetSuperVersionNumber()) {
std::string current_key = key().ToString(); std::string current_key = key().ToString();
Slice old_key(current_key.data(), current_key.size()); Slice old_key(current_key.data(), current_key.size());
@ -320,9 +348,17 @@ void ForwardIterator::Next() {
} }
current_->Next(); current_->Next();
if (current_->Valid() && current_ != mutable_iter_) { if (current_ != mutable_iter_) {
if (current_->status().IsIncomplete()) {
assert(read_options_.read_tier == kBlockCacheTier);
status_ = current_->status();
valid_ = false;
return;
} else if (current_->Valid()) {
immutable_min_heap_.push(current_); immutable_min_heap_.push(current_);
} }
}
UpdateCurrent(); UpdateCurrent();
} }
@ -367,8 +403,8 @@ void ForwardIterator::RebuildIterators() {
Cleanup(); Cleanup();
// New // New
sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_)); sv_ = cfd_->GetReferencedSuperVersion(&(db_->mutex_));
mutable_iter_ = sv_->mem->NewIterator(read_options_); mutable_iter_ = sv_->mem->NewIterator(read_options_, &arena_);
sv_->imm->AddIterators(read_options_, &imm_iters_); sv_->imm->AddIterators(read_options_, &imm_iters_, &arena_);
const auto& l0_files = sv_->current->files_[0]; const auto& l0_files = sv_->current->files_[0];
l0_iters_.reserve(l0_files.size()); l0_iters_.reserve(l0_files.size());
for (const auto* l0 : l0_files) { for (const auto* l0 : l0_files) {
@ -389,6 +425,29 @@ void ForwardIterator::RebuildIterators() {
is_prev_set_ = false; is_prev_set_ = false;
} }
void ForwardIterator::ResetIncompleteIterators() {
const auto& l0_files = sv_->current->files_[0];
for (uint32_t i = 0; i < l0_iters_.size(); ++i) {
assert(i < l0_files.size());
if (!l0_iters_[i]->status().IsIncomplete()) {
continue;
}
delete l0_iters_[i];
l0_iters_[i] = cfd_->table_cache()->NewIterator(
read_options_, *cfd_->soptions(), cfd_->internal_comparator(),
l0_files[i]->fd);
}
for (auto* level_iter : level_iters_) {
if (level_iter && level_iter->status().IsIncomplete()) {
level_iter->Reset();
}
}
current_ = nullptr;
is_prev_set_ = false;
}
void ForwardIterator::UpdateCurrent() { void ForwardIterator::UpdateCurrent() {
if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) { if (immutable_min_heap_.empty() && !mutable_iter_->Valid()) {
current_ = nullptr; current_ = nullptr;
@ -417,7 +476,7 @@ void ForwardIterator::UpdateCurrent() {
} }
bool ForwardIterator::NeedToSeekImmutable(const Slice& target) { bool ForwardIterator::NeedToSeekImmutable(const Slice& target) {
if (!is_prev_set_) { if (!valid_ || !is_prev_set_) {
return true; return true;
} }
Slice prev_key = prev_key_.GetKey(); Slice prev_key = prev_key_.GetKey();

@ -14,6 +14,7 @@
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "util/arena.h"
namespace rocksdb { namespace rocksdb {
@ -73,6 +74,7 @@ class ForwardIterator : public Iterator {
private: private:
void Cleanup(); void Cleanup();
void RebuildIterators(); void RebuildIterators();
void ResetIncompleteIterators();
void SeekInternal(const Slice& internal_key, bool seek_to_first); void SeekInternal(const Slice& internal_key, bool seek_to_first);
void UpdateCurrent(); void UpdateCurrent();
bool NeedToSeekImmutable(const Slice& internal_key); bool NeedToSeekImmutable(const Slice& internal_key);
@ -99,6 +101,7 @@ class ForwardIterator : public Iterator {
IterKey prev_key_; IterKey prev_key_;
bool is_prev_set_; bool is_prev_set_;
Arena arena_;
}; };
} // namespace rocksdb } // namespace rocksdb

@ -7,10 +7,15 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/internal_stats.h" #include "db/internal_stats.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <vector> #include <vector>
#include "db/column_family.h" #include "db/column_family.h"
#include "db/db_impl.h"
namespace rocksdb { namespace rocksdb {
@ -133,6 +138,8 @@ DBPropertyType GetPropertyType(const Slice& property, bool* is_int_property,
} else if (in == "estimate-table-readers-mem") { } else if (in == "estimate-table-readers-mem") {
*need_out_of_mutex = true; *need_out_of_mutex = true;
return kEstimatedUsageByTableReaders; return kEstimatedUsageByTableReaders;
} else if (in == "is-file-deletions-enabled") {
return kIsFileDeletionEnabled;
} }
return kUnknown; return kUnknown;
} }
@ -215,7 +222,7 @@ bool InternalStats::GetStringProperty(DBPropertyType property_type,
} }
bool InternalStats::GetIntProperty(DBPropertyType property_type, bool InternalStats::GetIntProperty(DBPropertyType property_type,
uint64_t* value) const { uint64_t* value, DBImpl* db) const {
Version* current = cfd_->current(); Version* current = cfd_->current();
switch (property_type) { switch (property_type) {
@ -254,6 +261,11 @@ bool InternalStats::GetIntProperty(DBPropertyType property_type,
cfd_->imm()->current()->GetTotalNumEntries() + cfd_->imm()->current()->GetTotalNumEntries() +
current->GetEstimatedActiveKeys(); current->GetEstimatedActiveKeys();
return true; return true;
#ifndef ROCKSDB_LITE
case kIsFileDeletionEnabled:
*value = db->IsFileDeletionsEnabled();
return true;
#endif
default: default:
return false; return false;
} }

@ -42,6 +42,8 @@ enum DBPropertyType : uint32_t {
// the immutable mem tables. // the immutable mem tables.
kEstimatedNumKeys, // Estimated total number of keys in the database. kEstimatedNumKeys, // Estimated total number of keys in the database.
kEstimatedUsageByTableReaders, // Estimated memory by table readers. kEstimatedUsageByTableReaders, // Estimated memory by table readers.
kIsFileDeletionEnabled, // Equals disable_delete_obsolete_files_,
// 0 means file deletions enabled
}; };
extern DBPropertyType GetPropertyType(const Slice& property, extern DBPropertyType GetPropertyType(const Slice& property,
@ -197,7 +199,8 @@ class InternalStats {
bool GetStringProperty(DBPropertyType property_type, const Slice& property, bool GetStringProperty(DBPropertyType property_type, const Slice& property,
std::string* value); std::string* value);
bool GetIntProperty(DBPropertyType property_type, uint64_t* value) const; bool GetIntProperty(DBPropertyType property_type, uint64_t* value,
DBImpl* db) const;
bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version, bool GetIntPropertyOutOfMutex(DBPropertyType property_type, Version* version,
uint64_t* value) const; uint64_t* value) const;

@ -9,6 +9,7 @@
#include "util/testharness.h" #include "util/testharness.h"
#include "util/benchharness.h" #include "util/benchharness.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_controller.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
namespace rocksdb { namespace rocksdb {
@ -21,6 +22,7 @@ std::string MakeKey(unsigned int num) {
void BM_LogAndApply(int iters, int num_base_files) { void BM_LogAndApply(int iters, int num_base_files) {
VersionSet* vset; VersionSet* vset;
WriteController wc;
ColumnFamilyData* default_cfd; ColumnFamilyData* default_cfd;
uint64_t fnum = 1; uint64_t fnum = 1;
port::Mutex mu; port::Mutex mu;
@ -47,7 +49,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
options.db_paths.emplace_back(dbname, 0); options.db_paths.emplace_back(dbname, 0);
// The parameter of table cache is passed in as null, so any file I/O // The parameter of table cache is passed in as null, so any file I/O
// operation is likely to fail. // operation is likely to fail.
vset = new VersionSet(dbname, &options, sopt, nullptr); vset = new VersionSet(dbname, &options, sopt, nullptr, &wc);
std::vector<ColumnFamilyDescriptor> dummy; std::vector<ColumnFamilyDescriptor> dummy;
dummy.push_back(ColumnFamilyDescriptor()); dummy.push_back(ColumnFamilyDescriptor());
ASSERT_OK(vset->Recover(dummy)); ASSERT_OK(vset->Recover(dummy));
@ -69,6 +71,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1); vedit.AddFile(2, ++fnum, 0, 1 /* file size */, start, limit, 1, 1);
vset->LogAndApply(default_cfd, &vedit, &mu); vset->LogAndApply(default_cfd, &vedit, &mu);
} }
delete vset;
} }
BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1) BENCHMARK_NAMED_PARAM(BM_LogAndApply, 1000_iters_1_file, 1000, 1)

@ -31,41 +31,57 @@
namespace rocksdb { namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) MemTableOptions::MemTableOptions(
const MutableCFOptions& mutable_cf_options, const Options& options)
: write_buffer_size(mutable_cf_options.write_buffer_size),
arena_block_size(mutable_cf_options.arena_block_size),
memtable_prefix_bloom_bits(mutable_cf_options.memtable_prefix_bloom_bits),
memtable_prefix_bloom_probes(
mutable_cf_options.memtable_prefix_bloom_probes),
memtable_prefix_bloom_huge_page_tlb_size(
mutable_cf_options.memtable_prefix_bloom_huge_page_tlb_size),
inplace_update_support(options.inplace_update_support),
inplace_update_num_locks(options.inplace_update_num_locks),
inplace_callback(options.inplace_callback),
max_successive_merges(mutable_cf_options.max_successive_merges),
filter_deletes(mutable_cf_options.filter_deletes) {}
MemTable::MemTable(const InternalKeyComparator& cmp,
const ImmutableCFOptions& ioptions,
const MemTableOptions& moptions)
: comparator_(cmp), : comparator_(cmp),
ioptions_(ioptions),
moptions_(moptions),
refs_(0), refs_(0),
kArenaBlockSize(OptimizeBlockSize(options.arena_block_size)), kArenaBlockSize(OptimizeBlockSize(moptions.arena_block_size)),
kWriteBufferSize(options.write_buffer_size), arena_(moptions.arena_block_size),
arena_(options.arena_block_size), table_(ioptions.memtable_factory->CreateMemTableRep(
table_(options.memtable_factory->CreateMemTableRep( comparator_, &arena_, ioptions.prefix_extractor, ioptions.info_log)),
comparator_, &arena_, options.prefix_extractor.get(),
options.info_log.get())),
num_entries_(0), num_entries_(0),
flush_in_progress_(false), flush_in_progress_(false),
flush_completed_(false), flush_completed_(false),
file_number_(0), file_number_(0),
first_seqno_(0), first_seqno_(0),
mem_next_logfile_number_(0), mem_next_logfile_number_(0),
locks_(options.inplace_update_support ? options.inplace_update_num_locks locks_(moptions.inplace_update_support ? moptions.inplace_update_num_locks
: 0), : 0),
prefix_extractor_(options.prefix_extractor.get()), prefix_extractor_(ioptions.prefix_extractor),
should_flush_(ShouldFlushNow()) { should_flush_(ShouldFlushNow()),
flush_scheduled_(false) {
// if should_flush_ == true without an entry inserted, something must have // if should_flush_ == true without an entry inserted, something must have
// gone wrong already. // gone wrong already.
assert(!should_flush_); assert(!should_flush_);
if (prefix_extractor_ && options.memtable_prefix_bloom_bits > 0) { if (prefix_extractor_ && moptions.memtable_prefix_bloom_bits > 0) {
prefix_bloom_.reset(new DynamicBloom( prefix_bloom_.reset(new DynamicBloom(
&arena_, &arena_,
options.memtable_prefix_bloom_bits, options.bloom_locality, moptions.memtable_prefix_bloom_bits, ioptions.bloom_locality,
options.memtable_prefix_bloom_probes, nullptr, moptions.memtable_prefix_bloom_probes, nullptr,
options.memtable_prefix_bloom_huge_page_tlb_size, moptions.memtable_prefix_bloom_huge_page_tlb_size,
options.info_log.get())); ioptions.info_log));
} }
} }
MemTable::~MemTable() { MemTable::~MemTable() { assert(refs_ == 0); }
assert(refs_ == 0);
}
size_t MemTable::ApproximateMemoryUsage() { size_t MemTable::ApproximateMemoryUsage() {
size_t arena_usage = arena_.ApproximateMemoryUsage(); size_t arena_usage = arena_.ApproximateMemoryUsage();
@ -97,14 +113,16 @@ bool MemTable::ShouldFlushNow() const {
// if we can still allocate one more block without exceeding the // if we can still allocate one more block without exceeding the
// over-allocation ratio, then we should not flush. // over-allocation ratio, then we should not flush.
if (allocated_memory + kArenaBlockSize < if (allocated_memory + kArenaBlockSize <
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { moptions_.write_buffer_size +
kArenaBlockSize * kAllowOverAllocationRatio) {
return false; return false;
} }
// if user keeps adding entries that exceeds kWriteBufferSize, we need to // if user keeps adding entries that exceeds moptions.write_buffer_size,
// flush earlier even though we still have much available memory left. // we need to flush earlier even though we still have much available
if (allocated_memory > // memory left.
kWriteBufferSize + kArenaBlockSize * kAllowOverAllocationRatio) { if (allocated_memory > moptions_.write_buffer_size +
kArenaBlockSize * kAllowOverAllocationRatio) {
return true; return true;
} }
@ -174,13 +192,13 @@ const char* EncodeKey(std::string* scratch, const Slice& target) {
class MemTableIterator: public Iterator { class MemTableIterator: public Iterator {
public: public:
MemTableIterator(const MemTable& mem, const ReadOptions& options, MemTableIterator(
bool enforce_total_order, Arena* arena) const MemTable& mem, const ReadOptions& read_options, Arena* arena)
: bloom_(nullptr), : bloom_(nullptr),
prefix_extractor_(mem.prefix_extractor_), prefix_extractor_(mem.prefix_extractor_),
valid_(false), valid_(false),
arena_mode_(arena != nullptr) { arena_mode_(arena != nullptr) {
if (prefix_extractor_ != nullptr && !enforce_total_order) { if (prefix_extractor_ != nullptr && !read_options.total_order_seek) {
bloom_ = mem.prefix_bloom_.get(); bloom_ = mem.prefix_bloom_.get();
iter_ = mem.table_->GetDynamicPrefixIterator(arena); iter_ = mem.table_->GetDynamicPrefixIterator(arena);
} else { } else {
@ -248,15 +266,10 @@ class MemTableIterator: public Iterator {
void operator=(const MemTableIterator&); void operator=(const MemTableIterator&);
}; };
Iterator* MemTable::NewIterator(const ReadOptions& options, Iterator* MemTable::NewIterator(const ReadOptions& read_options, Arena* arena) {
bool enforce_total_order, Arena* arena) { assert(arena != nullptr);
if (arena == nullptr) {
return new MemTableIterator(*this, options, enforce_total_order, nullptr);
} else {
auto mem = arena->AllocateAligned(sizeof(MemTableIterator)); auto mem = arena->AllocateAligned(sizeof(MemTableIterator));
return new (mem) return new (mem) MemTableIterator(*this, read_options, arena);
MemTableIterator(*this, options, enforce_total_order, arena);
}
} }
port::RWMutex* MemTable::GetLock(const Slice& key) { port::RWMutex* MemTable::GetLock(const Slice& key) {
@ -417,8 +430,13 @@ static bool SaveValue(void* arg, const char* entry) {
} }
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) { MergeContext* merge_context) {
PERF_TIMER_AUTO(get_from_memtable_time); // The sequence number is updated synchronously in version_set.h
if (IsEmpty()) {
// Avoiding recording stats for speed.
return false;
}
PERF_TIMER_GUARD(get_from_memtable_time);
Slice user_key = key.user_key(); Slice user_key = key.user_key();
bool found_final_value = false; bool found_final_value = false;
@ -436,11 +454,11 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
saver.value = value; saver.value = value;
saver.status = s; saver.status = s;
saver.mem = this; saver.mem = this;
saver.merge_context = &merge_context; saver.merge_context = merge_context;
saver.merge_operator = options.merge_operator.get(); saver.merge_operator = ioptions_.merge_operator;
saver.logger = options.info_log.get(); saver.logger = ioptions_.info_log;
saver.inplace_update_support = options.inplace_update_support; saver.inplace_update_support = moptions_.inplace_update_support;
saver.statistics = options.statistics.get(); saver.statistics = ioptions_.statistics;
table_->Get(key, &saver, SaveValue); table_->Get(key, &saver, SaveValue);
} }
@ -448,7 +466,6 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
if (!found_final_value && merge_in_progress) { if (!found_final_value && merge_in_progress) {
*s = Status::MergeInProgress(""); *s = Status::MergeInProgress("");
} }
PERF_TIMER_STOP(get_from_memtable_time);
PERF_COUNTER_ADD(get_from_memtable_count, 1); PERF_COUNTER_ADD(get_from_memtable_count, 1);
return found_final_value; return found_final_value;
} }
@ -513,8 +530,7 @@ void MemTable::Update(SequenceNumber seq,
bool MemTable::UpdateCallback(SequenceNumber seq, bool MemTable::UpdateCallback(SequenceNumber seq,
const Slice& key, const Slice& key,
const Slice& delta, const Slice& delta) {
const Options& options) {
LookupKey lkey(key, seq); LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key(); Slice memkey = lkey.memtable_key();
@ -549,7 +565,7 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
std::string str_value; std::string str_value;
WriteLock wl(GetLock(lkey.user_key())); WriteLock wl(GetLock(lkey.user_key()));
auto status = options.inplace_callback(prev_buffer, &new_prev_size, auto status = moptions_.inplace_callback(prev_buffer, &new_prev_size,
delta, &str_value); delta, &str_value);
if (status == UpdateStatus::UPDATED_INPLACE) { if (status == UpdateStatus::UPDATED_INPLACE) {
// Value already updated by callback. // Value already updated by callback.
@ -563,12 +579,12 @@ bool MemTable::UpdateCallback(SequenceNumber seq,
memcpy(p, prev_buffer, new_prev_size); memcpy(p, prev_buffer, new_prev_size);
} }
} }
RecordTick(options.statistics.get(), NUMBER_KEYS_UPDATED); RecordTick(ioptions_.statistics, NUMBER_KEYS_UPDATED);
should_flush_ = ShouldFlushNow(); should_flush_ = ShouldFlushNow();
return true; return true;
} else if (status == UpdateStatus::UPDATED) { } else if (status == UpdateStatus::UPDATED) {
Add(seq, kTypeValue, key, Slice(str_value)); Add(seq, kTypeValue, key, Slice(str_value));
RecordTick(options.statistics.get(), NUMBER_KEYS_WRITTEN); RecordTick(ioptions_.statistics, NUMBER_KEYS_WRITTEN);
should_flush_ = ShouldFlushNow(); should_flush_ = ShouldFlushNow();
return true; return true;
} else if (status == UpdateStatus::UPDATE_FAILED) { } else if (status == UpdateStatus::UPDATE_FAILED) {

@ -10,14 +10,18 @@
#pragma once #pragma once
#include <string> #include <string>
#include <memory> #include <memory>
#include <functional>
#include <deque> #include <deque>
#include <vector>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/skiplist.h" #include "db/skiplist.h"
#include "db/version_edit.h" #include "db/version_edit.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/immutable_options.h"
#include "util/arena.h" #include "util/arena.h"
#include "util/dynamic_bloom.h" #include "util/dynamic_bloom.h"
#include "util/mutable_cf_options.h"
namespace rocksdb { namespace rocksdb {
@ -26,6 +30,25 @@ class Mutex;
class MemTableIterator; class MemTableIterator;
class MergeContext; class MergeContext;
struct MemTableOptions {
explicit MemTableOptions(
const MutableCFOptions& mutable_cf_options,
const Options& options);
size_t write_buffer_size;
size_t arena_block_size;
uint32_t memtable_prefix_bloom_bits;
uint32_t memtable_prefix_bloom_probes;
size_t memtable_prefix_bloom_huge_page_tlb_size;
bool inplace_update_support;
size_t inplace_update_num_locks;
UpdateStatus (*inplace_callback)(char* existing_value,
uint32_t* existing_value_size,
Slice delta_value,
std::string* merged_value);
size_t max_successive_merges;
bool filter_deletes;
};
class MemTable { class MemTable {
public: public:
struct KeyComparator : public MemTableRep::KeyComparator { struct KeyComparator : public MemTableRep::KeyComparator {
@ -40,7 +63,8 @@ class MemTable {
// MemTables are reference counted. The initial reference count // MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once. // is zero and the caller must call Ref() at least once.
explicit MemTable(const InternalKeyComparator& comparator, explicit MemTable(const InternalKeyComparator& comparator,
const Options& options); const ImmutableCFOptions& ioptions,
const MemTableOptions& moptions);
~MemTable(); ~MemTable();
@ -67,7 +91,11 @@ class MemTable {
// This method heuristically determines if the memtable should continue to // This method heuristically determines if the memtable should continue to
// host more data. // host more data.
bool ShouldFlush() const { return should_flush_; } bool ShouldScheduleFlush() const {
return flush_scheduled_ == false && should_flush_;
}
void MarkFlushScheduled() { flush_scheduled_ = true; }
// Return an iterator that yields the contents of the memtable. // Return an iterator that yields the contents of the memtable.
// //
@ -81,9 +109,7 @@ class MemTable {
// arena: If not null, the arena needs to be used to allocate the Iterator. // arena: If not null, the arena needs to be used to allocate the Iterator.
// Calling ~Iterator of the iterator will destroy all the states but // Calling ~Iterator of the iterator will destroy all the states but
// those allocated in arena. // those allocated in arena.
Iterator* NewIterator(const ReadOptions& options, Iterator* NewIterator(const ReadOptions& read_options, Arena* arena);
bool enforce_total_order = false,
Arena* arena = nullptr);
// Add an entry into memtable that maps key to value at the // Add an entry into memtable that maps key to value at the
// specified sequence number and with the specified type. // specified sequence number and with the specified type.
@ -101,7 +127,7 @@ class MemTable {
// store MergeInProgress in s, and return false. // store MergeInProgress in s, and return false.
// Else, return false. // Else, return false.
bool Get(const LookupKey& key, std::string* value, Status* s, bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options); MergeContext* merge_context);
// Attempts to update the new_value inplace, else does normal Add // Attempts to update the new_value inplace, else does normal Add
// Pseudocode // Pseudocode
@ -125,8 +151,7 @@ class MemTable {
// else return false // else return false
bool UpdateCallback(SequenceNumber seq, bool UpdateCallback(SequenceNumber seq,
const Slice& key, const Slice& key,
const Slice& delta, const Slice& delta);
const Options& options);
// Returns the number of successive merge entries starting from the newest // Returns the number of successive merge entries starting from the newest
// entry for the key up to the last non-merge entry or last entry for the // entry for the key up to the last non-merge entry or last entry for the
@ -139,6 +164,9 @@ class MemTable {
// Returns the edits area that is needed for flushing the memtable // Returns the edits area that is needed for flushing the memtable
VersionEdit* GetEdits() { return &edit_; } VersionEdit* GetEdits() { return &edit_; }
// Returns if there is no entry inserted to the mem table.
bool IsEmpty() const { return first_seqno_ == 0; }
// Returns the sequence number of the first element that was inserted // Returns the sequence number of the first element that was inserted
// into the memtable // into the memtable
SequenceNumber GetFirstSequenceNumber() { return first_seqno_; } SequenceNumber GetFirstSequenceNumber() { return first_seqno_; }
@ -171,8 +199,11 @@ class MemTable {
const Arena& TEST_GetArena() const { return arena_; } const Arena& TEST_GetArena() const { return arena_; }
const ImmutableCFOptions* GetImmutableOptions() const { return &ioptions_; }
const MemTableOptions* GetMemTableOptions() const { return &moptions_; }
private: private:
// Dynamically check if we can add more incoming entries. // Dynamically check if we can add more incoming entries
bool ShouldFlushNow() const; bool ShouldFlushNow() const;
friend class MemTableIterator; friend class MemTableIterator;
@ -180,9 +211,10 @@ class MemTable {
friend class MemTableList; friend class MemTableList;
KeyComparator comparator_; KeyComparator comparator_;
const ImmutableCFOptions& ioptions_;
const MemTableOptions moptions_;
int refs_; int refs_;
const size_t kArenaBlockSize; const size_t kArenaBlockSize;
const size_t kWriteBufferSize;
Arena arena_; Arena arena_;
unique_ptr<MemTableRep> table_; unique_ptr<MemTableRep> table_;
@ -215,6 +247,9 @@ class MemTable {
// a flag indicating if a memtable has met the criteria to flush // a flag indicating if a memtable has met the criteria to flush
bool should_flush_; bool should_flush_;
// a flag indicating if flush has been scheduled
bool flush_scheduled_;
}; };
extern const char* EncodeKey(std::string* scratch, const Slice& target); extern const char* EncodeKey(std::string* scratch, const Slice& target);

@ -62,10 +62,9 @@ int MemTableList::size() const {
// Return the most recent value found, if any. // Return the most recent value found, if any.
// Operands stores the list of merge operations to apply, so far. // Operands stores the list of merge operations to apply, so far.
bool MemTableListVersion::Get(const LookupKey& key, std::string* value, bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
Status* s, MergeContext& merge_context, Status* s, MergeContext* merge_context) {
const Options& options) {
for (auto& memtable : memlist_) { for (auto& memtable : memlist_) {
if (memtable->Get(key, value, s, merge_context, options)) { if (memtable->Get(key, value, s, merge_context)) {
return true; return true;
} }
} }
@ -73,9 +72,10 @@ bool MemTableListVersion::Get(const LookupKey& key, std::string* value,
} }
void MemTableListVersion::AddIterators(const ReadOptions& options, void MemTableListVersion::AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list) { std::vector<Iterator*>* iterator_list,
Arena* arena) {
for (auto& m : memlist_) { for (auto& m : memlist_) {
iterator_list->push_back(m->NewIterator(options)); iterator_list->push_back(m->NewIterator(options, arena));
} }
} }

@ -46,10 +46,10 @@ class MemTableListVersion {
// Search all the memtables starting from the most recent one. // Search all the memtables starting from the most recent one.
// Return the most recent value found, if any. // Return the most recent value found, if any.
bool Get(const LookupKey& key, std::string* value, Status* s, bool Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options); MergeContext* merge_context);
void AddIterators(const ReadOptions& options, void AddIterators(const ReadOptions& options,
std::vector<Iterator*>* iterator_list); std::vector<Iterator*>* iterator_list, Arena* arena);
void AddIterators(const ReadOptions& options, void AddIterators(const ReadOptions& options,
MergeIteratorBuilder* merge_iter_builder); MergeIteratorBuilder* merge_iter_builder);

@ -192,16 +192,17 @@ extern const uint64_t kPlainTableMagicNumber;
class TestPlainTableReader : public PlainTableReader { class TestPlainTableReader : public PlainTableReader {
public: public:
TestPlainTableReader(const EnvOptions& storage_options, TestPlainTableReader(const EnvOptions& env_options,
const InternalKeyComparator& icomparator, const InternalKeyComparator& icomparator,
EncodingType encoding_type, uint64_t file_size, EncodingType encoding_type, uint64_t file_size,
int bloom_bits_per_key, double hash_table_ratio, int bloom_bits_per_key, double hash_table_ratio,
size_t index_sparseness, size_t index_sparseness,
const TableProperties* table_properties, const TableProperties* table_properties,
unique_ptr<RandomAccessFile>&& file, unique_ptr<RandomAccessFile>&& file,
const Options& options, bool* expect_bloom_not_match, const ImmutableCFOptions& ioptions,
bool* expect_bloom_not_match,
bool store_index_in_file) bool store_index_in_file)
: PlainTableReader(options, std::move(file), storage_options, icomparator, : PlainTableReader(ioptions, std::move(file), env_options, icomparator,
encoding_type, file_size, table_properties), encoding_type, file_size, table_properties),
expect_bloom_not_match_(expect_bloom_not_match) { expect_bloom_not_match_(expect_bloom_not_match) {
Status s = MmapDataFile(); Status s = MmapDataFile();
@ -218,7 +219,7 @@ class TestPlainTableReader : public PlainTableReader {
PlainTablePropertyNames::kBloomVersion); PlainTablePropertyNames::kBloomVersion);
ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end()); ASSERT_TRUE(bloom_version_ptr != props->user_collected_properties.end());
ASSERT_EQ(bloom_version_ptr->second, std::string("1")); ASSERT_EQ(bloom_version_ptr->second, std::string("1"));
if (options.bloom_locality > 0) { if (ioptions.bloom_locality > 0) {
auto num_blocks_ptr = props->user_collected_properties.find( auto num_blocks_ptr = props->user_collected_properties.find(
PlainTablePropertyNames::kNumBloomBlocks); PlainTablePropertyNames::kNumBloomBlocks);
ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end()); ASSERT_TRUE(num_blocks_ptr != props->user_collected_properties.end());
@ -253,25 +254,26 @@ class TestPlainTableFactory : public PlainTableFactory {
store_index_in_file_(options.store_index_in_file), store_index_in_file_(options.store_index_in_file),
expect_bloom_not_match_(expect_bloom_not_match) {} expect_bloom_not_match_(expect_bloom_not_match) {}
Status NewTableReader(const Options& options, const EnvOptions& soptions, Status NewTableReader(const ImmutableCFOptions& ioptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table) const override { unique_ptr<TableReader>* table) const override {
TableProperties* props = nullptr; TableProperties* props = nullptr;
auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber,
options.env, options.info_log.get(), &props); ioptions.env, ioptions.info_log, &props);
ASSERT_TRUE(s.ok()); ASSERT_TRUE(s.ok());
if (store_index_in_file_) { if (store_index_in_file_) {
BlockHandle bloom_block_handle; BlockHandle bloom_block_handle;
s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber, s = FindMetaBlock(file.get(), file_size, kPlainTableMagicNumber,
options.env, BloomBlockBuilder::kBloomBlock, ioptions.env, BloomBlockBuilder::kBloomBlock,
&bloom_block_handle); &bloom_block_handle);
ASSERT_TRUE(s.ok()); ASSERT_TRUE(s.ok());
BlockHandle index_block_handle; BlockHandle index_block_handle;
s = FindMetaBlock( s = FindMetaBlock(
file.get(), file_size, kPlainTableMagicNumber, options.env, file.get(), file_size, kPlainTableMagicNumber, ioptions.env,
PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle);
ASSERT_TRUE(s.ok()); ASSERT_TRUE(s.ok());
} }
@ -284,9 +286,9 @@ class TestPlainTableFactory : public PlainTableFactory {
DecodeFixed32(encoding_type_prop->second.c_str())); DecodeFixed32(encoding_type_prop->second.c_str()));
std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader( std::unique_ptr<PlainTableReader> new_reader(new TestPlainTableReader(
soptions, internal_comparator, encoding_type, file_size, env_options, internal_comparator, encoding_type, file_size,
bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, props,
std::move(file), options, expect_bloom_not_match_, std::move(file), ioptions, expect_bloom_not_match_,
store_index_in_file_)); store_index_in_file_));
*table = std::move(new_reader); *table = std::move(new_reader);

@ -31,7 +31,10 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include "db/builder.h" #include "db/builder.h"
#include "db/db_impl.h" #include "db/db_impl.h"
@ -46,6 +49,9 @@
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
#include "util/scoped_arena_iterator.h"
namespace rocksdb { namespace rocksdb {
@ -57,8 +63,8 @@ class Repairer {
: dbname_(dbname), : dbname_(dbname),
env_(options.env), env_(options.env),
icmp_(options.comparator), icmp_(options.comparator),
ipolicy_(options.filter_policy), options_(SanitizeOptions(dbname, &icmp_, options)),
options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), ioptions_(options_),
raw_table_cache_( raw_table_cache_(
// TableCache can be small since we expect each table to be opened // TableCache can be small since we expect each table to be opened
// once. // once.
@ -66,7 +72,7 @@ class Repairer {
options_.table_cache_remove_scan_count_limit)), options_.table_cache_remove_scan_count_limit)),
next_file_number_(1) { next_file_number_(1) {
table_cache_ = table_cache_ =
new TableCache(&options_, storage_options_, raw_table_cache_.get()); new TableCache(ioptions_, env_options_, raw_table_cache_.get());
edit_ = new VersionEdit(); edit_ = new VersionEdit();
} }
@ -108,9 +114,9 @@ class Repairer {
std::string const dbname_; std::string const dbname_;
Env* const env_; Env* const env_;
InternalKeyComparator const icmp_; const InternalKeyComparator icmp_;
InternalFilterPolicy const ipolicy_; const Options options_;
Options const options_; const ImmutableCFOptions ioptions_;
std::shared_ptr<Cache> raw_table_cache_; std::shared_ptr<Cache> raw_table_cache_;
TableCache* table_cache_; TableCache* table_cache_;
VersionEdit* edit_; VersionEdit* edit_;
@ -120,7 +126,7 @@ class Repairer {
std::vector<uint64_t> logs_; std::vector<uint64_t> logs_;
std::vector<TableInfo> tables_; std::vector<TableInfo> tables_;
uint64_t next_file_number_; uint64_t next_file_number_;
const EnvOptions storage_options_; const EnvOptions env_options_;
Status FindFiles() { Status FindFiles() {
std::vector<std::string> filenames; std::vector<std::string> filenames;
@ -192,7 +198,7 @@ class Repairer {
// Open the log file // Open the log file
std::string logname = LogFileName(dbname_, log); std::string logname = LogFileName(dbname_, log);
unique_ptr<SequentialFile> lfile; unique_ptr<SequentialFile> lfile;
Status status = env_->NewSequentialFile(logname, &lfile, storage_options_); Status status = env_->NewSequentialFile(logname, &lfile, env_options_);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
@ -213,7 +219,8 @@ class Repairer {
std::string scratch; std::string scratch;
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_); MemTable* mem = new MemTable(icmp_, ioptions_,
MemTableOptions(MutableCFOptions(options_), options_));
auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_); auto cf_mems_default = new ColumnFamilyMemTablesDefault(mem, &options_);
mem->Ref(); mem->Ref();
int counter = 0; int counter = 0;
@ -238,11 +245,15 @@ class Repairer {
// since ExtractMetaData() will also generate edits. // since ExtractMetaData() will also generate edits.
FileMetaData meta; FileMetaData meta;
meta.fd = FileDescriptor(next_file_number_++, 0, 0); meta.fd = FileDescriptor(next_file_number_++, 0, 0);
{
ReadOptions ro; ReadOptions ro;
Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */); ro.total_order_seek = true;
status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, Arena arena;
iter, &meta, icmp_, 0, 0, kNoCompression); ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
delete iter; status = BuildTable(dbname_, env_, ioptions_, env_options_, table_cache_,
iter.get(), &meta, icmp_, 0, 0, kNoCompression,
CompressionOptions());
}
delete mem->Unref(); delete mem->Unref();
delete cf_mems_default; delete cf_mems_default;
mem = nullptr; mem = nullptr;
@ -287,7 +298,7 @@ class Repairer {
file_size); file_size);
if (status.ok()) { if (status.ok()) {
Iterator* iter = table_cache_->NewIterator( Iterator* iter = table_cache_->NewIterator(
ReadOptions(), storage_options_, icmp_, t->meta.fd); ReadOptions(), env_options_, icmp_, t->meta.fd);
bool empty = true; bool empty = true;
ParsedInternalKey parsed; ParsedInternalKey parsed;
t->min_sequence = 0; t->min_sequence = 0;
@ -327,7 +338,7 @@ class Repairer {
std::string tmp = TempFileName(dbname_, 1); std::string tmp = TempFileName(dbname_, 1);
unique_ptr<WritableFile> file; unique_ptr<WritableFile> file;
Status status = env_->NewWritableFile( Status status = env_->NewWritableFile(
tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); tmp, &file, env_->OptimizeForManifestWrite(env_options_));
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }

@ -79,7 +79,8 @@ public:
// for the duration of the returned table's lifetime. // for the duration of the returned table's lifetime.
// //
// *file must remain live while this Table is in use. // *file must remain live while this Table is in use.
static Status Open(const Options& options, const EnvOptions& soptions, static Status Open(const ImmutableCFOptions& options,
const EnvOptions& env_options,
unique_ptr<RandomAccessFile> && file, uint64_t file_size, unique_ptr<RandomAccessFile> && file, uint64_t file_size,
unique_ptr<TableReader>* table_reader); unique_ptr<TableReader>* table_reader);
@ -160,14 +161,14 @@ private:
struct SimpleTableReader::Rep { struct SimpleTableReader::Rep {
~Rep() { ~Rep() {
} }
Rep(const EnvOptions& storage_options, uint64_t index_start_offset, Rep(const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
int num_entries) : uint64_t index_start_offset, int num_entries) :
soptions(storage_options), index_start_offset(index_start_offset), ioptions(ioptions), env_options(env_options),
num_entries(num_entries) { index_start_offset(index_start_offset), num_entries(num_entries) {
} }
Options options; const ImmutableCFOptions& ioptions;
const EnvOptions& soptions; const EnvOptions& env_options;
Status status; Status status;
unique_ptr<RandomAccessFile> file; unique_ptr<RandomAccessFile> file;
uint64_t index_start_offset; uint64_t index_start_offset;
@ -187,8 +188,8 @@ SimpleTableReader::~SimpleTableReader() {
delete rep_; delete rep_;
} }
Status SimpleTableReader::Open(const Options& options, Status SimpleTableReader::Open(const ImmutableCFOptions& ioptions,
const EnvOptions& soptions, const EnvOptions& env_options,
unique_ptr<RandomAccessFile> && file, unique_ptr<RandomAccessFile> && file,
uint64_t size, uint64_t size,
unique_ptr<TableReader>* table_reader) { unique_ptr<TableReader>* table_reader) {
@ -201,12 +202,10 @@ Status SimpleTableReader::Open(const Options& options,
int num_entries = (size - Rep::offset_length - index_start_offset) int num_entries = (size - Rep::offset_length - index_start_offset)
/ (Rep::GetInternalKeyLength() + Rep::offset_length); / (Rep::GetInternalKeyLength() + Rep::offset_length);
SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(soptions, SimpleTableReader::Rep* rep = new SimpleTableReader::Rep(
index_start_offset, ioptions, env_options, index_start_offset, num_entries);
num_entries);
rep->file = std::move(file); rep->file = std::move(file);
rep->options = options;
table_reader->reset(new SimpleTableReader(rep)); table_reader->reset(new SimpleTableReader(rep));
} }
return s; return s;
@ -248,7 +247,7 @@ Status SimpleTableReader::GetOffset(const Slice& target, uint64_t* offset) {
return s; return s;
} }
InternalKeyComparator ikc(rep_->options.comparator); InternalKeyComparator ikc(rep_->ioptions.comparator);
int compare_result = ikc.Compare(tmp_slice, target); int compare_result = ikc.Compare(tmp_slice, target);
if (compare_result < 0) { if (compare_result < 0) {
@ -382,7 +381,7 @@ void SimpleTableIterator::Prev() {
} }
Slice SimpleTableIterator::key() const { Slice SimpleTableIterator::key() const {
Log(table_->rep_->options.info_log, "key!!!!"); Log(table_->rep_->ioptions.info_log, "key!!!!");
return key_; return key_;
} }
@ -401,7 +400,7 @@ public:
// caller to close the file after calling Finish(). The output file // caller to close the file after calling Finish(). The output file
// will be part of level specified by 'level'. A value of -1 means // will be part of level specified by 'level'. A value of -1 means
// that the caller does not know which level the output file will reside. // that the caller does not know which level the output file will reside.
SimpleTableBuilder(const Options& options, WritableFile* file, SimpleTableBuilder(const ImmutableCFOptions& ioptions, WritableFile* file,
CompressionType compression_type); CompressionType compression_type);
// REQUIRES: Either Finish() or Abandon() has been called. // REQUIRES: Either Finish() or Abandon() has been called.
@ -444,7 +443,7 @@ private:
}; };
struct SimpleTableBuilder::Rep { struct SimpleTableBuilder::Rep {
Options options; const ImmutableCFOptions& ioptions;
WritableFile* file; WritableFile* file;
uint64_t offset = 0; uint64_t offset = 0;
Status status; Status status;
@ -463,17 +462,17 @@ struct SimpleTableBuilder::Rep {
std::string index; std::string index;
Rep(const Options& opt, WritableFile* f) : Rep(const ImmutableCFOptions& iopt, WritableFile* f) :
options(opt), file(f) { ioptions(iopt), file(f) {
} }
~Rep() { ~Rep() {
} }
}; };
SimpleTableBuilder::SimpleTableBuilder(const Options& options, SimpleTableBuilder::SimpleTableBuilder(const ImmutableCFOptions& ioptions,
WritableFile* file, WritableFile* file,
CompressionType compression_type) : CompressionType compression_type) :
rep_(new SimpleTableBuilder::Rep(options, file)) { rep_(new SimpleTableBuilder::Rep(ioptions, file)) {
} }
SimpleTableBuilder::~SimpleTableBuilder() { SimpleTableBuilder::~SimpleTableBuilder() {
@ -546,31 +545,45 @@ public:
const char* Name() const override { const char* Name() const override {
return "SimpleTable"; return "SimpleTable";
} }
Status NewTableReader(const Options& options, const EnvOptions& soptions, Status NewTableReader(const ImmutableCFOptions& ioptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_key, const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const; unique_ptr<TableReader>* table_reader) const;
TableBuilder* NewTableBuilder(const Options& options, TableBuilder* NewTableBuilder(
const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_key, const InternalKeyComparator& internal_key,
WritableFile* file, WritableFile* file,
CompressionType compression_type) const; const CompressionType compression_type,
const CompressionOptions& compression_opts) const;
virtual Status SanitizeDBOptions(const DBOptions* db_opts) const override {
return Status::OK();
}
virtual std::string GetPrintableTableOptions() const override {
return std::string();
}
}; };
Status SimpleTableFactory::NewTableReader( Status SimpleTableFactory::NewTableReader(
const Options& options, const EnvOptions& soptions, const ImmutableCFOptions& ioptions,
const EnvOptions& env_options,
const InternalKeyComparator& internal_key, const InternalKeyComparator& internal_key,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const { unique_ptr<TableReader>* table_reader) const {
return SimpleTableReader::Open(options, soptions, std::move(file), file_size, return SimpleTableReader::Open(ioptions, env_options, std::move(file),
table_reader); file_size, table_reader);
} }
TableBuilder* SimpleTableFactory::NewTableBuilder( TableBuilder* SimpleTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_key, const ImmutableCFOptions& ioptions,
WritableFile* file, CompressionType compression_type) const { const InternalKeyComparator& internal_key,
return new SimpleTableBuilder(options, file, compression_type); WritableFile* file, const CompressionType compression_type,
const CompressionOptions& compression_opts) const {
return new SimpleTableBuilder(ioptions, file, compression_type);
} }
class SimpleTableDBTest { class SimpleTableDBTest {

@ -71,7 +71,7 @@ class SnapshotList {
} }
// get the sequence number of the most recent snapshot // get the sequence number of the most recent snapshot
const SequenceNumber GetNewest() { SequenceNumber GetNewest() {
if (empty()) { if (empty()) {
return 0; return 0;
} }

@ -36,12 +36,10 @@ static Slice GetSliceForFileNumber(const uint64_t* file_number) {
sizeof(*file_number)); sizeof(*file_number));
} }
TableCache::TableCache(const Options* options, TableCache::TableCache(const ImmutableCFOptions& ioptions,
const EnvOptions& storage_options, Cache* const cache) const EnvOptions& env_options, Cache* const cache)
: env_(options->env), : ioptions_(ioptions),
db_paths_(options->db_paths), env_options_(env_options),
options_(options),
storage_options_(storage_options),
cache_(cache) {} cache_(cache) {}
TableCache::~TableCache() { TableCache::~TableCache() {
@ -55,7 +53,7 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
cache_->Release(handle); cache_->Release(handle);
} }
Status TableCache::FindTable(const EnvOptions& toptions, Status TableCache::FindTable(const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, Cache::Handle** handle, const FileDescriptor& fd, Cache::Handle** handle,
const bool no_io) { const bool no_io) {
@ -68,24 +66,24 @@ Status TableCache::FindTable(const EnvOptions& toptions,
return Status::Incomplete("Table not found in table_cache, no_io is set"); return Status::Incomplete("Table not found in table_cache, no_io is set");
} }
std::string fname = std::string fname =
TableFileName(db_paths_, fd.GetNumber(), fd.GetPathId()); TableFileName(ioptions_.db_paths, fd.GetNumber(), fd.GetPathId());
unique_ptr<RandomAccessFile> file; unique_ptr<RandomAccessFile> file;
unique_ptr<TableReader> table_reader; unique_ptr<TableReader> table_reader;
s = env_->NewRandomAccessFile(fname, &file, toptions); s = ioptions_.env->NewRandomAccessFile(fname, &file, env_options);
RecordTick(options_->statistics.get(), NO_FILE_OPENS); RecordTick(ioptions_.statistics, NO_FILE_OPENS);
if (s.ok()) { if (s.ok()) {
if (options_->advise_random_on_open) { if (ioptions_.advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM); file->Hint(RandomAccessFile::RANDOM);
} }
StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS);
s = options_->table_factory->NewTableReader( s = ioptions_.table_factory->NewTableReader(
*options_, toptions, internal_comparator, std::move(file), ioptions_, env_options, internal_comparator, std::move(file),
fd.GetFileSize(), &table_reader); fd.GetFileSize(), &table_reader);
} }
if (!s.ok()) { if (!s.ok()) {
assert(table_reader == nullptr); assert(table_reader == nullptr);
RecordTick(options_->statistics.get(), NO_FILE_ERRORS); RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
// We do not cache error results so that if the error is transient, // We do not cache error results so that if the error is transient,
// or somebody repairs the file, we recover automatically. // or somebody repairs the file, we recover automatically.
} else { } else {
@ -97,7 +95,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
} }
Iterator* TableCache::NewIterator(const ReadOptions& options, Iterator* TableCache::NewIterator(const ReadOptions& options,
const EnvOptions& toptions, const EnvOptions& env_options,
const InternalKeyComparator& icomparator, const InternalKeyComparator& icomparator,
const FileDescriptor& fd, const FileDescriptor& fd,
TableReader** table_reader_ptr, TableReader** table_reader_ptr,
@ -109,7 +107,7 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
Status s; Status s;
if (table_reader == nullptr) { if (table_reader == nullptr) {
s = FindTable(toptions, icomparator, fd, &handle, s = FindTable(env_options, icomparator, fd, &handle,
options.read_tier == kBlockCacheTier); options.read_tier == kBlockCacheTier);
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s, arena); return NewErrorIterator(s, arena);
@ -142,7 +140,7 @@ Status TableCache::Get(const ReadOptions& options,
Status s; Status s;
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
if (!t) { if (!t) {
s = FindTable(storage_options_, internal_comparator, fd, &handle, s = FindTable(env_options_, internal_comparator, fd, &handle,
options.read_tier == kBlockCacheTier); options.read_tier == kBlockCacheTier);
if (s.ok()) { if (s.ok()) {
t = GetTableReaderFromHandle(handle); t = GetTableReaderFromHandle(handle);
@ -160,8 +158,9 @@ Status TableCache::Get(const ReadOptions& options,
} }
return s; return s;
} }
Status TableCache::GetTableProperties( Status TableCache::GetTableProperties(
const EnvOptions& toptions, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
std::shared_ptr<const TableProperties>* properties, bool no_io) { std::shared_ptr<const TableProperties>* properties, bool no_io) {
Status s; Status s;
@ -174,7 +173,7 @@ Status TableCache::GetTableProperties(
} }
Cache::Handle* table_handle = nullptr; Cache::Handle* table_handle = nullptr;
s = FindTable(toptions, internal_comparator, fd, &table_handle, no_io); s = FindTable(env_options, internal_comparator, fd, &table_handle, no_io);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -186,7 +185,7 @@ Status TableCache::GetTableProperties(
} }
size_t TableCache::GetMemoryUsageByTableReader( size_t TableCache::GetMemoryUsageByTableReader(
const EnvOptions& toptions, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd) { const FileDescriptor& fd) {
Status s; Status s;
@ -197,7 +196,7 @@ size_t TableCache::GetMemoryUsageByTableReader(
} }
Cache::Handle* table_handle = nullptr; Cache::Handle* table_handle = nullptr;
s = FindTable(toptions, internal_comparator, fd, &table_handle, true); s = FindTable(env_options, internal_comparator, fd, &table_handle, true);
if (!s.ok()) { if (!s.ok()) {
return 0; return 0;
} }

@ -19,6 +19,7 @@
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/options.h"
#include "table/table_reader.h" #include "table/table_reader.h"
namespace rocksdb { namespace rocksdb {
@ -29,8 +30,8 @@ struct FileDescriptor;
class TableCache { class TableCache {
public: public:
TableCache(const Options* options, const EnvOptions& storage_options, TableCache(const ImmutableCFOptions& ioptions,
Cache* cache); const EnvOptions& storage_options, Cache* cache);
~TableCache(); ~TableCache();
// Return an iterator for the specified file number (the corresponding // Return an iterator for the specified file number (the corresponding
@ -91,10 +92,8 @@ class TableCache {
void ReleaseHandle(Cache::Handle* handle); void ReleaseHandle(Cache::Handle* handle);
private: private:
Env* const env_; const ImmutableCFOptions& ioptions_;
const std::vector<DbPath> db_paths_; const EnvOptions& env_options_;
const Options* options_;
const EnvOptions& storage_options_;
Cache* const cache_; Cache* const cache_;
}; };

@ -11,6 +11,7 @@
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/table_properties_collector.h" #include "db/table_properties_collector.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/immutable_options.h"
#include "table/block_based_table_factory.h" #include "table/block_based_table_factory.h"
#include "table/meta_blocks.h" #include "table/meta_blocks.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
@ -85,12 +86,14 @@ class DumbLogger : public Logger {
// Utilities test functions // Utilities test functions
namespace { namespace {
void MakeBuilder(const Options& options, void MakeBuilder(const Options& options,
const ImmutableCFOptions& ioptions,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
std::unique_ptr<FakeWritableFile>* writable, std::unique_ptr<FakeWritableFile>* writable,
std::unique_ptr<TableBuilder>* builder) { std::unique_ptr<TableBuilder>* builder) {
writable->reset(new FakeWritableFile); writable->reset(new FakeWritableFile);
builder->reset(options.table_factory->NewTableBuilder( builder->reset(ioptions.table_factory->NewTableBuilder(
options, internal_comparator, writable->get(), options.compression)); ioptions, internal_comparator, writable->get(),
options.compression, options.compression_opts));
} }
} // namespace } // namespace
@ -153,7 +156,8 @@ void TestCustomizedTablePropertiesCollector(
// -- Step 1: build table // -- Step 1: build table
std::unique_ptr<TableBuilder> builder; std::unique_ptr<TableBuilder> builder;
std::unique_ptr<FakeWritableFile> writable; std::unique_ptr<FakeWritableFile> writable;
MakeBuilder(options, internal_comparator, &writable, &builder); const ImmutableCFOptions ioptions(options);
MakeBuilder(options, ioptions, internal_comparator, &writable, &builder);
for (const auto& kv : kvs) { for (const auto& kv : kvs) {
if (encode_as_internal) { if (encode_as_internal) {
@ -257,16 +261,17 @@ void TestInternalKeyPropertiesCollector(
// SanitizeOptions(). // SanitizeOptions().
options.info_log = std::make_shared<DumbLogger>(); options.info_log = std::make_shared<DumbLogger>();
options = SanitizeOptions("db", // just a place holder options = SanitizeOptions("db", // just a place holder
&pikc, nullptr, // don't care filter policy &pikc,
options); options);
options.comparator = comparator; options.comparator = comparator;
} else { } else {
options.table_properties_collector_factories = { options.table_properties_collector_factories = {
std::make_shared<InternalKeyPropertiesCollectorFactory>()}; std::make_shared<InternalKeyPropertiesCollectorFactory>()};
} }
const ImmutableCFOptions ioptions(options);
for (int iter = 0; iter < 2; ++iter) { for (int iter = 0; iter < 2; ++iter) {
MakeBuilder(options, pikc, &writable, &builder); MakeBuilder(options, ioptions, pikc, &writable, &builder);
for (const auto& k : keys) { for (const auto& k : keys) {
builder->Add(k.Encode(), "val"); builder->Add(k.Encode(), "val");
} }

@ -163,13 +163,13 @@ class VersionEdit {
// Add the specified file at the specified number. // Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
// REQUIRES: "smallest" and "largest" are smallest and largest keys in file // REQUIRES: "smallest" and "largest" are smallest and largest keys in file
void AddFile(int level, uint64_t file, uint64_t file_size, void AddFile(int level, uint64_t file, uint64_t file_path_id,
uint64_t file_path_id, const InternalKey& smallest, uint64_t file_size, const InternalKey& smallest,
const InternalKey& largest, const SequenceNumber& smallest_seqno, const InternalKey& largest, const SequenceNumber& smallest_seqno,
const SequenceNumber& largest_seqno) { const SequenceNumber& largest_seqno) {
assert(smallest_seqno <= largest_seqno); assert(smallest_seqno <= largest_seqno);
FileMetaData f; FileMetaData f;
f.fd = FileDescriptor(file, file_size, file_path_id); f.fd = FileDescriptor(file, file_path_id, file_size);
f.smallest = smallest; f.smallest = smallest;
f.largest = largest; f.largest = largest;
f.smallest_seqno = smallest_seqno; f.smallest_seqno = smallest_seqno;

@ -9,7 +9,10 @@
#include "db/version_set.h" #include "db/version_set.h"
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <algorithm> #include <algorithm>
#include <map> #include <map>
@ -509,9 +512,9 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
const FileMetaData* file_meta, const FileMetaData* file_meta,
const std::string* fname) { const std::string* fname) {
auto table_cache = cfd_->table_cache(); auto table_cache = cfd_->table_cache();
auto options = cfd_->options(); auto ioptions = cfd_->ioptions();
Status s = table_cache->GetTableProperties( Status s = table_cache->GetTableProperties(
vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, vset_->env_options_, cfd_->internal_comparator(), file_meta->fd,
tp, true /* no io */); tp, true /* no io */);
if (s.ok()) { if (s.ok()) {
return s; return s;
@ -527,13 +530,13 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
// directly from the properties block in the file. // directly from the properties block in the file.
std::unique_ptr<RandomAccessFile> file; std::unique_ptr<RandomAccessFile> file;
if (fname != nullptr) { if (fname != nullptr) {
s = options->env->NewRandomAccessFile( s = ioptions->env->NewRandomAccessFile(
*fname, &file, vset_->storage_options_); *fname, &file, vset_->env_options_);
} else { } else {
s = options->env->NewRandomAccessFile( s = ioptions->env->NewRandomAccessFile(
TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
file_meta->fd.GetPathId()), file_meta->fd.GetPathId()),
&file, vset_->storage_options_); &file, vset_->env_options_);
} }
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -545,11 +548,11 @@ Status Version::GetTableProperties(std::shared_ptr<const TableProperties>* tp,
s = ReadTableProperties( s = ReadTableProperties(
file.get(), file_meta->fd.GetFileSize(), file.get(), file_meta->fd.GetFileSize(),
Footer::kInvalidTableMagicNumber /* table's magic number */, Footer::kInvalidTableMagicNumber /* table's magic number */,
vset_->env_, options->info_log.get(), &raw_table_properties); vset_->env_, ioptions->info_log, &raw_table_properties);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
RecordTick(options->statistics.get(), NUMBER_DIRECT_LOAD_TABLE_PROPERTIES); RecordTick(ioptions->statistics, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES);
*tp = std::shared_ptr<const TableProperties>(raw_table_properties); *tp = std::shared_ptr<const TableProperties>(raw_table_properties);
return s; return s;
@ -559,7 +562,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) {
for (int level = 0; level < num_levels_; level++) { for (int level = 0; level < num_levels_; level++) {
for (const auto& file_meta : files_[level]) { for (const auto& file_meta : files_[level]) {
auto fname = auto fname =
TableFileName(vset_->options_->db_paths, file_meta->fd.GetNumber(), TableFileName(vset_->db_options_->db_paths, file_meta->fd.GetNumber(),
file_meta->fd.GetPathId()); file_meta->fd.GetPathId());
// 1. If the table is already present in table cache, load table // 1. If the table is already present in table cache, load table
// properties from there. // properties from there.
@ -581,7 +584,7 @@ size_t Version::GetMemoryUsageByTableReaders() {
for (auto& file_level : file_levels_) { for (auto& file_level : file_levels_) {
for (size_t i = 0; i < file_level.num_files; i++) { for (size_t i = 0; i < file_level.num_files; i++) {
total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader(
vset_->storage_options_, cfd_->internal_comparator(), vset_->env_options_, cfd_->internal_comparator(),
file_level.files[i].fd); file_level.files[i].fd);
} }
} }
@ -596,31 +599,6 @@ uint64_t Version::GetEstimatedActiveKeys() {
return num_non_deletions_ - num_deletions_; return num_non_deletions_ - num_deletions_;
} }
void Version::AddIterators(const ReadOptions& read_options,
const EnvOptions& soptions,
std::vector<Iterator*>* iters) {
// Merge all level zero files together since they may overlap
for (size_t i = 0; i < file_levels_[0].num_files; i++) {
const auto& file = file_levels_[0].files[i];
iters->push_back(cfd_->table_cache()->NewIterator(
read_options, soptions, cfd_->internal_comparator(), file.fd));
}
// For levels > 0, we can use a concatenating iterator that sequentially
// walks through the non-overlapping files in the level, opening them
// lazily.
for (int level = 1; level < num_levels_; level++) {
if (file_levels_[level].num_files != 0) {
iters->push_back(NewTwoLevelIterator(new LevelFileIteratorState(
cfd_->table_cache(), read_options, soptions,
cfd_->internal_comparator(), false /* for_compaction */,
cfd_->options()->prefix_extractor != nullptr),
new LevelFileNumIterator(cfd_->internal_comparator(),
&file_levels_[level])));
}
}
}
void Version::AddIterators(const ReadOptions& read_options, void Version::AddIterators(const ReadOptions& read_options,
const EnvOptions& soptions, const EnvOptions& soptions,
MergeIteratorBuilder* merge_iter_builder) { MergeIteratorBuilder* merge_iter_builder) {
@ -641,7 +619,7 @@ void Version::AddIterators(const ReadOptions& read_options,
new LevelFileIteratorState( new LevelFileIteratorState(
cfd_->table_cache(), read_options, soptions, cfd_->table_cache(), read_options, soptions,
cfd_->internal_comparator(), false /* for_compaction */, cfd_->internal_comparator(), false /* for_compaction */,
cfd_->options()->prefix_extractor != nullptr), cfd_->ioptions()->prefix_extractor != nullptr),
new LevelFileNumIterator(cfd_->internal_comparator(), new LevelFileNumIterator(cfd_->internal_comparator(),
&file_levels_[level]), merge_iter_builder->GetArena())); &file_levels_[level]), merge_iter_builder->GetArena()));
} }
@ -757,10 +735,10 @@ Version::Version(ColumnFamilyData* cfd, VersionSet* vset,
(cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()), (cfd == nullptr) ? nullptr : internal_comparator_->user_comparator()),
table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()), table_cache_((cfd == nullptr) ? nullptr : cfd->table_cache()),
merge_operator_((cfd == nullptr) ? nullptr merge_operator_((cfd == nullptr) ? nullptr
: cfd->options()->merge_operator.get()), : cfd->ioptions()->merge_operator),
info_log_((cfd == nullptr) ? nullptr : cfd->options()->info_log.get()), info_log_((cfd == nullptr) ? nullptr : cfd->ioptions()->info_log),
db_statistics_((cfd == nullptr) ? nullptr db_statistics_((cfd == nullptr) ? nullptr
: cfd->options()->statistics.get()), : cfd->ioptions()->statistics),
// cfd is nullptr if Version is dummy // cfd is nullptr if Version is dummy
num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()), num_levels_(cfd == nullptr ? 0 : cfd->NumberLevels()),
num_non_empty_levels_(num_levels_), num_non_empty_levels_(num_levels_),
@ -886,7 +864,7 @@ bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) {
Status s = GetTableProperties(&tp, file_meta); Status s = GetTableProperties(&tp, file_meta);
file_meta->init_stats_from_file = true; file_meta->init_stats_from_file = true;
if (!s.ok()) { if (!s.ok()) {
Log(vset_->options_->info_log, Log(vset_->db_options_->info_log,
"Unable to load table properties for file %" PRIu64 " --- %s\n", "Unable to load table properties for file %" PRIu64 " --- %s\n",
file_meta->fd.GetNumber(), s.ToString().c_str()); file_meta->fd.GetNumber(), s.ToString().c_str());
return false; return false;
@ -969,7 +947,7 @@ void Version::ComputeCompactionScore(
numfiles++; numfiles++;
} }
} }
if (cfd_->options()->compaction_style == kCompactionStyleFIFO) { if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO) {
score = static_cast<double>(total_size) / score = static_cast<double>(total_size) /
cfd_->options()->compaction_options_fifo.max_table_files_size; cfd_->options()->compaction_options_fifo.max_table_files_size;
} else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) { } else if (numfiles >= cfd_->options()->level0_stop_writes_trigger) {
@ -1038,8 +1016,8 @@ void Version::UpdateNumNonEmptyLevels() {
} }
void Version::UpdateFilesBySize() { void Version::UpdateFilesBySize() {
if (cfd_->options()->compaction_style == kCompactionStyleFIFO || if (cfd_->ioptions()->compaction_style == kCompactionStyleFIFO ||
cfd_->options()->compaction_style == kCompactionStyleUniversal) { cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
// don't need this // don't need this
return; return;
} }
@ -1699,7 +1677,7 @@ class VersionSet::Builder {
for (auto& file_meta : *(levels_[level].added_files)) { for (auto& file_meta : *(levels_[level].added_files)) {
assert (!file_meta->table_reader_handle); assert (!file_meta->table_reader_handle);
cfd_->table_cache()->FindTable( cfd_->table_cache()->FindTable(
base_->vset_->storage_options_, cfd_->internal_comparator(), base_->vset_->env_options_, cfd_->internal_comparator(),
file_meta->fd, &file_meta->table_reader_handle, false); file_meta->fd, &file_meta->table_reader_handle, false);
if (file_meta->table_reader_handle != nullptr) { if (file_meta->table_reader_handle != nullptr) {
// Load table_reader // Load table_reader
@ -1727,13 +1705,14 @@ class VersionSet::Builder {
} }
}; };
VersionSet::VersionSet(const std::string& dbname, const DBOptions* options, VersionSet::VersionSet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache) const EnvOptions& env_options, Cache* table_cache,
: column_family_set_(new ColumnFamilySet(dbname, options, storage_options, WriteController* write_controller)
table_cache)), : column_family_set_(new ColumnFamilySet(dbname, db_options, env_options,
env_(options->env), table_cache, write_controller)),
env_(db_options->env),
dbname_(dbname), dbname_(dbname),
options_(options), db_options_(db_options),
next_file_number_(2), next_file_number_(2),
manifest_file_number_(0), // Filled by Recover() manifest_file_number_(0), // Filled by Recover()
pending_manifest_file_number_(0), pending_manifest_file_number_(0),
@ -1741,8 +1720,8 @@ VersionSet::VersionSet(const std::string& dbname, const DBOptions* options,
prev_log_number_(0), prev_log_number_(0),
current_version_number_(0), current_version_number_(0),
manifest_file_size_(0), manifest_file_size_(0),
storage_options_(storage_options), env_options_(env_options),
storage_options_compactions_(storage_options_) {} env_options_compactions_(env_options_) {}
VersionSet::~VersionSet() { VersionSet::~VersionSet() {
// we need to delete column_family_set_ because its destructor depends on // we need to delete column_family_set_ because its destructor depends on
@ -1844,7 +1823,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
assert(pending_manifest_file_number_ == 0); assert(pending_manifest_file_number_ == 0);
if (!descriptor_log_ || if (!descriptor_log_ ||
manifest_file_size_ > options_->max_manifest_file_size) { manifest_file_size_ > db_options_->max_manifest_file_size) {
pending_manifest_file_number_ = NewFileNumber(); pending_manifest_file_number_ = NewFileNumber();
batch_edits.back()->SetNextFile(next_file_number_); batch_edits.back()->SetNextFile(next_file_number_);
new_descriptor_log = true; new_descriptor_log = true;
@ -1872,7 +1851,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
mu->Unlock(); mu->Unlock();
if (!edit->IsColumnFamilyManipulation() && options_->max_open_files == -1) { if (!edit->IsColumnFamilyManipulation() &&
db_options_->max_open_files == -1) {
// unlimited table cache. Pre-load table handle now. // unlimited table cache. Pre-load table handle now.
// Need to do it out of the mutex. // Need to do it out of the mutex.
builder->LoadTableHandlers(); builder->LoadTableHandlers();
@ -1882,15 +1862,15 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
// only one thread can be here at the same time // only one thread can be here at the same time
if (new_descriptor_log) { if (new_descriptor_log) {
// create manifest file // create manifest file
Log(options_->info_log, Log(db_options_->info_log,
"Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_);
unique_ptr<WritableFile> descriptor_file; unique_ptr<WritableFile> descriptor_file;
s = env_->NewWritableFile( s = env_->NewWritableFile(
DescriptorFileName(dbname_, pending_manifest_file_number_), DescriptorFileName(dbname_, pending_manifest_file_number_),
&descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); &descriptor_file, env_->OptimizeForManifestWrite(env_options_));
if (s.ok()) { if (s.ok()) {
descriptor_file->SetPreallocationBlockSize( descriptor_file->SetPreallocationBlockSize(
options_->manifest_preallocation_size); db_options_->manifest_preallocation_size);
descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); descriptor_log_.reset(new log::Writer(std::move(descriptor_file)));
s = WriteSnapshot(descriptor_log_.get()); s = WriteSnapshot(descriptor_log_.get());
} }
@ -1911,19 +1891,20 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
break; break;
} }
} }
if (s.ok()) { if (s.ok() && db_options_->disableDataSync == false) {
if (options_->use_fsync) { if (db_options_->use_fsync) {
StopWatch sw(env_, options_->statistics.get(), StopWatch sw(env_, db_options_->statistics.get(),
MANIFEST_FILE_SYNC_MICROS); MANIFEST_FILE_SYNC_MICROS);
s = descriptor_log_->file()->Fsync(); s = descriptor_log_->file()->Fsync();
} else { } else {
StopWatch sw(env_, options_->statistics.get(), StopWatch sw(env_, db_options_->statistics.get(),
MANIFEST_FILE_SYNC_MICROS); MANIFEST_FILE_SYNC_MICROS);
s = descriptor_log_->file()->Sync(); s = descriptor_log_->file()->Sync();
} }
} }
if (!s.ok()) { if (!s.ok()) {
Log(options_->info_log, "MANIFEST write: %s\n", s.ToString().c_str()); Log(db_options_->info_log, "MANIFEST write: %s\n",
s.ToString().c_str());
bool all_records_in = true; bool all_records_in = true;
for (auto& e : batch_edits) { for (auto& e : batch_edits) {
std::string record; std::string record;
@ -1934,7 +1915,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
} }
} }
if (all_records_in) { if (all_records_in) {
Log(options_->info_log, Log(db_options_->info_log,
"MANIFEST contains log record despite error; advancing to new " "MANIFEST contains log record despite error; advancing to new "
"version to prevent mismatch between in-memory and logged state" "version to prevent mismatch between in-memory and logged state"
" If paranoid is set, then the db is now in readonly mode."); " If paranoid is set, then the db is now in readonly mode.");
@ -1947,10 +1928,10 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
// new CURRENT file that points to it. // new CURRENT file that points to it.
if (s.ok() && new_descriptor_log) { if (s.ok() && new_descriptor_log) {
s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_, s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_,
db_directory); db_options_->disableDataSync ? nullptr : db_directory);
if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) {
// delete old manifest file // delete old manifest file
Log(options_->info_log, Log(db_options_->info_log,
"Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
manifest_file_number_, pending_manifest_file_number_); manifest_file_number_, pending_manifest_file_number_);
// we don't care about an error here, PurgeObsoleteFiles will take care // we don't care about an error here, PurgeObsoleteFiles will take care
@ -1964,7 +1945,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
new_manifest_file_size = descriptor_log_->file()->GetFileSize(); new_manifest_file_size = descriptor_log_->file()->GetFileSize();
} }
LogFlush(options_->info_log); LogFlush(db_options_->info_log);
mu->Lock(); mu->Lock();
} }
@ -2000,12 +1981,12 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
manifest_file_size_ = new_manifest_file_size; manifest_file_size_ = new_manifest_file_size;
prev_log_number_ = edit->prev_log_number_; prev_log_number_ = edit->prev_log_number_;
} else { } else {
Log(options_->info_log, "Error in committing version %lu to [%s]", Log(db_options_->info_log, "Error in committing version %lu to [%s]",
(unsigned long)v->GetVersionNumber(), (unsigned long)v->GetVersionNumber(),
column_family_data->GetName().c_str()); column_family_data->GetName().c_str());
delete v; delete v;
if (new_descriptor_log) { if (new_descriptor_log) {
Log(options_->info_log, Log(db_options_->info_log,
"Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n",
manifest_file_number_, pending_manifest_file_number_); manifest_file_number_, pending_manifest_file_number_);
descriptor_log_.reset(); descriptor_log_.reset();
@ -2097,13 +2078,13 @@ Status VersionSet::Recover(
return Status::Corruption("CURRENT file corrupted"); return Status::Corruption("CURRENT file corrupted");
} }
Log(options_->info_log, "Recovering from manifest file: %s\n", Log(db_options_->info_log, "Recovering from manifest file: %s\n",
manifest_filename.c_str()); manifest_filename.c_str());
manifest_filename = dbname_ + "/" + manifest_filename; manifest_filename = dbname_ + "/" + manifest_filename;
unique_ptr<SequentialFile> manifest_file; unique_ptr<SequentialFile> manifest_file;
s = env_->NewSequentialFile(manifest_filename, &manifest_file, s = env_->NewSequentialFile(manifest_filename, &manifest_file,
storage_options_); env_options_);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -2230,7 +2211,7 @@ Status VersionSet::Recover(
if (cfd != nullptr) { if (cfd != nullptr) {
if (edit.has_log_number_) { if (edit.has_log_number_) {
if (cfd->GetLogNumber() > edit.log_number_) { if (cfd->GetLogNumber() > edit.log_number_) {
Log(options_->info_log, Log(db_options_->info_log,
"MANIFEST corruption detected, but ignored - Log numbers in " "MANIFEST corruption detected, but ignored - Log numbers in "
"records NOT monotonically increasing"); "records NOT monotonically increasing");
} else { } else {
@ -2306,7 +2287,7 @@ Status VersionSet::Recover(
assert(builders_iter != builders.end()); assert(builders_iter != builders.end());
auto builder = builders_iter->second; auto builder = builders_iter->second;
if (options_->max_open_files == -1) { if (db_options_->max_open_files == -1) {
// unlimited table cache. Pre-load table handle now. // unlimited table cache. Pre-load table handle now.
// Need to do it out of the mutex. // Need to do it out of the mutex.
builder->LoadTableHandlers(); builder->LoadTableHandlers();
@ -2327,7 +2308,7 @@ Status VersionSet::Recover(
last_sequence_ = last_sequence; last_sequence_ = last_sequence;
prev_log_number_ = prev_log_number; prev_log_number_ = prev_log_number;
Log(options_->info_log, Log(db_options_->info_log,
"Recovered from manifest file:%s succeeded," "Recovered from manifest file:%s succeeded,"
"manifest_file_number is %lu, next_file_number is %lu, " "manifest_file_number is %lu, next_file_number is %lu, "
"last_sequence is %lu, log_number is %lu," "last_sequence is %lu, log_number is %lu,"
@ -2339,7 +2320,7 @@ Status VersionSet::Recover(
column_family_set_->GetMaxColumnFamily()); column_family_set_->GetMaxColumnFamily());
for (auto cfd : *column_family_set_) { for (auto cfd : *column_family_set_) {
Log(options_->info_log, Log(db_options_->info_log,
"Column family [%s] (ID %u), log number is %" PRIu64 "\n", "Column family [%s] (ID %u), log number is %" PRIu64 "\n",
cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber()); cfd->GetName().c_str(), cfd->GetID(), cfd->GetLogNumber());
} }
@ -2422,7 +2403,7 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
const Options* options, const Options* options,
const EnvOptions& storage_options, const EnvOptions& env_options,
int new_levels) { int new_levels) {
if (new_levels <= 1) { if (new_levels <= 1) {
return Status::InvalidArgument( return Status::InvalidArgument(
@ -2433,7 +2414,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
std::shared_ptr<Cache> tc(NewLRUCache( std::shared_ptr<Cache> tc(NewLRUCache(
options->max_open_files - 10, options->table_cache_numshardbits, options->max_open_files - 10, options->table_cache_numshardbits,
options->table_cache_remove_scan_count_limit)); options->table_cache_remove_scan_count_limit));
VersionSet versions(dbname, options, storage_options, tc.get()); WriteController wc;
VersionSet versions(dbname, options, env_options, tc.get(), &wc);
Status status; Status status;
std::vector<ColumnFamilyDescriptor> dummy; std::vector<ColumnFamilyDescriptor> dummy;
@ -2504,7 +2486,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
bool verbose, bool hex) { bool verbose, bool hex) {
// Open the specified manifest file. // Open the specified manifest file.
unique_ptr<SequentialFile> file; unique_ptr<SequentialFile> file;
Status s = options.env->NewSequentialFile(dscname, &file, storage_options_); Status s = options.env->NewSequentialFile(dscname, &file, env_options_);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -2746,12 +2728,12 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
const std::string& record) const { const std::string& record) const {
std::string fname = std::string fname =
DescriptorFileName(dbname_, manifest_file_number); DescriptorFileName(dbname_, manifest_file_number);
Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); Log(db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str());
unique_ptr<SequentialFile> file; unique_ptr<SequentialFile> file;
Status s = env_->NewSequentialFile(fname, &file, storage_options_); Status s = env_->NewSequentialFile(fname, &file, env_options_);
if (!s.ok()) { if (!s.ok()) {
Log(options_->info_log, "ManifestContains: %s\n", s.ToString().c_str()); Log(db_options_->info_log, "ManifestContains: %s\n", s.ToString().c_str());
Log(options_->info_log, Log(db_options_->info_log,
"ManifestContains: is unable to reopen the manifest file %s", "ManifestContains: is unable to reopen the manifest file %s",
fname.c_str()); fname.c_str());
return false; return false;
@ -2766,7 +2748,7 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_number,
break; break;
} }
} }
Log(options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0); Log(db_options_->info_log, "ManifestContains: result = %d\n", result ? 1 : 0);
return result; return result;
} }
@ -2794,7 +2776,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) {
// approximate offset of "ikey" within the table. // approximate offset of "ikey" within the table.
TableReader* table_reader_ptr; TableReader* table_reader_ptr;
Iterator* iter = v->cfd_->table_cache()->NewIterator( Iterator* iter = v->cfd_->table_cache()->NewIterator(
ReadOptions(), storage_options_, v->cfd_->internal_comparator(), ReadOptions(), env_options_, v->cfd_->internal_comparator(),
files[i]->fd, &table_reader_ptr); files[i]->fd, &table_reader_ptr);
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode());
@ -2856,14 +2838,14 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) {
const FileLevel* flevel = c->input_levels(which); const FileLevel* flevel = c->input_levels(which);
for (size_t i = 0; i < flevel->num_files; i++) { for (size_t i = 0; i < flevel->num_files; i++) {
list[num++] = cfd->table_cache()->NewIterator( list[num++] = cfd->table_cache()->NewIterator(
read_options, storage_options_compactions_, read_options, env_options_compactions_,
cfd->internal_comparator(), flevel->files[i].fd, nullptr, cfd->internal_comparator(), flevel->files[i].fd, nullptr,
true /* for compaction */); true /* for compaction */);
} }
} else { } else {
// Create concatenating iterator for the files from this level // Create concatenating iterator for the files from this level
list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState( list[num++] = NewTwoLevelIterator(new Version::LevelFileIteratorState(
cfd->table_cache(), read_options, storage_options_, cfd->table_cache(), read_options, env_options_,
cfd->internal_comparator(), true /* for_compaction */, cfd->internal_comparator(), true /* for_compaction */,
false /* prefix enabled */), false /* prefix enabled */),
new Version::LevelFileNumIterator(cfd->internal_comparator(), new Version::LevelFileNumIterator(cfd->internal_comparator(),
@ -2884,7 +2866,7 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) {
#ifndef NDEBUG #ifndef NDEBUG
Version* version = c->column_family_data()->current(); Version* version = c->column_family_data()->current();
if (c->input_version() != version) { if (c->input_version() != version) {
Log(options_->info_log, Log(db_options_->info_log,
"[%s] VerifyCompactionFileConsistency version mismatch", "[%s] VerifyCompactionFileConsistency version mismatch",
c->column_family_data()->GetName().c_str()); c->column_family_data()->GetName().c_str());
} }
@ -2955,11 +2937,11 @@ void VersionSet::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
LiveFileMetaData filemetadata; LiveFileMetaData filemetadata;
filemetadata.column_family_name = cfd->GetName(); filemetadata.column_family_name = cfd->GetName();
uint32_t path_id = file->fd.GetPathId(); uint32_t path_id = file->fd.GetPathId();
if (path_id < options_->db_paths.size()) { if (path_id < db_options_->db_paths.size()) {
filemetadata.db_path = options_->db_paths[path_id].path; filemetadata.db_path = db_options_->db_paths[path_id].path;
} else { } else {
assert(!options_->db_paths.empty()); assert(!db_options_->db_paths.empty());
filemetadata.db_path = options_->db_paths.back().path; filemetadata.db_path = db_options_->db_paths.back().path;
} }
filemetadata.name = MakeTableFileName("", file->fd.GetNumber()); filemetadata.name = MakeTableFileName("", file->fd.GetNumber());
filemetadata.level = level; filemetadata.level = level;
@ -2980,17 +2962,21 @@ void VersionSet::GetObsoleteFiles(std::vector<FileMetaData*>* files) {
} }
ColumnFamilyData* VersionSet::CreateColumnFamily( ColumnFamilyData* VersionSet::CreateColumnFamily(
const ColumnFamilyOptions& options, VersionEdit* edit) { const ColumnFamilyOptions& cf_options, VersionEdit* edit) {
assert(edit->is_column_family_add_); assert(edit->is_column_family_add_);
Version* dummy_versions = new Version(nullptr, this); Version* dummy_versions = new Version(nullptr, this);
auto new_cfd = column_family_set_->CreateColumnFamily( auto new_cfd = column_family_set_->CreateColumnFamily(
edit->column_family_name_, edit->column_family_, dummy_versions, options); edit->column_family_name_, edit->column_family_, dummy_versions,
cf_options);
Version* v = new Version(new_cfd, this, current_version_number_++); Version* v = new Version(new_cfd, this, current_version_number_++);
AppendVersion(new_cfd, v); AppendVersion(new_cfd, v);
new_cfd->CreateNewMemtable(); // GetLatestMutableCFOptions() is safe here without mutex since the
// cfd is not available to client
new_cfd->CreateNewMemtable(MemTableOptions(
*new_cfd->GetLatestMutableCFOptions(), *new_cfd->options()));
new_cfd->SetLogNumber(edit->log_number_); new_cfd->SetLogNumber(edit->log_number_);
return new_cfd; return new_cfd;
} }

@ -34,6 +34,7 @@
#include "db/column_family.h" #include "db/column_family.h"
#include "db/log_reader.h" #include "db/log_reader.h"
#include "db/file_indexer.h" #include "db/file_indexer.h"
#include "db/write_controller.h"
namespace rocksdb { namespace rocksdb {
@ -86,8 +87,6 @@ class Version {
// Append to *iters a sequence of iterators that will // Append to *iters a sequence of iterators that will
// yield the contents of this Version when merged together. // yield the contents of this Version when merged together.
// REQUIRES: This version has been saved (see VersionSet::SaveTo) // REQUIRES: This version has been saved (see VersionSet::SaveTo)
void AddIterators(const ReadOptions&, const EnvOptions& soptions,
std::vector<Iterator*>* iters);
void AddIterators(const ReadOptions&, const EnvOptions& soptions, void AddIterators(const ReadOptions&, const EnvOptions& soptions,
MergeIteratorBuilder* merger_iter_builder); MergeIteratorBuilder* merger_iter_builder);
@ -257,7 +256,7 @@ class Version {
class LevelFileNumIterator; class LevelFileNumIterator;
class LevelFileIteratorState; class LevelFileIteratorState;
bool PrefixMayMatch(const ReadOptions& options, Iterator* level_iter, bool PrefixMayMatch(const ReadOptions& read_options, Iterator* level_iter,
const Slice& internal_prefix) const; const Slice& internal_prefix) const;
// Update num_non_empty_levels_. // Update num_non_empty_levels_.
@ -323,8 +322,8 @@ class Version {
// These are used to pick the best compaction level // These are used to pick the best compaction level
std::vector<double> compaction_score_; std::vector<double> compaction_score_;
std::vector<int> compaction_level_; std::vector<int> compaction_level_;
double max_compaction_score_; // max score in l1 to ln-1 double max_compaction_score_ = 0.0; // max score in l1 to ln-1
int max_compaction_score_level_; // level on which max score occurs int max_compaction_score_level_ = 0; // level on which max score occurs
// A version number that uniquely represents this version. This is // A version number that uniquely represents this version. This is
// used for debugging and logging purposes only. // used for debugging and logging purposes only.
@ -358,8 +357,9 @@ class Version {
class VersionSet { class VersionSet {
public: public:
VersionSet(const std::string& dbname, const DBOptions* options, VersionSet(const std::string& dbname, const DBOptions* db_options,
const EnvOptions& storage_options, Cache* table_cache); const EnvOptions& env_options, Cache* table_cache,
WriteController* write_controller);
~VersionSet(); ~VersionSet();
// Apply *edit to the current version to form a new descriptor that // Apply *edit to the current version to form a new descriptor that
@ -397,7 +397,7 @@ class VersionSet {
// among [4-6] contains files. // among [4-6] contains files.
static Status ReduceNumberOfLevels(const std::string& dbname, static Status ReduceNumberOfLevels(const std::string& dbname,
const Options* options, const Options* options,
const EnvOptions& storage_options, const EnvOptions& env_options,
int new_levels); int new_levels);
// printf contents (for debugging) // printf contents (for debugging)
@ -506,14 +506,14 @@ class VersionSet {
bool ManifestContains(uint64_t manifest_file_number, bool ManifestContains(uint64_t manifest_file_number,
const std::string& record) const; const std::string& record) const;
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& options, ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
VersionEdit* edit); VersionEdit* edit);
std::unique_ptr<ColumnFamilySet> column_family_set_; std::unique_ptr<ColumnFamilySet> column_family_set_;
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const DBOptions* const options_; const DBOptions* const db_options_;
uint64_t next_file_number_; uint64_t next_file_number_;
uint64_t manifest_file_number_; uint64_t manifest_file_number_;
uint64_t pending_manifest_file_number_; uint64_t pending_manifest_file_number_;
@ -534,12 +534,12 @@ class VersionSet {
std::vector<FileMetaData*> obsolete_files_; std::vector<FileMetaData*> obsolete_files_;
// storage options for all reads and writes except compactions // env options for all reads and writes except compactions
const EnvOptions& storage_options_; const EnvOptions& env_options_;
// storage options used for compactions. This is a copy of // env options used for compactions. This is a copy of
// storage_options_ but with readaheads set to readahead_compactions_. // env_options_ but with readaheads set to readahead_compactions_.
const EnvOptions storage_options_compactions_; const EnvOptions env_options_compactions_;
// No copying allowed // No copying allowed
VersionSet(const VersionSet&); VersionSet(const VersionSet&);

@ -23,10 +23,10 @@
// data: uint8[len] // data: uint8[len]
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "rocksdb/options.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/column_family.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "db/snapshot.h" #include "db/snapshot.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
@ -80,6 +80,58 @@ int WriteBatch::Count() const {
return WriteBatchInternal::Count(this); return WriteBatchInternal::Count(this);
} }
Status ReadRecordFromWriteBatch(Slice* input, char* tag,
uint32_t* column_family, Slice* key,
Slice* value, Slice* blob) {
assert(key != nullptr && value != nullptr);
*tag = (*input)[0];
input->remove_prefix(1);
*column_family = 0; // default
switch (*tag) {
case kTypeColumnFamilyValue:
if (!GetVarint32(input, column_family)) {
return Status::Corruption("bad WriteBatch Put");
}
// intentional fallthrough
case kTypeValue:
if (!GetLengthPrefixedSlice(input, key) ||
!GetLengthPrefixedSlice(input, value)) {
return Status::Corruption("bad WriteBatch Put");
}
break;
case kTypeColumnFamilyDeletion:
if (!GetVarint32(input, column_family)) {
return Status::Corruption("bad WriteBatch Delete");
}
// intentional fallthrough
case kTypeDeletion:
if (!GetLengthPrefixedSlice(input, key)) {
return Status::Corruption("bad WriteBatch Delete");
}
break;
case kTypeColumnFamilyMerge:
if (!GetVarint32(input, column_family)) {
return Status::Corruption("bad WriteBatch Merge");
}
// intentional fallthrough
case kTypeMerge:
if (!GetLengthPrefixedSlice(input, key) ||
!GetLengthPrefixedSlice(input, value)) {
return Status::Corruption("bad WriteBatch Merge");
}
break;
case kTypeLogData:
assert(blob != nullptr);
if (!GetLengthPrefixedSlice(input, blob)) {
return Status::Corruption("bad WriteBatch Blob");
}
break;
default:
return Status::Corruption("unknown WriteBatch tag");
}
return Status::OK();
}
Status WriteBatch::Iterate(Handler* handler) const { Status WriteBatch::Iterate(Handler* handler) const {
Slice input(rep_); Slice input(rep_);
if (input.size() < kHeader) { if (input.size() < kHeader) {
@ -91,57 +143,33 @@ Status WriteBatch::Iterate(Handler* handler) const {
int found = 0; int found = 0;
Status s; Status s;
while (s.ok() && !input.empty() && handler->Continue()) { while (s.ok() && !input.empty() && handler->Continue()) {
char tag = input[0]; char tag = 0;
input.remove_prefix(1);
uint32_t column_family = 0; // default uint32_t column_family = 0; // default
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
&blob);
if (!s.ok()) {
return s;
}
switch (tag) { switch (tag) {
case kTypeColumnFamilyValue: case kTypeColumnFamilyValue:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Put");
}
// intentional fallthrough
case kTypeValue: case kTypeValue:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
s = handler->PutCF(column_family, key, value); s = handler->PutCF(column_family, key, value);
found++; found++;
} else {
return Status::Corruption("bad WriteBatch Put");
}
break; break;
case kTypeColumnFamilyDeletion: case kTypeColumnFamilyDeletion:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Delete");
}
// intentional fallthrough
case kTypeDeletion: case kTypeDeletion:
if (GetLengthPrefixedSlice(&input, &key)) {
s = handler->DeleteCF(column_family, key); s = handler->DeleteCF(column_family, key);
found++; found++;
} else {
return Status::Corruption("bad WriteBatch Delete");
}
break; break;
case kTypeColumnFamilyMerge: case kTypeColumnFamilyMerge:
if (!GetVarint32(&input, &column_family)) {
return Status::Corruption("bad WriteBatch Merge");
}
// intentional fallthrough
case kTypeMerge: case kTypeMerge:
if (GetLengthPrefixedSlice(&input, &key) &&
GetLengthPrefixedSlice(&input, &value)) {
s = handler->MergeCF(column_family, key, value); s = handler->MergeCF(column_family, key, value);
found++; found++;
} else {
return Status::Corruption("bad WriteBatch Merge");
}
break; break;
case kTypeLogData: case kTypeLogData:
if (GetLengthPrefixedSlice(&input, &blob)) {
handler->LogData(blob); handler->LogData(blob);
} else {
return Status::Corruption("bad WriteBatch Blob");
}
break; break;
default: default:
return Status::Corruption("unknown WriteBatch tag"); return Status::Corruption("unknown WriteBatch tag");
@ -186,17 +214,6 @@ void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
PutLengthPrefixedSlice(&b->rep_, value); PutLengthPrefixedSlice(&b->rep_, value);
} }
namespace {
inline uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
uint32_t column_family_id = 0;
if (column_family != nullptr) {
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
column_family_id = cfh->GetID();
}
return column_family_id;
}
} // namespace
void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key, void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) { const Slice& value) {
WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value); WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
@ -281,17 +298,17 @@ class MemTableInserter : public WriteBatch::Handler {
public: public:
SequenceNumber sequence_; SequenceNumber sequence_;
ColumnFamilyMemTables* cf_mems_; ColumnFamilyMemTables* cf_mems_;
bool recovery_; bool ignore_missing_column_families_;
uint64_t log_number_; uint64_t log_number_;
DBImpl* db_; DBImpl* db_;
const bool dont_filter_deletes_; const bool dont_filter_deletes_;
MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems, MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
bool recovery, uint64_t log_number, DB* db, bool ignore_missing_column_families, uint64_t log_number,
const bool dont_filter_deletes) DB* db, const bool dont_filter_deletes)
: sequence_(sequence), : sequence_(sequence),
cf_mems_(cf_mems), cf_mems_(cf_mems),
recovery_(recovery), ignore_missing_column_families_(ignore_missing_column_families),
log_number_(log_number), log_number_(log_number),
db_(reinterpret_cast<DBImpl*>(db)), db_(reinterpret_cast<DBImpl*>(db)),
dont_filter_deletes_(dont_filter_deletes) { dont_filter_deletes_(dont_filter_deletes) {
@ -303,12 +320,18 @@ class MemTableInserter : public WriteBatch::Handler {
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) { bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
bool found = cf_mems_->Seek(column_family_id); bool found = cf_mems_->Seek(column_family_id);
if (recovery_ && (!found || log_number_ < cf_mems_->GetLogNumber())) { if (!found) {
// if in recovery envoronment: if (ignore_missing_column_families_) {
// * If column family was not found, it might mean that the WAL write *s = Status::OK();
// batch references to the column family that was dropped after the } else {
// insert. We don't want to fail the whole write batch in that case -- we *s = Status::InvalidArgument(
// just ignore the update. "Invalid column family specified in write batch");
}
return false;
}
if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
// This is true only in recovery environment (log_number_ is always 0 in
// non-recovery, regular write code-path)
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
// family already contains updates from this log. We can't apply updates // family already contains updates from this log. We can't apply updates
// twice because of update-in-place or merge workloads -- ignore the // twice because of update-in-place or merge workloads -- ignore the
@ -316,18 +339,8 @@ class MemTableInserter : public WriteBatch::Handler {
*s = Status::OK(); *s = Status::OK();
return false; return false;
} }
if (!found) {
assert(!recovery_);
// If the column family was not found in non-recovery enviornment
// (client's write code-path), we have to fail the write and return
// the failure status to the client.
*s = Status::InvalidArgument(
"Invalid column family specified in write batch");
return false;
}
return true; return true;
} }
virtual Status PutCF(uint32_t column_family_id, const Slice& key, virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) { const Slice& value) {
Status seek_status; Status seek_status;
@ -336,14 +349,15 @@ class MemTableInserter : public WriteBatch::Handler {
return seek_status; return seek_status;
} }
MemTable* mem = cf_mems_->GetMemTable(); MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions(); auto* ioptions = mem->GetImmutableOptions();
if (!options->inplace_update_support) { auto* moptions = mem->GetMemTableOptions();
if (!moptions->inplace_update_support) {
mem->Add(sequence_, kTypeValue, key, value); mem->Add(sequence_, kTypeValue, key, value);
} else if (options->inplace_callback == nullptr) { } else if (moptions->inplace_callback == nullptr) {
mem->Update(sequence_, key, value); mem->Update(sequence_, key, value);
RecordTick(options->statistics.get(), NUMBER_KEYS_UPDATED); RecordTick(ioptions->statistics, NUMBER_KEYS_UPDATED);
} else { } else {
if (mem->UpdateCallback(sequence_, key, value, *options)) { if (mem->UpdateCallback(sequence_, key, value)) {
} else { } else {
// key not found in memtable. Do sst get, update, add // key not found in memtable. Do sst get, update, add
SnapshotImpl read_from_snapshot; SnapshotImpl read_from_snapshot;
@ -362,17 +376,17 @@ class MemTableInserter : public WriteBatch::Handler {
char* prev_buffer = const_cast<char*>(prev_value.c_str()); char* prev_buffer = const_cast<char*>(prev_value.c_str());
uint32_t prev_size = prev_value.size(); uint32_t prev_size = prev_value.size();
auto status = options->inplace_callback(s.ok() ? prev_buffer : nullptr, auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
s.ok() ? &prev_size : nullptr, s.ok() ? &prev_size : nullptr,
value, &merged_value); value, &merged_value);
if (status == UpdateStatus::UPDATED_INPLACE) { if (status == UpdateStatus::UPDATED_INPLACE) {
// prev_value is updated in-place with final value. // prev_value is updated in-place with final value.
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size)); mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
} else if (status == UpdateStatus::UPDATED) { } else if (status == UpdateStatus::UPDATED) {
// merged_value contains the final value. // merged_value contains the final value.
mem->Add(sequence_, kTypeValue, key, Slice(merged_value)); mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
RecordTick(options->statistics.get(), NUMBER_KEYS_WRITTEN); RecordTick(ioptions->statistics, NUMBER_KEYS_WRITTEN);
} }
} }
} }
@ -380,6 +394,7 @@ class MemTableInserter : public WriteBatch::Handler {
// sequence number. Even if the update eventually fails and does not result // sequence number. Even if the update eventually fails and does not result
// in memtable add/update. // in memtable add/update.
sequence_++; sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK(); return Status::OK();
} }
@ -391,17 +406,18 @@ class MemTableInserter : public WriteBatch::Handler {
return seek_status; return seek_status;
} }
MemTable* mem = cf_mems_->GetMemTable(); MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions(); auto* ioptions = mem->GetImmutableOptions();
auto* moptions = mem->GetMemTableOptions();
bool perform_merge = false; bool perform_merge = false;
if (options->max_successive_merges > 0 && db_ != nullptr) { if (moptions->max_successive_merges > 0 && db_ != nullptr) {
LookupKey lkey(key, sequence_); LookupKey lkey(key, sequence_);
// Count the number of successive merges at the head // Count the number of successive merges at the head
// of the key in the memtable // of the key in the memtable
size_t num_merges = mem->CountSuccessiveMergeEntries(lkey); size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
if (num_merges >= options->max_successive_merges) { if (num_merges >= moptions->max_successive_merges) {
perform_merge = true; perform_merge = true;
} }
} }
@ -425,16 +441,16 @@ class MemTableInserter : public WriteBatch::Handler {
Slice get_value_slice = Slice(get_value); Slice get_value_slice = Slice(get_value);
// 2) Apply this merge // 2) Apply this merge
auto merge_operator = options->merge_operator.get(); auto merge_operator = ioptions->merge_operator;
assert(merge_operator); assert(merge_operator);
std::deque<std::string> operands; std::deque<std::string> operands;
operands.push_front(value.ToString()); operands.push_front(value.ToString());
std::string new_value; std::string new_value;
if (!merge_operator->FullMerge(key, &get_value_slice, operands, if (!merge_operator->FullMerge(key, &get_value_slice, operands,
&new_value, options->info_log.get())) { &new_value, ioptions->info_log)) {
// Failed to merge! // Failed to merge!
RecordTick(options->statistics.get(), NUMBER_MERGE_FAILURES); RecordTick(ioptions->statistics, NUMBER_MERGE_FAILURES);
// Store the delta in memtable // Store the delta in memtable
perform_merge = false; perform_merge = false;
@ -450,6 +466,7 @@ class MemTableInserter : public WriteBatch::Handler {
} }
sequence_++; sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK(); return Status::OK();
} }
@ -460,8 +477,9 @@ class MemTableInserter : public WriteBatch::Handler {
return seek_status; return seek_status;
} }
MemTable* mem = cf_mems_->GetMemTable(); MemTable* mem = cf_mems_->GetMemTable();
const Options* options = cf_mems_->GetOptions(); auto* ioptions = mem->GetImmutableOptions();
if (!dont_filter_deletes_ && options->filter_deletes) { auto* moptions = mem->GetMemTableOptions();
if (!dont_filter_deletes_ && moptions->filter_deletes) {
SnapshotImpl read_from_snapshot; SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_; read_from_snapshot.number_ = sequence_;
ReadOptions ropts; ReadOptions ropts;
@ -472,12 +490,13 @@ class MemTableInserter : public WriteBatch::Handler {
cf_handle = db_->DefaultColumnFamily(); cf_handle = db_->DefaultColumnFamily();
} }
if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) { if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
RecordTick(options->statistics.get(), NUMBER_FILTERED_DELETES); RecordTick(ioptions->statistics, NUMBER_FILTERED_DELETES);
return Status::OK(); return Status::OK();
} }
} }
mem->Add(sequence_, kTypeDeletion, key, Slice()); mem->Add(sequence_, kTypeDeletion, key, Slice());
sequence_++; sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK(); return Status::OK();
} }
}; };
@ -485,10 +504,12 @@ class MemTableInserter : public WriteBatch::Handler {
Status WriteBatchInternal::InsertInto(const WriteBatch* b, Status WriteBatchInternal::InsertInto(const WriteBatch* b,
ColumnFamilyMemTables* memtables, ColumnFamilyMemTables* memtables,
bool recovery, uint64_t log_number, bool ignore_missing_column_families,
DB* db, const bool dont_filter_deletes) { uint64_t log_number, DB* db,
const bool dont_filter_deletes) {
MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables, MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
recovery, log_number, db, dont_filter_deletes); ignore_missing_column_families, log_number, db,
dont_filter_deletes);
return b->Iterate(&inserter); return b->Iterate(&inserter);
} }

@ -28,6 +28,7 @@ class ColumnFamilyMemTables {
virtual MemTable* GetMemTable() const = 0; virtual MemTable* GetMemTable() const = 0;
virtual const Options* GetOptions() const = 0; virtual const Options* GetOptions() const = 0;
virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0; virtual ColumnFamilyHandle* GetColumnFamilyHandle() = 0;
virtual void CheckMemtableFull() = 0;
}; };
class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables { class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
@ -54,6 +55,8 @@ class ColumnFamilyMemTablesDefault : public ColumnFamilyMemTables {
ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; } ColumnFamilyHandle* GetColumnFamilyHandle() override { return nullptr; }
void CheckMemtableFull() override {}
private: private:
bool ok_; bool ok_;
MemTable* mem_; MemTable* mem_;
@ -106,18 +109,18 @@ class WriteBatchInternal {
// Inserts batch entries into memtable // Inserts batch entries into memtable
// If dont_filter_deletes is false AND options.filter_deletes is true, // If dont_filter_deletes is false AND options.filter_deletes is true,
// then --> Drops deletes in batch if db->KeyMayExist returns false // then --> Drops deletes in batch if db->KeyMayExist returns false
// If recovery == true, this means InsertInto is executed on a recovery // If ignore_missing_column_families == true. WriteBatch referencing
// code-path. WriteBatch referencing a dropped column family can be // non-existing column family should be ignored.
// found on a recovery code-path and should be ignored (recovery should not // However, if ignore_missing_column_families == false, any WriteBatch
// fail). Additionally, the memtable will be updated only if // referencing non-existing column family will return a InvalidArgument()
// failure.
//
// If log_number is non-zero, the memtable will be updated only if
// memtables->GetLogNumber() >= log_number // memtables->GetLogNumber() >= log_number
// However, if recovery == false, any WriteBatch referencing
// non-existing column family will return a failure. Also, log_number is
// ignored in that case
static Status InsertInto(const WriteBatch* batch, static Status InsertInto(const WriteBatch* batch,
ColumnFamilyMemTables* memtables, ColumnFamilyMemTables* memtables,
bool recovery = false, uint64_t log_number = 0, bool ignore_missing_column_families = false,
DB* db = nullptr, uint64_t log_number = 0, DB* db = nullptr,
const bool dont_filter_deletes = true); const bool dont_filter_deletes = true);
static void Append(WriteBatch* dst, const WriteBatch* src); static void Append(WriteBatch* dst, const WriteBatch* src);

@ -15,8 +15,10 @@
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/utilities/write_batch_with_index.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/testharness.h" #include "util/testharness.h"
#include "util/scoped_arena_iterator.h"
namespace rocksdb { namespace rocksdb {
@ -25,13 +27,15 @@ static std::string PrintContents(WriteBatch* b) {
auto factory = std::make_shared<SkipListFactory>(); auto factory = std::make_shared<SkipListFactory>();
Options options; Options options;
options.memtable_factory = factory; options.memtable_factory = factory;
MemTable* mem = new MemTable(cmp, options); MemTable* mem = new MemTable(cmp, ImmutableCFOptions(options),
MemTableOptions(MutableCFOptions(options), options));
mem->Ref(); mem->Ref();
std::string state; std::string state;
ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default); Status s = WriteBatchInternal::InsertInto(b, &cf_mems_default);
int count = 0; int count = 0;
Iterator* iter = mem->NewIterator(ReadOptions()); Arena arena;
ScopedArenaIterator iter(mem->NewIterator(ReadOptions(), &arena));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ParsedInternalKey ikey; ParsedInternalKey ikey;
memset((void *)&ikey, 0, sizeof(ikey)); memset((void *)&ikey, 0, sizeof(ikey));
@ -66,7 +70,6 @@ static std::string PrintContents(WriteBatch* b) {
state.append("@"); state.append("@");
state.append(NumberToString(ikey.sequence)); state.append(NumberToString(ikey.sequence));
} }
delete iter;
if (!s.ok()) { if (!s.ok()) {
state.append(s.ToString()); state.append(s.ToString());
} else if (count != WriteBatchInternal::Count(b)) { } else if (count != WriteBatchInternal::Count(b)) {
@ -286,6 +289,9 @@ class ColumnFamilyHandleImplDummy : public ColumnFamilyHandleImpl {
explicit ColumnFamilyHandleImplDummy(int id) explicit ColumnFamilyHandleImplDummy(int id)
: ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {} : ColumnFamilyHandleImpl(nullptr, nullptr, nullptr), id_(id) {}
uint32_t GetID() const override { return id_; } uint32_t GetID() const override { return id_; }
const Comparator* user_comparator() const override {
return BytewiseComparator();
}
private: private:
uint32_t id_; uint32_t id_;
@ -316,6 +322,88 @@ TEST(WriteBatchTest, ColumnFamiliesBatchTest) {
handler.seen); handler.seen);
} }
TEST(WriteBatchTest, ColumnFamiliesBatchWithIndexTest) {
WriteBatchWithIndex batch;
ColumnFamilyHandleImplDummy zero(0), two(2), three(3), eight(8);
batch.Put(&zero, Slice("foo"), Slice("bar"));
batch.Put(&two, Slice("twofoo"), Slice("bar2"));
batch.Put(&eight, Slice("eightfoo"), Slice("bar8"));
batch.Delete(&eight, Slice("eightfoo"));
batch.Merge(&three, Slice("threethree"), Slice("3three"));
batch.Put(&zero, Slice("foo"), Slice("bar"));
batch.Merge(Slice("omom"), Slice("nom"));
std::unique_ptr<WBWIIterator> iter;
iter.reset(batch.NewIterator(&eight));
iter->Seek("eightfoo");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
ASSERT_EQ("bar8", iter->Entry().value.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kDeleteRecord, iter->Entry().type);
ASSERT_EQ("eightfoo", iter->Entry().key.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
iter.reset(batch.NewIterator());
iter->Seek("gggg");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
ASSERT_EQ("omom", iter->Entry().key.ToString());
ASSERT_EQ("nom", iter->Entry().value.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
iter.reset(batch.NewIterator(&zero));
iter->Seek("foo");
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
ASSERT_EQ("foo", iter->Entry().key.ToString());
ASSERT_EQ("bar", iter->Entry().value.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kPutRecord, iter->Entry().type);
ASSERT_EQ("foo", iter->Entry().key.ToString());
ASSERT_EQ("bar", iter->Entry().value.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(iter->Valid());
ASSERT_EQ(WriteType::kMergeRecord, iter->Entry().type);
ASSERT_EQ("omom", iter->Entry().key.ToString());
ASSERT_EQ("nom", iter->Entry().value.ToString());
iter->Next();
ASSERT_OK(iter->status());
ASSERT_TRUE(!iter->Valid());
TestHandler handler;
batch.GetWriteBatch()->Iterate(&handler);
ASSERT_EQ(
"Put(foo, bar)"
"PutCF(2, twofoo, bar2)"
"PutCF(8, eightfoo, bar8)"
"DeleteCF(8, eightfoo)"
"MergeCF(3, threethree, 3three)"
"Put(foo, bar)"
"Merge(omom, nom)",
handler.seen);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -0,0 +1,37 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/write_controller.h"
#include <cassert>
namespace rocksdb {
std::unique_ptr<WriteControllerToken> WriteController::GetStopToken() {
++total_stopped_;
return std::unique_ptr<WriteControllerToken>(new StopWriteToken(this));
}
std::unique_ptr<WriteControllerToken> WriteController::GetDelayToken(
uint64_t delay_us) {
total_delay_us_ += delay_us;
return std::unique_ptr<WriteControllerToken>(
new DelayWriteToken(this, delay_us));
}
bool WriteController::IsStopped() const { return total_stopped_ > 0; }
uint64_t WriteController::GetDelay() const { return total_delay_us_; }
StopWriteToken::~StopWriteToken() {
assert(controller_->total_stopped_ >= 1);
--controller_->total_stopped_;
}
DelayWriteToken::~DelayWriteToken() {
assert(controller_->total_delay_us_ >= delay_us_);
controller_->total_delay_us_ -= delay_us_;
}
} // namespace rocksdb

@ -0,0 +1,78 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <stdint.h>
#include <memory>
namespace rocksdb {
class WriteControllerToken;
// WriteController is controlling write stalls in our write code-path. Write
// stalls happen when compaction can't keep up with write rate.
// All of the methods here (including WriteControllerToken's destructors) need
// to be called while holding DB mutex
class WriteController {
public:
WriteController() : total_stopped_(0), total_delay_us_(0) {}
~WriteController() = default;
// When an actor (column family) requests a stop token, all writes will be
// stopped until the stop token is released (deleted)
std::unique_ptr<WriteControllerToken> GetStopToken();
// When an actor (column family) requests a delay token, total delay for all
// writes will be increased by delay_us. The delay will last until delay token
// is released
std::unique_ptr<WriteControllerToken> GetDelayToken(uint64_t delay_us);
// these two metods are querying the state of the WriteController
bool IsStopped() const;
uint64_t GetDelay() const;
private:
friend class WriteControllerToken;
friend class StopWriteToken;
friend class DelayWriteToken;
int total_stopped_;
uint64_t total_delay_us_;
};
class WriteControllerToken {
public:
explicit WriteControllerToken(WriteController* controller)
: controller_(controller) {}
virtual ~WriteControllerToken() {}
protected:
WriteController* controller_;
private:
// no copying allowed
WriteControllerToken(const WriteControllerToken&) = delete;
void operator=(const WriteControllerToken&) = delete;
};
class StopWriteToken : public WriteControllerToken {
public:
explicit StopWriteToken(WriteController* controller)
: WriteControllerToken(controller) {}
virtual ~StopWriteToken();
};
class DelayWriteToken : public WriteControllerToken {
public:
DelayWriteToken(WriteController* controller, uint64_t delay_us)
: WriteControllerToken(controller), delay_us_(delay_us) {}
virtual ~DelayWriteToken();
private:
uint64_t delay_us_;
};
} // namespace rocksdb

@ -0,0 +1,40 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include "db/write_controller.h"
#include "util/testharness.h"
namespace rocksdb {
class WriteControllerTest {};
TEST(WriteControllerTest, SanityTest) {
WriteController controller;
auto stop_token_1 = controller.GetStopToken();
auto stop_token_2 = controller.GetStopToken();
ASSERT_EQ(true, controller.IsStopped());
stop_token_1.reset();
ASSERT_EQ(true, controller.IsStopped());
stop_token_2.reset();
ASSERT_EQ(false, controller.IsStopped());
auto delay_token_1 = controller.GetDelayToken(5);
ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
auto delay_token_2 = controller.GetDelayToken(8);
ASSERT_EQ(static_cast<uint64_t>(13), controller.GetDelay());
delay_token_2.reset();
ASSERT_EQ(static_cast<uint64_t>(5), controller.GetDelay());
delay_token_1.reset();
ASSERT_EQ(static_cast<uint64_t>(0), controller.GetDelay());
delay_token_1.reset();
ASSERT_EQ(false, controller.IsStopped());
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

@ -0,0 +1,147 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/write_thread.h"
namespace rocksdb {
Status WriteThread::EnterWriteThread(WriteThread::Writer* w,
uint64_t expiration_time) {
// the following code block pushes the current writer "w" into the writer
// queue "writers_" and wait until one of the following conditions met:
// 1. the job of "w" has been done by some other writers.
// 2. "w" becomes the first writer in "writers_"
// 3. "w" timed-out.
writers_.push_back(w);
bool timed_out = false;
while (!w->done && w != writers_.front()) {
if (expiration_time == 0) {
w->cv.Wait();
} else if (w->cv.TimedWait(expiration_time)) {
if (w->in_batch_group) {
// then it means the front writer is currently doing the
// write on behalf of this "timed-out" writer. Then it
// should wait until the write completes.
expiration_time = 0;
} else {
timed_out = true;
break;
}
}
}
if (timed_out) {
#ifndef NDEBUG
bool found = false;
#endif
for (auto iter = writers_.begin(); iter != writers_.end(); iter++) {
if (*iter == w) {
writers_.erase(iter);
#ifndef NDEBUG
found = true;
#endif
break;
}
}
#ifndef NDEBUG
assert(found);
#endif
// writers_.front() might still be in cond_wait without a time-out.
// As a result, we need to signal it to wake it up. Otherwise no
// one else will wake him up, and RocksDB will hang.
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
return Status::TimedOut();
}
return Status::OK();
}
void WriteThread::ExitWriteThread(WriteThread::Writer* w,
WriteThread::Writer* last_writer,
Status status) {
// Pop out the current writer and all writers being pushed before the
// current writer from the writer queue.
while (!writers_.empty()) {
Writer* ready = writers_.front();
writers_.pop_front();
if (ready != w) {
ready->status = status;
ready->done = true;
ready->cv.Signal();
}
if (ready == last_writer) break;
}
// Notify new head of write queue
if (!writers_.empty()) {
writers_.front()->cv.Signal();
}
}
// This function will be called only when the first writer succeeds.
// All writers in the to-be-built batch group will be processed.
//
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-nullptr batch
void WriteThread::BuildBatchGroup(WriteThread::Writer** last_writer,
autovector<WriteBatch*>* write_batch_group) {
assert(!writers_.empty());
Writer* first = writers_.front();
assert(first->batch != nullptr);
size_t size = WriteBatchInternal::ByteSize(first->batch);
write_batch_group->push_back(first->batch);
// Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow
// down the small write too much.
size_t max_size = 1 << 20;
if (size <= (128<<10)) {
max_size = size + (128<<10);
}
*last_writer = first;
std::deque<Writer*>::iterator iter = writers_.begin();
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) {
Writer* w = *iter;
if (w->sync && !first->sync) {
// Do not include a sync write into a batch handled by a non-sync write.
break;
}
if (!w->disableWAL && first->disableWAL) {
// Do not include a write that needs WAL into a batch that has
// WAL disabled.
break;
}
if (w->timeout_hint_us < first->timeout_hint_us) {
// Do not include those writes with shorter timeout. Otherwise, we might
// execute a write that should instead be aborted because of timeout.
break;
}
if (w->batch == nullptr) {
// Do not include those writes with nullptr batch. Those are not writes,
// those are something else. They want to be alone
break;
}
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
write_batch_group->push_back(w->batch);
w->in_batch_group = true;
*last_writer = w;
}
}
} // namespace rocksdb

@ -0,0 +1,80 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <stdint.h>
#include <deque>
#include <limits>
#include "rocksdb/status.h"
#include "db/write_batch_internal.h"
#include "util/autovector.h"
#include "port/port.h"
namespace rocksdb {
class WriteThread {
public:
static const uint64_t kNoTimeOut = std::numeric_limits<uint64_t>::max();
// Information kept for every waiting writer
struct Writer {
Status status;
WriteBatch* batch;
bool sync;
bool disableWAL;
bool in_batch_group;
bool done;
uint64_t timeout_hint_us;
port::CondVar cv;
explicit Writer(port::Mutex* mu)
: batch(nullptr),
sync(false),
disableWAL(false),
in_batch_group(false),
done(false),
timeout_hint_us(kNoTimeOut),
cv(mu) {}
};
WriteThread() = default;
~WriteThread() = default;
// Before applying write operation (such as DBImpl::Write, DBImpl::Flush)
// thread should grab the mutex_ and be the first on writers queue.
// EnterWriteThread is used for it.
// Be aware! Writer's job can be done by other thread (see DBImpl::Write
// for examples), so check it via w.done before applying changes.
//
// Writer* w: writer to be placed in the queue
// uint64_t expiration_time: maximum time to be in the queue
// See also: ExitWriteThread
// REQUIRES: db mutex held
Status EnterWriteThread(Writer* w, uint64_t expiration_time);
// After doing write job, we need to remove already used writers from
// writers_ queue and notify head of the queue about it.
// ExitWriteThread is used for this.
//
// Writer* w: Writer, that was added by EnterWriteThread function
// Writer* last_writer: Since we can join a few Writers (as DBImpl::Write
// does)
// we should pass last_writer as a parameter to
// ExitWriteThread
// (if you don't touch other writers, just pass w)
// Status status: Status of write operation
// See also: EnterWriteThread
// REQUIRES: db mutex held
void ExitWriteThread(Writer* w, Writer* last_writer, Status status);
void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);
private:
// Queue of writers.
std::deque<Writer*> writers_;
};
} // namespace rocksdb

@ -75,6 +75,8 @@ typedef struct rocksdb_iterator_t rocksdb_iterator_t;
typedef struct rocksdb_logger_t rocksdb_logger_t; typedef struct rocksdb_logger_t rocksdb_logger_t;
typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t; typedef struct rocksdb_mergeoperator_t rocksdb_mergeoperator_t;
typedef struct rocksdb_options_t rocksdb_options_t; typedef struct rocksdb_options_t rocksdb_options_t;
typedef struct rocksdb_block_based_table_options_t
rocksdb_block_based_table_options_t;
typedef struct rocksdb_randomfile_t rocksdb_randomfile_t; typedef struct rocksdb_randomfile_t rocksdb_randomfile_t;
typedef struct rocksdb_readoptions_t rocksdb_readoptions_t; typedef struct rocksdb_readoptions_t rocksdb_readoptions_t;
typedef struct rocksdb_seqfile_t rocksdb_seqfile_t; typedef struct rocksdb_seqfile_t rocksdb_seqfile_t;
@ -346,6 +348,34 @@ extern void rocksdb_writebatch_iterate(
void (*deleted)(void*, const char* k, size_t klen)); void (*deleted)(void*, const char* k, size_t klen));
extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size); extern const char* rocksdb_writebatch_data(rocksdb_writebatch_t*, size_t *size);
/* Block based table options */
extern rocksdb_block_based_table_options_t*
rocksdb_block_based_options_create();
extern void rocksdb_block_based_options_destroy(
rocksdb_block_based_table_options_t* options);
extern void rocksdb_block_based_options_set_block_size(
rocksdb_block_based_table_options_t* options, size_t block_size);
extern void rocksdb_block_based_options_set_block_size_deviation(
rocksdb_block_based_table_options_t* options, int block_size_deviation);
extern void rocksdb_block_based_options_set_block_restart_interval(
rocksdb_block_based_table_options_t* options, int block_restart_interval);
extern void rocksdb_block_based_options_set_filter_policy(
rocksdb_block_based_table_options_t* options,
rocksdb_filterpolicy_t* filter_policy);
extern void rocksdb_block_based_options_set_no_block_cache(
rocksdb_block_based_table_options_t* options,
unsigned char no_block_cache);
extern void rocksdb_block_based_options_set_block_cache(
rocksdb_block_based_table_options_t* options, rocksdb_cache_t* block_cache);
extern void rocksdb_block_based_options_set_block_cache_compressed(
rocksdb_block_based_table_options_t* options,
rocksdb_cache_t* block_cache_compressed);
extern void rocksdb_block_based_options_set_whole_key_filtering(
rocksdb_block_based_table_options_t*, unsigned char);
extern void rocksdb_options_set_block_based_table_factory(
rocksdb_options_t *opt, rocksdb_block_based_table_options_t* table_options);
/* Options */ /* Options */
extern rocksdb_options_t* rocksdb_options_create(); extern rocksdb_options_t* rocksdb_options_create();
@ -353,7 +383,7 @@ extern void rocksdb_options_destroy(rocksdb_options_t*);
extern void rocksdb_options_increase_parallelism( extern void rocksdb_options_increase_parallelism(
rocksdb_options_t* opt, int total_threads); rocksdb_options_t* opt, int total_threads);
extern void rocksdb_options_optimize_for_point_lookup( extern void rocksdb_options_optimize_for_point_lookup(
rocksdb_options_t* opt); rocksdb_options_t* opt, uint64_t block_cache_size_mb);
extern void rocksdb_options_optimize_level_style_compaction( extern void rocksdb_options_optimize_level_style_compaction(
rocksdb_options_t* opt, uint64_t memtable_memory_budget); rocksdb_options_t* opt, uint64_t memtable_memory_budget);
extern void rocksdb_options_optimize_universal_style_compaction( extern void rocksdb_options_optimize_universal_style_compaction(
@ -376,9 +406,6 @@ extern void rocksdb_options_set_compression_per_level(
rocksdb_options_t* opt, rocksdb_options_t* opt,
int* level_values, int* level_values,
size_t num_levels); size_t num_levels);
extern void rocksdb_options_set_filter_policy(
rocksdb_options_t*,
rocksdb_filterpolicy_t*);
extern void rocksdb_options_set_create_if_missing( extern void rocksdb_options_set_create_if_missing(
rocksdb_options_t*, unsigned char); rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_create_missing_column_families( extern void rocksdb_options_set_create_missing_column_families(
@ -392,13 +419,8 @@ extern void rocksdb_options_set_info_log(rocksdb_options_t*, rocksdb_logger_t*);
extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int); extern void rocksdb_options_set_info_log_level(rocksdb_options_t*, int);
extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t); extern void rocksdb_options_set_write_buffer_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int); extern void rocksdb_options_set_max_open_files(rocksdb_options_t*, int);
extern void rocksdb_options_set_cache(rocksdb_options_t*, rocksdb_cache_t*);
extern void rocksdb_options_set_cache_compressed(rocksdb_options_t*, rocksdb_cache_t*);
extern void rocksdb_options_set_block_size(rocksdb_options_t*, size_t);
extern void rocksdb_options_set_block_restart_interval(rocksdb_options_t*, int);
extern void rocksdb_options_set_compression_options( extern void rocksdb_options_set_compression_options(
rocksdb_options_t*, int, int, int); rocksdb_options_t*, int, int, int);
extern void rocksdb_options_set_whole_key_filtering(rocksdb_options_t*, unsigned char);
extern void rocksdb_options_set_prefix_extractor( extern void rocksdb_options_set_prefix_extractor(
rocksdb_options_t*, rocksdb_slicetransform_t*); rocksdb_options_t*, rocksdb_slicetransform_t*);
extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int); extern void rocksdb_options_set_num_levels(rocksdb_options_t*, int);
@ -449,8 +471,6 @@ extern void rocksdb_options_set_arena_block_size(
rocksdb_options_t*, size_t); rocksdb_options_t*, size_t);
extern void rocksdb_options_set_use_fsync( extern void rocksdb_options_set_use_fsync(
rocksdb_options_t*, int); rocksdb_options_t*, int);
extern void rocksdb_options_set_db_stats_log_interval(
rocksdb_options_t*, int);
extern void rocksdb_options_set_db_log_dir( extern void rocksdb_options_set_db_log_dir(
rocksdb_options_t*, const char*); rocksdb_options_t*, const char*);
extern void rocksdb_options_set_wal_dir( extern void rocksdb_options_set_wal_dir(
@ -493,7 +513,6 @@ extern void rocksdb_options_set_max_sequential_skip_in_iterations(
rocksdb_options_t*, uint64_t); rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int); extern void rocksdb_options_set_disable_data_sync(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int); extern void rocksdb_options_set_disable_auto_compactions(rocksdb_options_t*, int);
extern void rocksdb_options_set_disable_seek_compaction(rocksdb_options_t*, int);
extern void rocksdb_options_set_delete_obsolete_files_period_micros( extern void rocksdb_options_set_delete_obsolete_files_period_micros(
rocksdb_options_t*, uint64_t); rocksdb_options_t*, uint64_t);
extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int); extern void rocksdb_options_set_source_compaction_factor(rocksdb_options_t*, int);
@ -679,6 +698,10 @@ extern void rocksdb_readoptions_set_fill_cache(
extern void rocksdb_readoptions_set_snapshot( extern void rocksdb_readoptions_set_snapshot(
rocksdb_readoptions_t*, rocksdb_readoptions_t*,
const rocksdb_snapshot_t*); const rocksdb_snapshot_t*);
extern void rocksdb_readoptions_set_iterate_upper_bound(
rocksdb_readoptions_t*,
const char* key,
size_t keylen);
extern void rocksdb_readoptions_set_read_tier( extern void rocksdb_readoptions_set_read_tier(
rocksdb_readoptions_t*, int); rocksdb_readoptions_t*, int);
extern void rocksdb_readoptions_set_tailing( extern void rocksdb_readoptions_set_tailing(

@ -127,9 +127,6 @@ class Cache {
void LRU_Append(Handle* e); void LRU_Append(Handle* e);
void Unref(Handle* e); void Unref(Handle* e);
struct Rep;
Rep* rep_;
// No copying allowed // No copying allowed
Cache(const Cache&); Cache(const Cache&);
void operator=(const Cache&); void operator=(const Cache&);

@ -9,6 +9,7 @@
#ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ #ifndef STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_ #define STORAGE_ROCKSDB_INCLUDE_COMPACTION_FILTER_H_
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>

@ -123,7 +123,7 @@ class DB {
// Open DB with column families. // Open DB with column families.
// db_options specify database specific options // db_options specify database specific options
// column_families is the vector of all column families in the databse, // column_families is the vector of all column families in the database,
// containing column family name and options. You need to open ALL column // containing column family name and options. You need to open ALL column
// families in the database. To get the list of column families, you can use // families in the database. To get the list of column families, you can use
// ListColumnFamilies(). Also, you can open only a subset of column families // ListColumnFamilies(). Also, you can open only a subset of column families
@ -359,6 +359,14 @@ class DB {
return CompactRange(DefaultColumnFamily(), begin, end, reduce_level, return CompactRange(DefaultColumnFamily(), begin, end, reduce_level,
target_level, target_path_id); target_level, target_path_id);
} }
virtual bool SetOptions(ColumnFamilyHandle* column_family,
const std::unordered_map<std::string, std::string>& new_options) {
return true;
}
virtual bool SetOptions(
const std::unordered_map<std::string, std::string>& new_options) {
return SetOptions(DefaultColumnFamily(), new_options);
}
// Number of levels used for this DB. // Number of levels used for this DB.
virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0; virtual int NumberLevels(ColumnFamilyHandle* column_family) = 0;

@ -21,11 +21,52 @@
#define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #define STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_
#include <string> #include <string>
#include <memory>
namespace rocksdb { namespace rocksdb {
class Slice; class Slice;
// A class that takes a bunch of keys, then generates filter
class FilterBitsBuilder {
public:
virtual ~FilterBitsBuilder() {}
// Add Key to filter, you could use any way to store the key.
// Such as: storing hashes or original keys
// Keys are in sorted order and duplicated keys are possible.
virtual void AddKey(const Slice& key) = 0;
// Generate the filter using the keys that are added
// The return value of this function would be the filter bits,
// The ownership of actual data is set to buf
virtual Slice Finish(std::unique_ptr<const char[]>* buf) = 0;
};
// A class that checks if a key can be in filter
// It should be initialized by Slice generated by BitsBuilder
class FilterBitsReader {
public:
virtual ~FilterBitsReader() {}
// Check if the entry match the bits in filter
virtual bool MayMatch(const Slice& entry) = 0;
};
// We add a new format of filter block called full filter block
// This new interface gives you more space of customization
//
// For the full filter block, you can plug in your version by implement
// the FilterBitsBuilder and FilterBitsReader
//
// There are two sets of interface in FilterPolicy
// Set 1: CreateFilter, KeyMayMatch: used for blockbased filter
// Set 2: GetFilterBitsBuilder, GetFilterBitsReader, they are used for
// full filter.
// Set 1 MUST be implemented correctly, Set 2 is optional
// RocksDB would first try using functions in Set 2. if they return nullptr,
// it would use Set 1 instead.
// You can choose filter type in NewBloomFilterPolicy
class FilterPolicy { class FilterPolicy {
public: public:
virtual ~FilterPolicy(); virtual ~FilterPolicy();
@ -51,11 +92,28 @@ class FilterPolicy {
// This method may return true or false if the key was not on the // This method may return true or false if the key was not on the
// list, but it should aim to return false with a high probability. // list, but it should aim to return false with a high probability.
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0; virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const = 0;
// Get the FilterBitsBuilder, which is ONLY used for full filter block
// It contains interface to take individual key, then generate filter
virtual FilterBitsBuilder* GetFilterBitsBuilder() const {
return nullptr;
}
// Get the FilterBitsReader, which is ONLY used for full filter block
// It contains interface to tell if key can be in filter
// The input slice should NOT be deleted by FilterPolicy
virtual FilterBitsReader* GetFilterBitsReader(const Slice& contents) const {
return nullptr;
}
}; };
// Return a new filter policy that uses a bloom filter with approximately // Return a new filter policy that uses a bloom filter with approximately
// the specified number of bits per key. A good value for bits_per_key // the specified number of bits per key.
//
// bits_per_key: bits per key in bloom filter. A good value for bits_per_key
// is 10, which yields a filter with ~ 1% false positive rate. // is 10, which yields a filter with ~ 1% false positive rate.
// use_block_based_builder: use block based filter rather than full fiter.
// If you want to builder full filter, it needs to be set to false.
// //
// Callers must delete the result after any database that is using the // Callers must delete the result after any database that is using the
// result has been closed. // result has been closed.
@ -67,8 +125,8 @@ class FilterPolicy {
// ignores trailing spaces, it would be incorrect to use a // ignores trailing spaces, it would be incorrect to use a
// FilterPolicy (like NewBloomFilterPolicy) that does not ignore // FilterPolicy (like NewBloomFilterPolicy) that does not ignore
// trailing spaces in keys. // trailing spaces in keys.
extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key); extern const FilterPolicy* NewBloomFilterPolicy(int bits_per_key,
bool use_block_based_builder = true);
} }
#endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_ #endif // STORAGE_ROCKSDB_INCLUDE_FILTER_POLICY_H_

@ -6,6 +6,7 @@
#pragma once #pragma once
#include <string> #include <string>
#include "rocksdb/table.h"
namespace rocksdb { namespace rocksdb {
@ -37,7 +38,8 @@ class FlushBlockPolicyFactory {
// Callers must delete the result after any database that is using the // Callers must delete the result after any database that is using the
// result has been closed. // result has been closed.
virtual FlushBlockPolicy* NewFlushBlockPolicy( virtual FlushBlockPolicy* NewFlushBlockPolicy(
const Options& options, const BlockBuilder& data_block_builder) const = 0; const BlockBasedTableOptions& table_options,
const BlockBuilder& data_block_builder) const = 0;
virtual ~FlushBlockPolicyFactory() { } virtual ~FlushBlockPolicyFactory() { }
}; };
@ -51,7 +53,7 @@ class FlushBlockBySizePolicyFactory : public FlushBlockPolicyFactory {
} }
virtual FlushBlockPolicy* NewFlushBlockPolicy( virtual FlushBlockPolicy* NewFlushBlockPolicy(
const Options& options, const BlockBasedTableOptions& table_options,
const BlockBuilder& data_block_builder) const override; const BlockBuilder& data_block_builder) const override;
}; };

@ -0,0 +1,84 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <vector>
#include "rocksdb/options.h"
namespace rocksdb {
// ImmutableCFOptions is a data struct used by RocksDB internal. It contains a
// subset of Options that should not be changed during the entire lifetime
// of DB. You shouldn't need to access this data structure unless you are
// implementing a new TableFactory. Raw pointers defined in this struct do
// not have ownership to the data they point to. Options contains shared_ptr
// to these data.
struct ImmutableCFOptions {
explicit ImmutableCFOptions(const Options& options);
CompactionStyle compaction_style;
CompactionOptionsUniversal compaction_options_universal;
const SliceTransform* prefix_extractor;
const Comparator* comparator;
MergeOperator* merge_operator;
const CompactionFilter* compaction_filter;
CompactionFilterFactory* compaction_filter_factory;
CompactionFilterFactoryV2* compaction_filter_factory_v2;
Logger* info_log;
Statistics* statistics;
InfoLogLevel info_log_level;
Env* env;
// Allow the OS to mmap file for reading sst tables. Default: false
bool allow_mmap_reads;
// Allow the OS to mmap file for writing. Default: false
bool allow_mmap_writes;
std::vector<DbPath> db_paths;
MemTableRepFactory* memtable_factory;
TableFactory* table_factory;
Options::TablePropertiesCollectorFactories
table_properties_collector_factories;
bool advise_random_on_open;
// This options is required by PlainTableReader. May need to move it
// to PlainTalbeOptions just like bloom_bits_per_key
uint32_t bloom_locality;
bool purge_redundant_kvs_while_flush;
uint32_t min_partial_merge_operands;
bool disable_data_sync;
bool use_fsync;
CompressionType compression;
std::vector<CompressionType> compression_per_level;
CompressionOptions compression_opts;
Options::AccessHint access_hint_on_compaction_start;
};
} // namespace rocksdb

@ -27,7 +27,9 @@ struct IOStatsContext {
uint64_t bytes_read; uint64_t bytes_read;
}; };
#ifndef IOS_CROSS_COMPILE
extern __thread IOStatsContext iostats_context; extern __thread IOStatsContext iostats_context;
#endif // IOS_CROSS_COMPILE
} // namespace rocksdb } // namespace rocksdb

@ -14,6 +14,7 @@
#include <memory> #include <memory>
#include <vector> #include <vector>
#include <stdint.h> #include <stdint.h>
#include <unordered_map>
#include "rocksdb/version.h" #include "rocksdb/version.h"
#include "rocksdb/universal_compaction.h" #include "rocksdb/universal_compaction.h"
@ -57,6 +58,7 @@ enum CompactionStyle : char {
kCompactionStyleFIFO = 0x2, // FIFO compaction style kCompactionStyleFIFO = 0x2, // FIFO compaction style
}; };
struct CompactionOptionsFIFO { struct CompactionOptionsFIFO {
// once the total sum of table files reaches this, we will delete the oldest // once the total sum of table files reaches this, we will delete the oldest
// table file // table file
@ -97,7 +99,8 @@ struct ColumnFamilyOptions {
// Use this if you don't need to keep the data sorted, i.e. you'll never use // Use this if you don't need to keep the data sorted, i.e. you'll never use
// an iterator, only Put() and Get() API calls // an iterator, only Put() and Get() API calls
ColumnFamilyOptions* OptimizeForPointLookup(); ColumnFamilyOptions* OptimizeForPointLookup(
uint64_t block_cache_size_mb);
// Default values for some parameters in ColumnFamilyOptions are not // Default values for some parameters in ColumnFamilyOptions are not
// optimized for heavy workloads and big datasets, which means you might // optimized for heavy workloads and big datasets, which means you might
@ -206,34 +209,6 @@ struct ColumnFamilyOptions {
// individual write buffers. Default: 1 // individual write buffers. Default: 1
int min_write_buffer_number_to_merge; int min_write_buffer_number_to_merge;
// Control over blocks (user data is stored in a set of blocks, and
// a block is the unit of reading from disk).
// If non-NULL use the specified cache for blocks.
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
// Default: nullptr
std::shared_ptr<Cache> block_cache;
// If non-NULL use the specified cache for compressed blocks.
// If NULL, rocksdb will not use a compressed block cache.
// Default: nullptr
std::shared_ptr<Cache> block_cache_compressed;
// Approximate size of user data packed per block. Note that the
// block size specified here corresponds to uncompressed data. The
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
//
// Default: 4K
size_t block_size;
// Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should
// leave this parameter alone.
//
// Default: 16
int block_restart_interval;
// Compress blocks using the specified compression algorithm. This // Compress blocks using the specified compression algorithm. This
// parameter can be changed dynamically. // parameter can be changed dynamically.
// //
@ -251,29 +226,17 @@ struct ColumnFamilyOptions {
CompressionType compression; CompressionType compression;
// Different levels can have different compression policies. There // Different levels can have different compression policies. There
// are cases where most lower levels would like to quick compression // are cases where most lower levels would like to use quick compression
// algorithm while the higher levels (which have more data) use // algorithms while the higher levels (which have more data) use
// compression algorithms that have better compression but could // compression algorithms that have better compression but could
// be slower. This array, if non nullptr, should have an entry for // be slower. This array, if non-empty, should have an entry for
// each level of the database. This array, if non nullptr, overides the // each level of the database; these override the value specified in
// value specified in the previous field 'compression'. The caller is // the previous field 'compression'.
// reponsible for allocating memory and initializing the values in it
// before invoking Open(). The caller is responsible for freeing this
// array and it could be freed anytime after the return from Open().
// This could have been a std::vector but that makes the equivalent
// java/C api hard to construct.
std::vector<CompressionType> compression_per_level; std::vector<CompressionType> compression_per_level;
// different options for compression algorithms // different options for compression algorithms
CompressionOptions compression_opts; CompressionOptions compression_opts;
// If non-nullptr, use the specified filter policy to reduce disk reads.
// Many applications will benefit from passing the result of
// NewBloomFilterPolicy() here.
//
// Default: nullptr
const FilterPolicy* filter_policy;
// If non-nullptr, use the specified function to determine the // If non-nullptr, use the specified function to determine the
// prefixes for keys. These prefixes will be placed in the filter. // prefixes for keys. These prefixes will be placed in the filter.
// Depending on the workload, this can reduce the number of read-IOP // Depending on the workload, this can reduce the number of read-IOP
@ -290,12 +253,6 @@ struct ColumnFamilyOptions {
// Default: nullptr // Default: nullptr
std::shared_ptr<const SliceTransform> prefix_extractor; std::shared_ptr<const SliceTransform> prefix_extractor;
// If true, place whole keys in the filter (not just prefixes).
// This must generally be true for gets to be efficient.
//
// Default: true
bool whole_key_filtering;
// Number of levels for this database // Number of levels for this database
int num_levels; int num_levels;
@ -331,7 +288,7 @@ struct ColumnFamilyOptions {
// and each file on level-3 will be 200MB. // and each file on level-3 will be 200MB.
// by default target_file_size_base is 2MB. // by default target_file_size_base is 2MB.
int target_file_size_base; uint64_t target_file_size_base;
// by default target_file_size_multiplier is 1, which means // by default target_file_size_multiplier is 1, which means
// by default files in different levels will have similar size. // by default files in different levels will have similar size.
int target_file_size_multiplier; int target_file_size_multiplier;
@ -375,18 +332,6 @@ struct ColumnFamilyOptions {
// stop building a single file in a level->level+1 compaction. // stop building a single file in a level->level+1 compaction.
int max_grandparent_overlap_factor; int max_grandparent_overlap_factor;
// We decided to remove seek compaction from RocksDB because:
// 1) It makes more sense for spinning disk workloads, while RocksDB is
// primarily designed for flash and memory,
// 2) It added some complexity to the important code-paths,
// 3) None of our internal customers were really using it.
//
// Since we removed seek compaction, this option is now obsolete.
// We left it here for backwards compatiblity (otherwise it would break the
// build), but we'll remove it at some point.
// Default: true
bool disable_seek_compaction;
// Puts are delayed 0-1 ms when any level has a compaction score that exceeds // Puts are delayed 0-1 ms when any level has a compaction score that exceeds
// soft_rate_limit. This is ignored when == 0.0. // soft_rate_limit. This is ignored when == 0.0.
// CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not // CONSTRAINT: soft_rate_limit <= hard_rate_limit. If this constraint does not
@ -399,17 +344,9 @@ struct ColumnFamilyOptions {
// Default: 0 (disabled) // Default: 0 (disabled)
double hard_rate_limit; double hard_rate_limit;
// Max time a put will be stalled when hard_rate_limit is enforced. If 0, then // DEPRECATED -- this options is no longer used
// there is no limit.
// Default: 1000
unsigned int rate_limit_delay_max_milliseconds; unsigned int rate_limit_delay_max_milliseconds;
// Disable block cache. If this is set to true,
// then no block cache should be used, and the block_cache should
// point to a nullptr object.
// Default: false
bool no_block_cache;
// size of one block in arena memory allocation. // size of one block in arena memory allocation.
// If <= 0, a proper value is automatically calculated (usually 1/10 of // If <= 0, a proper value is automatically calculated (usually 1/10 of
// writer_buffer_size). // writer_buffer_size).
@ -433,14 +370,6 @@ struct ColumnFamilyOptions {
// Default: true // Default: true
bool purge_redundant_kvs_while_flush; bool purge_redundant_kvs_while_flush;
// This is used to close a block before it reaches the configured
// 'block_size'. If the percentage of free space in the current block is less
// than this specified number and adding a new record to the block will
// exceed the configured block size, then this block will be closed and the
// new record will be written to the next block.
// Default is 10.
int block_size_deviation;
// The compaction style. Default: kCompactionStyleLevel // The compaction style. Default: kCompactionStyleLevel
CompactionStyle compaction_style; CompactionStyle compaction_style;
@ -475,10 +404,24 @@ struct ColumnFamilyOptions {
std::shared_ptr<MemTableRepFactory> memtable_factory; std::shared_ptr<MemTableRepFactory> memtable_factory;
// This is a factory that provides TableFactory objects. // This is a factory that provides TableFactory objects.
// Default: a factory that provides a default implementation of // Default: a block-based table factory that provides a default
// Table and TableBuilder. // implementation of TableBuilder and TableReader with default
// BlockBasedTableOptions.
std::shared_ptr<TableFactory> table_factory; std::shared_ptr<TableFactory> table_factory;
// Block-based table related options are moved to BlockBasedTableOptions.
// Related options that were originally here but now moved include:
// no_block_cache
// block_cache
// block_cache_compressed
// block_size
// block_size_deviation
// block_restart_interval
// filter_policy
// whole_key_filtering
// If you'd like to customize some of these options, you will need to
// use NewBlockBasedTableFactory() to construct a new table factory.
// This option allows user to to collect their own interested statistics of // This option allows user to to collect their own interested statistics of
// the tables. // the tables.
// Default: empty vector -- no user-defined statistics collection will be // Default: empty vector -- no user-defined statistics collection will be
@ -669,7 +612,7 @@ struct DBOptions {
// it does not use any locks to prevent concurrent updates. // it does not use any locks to prevent concurrent updates.
std::shared_ptr<Statistics> statistics; std::shared_ptr<Statistics> statistics;
// If true, then the contents of data files are not synced // If true, then the contents of manifest and data files are not synced
// to stable storage. Their contents remain in the OS buffers till the // to stable storage. Their contents remain in the OS buffers till the
// OS decides to flush them. This option is good for bulk-loading // OS decides to flush them. This option is good for bulk-loading
// of data. Once the bulk-loading is complete, please issue a // of data. Once the bulk-loading is complete, please issue a
@ -684,9 +627,6 @@ struct DBOptions {
// Default: false // Default: false
bool use_fsync; bool use_fsync;
// This options is not used!!
int db_stats_log_interval;
// A list of paths where SST files can be put into, with its target size. // A list of paths where SST files can be put into, with its target size.
// Newer data is placed into paths specified earlier in the vector while // Newer data is placed into paths specified earlier in the vector while
// older data gradually moves to paths specified later in the vector. // older data gradually moves to paths specified later in the vector.
@ -844,12 +784,13 @@ struct DBOptions {
// Specify the file access pattern once a compaction is started. // Specify the file access pattern once a compaction is started.
// It will be applied to all input files of a compaction. // It will be applied to all input files of a compaction.
// Default: NORMAL // Default: NORMAL
enum { enum AccessHint {
NONE, NONE,
NORMAL, NORMAL,
SEQUENTIAL, SEQUENTIAL,
WILLNEED WILLNEED
} access_hint_on_compaction_start; };
AccessHint access_hint_on_compaction_start;
// Use adaptive mutex, which spins in the user space before resorting // Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not // to kernel. This could reduce context switch when the mutex is not
@ -958,6 +899,18 @@ struct ReadOptions {
// ! DEPRECATED // ! DEPRECATED
// const Slice* prefix; // const Slice* prefix;
// "iterate_upper_bound" defines the extent upto which the forward iterator
// can returns entries. Once the bound is reached, Valid() will be false.
// "iterate_upper_bound" is exclusive ie the bound value is
// not a valid entry. If iterator_extractor is not null, the Seek target
// and iterator_upper_bound need to have the same prefix.
// This is because ordering is not guaranteed outside of prefix domain.
// There is no lower bound on the iterator. If needed, that can be easily
// implemented
//
// Default: nullptr
const Slice* iterate_upper_bound;
// Specify if this read request should process data that ALREADY // Specify if this read request should process data that ALREADY
// resides on a particular cache. If the required data is not // resides on a particular cache. If the required data is not
// found at the specified cache, then Status::Incomplete is returned. // found at the specified cache, then Status::Incomplete is returned.
@ -972,18 +925,27 @@ struct ReadOptions {
// Not supported in ROCKSDB_LITE mode! // Not supported in ROCKSDB_LITE mode!
bool tailing; bool tailing;
// Enable a total order seek regardless of index format (e.g. hash index)
// used in the table. Some table format (e.g. plain table) may not support
// this option.
bool total_order_seek;
ReadOptions() ReadOptions()
: verify_checksums(true), : verify_checksums(true),
fill_cache(true), fill_cache(true),
snapshot(nullptr), snapshot(nullptr),
iterate_upper_bound(nullptr),
read_tier(kReadAllTier), read_tier(kReadAllTier),
tailing(false) {} tailing(false),
total_order_seek(false) {}
ReadOptions(bool cksum, bool cache) ReadOptions(bool cksum, bool cache)
: verify_checksums(cksum), : verify_checksums(cksum),
fill_cache(cache), fill_cache(cache),
snapshot(nullptr), snapshot(nullptr),
iterate_upper_bound(nullptr),
read_tier(kReadAllTier), read_tier(kReadAllTier),
tailing(false) {} tailing(false),
total_order_seek(false) {}
}; };
// Options that control write operations // Options that control write operations
@ -1021,7 +983,17 @@ struct WriteOptions {
// Default: 0 // Default: 0
uint64_t timeout_hint_us; uint64_t timeout_hint_us;
WriteOptions() : sync(false), disableWAL(false), timeout_hint_us(0) {} // If true and if user is trying to write to column families that don't exist
// (they were dropped), ignore the write (don't return an error). If there
// are multiple writes in a WriteBatch, other writes will succeed.
// Default: false
bool ignore_missing_column_families;
WriteOptions()
: sync(false),
disableWAL(false),
timeout_hint_us(0),
ignore_missing_column_families(false) {}
}; };
// Options that control flush operations // Options that control flush operations
@ -1043,6 +1015,12 @@ extern Options GetOptions(size_t total_write_buffer_limit,
int read_amplification_threshold = 8, int read_amplification_threshold = 8,
int write_amplification_threshold = 32, int write_amplification_threshold = 32,
uint64_t target_db_size = 68719476736 /* 64GB */); uint64_t target_db_size = 68719476736 /* 64GB */);
bool GetOptionsFromStrings(
const Options& base_options,
const std::unordered_map<std::string, std::string>& options_map,
Options* new_options);
} // namespace rocksdb } // namespace rocksdb
#endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_

@ -212,7 +212,6 @@ enum Histograms : uint32_t {
READ_BLOCK_COMPACTION_MICROS, READ_BLOCK_COMPACTION_MICROS,
READ_BLOCK_GET_MICROS, READ_BLOCK_GET_MICROS,
WRITE_RAW_BLOCK_MICROS, WRITE_RAW_BLOCK_MICROS,
STALL_L0_SLOWDOWN_COUNT, STALL_L0_SLOWDOWN_COUNT,
STALL_MEMTABLE_COMPACTION_COUNT, STALL_MEMTABLE_COMPACTION_COUNT,
STALL_L0_NUM_FILES_COUNT, STALL_L0_NUM_FILES_COUNT,
@ -220,6 +219,7 @@ enum Histograms : uint32_t {
SOFT_RATE_LIMIT_DELAY_COUNT, SOFT_RATE_LIMIT_DELAY_COUNT,
NUM_FILES_IN_SINGLE_COMPACTION, NUM_FILES_IN_SINGLE_COMPACTION,
DB_SEEK, DB_SEEK,
WRITE_STALL,
HISTOGRAM_ENUM_MAX, HISTOGRAM_ENUM_MAX,
}; };

@ -96,7 +96,7 @@ class Status {
// Returns true iff the status indicates Incomplete // Returns true iff the status indicates Incomplete
bool IsIncomplete() const { return code() == kIncomplete; } bool IsIncomplete() const { return code() == kIncomplete; }
// Returns true iff the status indicates Incomplete // Returns true iff the status indicates Shutdown In progress
bool IsShutdownInProgress() const { return code() == kShutdownInProgress; } bool IsShutdownInProgress() const { return code() == kShutdownInProgress; }
bool IsTimedOut() const { return code() == kTimedOut; } bool IsTimedOut() const { return code() == kTimedOut; }

@ -23,6 +23,7 @@
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/immutable_options.h"
#include "rocksdb/status.h" #include "rocksdb/status.h"
namespace rocksdb { namespace rocksdb {
@ -84,6 +85,46 @@ struct BlockBasedTableOptions {
// protected with this checksum type. Old table files will still be readable, // protected with this checksum type. Old table files will still be readable,
// even though they have different checksum type. // even though they have different checksum type.
ChecksumType checksum = kCRC32c; ChecksumType checksum = kCRC32c;
// Disable block cache. If this is set to true,
// then no block cache should be used, and the block_cache should
// point to a nullptr object.
bool no_block_cache = false;
// If non-NULL use the specified cache for blocks.
// If NULL, rocksdb will automatically create and use an 8MB internal cache.
std::shared_ptr<Cache> block_cache = nullptr;
// If non-NULL use the specified cache for compressed blocks.
// If NULL, rocksdb will not use a compressed block cache.
std::shared_ptr<Cache> block_cache_compressed = nullptr;
// Approximate size of user data packed per block. Note that the
// block size specified here corresponds to uncompressed data. The
// actual size of the unit read from disk may be smaller if
// compression is enabled. This parameter can be changed dynamically.
size_t block_size = 4 * 1024;
// This is used to close a block before it reaches the configured
// 'block_size'. If the percentage of free space in the current block is less
// than this specified number and adding a new record to the block will
// exceed the configured block size, then this block will be closed and the
// new record will be written to the next block.
int block_size_deviation = 10;
// Number of keys between restart points for delta encoding of keys.
// This parameter can be changed dynamically. Most clients should
// leave this parameter alone.
int block_restart_interval = 16;
// If non-nullptr, use the specified filter policy to reduce disk reads.
// Many applications will benefit from passing the result of
// NewBloomFilterPolicy() here.
std::shared_ptr<const FilterPolicy> filter_policy = nullptr;
// If true, place whole keys in the filter (not just prefixes).
// This must generally be true for gets to be efficient.
bool whole_key_filtering = true;
}; };
// Table Properties that are specific to block-based table properties. // Table Properties that are specific to block-based table properties.
@ -126,47 +167,49 @@ struct PlainTablePropertyNames {
const uint32_t kPlainTableVariableLength = 0; const uint32_t kPlainTableVariableLength = 0;
struct PlainTableOptions { struct PlainTableOptions {
// @user_key_len: plain table has optimization for fix-sized keys, which can be // @user_key_len: plain table has optimization for fix-sized keys, which can
// specified via user_key_len. Alternatively, you can pass // be specified via user_key_len. Alternatively, you can pass
// `kPlainTableVariableLength` if your keys have variable // `kPlainTableVariableLength` if your keys have variable
// lengths. // lengths.
uint32_t user_key_len = kPlainTableVariableLength; uint32_t user_key_len = kPlainTableVariableLength;
// @bloom_bits_per_key: the number of bits used for bloom filer per prefix. You // @bloom_bits_per_key: the number of bits used for bloom filer per prefix.
// may disable it by passing a zero. // You may disable it by passing a zero.
int bloom_bits_per_key = 10; int bloom_bits_per_key = 10;
// @hash_table_ratio: the desired utilization of the hash table used for prefix // @hash_table_ratio: the desired utilization of the hash table used for
// hashing. hash_table_ratio = number of prefixes / #buckets // prefix hashing.
// in the hash table // hash_table_ratio = number of prefixes / #buckets in the
double hash_table_ratio = 0.75; // hash table
double hash_table_ratio = 0.75;
// @index_sparseness: inside each prefix, need to build one index record for how
// many keys for binary search inside each hash bucket. // @index_sparseness: inside each prefix, need to build one index record for
// For encoding type kPrefix, the value will be used when // how many keys for binary search inside each hash bucket.
// writing to determine an interval to rewrite the full key. // For encoding type kPrefix, the value will be used when
// It will also be used as a suggestion and satisfied when // writing to determine an interval to rewrite the full
// possible. // key. It will also be used as a suggestion and satisfied
size_t index_sparseness = 16; // when possible.
size_t index_sparseness = 16;
// @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// Otherwise from huge page TLB. The user needs to reserve // @huge_page_tlb_size: if <=0, allocate hash indexes and blooms from malloc.
// huge pages for it to be allocated, like: // Otherwise from huge page TLB. The user needs to
// sysctl -w vm.nr_hugepages=20 // reserve huge pages for it to be allocated, like:
// See linux doc Documentation/vm/hugetlbpage.txt // sysctl -w vm.nr_hugepages=20
size_t huge_page_tlb_size = 0; // See linux doc Documentation/vm/hugetlbpage.txt
size_t huge_page_tlb_size = 0;
// @encoding_type: how to encode the keys. See enum EncodingType above for
// the choices. The value will determine how to encode keys // @encoding_type: how to encode the keys. See enum EncodingType above for
// when writing to a new SST file. This value will be stored // the choices. The value will determine how to encode keys
// inside the SST file which will be used when reading from the // when writing to a new SST file. This value will be stored
// file, which makes it possible for users to choose different // inside the SST file which will be used when reading from
// encoding type when reopening a DB. Files with different // the file, which makes it possible for users to choose
// encoding types can co-exist in the same DB and can be read. // different encoding type when reopening a DB. Files with
EncodingType encoding_type = kPlain; // different encoding types can co-exist in the same DB and
// can be read.
// @full_scan_mode: mode for reading the whole file one record by one without EncodingType encoding_type = kPlain;
// using the index.
// @full_scan_mode: mode for reading the whole file one record by one without
// using the index.
bool full_scan_mode = false; bool full_scan_mode = false;
// @store_index_in_file: compute plain table index and bloom filter during // @store_index_in_file: compute plain table index and bloom filter during
@ -185,15 +228,59 @@ extern TableFactory* NewPlainTableFactory(const PlainTableOptions& options =
PlainTableOptions()); PlainTableOptions());
struct CuckooTablePropertyNames { struct CuckooTablePropertyNames {
// The key that is used to fill empty buckets.
static const std::string kEmptyKey; static const std::string kEmptyKey;
// Fixed length of value.
static const std::string kValueLength; static const std::string kValueLength;
static const std::string kNumHashTable; // Number of hash functions used in Cuckoo Hash.
static const std::string kMaxNumBuckets; static const std::string kNumHashFunc;
// It denotes the number of buckets in a Cuckoo Block. Given a key and a
// particular hash function, a Cuckoo Block is a set of consecutive buckets,
// where starting bucket id is given by the hash function on the key. In case
// of a collision during inserting the key, the builder tries to insert the
// key in other locations of the cuckoo block before using the next hash
// function. This reduces cache miss during read operation in case of
// collision.
static const std::string kCuckooBlockSize;
// Size of the hash table. Use this number to compute the modulo of hash
// function. The actual number of buckets will be kMaxHashTableSize +
// kCuckooBlockSize - 1. The last kCuckooBlockSize-1 buckets are used to
// accommodate the Cuckoo Block from end of hash table, due to cache friendly
// implementation.
static const std::string kHashTableSize;
// Denotes if the key sorted in the file is Internal Key (if false)
// or User Key only (if true).
static const std::string kIsLastLevel; static const std::string kIsLastLevel;
// Indicate if using identity function for the first hash function.
static const std::string kIdentityAsFirstHash;
};
struct CuckooTableOptions {
// Determines the utilization of hash tables. Smaller values
// result in larger hash tables with fewer collisions.
double hash_table_ratio = 0.9;
// A property used by builder to determine the depth to go to
// to search for a path to displace elements in case of
// collision. See Builder.MakeSpaceForKey method. Higher
// values result in more efficient hash tables with fewer
// lookups but take more time to build.
uint32_t max_search_depth = 100;
// In case of collision while inserting, the builder
// attempts to insert in the next cuckoo_block_size
// locations before skipping over to the next Cuckoo hash
// function. This makes lookups more cache friendly in case
// of collisions.
uint32_t cuckoo_block_size = 5;
// If this options is enabled, user key is treated as uint64_t and its value
// is used as hash value directly. This option changes builder's behavior.
// Reader ignore this option and behave according to what specified in table
// property.
bool identity_as_first_hash = false;
}; };
extern TableFactory* NewCuckooTableFactory(double hash_table_ratio = 0.9, // Cuckoo Table Factory for SST table format using Cache Friendly Cuckoo Hashing
uint32_t max_search_depth = 100); extern TableFactory* NewCuckooTableFactory(
const CuckooTableOptions& table_options = CuckooTableOptions());
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
@ -220,14 +307,15 @@ class TableFactory {
// and cache the table object returned. // and cache the table object returned.
// (1) SstFileReader (for SST Dump) opens the table and dump the table // (1) SstFileReader (for SST Dump) opens the table and dump the table
// contents using the interator of the table. // contents using the interator of the table.
// options and soptions are options. options is the general options. // ImmutableCFOptions is a subset of Options that can not be altered.
// EnvOptions is a subset of Options that will be used by Env.
// Multiple configured can be accessed from there, including and not // Multiple configured can be accessed from there, including and not
// limited to block cache and key comparators. // limited to block cache and key comparators.
// file is a file handler to handle the file for the table // file is a file handler to handle the file for the table
// file_size is the physical file size of the file // file_size is the physical file size of the file
// table_reader is the output table reader // table_reader is the output table reader
virtual Status NewTableReader( virtual Status NewTableReader(
const Options& options, const EnvOptions& soptions, const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table_reader) const = 0; unique_ptr<TableReader>* table_reader) const = 0;
@ -245,14 +333,27 @@ class TableFactory {
// (4) When running Repairer, it creates a table builder to convert logs to // (4) When running Repairer, it creates a table builder to convert logs to
// SST files (In Repairer::ConvertLogToTable() by calling BuildTable()) // SST files (In Repairer::ConvertLogToTable() by calling BuildTable())
// //
// options is the general options. Multiple configured can be acceseed from // ImmutableCFOptions is a subset of Options that can not be altered.
// there, including and not limited to compression options. // Multiple configured can be acceseed from there, including and not limited
// file is a handle of a writable file. It is the caller's responsibility to // to compression options. file is a handle of a writable file.
// keep the file open and close the file after closing the table builder. // It is the caller's responsibility to keep the file open and close the file
// compression_type is the compression type to use in this table. // after closing the table builder. compression_type is the compression type
// to use in this table.
virtual TableBuilder* NewTableBuilder( virtual TableBuilder* NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator, const ImmutableCFOptions& ioptions,
WritableFile* file, CompressionType compression_type) const = 0; const InternalKeyComparator& internal_comparator,
WritableFile* file, const CompressionType compression_type,
const CompressionOptions& compression_opts) const = 0;
// Sanitizes the specified DB Options.
//
// If the function cannot find a way to sanitize the input DB Options,
// a non-ok Status will be returned.
virtual Status SanitizeDBOptions(const DBOptions* db_opts) const = 0;
// Return a string that contains printable format of table configurations.
// RocksDB prints configurations at DB Open().
virtual std::string GetPrintableTableOptions() const = 0;
}; };
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE

@ -10,7 +10,10 @@
#pragma once #pragma once
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS
#endif
#include <inttypes.h> #include <inttypes.h>
#include <string> #include <string>
#include <map> #include <map>
@ -127,9 +130,41 @@ struct BackupInfo {
int64_t timestamp; int64_t timestamp;
uint64_t size; uint64_t size;
uint32_t number_files;
BackupInfo() {} BackupInfo() {}
BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size)
: backup_id(_backup_id), timestamp(_timestamp), size(_size) {} BackupInfo(BackupID _backup_id, int64_t _timestamp, uint64_t _size,
uint32_t _number_files)
: backup_id(_backup_id), timestamp(_timestamp), size(_size),
number_files(_number_files) {}
};
class BackupStatistics {
public:
BackupStatistics() {
number_success_backup = 0;
number_fail_backup = 0;
}
BackupStatistics(uint32_t _number_success_backup,
uint32_t _number_fail_backup)
: number_success_backup(_number_success_backup),
number_fail_backup(_number_fail_backup) {}
~BackupStatistics() {}
void IncrementNumberSuccessBackup();
void IncrementNumberFailBackup();
uint32_t GetNumberSuccessBackup() const;
uint32_t GetNumberFailBackup() const;
std::string ToString() const;
private:
uint32_t number_success_backup;
uint32_t number_fail_backup;
}; };
class BackupEngineReadOnly { class BackupEngineReadOnly {

@ -0,0 +1,105 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A WriteBatchWithIndex with a binary searchable index built for all the keys
// inserted.
#pragma once
#include "rocksdb/comparator.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/write_batch.h"
namespace rocksdb {
class ColumnFamilyHandle;
struct SliceParts;
class Comparator;
enum WriteType { kPutRecord, kMergeRecord, kDeleteRecord, kLogDataRecord };
// an entry for Put, Merge or Delete entry for write batches. Used in
// WBWIIterator.
struct WriteEntry {
WriteType type;
Slice key;
Slice value;
};
// Iterator of one column family out of a WriteBatchWithIndex.
class WBWIIterator {
public:
virtual ~WBWIIterator() {}
virtual bool Valid() const = 0;
virtual void Seek(const Slice& key) = 0;
virtual void Next() = 0;
virtual const WriteEntry& Entry() const = 0;
virtual Status status() const = 0;
};
// A WriteBatchWithIndex with a binary searchable index built for all the keys
// inserted.
// In Put(), Merge() or Delete(), the same function of the wrapped will be
// called. At the same time, indexes will be built.
// By calling GetWriteBatch(), a user will get the WriteBatch for the data
// they inserted, which can be used for DB::Write().
// A user can call NewIterator() to create an iterator.
class WriteBatchWithIndex {
public:
// backup_index_comparator: the backup comparator used to compare keys
// within the same column family, if column family is not given in the
// interface, or we can't find a column family from the column family handle
// passed in, backup_index_comparator will be used for the column family.
// reserved_bytes: reserved bytes in underlying WriteBatch
explicit WriteBatchWithIndex(
const Comparator* backup_index_comparator = BytewiseComparator(),
size_t reserved_bytes = 0);
virtual ~WriteBatchWithIndex();
WriteBatch* GetWriteBatch();
virtual void Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
virtual void Put(const Slice& key, const Slice& value);
virtual void Merge(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value);
virtual void Merge(const Slice& key, const Slice& value);
virtual void PutLogData(const Slice& blob);
virtual void Delete(ColumnFamilyHandle* column_family, const Slice& key);
virtual void Delete(const Slice& key);
virtual void Delete(ColumnFamilyHandle* column_family, const SliceParts& key);
virtual void Delete(const SliceParts& key);
// Create an iterator of a column family. User can call iterator.Seek() to
// search to the next entry of or after a key. Keys will be iterated in the
// order given by index_comparator. For multiple updates on the same key,
// each update will be returned as a separate entry, in the order of update
// time.
virtual WBWIIterator* NewIterator(ColumnFamilyHandle* column_family);
// Create an iterator of the default column family.
virtual WBWIIterator* NewIterator();
private:
struct Rep;
Rep* rep;
};
} // namespace rocksdb

@ -152,6 +152,7 @@ class WriteBatch {
private: private:
friend class WriteBatchInternal; friend class WriteBatchInternal;
protected:
std::string rep_; // See comment in write_batch.cc for the format of rep_ std::string rep_; // See comment in write_batch.cc for the format of rep_
// Intentionally copyable // Intentionally copyable

@ -1,4 +1,4 @@
NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv NATIVE_JAVA_CLASSES = org.rocksdb.RocksDB org.rocksdb.Options org.rocksdb.WriteBatch org.rocksdb.WriteBatchInternal org.rocksdb.WriteBatchTest org.rocksdb.WriteOptions org.rocksdb.BackupableDB org.rocksdb.BackupableDBOptions org.rocksdb.Statistics org.rocksdb.RocksIterator org.rocksdb.VectorMemTableConfig org.rocksdb.SkipListMemTableConfig org.rocksdb.HashLinkedListMemTableConfig org.rocksdb.HashSkipListMemTableConfig org.rocksdb.PlainTableConfig org.rocksdb.BlockBasedTableConfig org.rocksdb.ReadOptions org.rocksdb.Filter org.rocksdb.BloomFilter org.rocksdb.RestoreOptions org.rocksdb.RestoreBackupableDB org.rocksdb.RocksEnv org.rocksdb.GenericRateLimiterConfig
NATIVE_INCLUDE = ./include NATIVE_INCLUDE = ./include
ROCKSDB_JAR = rocksdbjni.jar ROCKSDB_JAR = rocksdbjni.jar

@ -35,16 +35,11 @@ public class RocksDBSample {
assert(db == null); assert(db == null);
} }
Filter filter = new BloomFilter(10);
options.setCreateIfMissing(true) options.setCreateIfMissing(true)
.createStatistics() .createStatistics()
.setWriteBufferSize(8 * SizeUnit.KB) .setWriteBufferSize(8 * SizeUnit.KB)
.setMaxWriteBufferNumber(3) .setMaxWriteBufferNumber(3)
.setDisableSeekCompaction(true)
.setBlockSize(64 * SizeUnit.KB)
.setMaxBackgroundCompactions(10) .setMaxBackgroundCompactions(10)
.setFilter(filter)
.setCacheNumShardBits(6)
.setCompressionType(CompressionType.SNAPPY_COMPRESSION) .setCompressionType(CompressionType.SNAPPY_COMPRESSION)
.setCompactionStyle(CompactionStyle.UNIVERSAL); .setCompactionStyle(CompactionStyle.UNIVERSAL);
Statistics stats = options.statisticsPtr(); Statistics stats = options.statisticsPtr();
@ -52,10 +47,7 @@ public class RocksDBSample {
assert(options.createIfMissing() == true); assert(options.createIfMissing() == true);
assert(options.writeBufferSize() == 8 * SizeUnit.KB); assert(options.writeBufferSize() == 8 * SizeUnit.KB);
assert(options.maxWriteBufferNumber() == 3); assert(options.maxWriteBufferNumber() == 3);
assert(options.disableSeekCompaction() == true);
assert(options.blockSize() == 64 * SizeUnit.KB);
assert(options.maxBackgroundCompactions() == 10); assert(options.maxBackgroundCompactions() == 10);
assert(options.cacheNumShardBits() == 6);
assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION); assert(options.compressionType() == CompressionType.SNAPPY_COMPRESSION);
assert(options.compactionStyle() == CompactionStyle.UNIVERSAL); assert(options.compactionStyle() == CompactionStyle.UNIVERSAL);
@ -80,8 +72,23 @@ public class RocksDBSample {
assert(options.memTableFactoryName().equals("SkipListFactory")); assert(options.memTableFactoryName().equals("SkipListFactory"));
options.setTableFormatConfig(new PlainTableConfig()); options.setTableFormatConfig(new PlainTableConfig());
// Plain-Table requires mmap read
options.setAllowMmapReads(true);
assert(options.tableFactoryName().equals("PlainTable")); assert(options.tableFactoryName().equals("PlainTable"));
options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000,
10000, 10));
options.setRateLimiterConfig(new GenericRateLimiterConfig(10000000));
BlockBasedTableConfig table_options = new BlockBasedTableConfig();
table_options.setBlockCacheSize(64 * SizeUnit.KB)
.setFilterBitsPerKey(10)
.setCacheNumShardBits(6);
assert(table_options.blockCacheSize() == 64 * SizeUnit.KB);
assert(table_options.cacheNumShardBits() == 6);
options.setTableFormatConfig(table_options);
assert(options.tableFactoryName().equals("BlockBasedTable"));
try { try {
db = RocksDB.open(options, db_path_not_found); db = RocksDB.open(options, db_path_not_found);
db.put("hello".getBytes(), "world".getBytes()); db.put("hello".getBytes(), "world".getBytes());
@ -120,6 +127,29 @@ public class RocksDBSample {
System.out.println(""); System.out.println("");
} }
// write batch test
WriteOptions writeOpt = new WriteOptions();
for (int i = 10; i <= 19; ++i) {
WriteBatch batch = new WriteBatch();
for (int j = 10; j <= 19; ++j) {
batch.put(String.format("%dx%d", i, j).getBytes(),
String.format("%d", i * j).getBytes());
}
db.write(writeOpt, batch);
batch.dispose();
}
for (int i = 10; i <= 19; ++i) {
for (int j = 10; j <= 19; ++j) {
assert(new String(
db.get(String.format("%dx%d", i, j).getBytes())).equals(
String.format("%d", i * j)));
System.out.format("%s ", new String(db.get(
String.format("%dx%d", i, j).getBytes())));
}
System.out.println("");
}
writeOpt.dispose();
value = db.get("1x1".getBytes()); value = db.get("1x1".getBytes());
assert(value != null); assert(value != null);
value = db.get("world".getBytes()); value = db.get("world".getBytes());
@ -254,6 +284,5 @@ public class RocksDBSample {
// be sure to dispose c++ pointers // be sure to dispose c++ pointers
options.dispose(); options.dispose();
readOptions.dispose(); readOptions.dispose();
filter.dispose();
} }
} }

@ -0,0 +1,210 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
package org.rocksdb;
/**
* The config for plain table sst format.
*
* BlockBasedTable is a RocksDB's default SST file format.
*/
public class BlockBasedTableConfig extends TableFormatConfig {
public BlockBasedTableConfig() {
noBlockCache_ = false;
blockCacheSize_ = 8 * 1024 * 1024;
blockSize_ = 4 * 1024;
blockSizeDeviation_ =10;
blockRestartInterval_ =16;
wholeKeyFiltering_ = true;
bitsPerKey_ = 0;
}
/**
* Disable block cache. If this is set to true,
* then no block cache should be used, and the block_cache should
* point to a nullptr object.
* Default: false
*
* @param noBlockCache if use block cache
* @return the reference to the current config.
*/
public BlockBasedTableConfig setNoBlockCache(boolean noBlockCache) {
noBlockCache_ = noBlockCache;
return this;
}
/**
* @return if block cache is disabled
*/
public boolean noBlockCache() {
return noBlockCache_;
}
/**
* Set the amount of cache in bytes that will be used by RocksDB.
* If cacheSize is non-positive, then cache will not be used.
* DEFAULT: 8M
*
* @param blockCacheSize block cache size in bytes
* @return the reference to the current config.
*/
public BlockBasedTableConfig setBlockCacheSize(long blockCacheSize) {
blockCacheSize_ = blockCacheSize;
return this;
}
/**
* @return block cache size in bytes
*/
public long blockCacheSize() {
return blockCacheSize_;
}
/**
* Controls the number of shards for the block cache.
* This is applied only if cacheSize is set to non-negative.
*
* @param numShardBits the number of shard bits. The resulting
* number of shards would be 2 ^ numShardBits. Any negative
* number means use default settings."
* @return the reference to the current option.
*/
public BlockBasedTableConfig setCacheNumShardBits(int numShardBits) {
numShardBits_ = numShardBits;
return this;
}
/**
* Returns the number of shard bits used in the block cache.
* The resulting number of shards would be 2 ^ (returned value).
* Any negative number means use default settings.
*
* @return the number of shard bits used in the block cache.
*/
public int cacheNumShardBits() {
return numShardBits_;
}
/**
* Approximate size of user data packed per block. Note that the
* block size specified here corresponds to uncompressed data. The
* actual size of the unit read from disk may be smaller if
* compression is enabled. This parameter can be changed dynamically.
* Default: 4K
*
* @param blockSize block size in bytes
* @return the reference to the current config.
*/
public BlockBasedTableConfig setBlockSize(long blockSize) {
blockSize_ = blockSize;
return this;
}
/**
* @return block size in bytes
*/
public long blockSize() {
return blockSize_;
}
/**
* This is used to close a block before it reaches the configured
* 'block_size'. If the percentage of free space in the current block is less
* than this specified number and adding a new record to the block will
* exceed the configured block size, then this block will be closed and the
* new record will be written to the next block.
* Default is 10.
*
* @param blockSizeDeviation the deviation to block size allowed
* @return the reference to the current config.
*/
public BlockBasedTableConfig setBlockSizeDeviation(int blockSizeDeviation) {
blockSizeDeviation_ = blockSizeDeviation;
return this;
}
/**
* @return the hash table ratio.
*/
public int blockSizeDeviation() {
return blockSizeDeviation_;
}
/**
* Set block restart interval
*
* @param restartInterval block restart interval.
* @return the reference to the current config.
*/
public BlockBasedTableConfig setBlockRestartInterval(int restartInterval) {
blockRestartInterval_ = restartInterval;
return this;
}
/**
* @return block restart interval
*/
public int blockRestartInterval() {
return blockRestartInterval_;
}
/**
* If true, place whole keys in the filter (not just prefixes).
* This must generally be true for gets to be efficient.
* Default: true
*
* @param wholeKeyFiltering if enable whole key filtering
* @return the reference to the current config.
*/
public BlockBasedTableConfig setWholeKeyFiltering(boolean wholeKeyFiltering) {
wholeKeyFiltering_ = wholeKeyFiltering;
return this;
}
/**
* @return if whole key filtering is enabled
*/
public boolean wholeKeyFiltering() {
return wholeKeyFiltering_;
}
/**
* Use the specified filter policy to reduce disk reads.
*
* Filter should not be disposed before options instances using this filter is
* disposed. If dispose() function is not called, then filter object will be
* GC'd automatically.
*
* Filter instance can be re-used in multiple options instances.
*
* @param Filter policy java instance.
* @return the reference to the current config.
*/
public BlockBasedTableConfig setFilterBitsPerKey(int bitsPerKey) {
bitsPerKey_ = bitsPerKey;
return this;
}
@Override protected long newTableFactoryHandle() {
return newTableFactoryHandle(noBlockCache_, blockCacheSize_, numShardBits_,
blockSize_, blockSizeDeviation_, blockRestartInterval_,
wholeKeyFiltering_, bitsPerKey_);
}
private native long newTableFactoryHandle(
boolean noBlockCache, long blockCacheSize, int numShardbits,
long blockSize, int blockSizeDeviation, int blockRestartInterval,
boolean wholeKeyFiltering, int bitsPerKey);
private boolean noBlockCache_;
private long blockCacheSize_;
private int numShardBits_;
private long shard;
private long blockSize_;
private int blockSizeDeviation_;
private int blockRestartInterval_;
private boolean wholeKeyFiltering_;
private int bitsPerKey_;
}

@ -0,0 +1,36 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
package org.rocksdb;
/**
* Config for rate limiter, which is used to control write rate of flush and
* compaction.
*/
public class GenericRateLimiterConfig extends RateLimiterConfig {
private static final long DEFAULT_REFILL_PERIOD_MICROS = (100 * 1000);
private static final int DEFAULT_FAIRNESS = 10;
public GenericRateLimiterConfig(long rateBytesPerSecond,
long refillPeriodMicros, int fairness) {
rateBytesPerSecond_ = rateBytesPerSecond;
refillPeriodMicros_ = refillPeriodMicros;
fairness_ = fairness;
}
public GenericRateLimiterConfig(long rateBytesPerSecond) {
this(rateBytesPerSecond, DEFAULT_REFILL_PERIOD_MICROS, DEFAULT_FAIRNESS);
}
@Override protected long newRateLimiterHandle() {
return newRateLimiterHandle(rateBytesPerSecond_, refillPeriodMicros_,
fairness_);
}
private native long newRateLimiterHandle(long rateBytesPerSecond,
long refillPeriodMicros, int fairness);
private final long rateBytesPerSecond_;
private final long refillPeriodMicros_;
private final int fairness_;
}

@ -139,135 +139,6 @@ public class Options extends RocksObject {
return maxWriteBufferNumber(nativeHandle_); return maxWriteBufferNumber(nativeHandle_);
} }
/*
* Approximate size of user data packed per block. Note that the
* block size specified here corresponds to uncompressed data. The
* actual size of the unit read from disk may be smaller if
* compression is enabled. This parameter can be changed dynamically.
*
* Default: 4K
*
* @param blockSize the size of each block in bytes.
* @return the instance of the current Options.
* @see RocksDB.open()
*/
public Options setBlockSize(long blockSize) {
assert(isInitialized());
setBlockSize(nativeHandle_, blockSize);
return this;
}
/*
* Returns the size of a block in bytes.
*
* @return block size.
* @see setBlockSize()
*/
public long blockSize() {
assert(isInitialized());
return blockSize(nativeHandle_);
}
/**
* Use the specified filter policy to reduce disk reads.
*
* Filter should not be disposed before options instances using this filter is
* disposed. If dispose() function is not called, then filter object will be
* GC'd automatically.
*
* Filter instance can be re-used in multiple options instances.
*
* @param Filter policy java instance.
* @return the instance of the current Options.
* @see RocksDB.open()
*/
public Options setFilter(Filter filter) {
assert(isInitialized());
setFilterHandle(nativeHandle_, filter.nativeHandle_);
filter_ = filter;
return this;
}
private native void setFilterHandle(long optHandle, long filterHandle);
/*
* Disable compaction triggered by seek.
* With bloomfilter and fast storage, a miss on one level
* is very cheap if the file handle is cached in table cache
* (which is true if max_open_files is large).
* Default: true
*
* @param disableSeekCompaction a boolean value to specify whether
* to disable seek compaction.
* @return the instance of the current Options.
* @see RocksDB.open()
*/
public Options setDisableSeekCompaction(boolean disableSeekCompaction) {
assert(isInitialized());
setDisableSeekCompaction(nativeHandle_, disableSeekCompaction);
return this;
}
/*
* Returns true if disable seek compaction is set to true.
*
* @return true if disable seek compaction is set to true.
* @see setDisableSeekCompaction()
*/
public boolean disableSeekCompaction() {
assert(isInitialized());
return disableSeekCompaction(nativeHandle_);
}
/**
* Set the amount of cache in bytes that will be used by RocksDB.
* If cacheSize is non-positive, then cache will not be used.
*
* DEFAULT: 8M
* @see setCacheNumShardBits()
*/
public Options setCacheSize(long cacheSize) {
cacheSize_ = cacheSize;
return this;
}
/**
* @return the amount of cache in bytes that will be used by RocksDB.
*
* @see cacheNumShardBits()
*/
public long cacheSize() {
return cacheSize_;
}
/**
* Controls the number of shards for the block cache.
* This is applied only if cacheSize is set to non-negative.
*
* @param numShardBits the number of shard bits. The resulting
* number of shards would be 2 ^ numShardBits. Any negative
* number means use default settings."
* @return the reference to the current option.
*
* @see setCacheSize()
*/
public Options setCacheNumShardBits(int numShardBits) {
numShardBits_ = numShardBits;
return this;
}
/**
* Returns the number of shard bits used in the block cache.
* The resulting number of shards would be 2 ^ (returned value).
* Any negative number means use default settings.
*
* @return the number of shard bits used in the block cache.
*
* @see cacheSize()
*/
public int cacheNumShardBits() {
return numShardBits_;
}
/** /**
* If true, an error will be thrown during RocksDB.open() if the * If true, an error will be thrown during RocksDB.open() if the
* database already exists. * database already exists.
@ -437,40 +308,6 @@ public class Options extends RocksObject {
} }
private native void setUseFsync(long handle, boolean useFsync); private native void setUseFsync(long handle, boolean useFsync);
/**
* The time interval in seconds between each two consecutive stats logs.
* This number controls how often a new scribe log about
* db deploy stats is written out.
* -1 indicates no logging at all.
*
* @return the time interval in seconds between each two consecutive
* stats logs.
*/
public int dbStatsLogInterval() {
assert(isInitialized());
return dbStatsLogInterval(nativeHandle_);
}
private native int dbStatsLogInterval(long handle);
/**
* The time interval in seconds between each two consecutive stats logs.
* This number controls how often a new scribe log about
* db deploy stats is written out.
* -1 indicates no logging at all.
* Default value is 1800 (half an hour).
*
* @param dbStatsLogInterval the time interval in seconds between each
* two consecutive stats logs.
* @return the reference to the current option.
*/
public Options setDbStatsLogInterval(int dbStatsLogInterval) {
assert(isInitialized());
setDbStatsLogInterval(nativeHandle_, dbStatsLogInterval);
return this;
}
private native void setDbStatsLogInterval(
long handle, int dbStatsLogInterval);
/** /**
* Returns the directory of info log. * Returns the directory of info log.
* *
@ -1271,6 +1108,19 @@ public class Options extends RocksObject {
return this; return this;
} }
/**
* Use to control write rate of flush and compaction. Flush has higher
* priority than compaction. Rate limiting is disabled if nullptr.
* Default: nullptr
*
* @param config rate limiter config.
* @return the instance of the current Options.
*/
public Options setRateLimiterConfig(RateLimiterConfig config) {
setRateLimiter(nativeHandle_, config.newRateLimiterHandle());
return this;
}
/** /**
* Returns the name of the current mem table representation. * Returns the name of the current mem table representation.
* Memtable format can be set using setTableFormatConfig. * Memtable format can be set using setTableFormatConfig.
@ -1400,33 +1250,6 @@ public class Options extends RocksObject {
} }
private native void setCompactionStyle(long handle, byte compactionStyle); private native void setCompactionStyle(long handle, byte compactionStyle);
/**
* If true, place whole keys in the filter (not just prefixes).
* This must generally be true for gets to be efficient.
* Default: true
*
* @return if true, then whole-key-filtering is on.
*/
public boolean wholeKeyFiltering() {
return wholeKeyFiltering(nativeHandle_);
}
private native boolean wholeKeyFiltering(long handle);
/**
* If true, place whole keys in the filter (not just prefixes).
* This must generally be true for gets to be efficient.
* Default: true
*
* @param wholeKeyFiltering if true, then whole-key-filtering is on.
* @return the reference to the current option.
*/
public Options setWholeKeyFiltering(boolean wholeKeyFiltering) {
setWholeKeyFiltering(nativeHandle_, wholeKeyFiltering);
return this;
}
private native void setWholeKeyFiltering(
long handle, boolean wholeKeyFiltering);
/** /**
* If level-styled compaction is used, then this number determines * If level-styled compaction is used, then this number determines
* the total number of levels. * the total number of levels.
@ -1900,35 +1723,6 @@ public class Options extends RocksObject {
private native void setRateLimitDelayMaxMilliseconds( private native void setRateLimitDelayMaxMilliseconds(
long handle, int rateLimitDelayMaxMilliseconds); long handle, int rateLimitDelayMaxMilliseconds);
/**
* Disable block cache. If this is set to true,
* then no block cache should be used, and the block_cache should
* point to a nullptr object.
* Default: false
*
* @return true if block cache is disabled.
*/
public boolean noBlockCache() {
return noBlockCache(nativeHandle_);
}
private native boolean noBlockCache(long handle);
/**
* Disable block cache. If this is set to true,
* then no block cache should be used, and the block_cache should
* point to a nullptr object.
* Default: false
*
* @param noBlockCache true if block-cache is disabled.
* @return the reference to the current option.
*/
public Options setNoBlockCache(boolean noBlockCache) {
setNoBlockCache(nativeHandle_, noBlockCache);
return this;
}
private native void setNoBlockCache(
long handle, boolean noBlockCache);
/** /**
* The size of one block in arena memory allocation. * The size of one block in arena memory allocation.
* If <= 0, a proper value is automatically calculated (usually 1/10 of * If <= 0, a proper value is automatically calculated (usually 1/10 of
@ -2026,39 +1820,6 @@ public class Options extends RocksObject {
private native void setPurgeRedundantKvsWhileFlush( private native void setPurgeRedundantKvsWhileFlush(
long handle, boolean purgeRedundantKvsWhileFlush); long handle, boolean purgeRedundantKvsWhileFlush);
/**
* This is used to close a block before it reaches the configured
* 'block_size'. If the percentage of free space in the current block is less
* than this specified number and adding a new record to the block will
* exceed the configured block size, then this block will be closed and the
* new record will be written to the next block.
* Default is 10.
*
* @return the target block size
*/
public int blockSizeDeviation() {
return blockSizeDeviation(nativeHandle_);
}
private native int blockSizeDeviation(long handle);
/**
* This is used to close a block before it reaches the configured
* 'block_size'. If the percentage of free space in the current block is less
* than this specified number and adding a new record to the block will
* exceed the configured block size, then this block will be closed and the
* new record will be written to the next block.
* Default is 10.
*
* @param blockSizeDeviation the target block size
* @return the reference to the current option.
*/
public Options setBlockSizeDeviation(int blockSizeDeviation) {
setBlockSizeDeviation(nativeHandle_, blockSizeDeviation);
return this;
}
private native void setBlockSizeDeviation(
long handle, int blockSizeDeviation);
/** /**
* If true, compaction will verify checksum on every read that happens * If true, compaction will verify checksum on every read that happens
* as part of compaction * as part of compaction
@ -2440,11 +2201,6 @@ public class Options extends RocksObject {
private native void setMaxWriteBufferNumber( private native void setMaxWriteBufferNumber(
long handle, int maxWriteBufferNumber); long handle, int maxWriteBufferNumber);
private native int maxWriteBufferNumber(long handle); private native int maxWriteBufferNumber(long handle);
private native void setBlockSize(long handle, long blockSize);
private native long blockSize(long handle);
private native void setDisableSeekCompaction(
long handle, boolean disableSeekCompaction);
private native boolean disableSeekCompaction(long handle);
private native void setMaxBackgroundCompactions( private native void setMaxBackgroundCompactions(
long handle, int maxBackgroundCompactions); long handle, int maxBackgroundCompactions);
private native int maxBackgroundCompactions(long handle); private native int maxBackgroundCompactions(long handle);
@ -2452,6 +2208,8 @@ public class Options extends RocksObject {
private native long statisticsPtr(long optHandle); private native long statisticsPtr(long optHandle);
private native void setMemTableFactory(long handle, long factoryHandle); private native void setMemTableFactory(long handle, long factoryHandle);
private native void setRateLimiter(long handle,
long rateLimiterHandle);
private native String memTableFactoryName(long handle); private native String memTableFactoryName(long handle);
private native void setTableFactory(long handle, long factoryHandle); private native void setTableFactory(long handle, long factoryHandle);
@ -2462,6 +2220,5 @@ public class Options extends RocksObject {
long cacheSize_; long cacheSize_;
int numShardBits_; int numShardBits_;
Filter filter_;
RocksEnv env_; RocksEnv env_;
} }

@ -0,0 +1,20 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
package org.rocksdb;
/**
* Config for rate limiter, which is used to control write rate of flush and
* compaction.
*/
public abstract class RateLimiterConfig {
/**
* This function should only be called by Options.setRateLimiter(),
* which will create a c++ shared-pointer to the c++ RateLimiter
* that is associated with the Java RateLimtierConifg.
*
* @see Options.setRateLimiter()
*/
abstract protected long newRateLimiterHandle();
}

@ -130,8 +130,7 @@ public class RocksDB extends RocksObject {
// in RocksDB can prevent Java to GC during the life-time of // in RocksDB can prevent Java to GC during the life-time of
// the currently-created RocksDB. // the currently-created RocksDB.
RocksDB db = new RocksDB(); RocksDB db = new RocksDB();
db.open(options.nativeHandle_, options.cacheSize_, db.open(options.nativeHandle_, path);
options.numShardBits_, path);
db.storeOptionsInstance(options); db.storeOptionsInstance(options);
return db; return db;
@ -349,8 +348,7 @@ public class RocksDB extends RocksObject {
// native methods // native methods
protected native void open( protected native void open(
long optionsHandle, long cacheSize, int numShardBits, long optionsHandle, String path) throws RocksDBException;
String path) throws RocksDBException;
protected native void put( protected native void put(
long handle, byte[] key, int keyLen, long handle, byte[] key, int keyLen,
byte[] value, int valueLen) throws RocksDBException; byte[] value, int valueLen) throws RocksDBException;

@ -255,7 +255,7 @@ public class DbBenchmark {
for (long j = 0; j < entriesPerBatch_; j++) { for (long j = 0; j < entriesPerBatch_; j++) {
getKey(key, i + j, keyRange_); getKey(key, i + j, keyRange_);
DbBenchmark.this.gen_.generate(value); DbBenchmark.this.gen_.generate(value);
db_.put(writeOpt_, key, value); batch.put(key, value);
stats_.finishedSingleOp(keySize_ + valueSize_); stats_.finishedSingleOp(keySize_ + valueSize_);
} }
db_.write(writeOpt_, batch); db_.write(writeOpt_, batch);
@ -446,7 +446,6 @@ public class DbBenchmark {
randSeed_ = (Long) flags.get(Flag.seed); randSeed_ = (Long) flags.get(Flag.seed);
databaseDir_ = (String) flags.get(Flag.db); databaseDir_ = (String) flags.get(Flag.db);
writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second); writesPerSeconds_ = (Integer) flags.get(Flag.writes_per_second);
cacheSize_ = (Long) flags.get(Flag.cache_size);
memtable_ = (String) flags.get(Flag.memtablerep); memtable_ = (String) flags.get(Flag.memtablerep);
maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number); maxWriteBufferNumber_ = (Integer) flags.get(Flag.max_write_buffer_number);
prefixSize_ = (Integer) flags.get(Flag.prefix_size); prefixSize_ = (Integer) flags.get(Flag.prefix_size);
@ -491,7 +490,6 @@ public class DbBenchmark {
} }
private void prepareOptions(Options options) { private void prepareOptions(Options options) {
options.setCacheSize(cacheSize_);
if (!useExisting_) { if (!useExisting_) {
options.setCreateIfMissing(true); options.setCreateIfMissing(true);
} else { } else {
@ -521,6 +519,13 @@ public class DbBenchmark {
if (usePlainTable_) { if (usePlainTable_) {
options.setTableFormatConfig( options.setTableFormatConfig(
new PlainTableConfig().setKeySize(keySize_)); new PlainTableConfig().setKeySize(keySize_));
} else {
BlockBasedTableConfig table_options = new BlockBasedTableConfig();
table_options.setBlockSize((Long)flags_.get(Flag.block_size))
.setBlockCacheSize((Long)flags_.get(Flag.cache_size))
.setFilterBitsPerKey((Integer)flags_.get(Flag.bloom_bits))
.setCacheNumShardBits((Integer)flags_.get(Flag.cache_numshardbits));
options.setTableFormatConfig(table_options);
} }
options.setWriteBufferSize( options.setWriteBufferSize(
(Long)flags_.get(Flag.write_buffer_size)); (Long)flags_.get(Flag.write_buffer_size));
@ -532,12 +537,6 @@ public class DbBenchmark {
(Integer)flags_.get(Flag.max_background_compactions)); (Integer)flags_.get(Flag.max_background_compactions));
options.setMaxBackgroundFlushes( options.setMaxBackgroundFlushes(
(Integer)flags_.get(Flag.max_background_flushes)); (Integer)flags_.get(Flag.max_background_flushes));
options.setCacheSize(
(Long)flags_.get(Flag.cache_size));
options.setCacheNumShardBits(
(Integer)flags_.get(Flag.cache_numshardbits));
options.setBlockSize(
(Long)flags_.get(Flag.block_size));
options.setMaxOpenFiles( options.setMaxOpenFiles(
(Integer)flags_.get(Flag.open_files)); (Integer)flags_.get(Flag.open_files));
options.setTableCacheRemoveScanCountLimit( options.setTableCacheRemoveScanCountLimit(
@ -548,8 +547,6 @@ public class DbBenchmark {
(Boolean)flags_.get(Flag.use_fsync)); (Boolean)flags_.get(Flag.use_fsync));
options.setWalDir( options.setWalDir(
(String)flags_.get(Flag.wal_dir)); (String)flags_.get(Flag.wal_dir));
options.setDisableSeekCompaction(
(Boolean)flags_.get(Flag.disable_seek_compaction));
options.setDeleteObsoleteFilesPeriodMicros( options.setDeleteObsoleteFilesPeriodMicros(
(Integer)flags_.get(Flag.delete_obsolete_files_period_micros)); (Integer)flags_.get(Flag.delete_obsolete_files_period_micros));
options.setTableCacheNumshardbits( options.setTableCacheNumshardbits(
@ -604,15 +601,6 @@ public class DbBenchmark {
(Integer)flags_.get(Flag.max_successive_merges)); (Integer)flags_.get(Flag.max_successive_merges));
options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds)); options.setWalTtlSeconds((Long)flags_.get(Flag.wal_ttl_seconds));
options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB)); options.setWalSizeLimitMB((Long)flags_.get(Flag.wal_size_limit_MB));
int bloomBits = (Integer)flags_.get(Flag.bloom_bits);
if (bloomBits > 0) {
// Internally, options will keep a reference to this BloomFilter.
// This will disallow Java to GC this BloomFilter. In addition,
// options.dispose() will release the c++ object of this BloomFilter.
// As a result, the caller should not directly call
// BloomFilter.dispose().
options.setFilter(new BloomFilter(bloomBits));
}
/* TODO(yhchiang): enable the following parameters /* TODO(yhchiang): enable the following parameters
options.setCompressionType((String)flags_.get(Flag.compression_type)); options.setCompressionType((String)flags_.get(Flag.compression_type));
options.setCompressionLevel((Integer)flags_.get(Flag.compression_level)); options.setCompressionLevel((Integer)flags_.get(Flag.compression_level));
@ -1160,7 +1148,7 @@ public class DbBenchmark {
return Integer.parseInt(value); return Integer.parseInt(value);
} }
}, },
block_size(defaultOptions_.blockSize(), block_size(defaultBlockBasedTableOptions_.blockSize(),
"Number of bytes in a block.") { "Number of bytes in a block.") {
@Override public Object parseValue(String value) { @Override public Object parseValue(String value) {
return Long.parseLong(value); return Long.parseLong(value);
@ -1312,12 +1300,6 @@ public class DbBenchmark {
return Integer.parseInt(value); return Integer.parseInt(value);
} }
}, },
disable_seek_compaction(false,"Option to disable compaction\n" +
"\ttriggered by read.") {
@Override public Object parseValue(String value) {
return parseBoolean(value);
}
},
delete_obsolete_files_period_micros(0,"Option to delete\n" + delete_obsolete_files_period_micros(0,"Option to delete\n" +
"\tobsolete files periodically. 0 means that obsolete files are\n" + "\tobsolete files periodically. 0 means that obsolete files are\n" +
"\tdeleted after every compaction run.") { "\tdeleted after every compaction run.") {
@ -1597,7 +1579,6 @@ public class DbBenchmark {
final int threadNum_; final int threadNum_;
final int writesPerSeconds_; final int writesPerSeconds_;
final long randSeed_; final long randSeed_;
final long cacheSize_;
final boolean useExisting_; final boolean useExisting_;
final String databaseDir_; final String databaseDir_;
double compressionRatio_; double compressionRatio_;
@ -1620,6 +1601,8 @@ public class DbBenchmark {
// as the scope of a static member equals to the scope of the problem, // as the scope of a static member equals to the scope of the problem,
// we let its c++ pointer to be disposed in its finalizer. // we let its c++ pointer to be disposed in its finalizer.
static Options defaultOptions_ = new Options(); static Options defaultOptions_ = new Options();
static BlockBasedTableConfig defaultBlockBasedTableOptions_ =
new BlockBasedTableConfig();
String compressionType_; String compressionType_;
CompressionType compression_; CompressionType compression_;
} }

@ -52,12 +52,6 @@ public class OptionsTest {
assert(opt.useFsync() == boolValue); assert(opt.useFsync() == boolValue);
} }
{ // DbStatsLogInterval test
int intValue = rand.nextInt();
opt.setDbStatsLogInterval(intValue);
assert(opt.dbStatsLogInterval() == intValue);
}
{ // DbLogDir test { // DbLogDir test
String str = "path/to/DbLogDir"; String str = "path/to/DbLogDir";
opt.setDbLogDir(str); opt.setDbLogDir(str);
@ -214,24 +208,6 @@ public class OptionsTest {
assert(opt.minWriteBufferNumberToMerge() == intValue); assert(opt.minWriteBufferNumberToMerge() == intValue);
} }
{ // BlockSize test
long longValue = rand.nextLong();
opt.setBlockSize(longValue);
assert(opt.blockSize() == longValue);
}
{ // BlockRestartInterval test
int intValue = rand.nextInt();
opt.setBlockRestartInterval(intValue);
assert(opt.blockRestartInterval() == intValue);
}
{ // WholeKeyFiltering test
boolean boolValue = rand.nextBoolean();
opt.setWholeKeyFiltering(boolValue);
assert(opt.wholeKeyFiltering() == boolValue);
}
{ // NumLevels test { // NumLevels test
int intValue = rand.nextInt(); int intValue = rand.nextInt();
opt.setNumLevels(intValue); opt.setNumLevels(intValue);
@ -304,12 +280,6 @@ public class OptionsTest {
assert(opt.maxGrandparentOverlapFactor() == intValue); assert(opt.maxGrandparentOverlapFactor() == intValue);
} }
{ // DisableSeekCompaction test
boolean boolValue = rand.nextBoolean();
opt.setDisableSeekCompaction(boolValue);
assert(opt.disableSeekCompaction() == boolValue);
}
{ // SoftRateLimit test { // SoftRateLimit test
double doubleValue = rand.nextDouble(); double doubleValue = rand.nextDouble();
opt.setSoftRateLimit(doubleValue); opt.setSoftRateLimit(doubleValue);
@ -328,12 +298,6 @@ public class OptionsTest {
assert(opt.rateLimitDelayMaxMilliseconds() == intValue); assert(opt.rateLimitDelayMaxMilliseconds() == intValue);
} }
{ // NoBlockCache test
boolean boolValue = rand.nextBoolean();
opt.setNoBlockCache(boolValue);
assert(opt.noBlockCache() == boolValue);
}
{ // ArenaBlockSize test { // ArenaBlockSize test
long longValue = rand.nextLong(); long longValue = rand.nextLong();
opt.setArenaBlockSize(longValue); opt.setArenaBlockSize(longValue);
@ -352,12 +316,6 @@ public class OptionsTest {
assert(opt.purgeRedundantKvsWhileFlush() == boolValue); assert(opt.purgeRedundantKvsWhileFlush() == boolValue);
} }
{ // BlockSizeDeviation test
int intValue = rand.nextInt();
opt.setBlockSizeDeviation(intValue);
assert(opt.blockSizeDeviation() == intValue);
}
{ // VerifyChecksumsInCompaction test { // VerifyChecksumsInCompaction test
boolean boolValue = rand.nextBoolean(); boolean boolValue = rand.nextBoolean();
opt.setVerifyChecksumsInCompaction(boolValue); opt.setVerifyChecksumsInCompaction(boolValue);

@ -5,6 +5,7 @@
// //
// This file implements the "bridge" between Java and C++ for MemTables. // This file implements the "bridge" between Java and C++ for MemTables.
#include "rocksjni/portal.h"
#include "include/org_rocksdb_HashSkipListMemTableConfig.h" #include "include/org_rocksdb_HashSkipListMemTableConfig.h"
#include "include/org_rocksdb_HashLinkedListMemTableConfig.h" #include "include/org_rocksdb_HashLinkedListMemTableConfig.h"
#include "include/org_rocksdb_VectorMemTableConfig.h" #include "include/org_rocksdb_VectorMemTableConfig.h"
@ -20,7 +21,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
JNIEnv* env, jobject jobj, jlong jbucket_count, JNIEnv* env, jobject jobj, jlong jbucket_count,
jint jheight, jint jbranching_factor) { jint jheight, jint jbranching_factor) {
return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory( return reinterpret_cast<jlong>(rocksdb::NewHashSkipListRepFactory(
static_cast<size_t>(jbucket_count), rocksdb::jlong_to_size_t(jbucket_count),
static_cast<int32_t>(jheight), static_cast<int32_t>(jheight),
static_cast<int32_t>(jbranching_factor))); static_cast<int32_t>(jbranching_factor)));
} }
@ -33,7 +34,7 @@ jlong Java_org_rocksdb_HashSkipListMemTableConfig_newMemTableFactoryHandle(
jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
JNIEnv* env, jobject jobj, jlong jbucket_count) { JNIEnv* env, jobject jobj, jlong jbucket_count) {
return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory( return reinterpret_cast<jlong>(rocksdb::NewHashLinkListRepFactory(
static_cast<size_t>(jbucket_count))); rocksdb::jlong_to_size_t(jbucket_count)));
} }
/* /*
@ -44,7 +45,7 @@ jlong Java_org_rocksdb_HashLinkedListMemTableConfig_newMemTableFactoryHandle(
jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle( jlong Java_org_rocksdb_VectorMemTableConfig_newMemTableFactoryHandle(
JNIEnv* env, jobject jobj, jlong jreserved_size) { JNIEnv* env, jobject jobj, jlong jreserved_size) {
return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory( return reinterpret_cast<jlong>(new rocksdb::VectorRepFactory(
static_cast<size_t>(jreserved_size))); rocksdb::jlong_to_size_t(jreserved_size)));
} }
/* /*

@ -21,7 +21,7 @@
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/slice_transform.h" #include "rocksdb/slice_transform.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/rate_limiter.h"
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
@ -71,7 +71,7 @@ jboolean Java_org_rocksdb_Options_createIfMissing(
void Java_org_rocksdb_Options_setWriteBufferSize( void Java_org_rocksdb_Options_setWriteBufferSize(
JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) { JNIEnv* env, jobject jobj, jlong jhandle, jlong jwrite_buffer_size) {
reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size = reinterpret_cast<rocksdb::Options*>(jhandle)->write_buffer_size =
static_cast<size_t>(jwrite_buffer_size); rocksdb::jlong_to_size_t(jwrite_buffer_size);
} }
@ -118,17 +118,6 @@ jlong Java_org_rocksdb_Options_statisticsPtr(
return reinterpret_cast<jlong>(st); return reinterpret_cast<jlong>(st);
} }
/*
* Class: org_rocksdb_Options
* Method: setFilterHandle
* Signature: (JJ)V
*/
void Java_org_rocksdb_Options_setFilterHandle(
JNIEnv* env, jobject jobj, jlong jopt_handle, jlong jfilter_handle) {
reinterpret_cast<rocksdb::Options*>(jopt_handle)->filter_policy =
reinterpret_cast<rocksdb::FilterPolicy*>(jfilter_handle);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: maxWriteBufferNumber * Method: maxWriteBufferNumber
@ -139,49 +128,6 @@ jint Java_org_rocksdb_Options_maxWriteBufferNumber(
return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number; return reinterpret_cast<rocksdb::Options*>(jhandle)->max_write_buffer_number;
} }
/*
* Class: org_rocksdb_Options
* Method: setBlockSize
* Signature: (JJ)V
*/
void Java_org_rocksdb_Options_setBlockSize(
JNIEnv* env, jobject jobj, jlong jhandle, jlong jblock_size) {
reinterpret_cast<rocksdb::Options*>(jhandle)->block_size =
static_cast<size_t>(jblock_size);
}
/*
* Class: org_rocksdb_Options
* Method: blockSize
* Signature: (J)J
*/
jlong Java_org_rocksdb_Options_blockSize(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size;
}
/*
* Class: org_rocksdb_Options
* Method: setDisableSeekCompaction
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setDisableSeekCompaction(
JNIEnv* env, jobject jobj, jlong jhandle,
jboolean jdisable_seek_compaction) {
reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction =
jdisable_seek_compaction;
}
/*
* Class: org_rocksdb_Options
* Method: disableSeekCompaction
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_disableSeekCompaction(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->disable_seek_compaction;
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: errorIfExists * Method: errorIfExists
@ -287,27 +233,6 @@ void Java_org_rocksdb_Options_setUseFsync(
static_cast<bool>(use_fsync); static_cast<bool>(use_fsync);
} }
/*
* Class: org_rocksdb_Options
* Method: dbStatsLogInterval
* Signature: (J)I
*/
jint Java_org_rocksdb_Options_dbStatsLogInterval(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval;
}
/*
* Class: org_rocksdb_Options
* Method: setDbStatsLogInterval
* Signature: (JI)V
*/
void Java_org_rocksdb_Options_setDbStatsLogInterval(
JNIEnv* env, jobject jobj, jlong jhandle, jint db_stats_log_interval) {
reinterpret_cast<rocksdb::Options*>(jhandle)->db_stats_log_interval =
static_cast<int>(db_stats_log_interval);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: dbLogDir * Method: dbLogDir
@ -438,7 +363,7 @@ jlong Java_org_rocksdb_Options_maxLogFileSize(
void Java_org_rocksdb_Options_setMaxLogFileSize( void Java_org_rocksdb_Options_setMaxLogFileSize(
JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) { JNIEnv* env, jobject jobj, jlong jhandle, jlong max_log_file_size) {
reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size = reinterpret_cast<rocksdb::Options*>(jhandle)->max_log_file_size =
static_cast<size_t>(max_log_file_size); rocksdb::jlong_to_size_t(max_log_file_size);
} }
/* /*
@ -459,7 +384,7 @@ jlong Java_org_rocksdb_Options_logFileTimeToRoll(
void Java_org_rocksdb_Options_setLogFileTimeToRoll( void Java_org_rocksdb_Options_setLogFileTimeToRoll(
JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) { JNIEnv* env, jobject jobj, jlong jhandle, jlong log_file_time_to_roll) {
reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll = reinterpret_cast<rocksdb::Options*>(jhandle)->log_file_time_to_roll =
static_cast<size_t>(log_file_time_to_roll); rocksdb::jlong_to_size_t(log_file_time_to_roll);
} }
/* /*
@ -480,7 +405,7 @@ jlong Java_org_rocksdb_Options_keepLogFileNum(
void Java_org_rocksdb_Options_setKeepLogFileNum( void Java_org_rocksdb_Options_setKeepLogFileNum(
JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) { JNIEnv* env, jobject jobj, jlong jhandle, jlong keep_log_file_num) {
reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num = reinterpret_cast<rocksdb::Options*>(jhandle)->keep_log_file_num =
static_cast<size_t>(keep_log_file_num); rocksdb::jlong_to_size_t(keep_log_file_num);
} }
/* /*
@ -535,6 +460,17 @@ void Java_org_rocksdb_Options_setMemTableFactory(
reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle)); reinterpret_cast<rocksdb::MemTableRepFactory*>(jfactory_handle));
} }
/*
* Class: org_rocksdb_Options
* Method: setRateLimiter
* Signature: (JJ)V
*/
void Java_org_rocksdb_Options_setRateLimiter(
JNIEnv* env, jobject jobj, jlong jhandle, jlong jrate_limiter_handle) {
reinterpret_cast<rocksdb::Options*>(jhandle)->rate_limiter.reset(
reinterpret_cast<rocksdb::RateLimiter*>(jrate_limiter_handle));
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: tableCacheNumshardbits * Method: tableCacheNumshardbits
@ -585,7 +521,8 @@ void Java_org_rocksdb_Options_setTableCacheRemoveScanCountLimit(
void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor( void Java_org_rocksdb_Options_useFixedLengthPrefixExtractor(
JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) { JNIEnv* env, jobject jobj, jlong jhandle, jint jprefix_length) {
reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset( reinterpret_cast<rocksdb::Options*>(jhandle)->prefix_extractor.reset(
rocksdb::NewFixedPrefixTransform(static_cast<size_t>(jprefix_length))); rocksdb::NewFixedPrefixTransform(
rocksdb::jlong_to_size_t(jprefix_length)));
} }
/* /*
@ -649,7 +586,7 @@ jlong Java_org_rocksdb_Options_manifestPreallocationSize(
void Java_org_rocksdb_Options_setManifestPreallocationSize( void Java_org_rocksdb_Options_setManifestPreallocationSize(
JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) { JNIEnv* env, jobject jobj, jlong jhandle, jlong preallocation_size) {
reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size = reinterpret_cast<rocksdb::Options*>(jhandle)->manifest_preallocation_size =
static_cast<size_t>(preallocation_size); rocksdb::jlong_to_size_t(preallocation_size);
} }
/* /*
@ -914,27 +851,6 @@ void Java_org_rocksdb_Options_setMinWriteBufferNumberToMerge(
static_cast<int>(jmin_write_buffer_number_to_merge); static_cast<int>(jmin_write_buffer_number_to_merge);
} }
/*
* Class: org_rocksdb_Options
* Method: blockRestartInterval
* Signature: (J)I
*/
jint Java_org_rocksdb_Options_blockRestartInterval(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval;
}
/*
* Class: org_rocksdb_Options
* Method: setBlockRestartInterval
* Signature: (JI)V
*/
void Java_org_rocksdb_Options_setBlockRestartInterval(
JNIEnv* env, jobject jobj, jlong jhandle, jint jblock_restart_interval) {
reinterpret_cast<rocksdb::Options*>(jhandle)->block_restart_interval =
static_cast<int>(jblock_restart_interval);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: setCompressionType * Method: setCompressionType
@ -977,27 +893,6 @@ jbyte Java_org_rocksdb_Options_compactionStyle(
return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style; return reinterpret_cast<rocksdb::Options*>(jhandle)->compaction_style;
} }
/*
* Class: org_rocksdb_Options
* Method: wholeKeyFiltering
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_wholeKeyFiltering(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering;
}
/*
* Class: org_rocksdb_Options
* Method: setWholeKeyFiltering
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setWholeKeyFiltering(
JNIEnv* env, jobject jobj, jlong jhandle, jboolean jwhole_key_filtering) {
reinterpret_cast<rocksdb::Options*>(jhandle)->whole_key_filtering =
static_cast<bool>(jwhole_key_filtering);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: numLevels * Method: numLevels
@ -1345,27 +1240,6 @@ void Java_org_rocksdb_Options_setRateLimitDelayMaxMilliseconds(
static_cast<int>(jrate_limit_delay_max_milliseconds); static_cast<int>(jrate_limit_delay_max_milliseconds);
} }
/*
* Class: org_rocksdb_Options
* Method: noBlockCache
* Signature: (J)Z
*/
jboolean Java_org_rocksdb_Options_noBlockCache(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache;
}
/*
* Class: org_rocksdb_Options
* Method: setNoBlockCache
* Signature: (JZ)V
*/
void Java_org_rocksdb_Options_setNoBlockCache(
JNIEnv* env, jobject jobj, jlong jhandle, jboolean jno_block_cache) {
reinterpret_cast<rocksdb::Options*>(jhandle)->no_block_cache =
static_cast<bool>(jno_block_cache);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: arenaBlockSize * Method: arenaBlockSize
@ -1384,7 +1258,7 @@ jlong Java_org_rocksdb_Options_arenaBlockSize(
void Java_org_rocksdb_Options_setArenaBlockSize( void Java_org_rocksdb_Options_setArenaBlockSize(
JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) { JNIEnv* env, jobject jobj, jlong jhandle, jlong jarena_block_size) {
reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size = reinterpret_cast<rocksdb::Options*>(jhandle)->arena_block_size =
static_cast<size_t>(jarena_block_size); rocksdb::jlong_to_size_t(jarena_block_size);
} }
/* /*
@ -1435,28 +1309,6 @@ void Java_org_rocksdb_Options_setPurgeRedundantKvsWhileFlush(
static_cast<bool>(jpurge_redundant_kvs_while_flush); static_cast<bool>(jpurge_redundant_kvs_while_flush);
} }
/*
* Class: org_rocksdb_Options
* Method: blockSizeDeviation
* Signature: (J)I
*/
jint Java_org_rocksdb_Options_blockSizeDeviation(
JNIEnv* env, jobject jobj, jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation;
}
/*
* Class: org_rocksdb_Options
* Method: setBlockSizeDeviation
* Signature: (JI)V
*/
void Java_org_rocksdb_Options_setBlockSizeDeviation(
JNIEnv* env, jobject jobj, jlong jhandle,
jint jblock_size_deviation) {
reinterpret_cast<rocksdb::Options*>(jhandle)->block_size_deviation =
static_cast<int>(jblock_size_deviation);
}
/* /*
* Class: org_rocksdb_Options * Class: org_rocksdb_Options
* Method: verifyChecksumsInCompaction * Method: verifyChecksumsInCompaction
@ -1571,7 +1423,7 @@ void Java_org_rocksdb_Options_setInplaceUpdateNumLocks(
jlong jinplace_update_num_locks) { jlong jinplace_update_num_locks) {
reinterpret_cast<rocksdb::Options*>( reinterpret_cast<rocksdb::Options*>(
jhandle)->inplace_update_num_locks = jhandle)->inplace_update_num_locks =
static_cast<size_t>(jinplace_update_num_locks); rocksdb::jlong_to_size_t(jinplace_update_num_locks);
} }
/* /*
@ -1662,7 +1514,7 @@ void Java_org_rocksdb_Options_setMaxSuccessiveMerges(
JNIEnv* env, jobject jobj, jlong jhandle, JNIEnv* env, jobject jobj, jlong jhandle,
jlong jmax_successive_merges) { jlong jmax_successive_merges) {
reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges = reinterpret_cast<rocksdb::Options*>(jhandle)->max_successive_merges =
static_cast<size_t>(jmax_successive_merges); rocksdb::jlong_to_size_t(jmax_successive_merges);
} }
/* /*

@ -11,12 +11,19 @@
#define JAVA_ROCKSJNI_PORTAL_H_ #define JAVA_ROCKSJNI_PORTAL_H_
#include <jni.h> #include <jni.h>
#include <limits>
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
#include "rocksdb/utilities/backupable_db.h" #include "rocksdb/utilities/backupable_db.h"
namespace rocksdb { namespace rocksdb {
inline size_t jlong_to_size_t(const jlong& jvalue) {
return static_cast<uint64_t>(jvalue) <=
static_cast<uint64_t>(std::numeric_limits<size_t>::max()) ?
static_cast<size_t>(jvalue) : std::numeric_limits<size_t>::max();
}
// The portal class for org.rocksdb.RocksDB // The portal class for org.rocksdb.RocksDB
class RocksDBJni { class RocksDBJni {
public: public:

@ -0,0 +1,24 @@
// Copyright (c) 2014, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// This file implements the "bridge" between Java and C++ for RateLimiter.
#include "rocksjni/portal.h"
#include "include/org_rocksdb_GenericRateLimiterConfig.h"
#include "rocksdb/rate_limiter.h"
/*
* Class: org_rocksdb_GenericRateLimiterConfig
* Method: newRateLimiterHandle
* Signature: (JJI)J
*/
jlong Java_org_rocksdb_GenericRateLimiterConfig_newRateLimiterHandle(
JNIEnv* env, jobject jobj, jlong jrate_bytes_per_second,
jlong jrefill_period_micros, jint jfairness) {
return reinterpret_cast<jlong>(rocksdb::NewGenericRateLimiter(
rocksdb::jlong_to_size_t(jrate_bytes_per_second),
rocksdb::jlong_to_size_t(jrefill_period_micros),
static_cast<int32_t>(jfairness)));
}

@ -26,21 +26,8 @@
* Signature: (JLjava/lang/String;)V * Signature: (JLjava/lang/String;)V
*/ */
void Java_org_rocksdb_RocksDB_open( void Java_org_rocksdb_RocksDB_open(
JNIEnv* env, jobject jdb, jlong jopt_handle, JNIEnv* env, jobject jdb, jlong jopt_handle, jstring jdb_path) {
jlong jcache_size, jint jnum_shardbits, jstring jdb_path) {
auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle); auto opt = reinterpret_cast<rocksdb::Options*>(jopt_handle);
if (jcache_size > 0) {
opt->no_block_cache = false;
if (jnum_shardbits >= 1) {
opt->block_cache = rocksdb::NewLRUCache(jcache_size, jnum_shardbits);
} else {
opt->block_cache = rocksdb::NewLRUCache(jcache_size);
}
} else {
opt->no_block_cache = true;
opt->block_cache = nullptr;
}
rocksdb::DB* db = nullptr; rocksdb::DB* db = nullptr;
const char* db_path = env->GetStringUTFChars(jdb_path, 0); const char* db_path = env->GetStringUTFChars(jdb_path, 0);
rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db); rocksdb::Status s = rocksdb::DB::Open(*opt, db_path, &db);

@ -7,7 +7,10 @@
#include <jni.h> #include <jni.h>
#include "include/org_rocksdb_PlainTableConfig.h" #include "include/org_rocksdb_PlainTableConfig.h"
#include "include/org_rocksdb_BlockBasedTableConfig.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
#include "rocksdb/cache.h"
#include "rocksdb/filter_policy.h"
/* /*
* Class: org_rocksdb_PlainTableConfig * Class: org_rocksdb_PlainTableConfig
@ -24,3 +27,34 @@ jlong Java_org_rocksdb_PlainTableConfig_newTableFactoryHandle(
options.index_sparseness = jindex_sparseness; options.index_sparseness = jindex_sparseness;
return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options)); return reinterpret_cast<jlong>(rocksdb::NewPlainTableFactory(options));
} }
/*
* Class: org_rocksdb_BlockBasedTableConfig
* Method: newTableFactoryHandle
* Signature: (ZJIJIIZI)J
*/
jlong Java_org_rocksdb_BlockBasedTableConfig_newTableFactoryHandle(
JNIEnv* env, jobject jobj, jboolean no_block_cache, jlong block_cache_size,
jint num_shardbits, jlong block_size, jint block_size_deviation,
jint block_restart_interval, jboolean whole_key_filtering,
jint bits_per_key) {
rocksdb::BlockBasedTableOptions options;
options.no_block_cache = no_block_cache;
if (!no_block_cache && block_cache_size > 0) {
if (num_shardbits > 0) {
options.block_cache =
rocksdb::NewLRUCache(block_cache_size, num_shardbits);
} else {
options.block_cache = rocksdb::NewLRUCache(block_cache_size);
}
}
options.block_size = block_size;
options.block_size_deviation = block_size_deviation;
options.block_restart_interval = block_restart_interval;
options.whole_key_filtering = whole_key_filtering;
if (bits_per_key > 0) {
options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(bits_per_key));
}
return reinterpret_cast<jlong>(rocksdb::NewBlockBasedTableFactory(options));
}

@ -12,12 +12,14 @@
#include "include/org_rocksdb_WriteBatchTest.h" #include "include/org_rocksdb_WriteBatchTest.h"
#include "rocksjni/portal.h" #include "rocksjni/portal.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/immutable_options.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "rocksdb/memtablerep.h" #include "rocksdb/memtablerep.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/scoped_arena_iterator.h"
#include "util/testharness.h" #include "util/testharness.h"
/* /*
@ -28,7 +30,7 @@
void Java_org_rocksdb_WriteBatch_newWriteBatch( void Java_org_rocksdb_WriteBatch_newWriteBatch(
JNIEnv* env, jobject jobj, jint jreserved_bytes) { JNIEnv* env, jobject jobj, jint jreserved_bytes) {
rocksdb::WriteBatch* wb = new rocksdb::WriteBatch( rocksdb::WriteBatch* wb = new rocksdb::WriteBatch(
static_cast<size_t>(jreserved_bytes)); rocksdb::jlong_to_size_t(jreserved_bytes));
rocksdb::WriteBatchJni::setHandle(env, jobj, wb); rocksdb::WriteBatchJni::setHandle(env, jobj, wb);
} }
@ -202,14 +204,18 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
auto factory = std::make_shared<rocksdb::SkipListFactory>(); auto factory = std::make_shared<rocksdb::SkipListFactory>();
rocksdb::Options options; rocksdb::Options options;
options.memtable_factory = factory; options.memtable_factory = factory;
rocksdb::MemTable* mem = new rocksdb::MemTable(cmp, options); rocksdb::MemTable* mem = new rocksdb::MemTable(
cmp, rocksdb::ImmutableCFOptions(options),
rocksdb::MemTableOptions(rocksdb::MutableCFOptions(options), options));
mem->Ref(); mem->Ref();
std::string state; std::string state;
rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options); rocksdb::ColumnFamilyMemTablesDefault cf_mems_default(mem, &options);
rocksdb::Status s = rocksdb::Status s =
rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default); rocksdb::WriteBatchInternal::InsertInto(b, &cf_mems_default);
int count = 0; int count = 0;
rocksdb::Iterator* iter = mem->NewIterator(rocksdb::ReadOptions()); rocksdb::Arena arena;
rocksdb::ScopedArenaIterator iter(mem->NewIterator(
rocksdb::ReadOptions(), &arena));
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
rocksdb::ParsedInternalKey ikey; rocksdb::ParsedInternalKey ikey;
memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey)); memset(reinterpret_cast<void*>(&ikey), 0, sizeof(ikey));
@ -244,7 +250,6 @@ jbyteArray Java_org_rocksdb_WriteBatchTest_getContents(
state.append("@"); state.append("@");
state.append(rocksdb::NumberToString(ikey.sequence)); state.append(rocksdb::NumberToString(ikey.sequence));
} }
delete iter;
if (!s.ok()) { if (!s.ok()) {
state.append(s.ToString()); state.append(s.ToString());
} else if (count != rocksdb::WriteBatchInternal::Count(b)) { } else if (count != rocksdb::WriteBatchInternal::Count(b)) {

@ -33,7 +33,7 @@ const char* GetExecutableName() {
char link[1024]; char link[1024];
snprintf(link, sizeof(link), "/proc/%d/exe", getpid()); snprintf(link, sizeof(link), "/proc/%d/exe", getpid());
auto read = readlink(link, name, sizeof(name)); auto read = readlink(link, name, sizeof(name) - 1);
if (-1 == read) { if (-1 == read) {
return nullptr; return nullptr;
} else { } else {

@ -39,7 +39,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber;
extern const uint64_t kCuckooTableMagicNumber; extern const uint64_t kCuckooTableMagicNumber;
Status AdaptiveTableFactory::NewTableReader( Status AdaptiveTableFactory::NewTableReader(
const Options& options, const EnvOptions& soptions, const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file, const InternalKeyComparator& icomp, unique_ptr<RandomAccessFile>&& file,
uint64_t file_size, unique_ptr<TableReader>* table) const { uint64_t file_size, unique_ptr<TableReader>* table) const {
Footer footer; Footer footer;
@ -50,24 +50,59 @@ Status AdaptiveTableFactory::NewTableReader(
if (footer.table_magic_number() == kPlainTableMagicNumber || if (footer.table_magic_number() == kPlainTableMagicNumber ||
footer.table_magic_number() == kLegacyPlainTableMagicNumber) { footer.table_magic_number() == kLegacyPlainTableMagicNumber) {
return plain_table_factory_->NewTableReader( return plain_table_factory_->NewTableReader(
options, soptions, icomp, std::move(file), file_size, table); ioptions, env_options, icomp, std::move(file), file_size, table);
} else if (footer.table_magic_number() == kBlockBasedTableMagicNumber || } else if (footer.table_magic_number() == kBlockBasedTableMagicNumber ||
footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) { footer.table_magic_number() == kLegacyBlockBasedTableMagicNumber) {
return block_based_table_factory_->NewTableReader( return block_based_table_factory_->NewTableReader(
options, soptions, icomp, std::move(file), file_size, table); ioptions, env_options, icomp, std::move(file), file_size, table);
} else if (footer.table_magic_number() == kCuckooTableMagicNumber) { } else if (footer.table_magic_number() == kCuckooTableMagicNumber) {
return cuckoo_table_factory_->NewTableReader( return cuckoo_table_factory_->NewTableReader(
options, soptions, icomp, std::move(file), file_size, table); ioptions, env_options, icomp, std::move(file), file_size, table);
} else { } else {
return Status::NotSupported("Unidentified table format"); return Status::NotSupported("Unidentified table format");
} }
} }
TableBuilder* AdaptiveTableFactory::NewTableBuilder( TableBuilder* AdaptiveTableFactory::NewTableBuilder(
const Options& options, const InternalKeyComparator& internal_comparator, const ImmutableCFOptions& ioptions,
WritableFile* file, CompressionType compression_type) const { const InternalKeyComparator& internal_comparator,
return table_factory_to_write_->NewTableBuilder(options, internal_comparator, WritableFile* file, const CompressionType compression_type,
file, compression_type); const CompressionOptions& compression_opts) const {
return table_factory_to_write_->NewTableBuilder(
ioptions, internal_comparator, file, compression_type, compression_opts);
}
std::string AdaptiveTableFactory::GetPrintableTableOptions() const {
std::string ret;
ret.reserve(20000);
const int kBufferSize = 200;
char buffer[kBufferSize];
if (!table_factory_to_write_) {
snprintf(buffer, kBufferSize, " write factory (%s) options:\n%s\n",
table_factory_to_write_->Name(),
table_factory_to_write_->GetPrintableTableOptions().c_str());
ret.append(buffer);
}
if (!plain_table_factory_) {
snprintf(buffer, kBufferSize, " %s options:\n%s\n",
plain_table_factory_->Name(),
plain_table_factory_->GetPrintableTableOptions().c_str());
ret.append(buffer);
}
if (!block_based_table_factory_) {
snprintf(buffer, kBufferSize, " %s options:\n%s\n",
block_based_table_factory_->Name(),
block_based_table_factory_->GetPrintableTableOptions().c_str());
ret.append(buffer);
}
if (!cuckoo_table_factory_) {
snprintf(buffer, kBufferSize, " %s options:\n%s\n",
cuckoo_table_factory_->Name(),
cuckoo_table_factory_->GetPrintableTableOptions().c_str());
ret.append(buffer);
}
return ret;
} }
extern TableFactory* NewAdaptiveTableFactory( extern TableFactory* NewAdaptiveTableFactory(

@ -6,12 +6,12 @@
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#include <string>
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/table.h" #include "rocksdb/table.h"
namespace rocksdb { namespace rocksdb {
struct Options;
struct EnvOptions; struct EnvOptions;
using std::unique_ptr; using std::unique_ptr;
@ -30,16 +30,32 @@ class AdaptiveTableFactory : public TableFactory {
std::shared_ptr<TableFactory> block_based_table_factory, std::shared_ptr<TableFactory> block_based_table_factory,
std::shared_ptr<TableFactory> plain_table_factory, std::shared_ptr<TableFactory> plain_table_factory,
std::shared_ptr<TableFactory> cuckoo_table_factory); std::shared_ptr<TableFactory> cuckoo_table_factory);
const char* Name() const override { return "AdaptiveTableFactory"; } const char* Name() const override { return "AdaptiveTableFactory"; }
Status NewTableReader(const Options& options, const EnvOptions& soptions,
Status NewTableReader(
const ImmutableCFOptions& ioptions, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
unique_ptr<RandomAccessFile>&& file, uint64_t file_size, unique_ptr<RandomAccessFile>&& file, uint64_t file_size,
unique_ptr<TableReader>* table) const override; unique_ptr<TableReader>* table) const override;
TableBuilder* NewTableBuilder(const Options& options,
TableBuilder* NewTableBuilder(
const ImmutableCFOptions& ioptions,
const InternalKeyComparator& icomparator, const InternalKeyComparator& icomparator,
WritableFile* file, WritableFile* file,
CompressionType compression_type) const const CompressionType compression_type,
override; const CompressionOptions& compression_opts) const override;
// Sanitizes the specified DB Options.
Status SanitizeDBOptions(const DBOptions* db_opts) const override {
if (db_opts->allow_mmap_reads == false) {
return Status::NotSupported(
"AdaptiveTable with allow_mmap_reads == false is not supported.");
}
return Status::OK();
}
std::string GetPrintableTableOptions() const override;
private: private:
std::shared_ptr<TableFactory> table_factory_to_write_; std::shared_ptr<TableFactory> table_factory_to_write_;

@ -297,12 +297,10 @@ uint32_t Block::NumRestarts() const {
return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
} }
Block::Block(const BlockContents& contents) Block::Block(BlockContents&& contents)
: data_(contents.data.data()), : contents_(std::move(contents)),
size_(contents.data.size()), data_(contents_.data.data()),
owned_(contents.heap_allocated), size_(contents_.data.size()) {
cachable_(contents.cachable),
compression_type_(contents.compression_type) {
if (size_ < sizeof(uint32_t)) { if (size_ < sizeof(uint32_t)) {
size_ = 0; // Error marker size_ = 0; // Error marker
} else { } else {
@ -315,13 +313,8 @@ Block::Block(const BlockContents& contents)
} }
} }
Block::~Block() { Iterator* Block::NewIterator(
if (owned_) { const Comparator* cmp, BlockIter* iter, bool total_order_seek) {
delete[] data_;
}
}
Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
if (size_ < 2*sizeof(uint32_t)) { if (size_ < 2*sizeof(uint32_t)) {
if (iter != nullptr) { if (iter != nullptr) {
iter->SetStatus(Status::Corruption("bad block contents")); iter->SetStatus(Status::Corruption("bad block contents"));
@ -339,12 +332,17 @@ Iterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter) {
return NewEmptyIterator(); return NewEmptyIterator();
} }
} else { } else {
BlockHashIndex* hash_index_ptr =
total_order_seek ? nullptr : hash_index_.get();
BlockPrefixIndex* prefix_index_ptr =
total_order_seek ? nullptr : prefix_index_.get();
if (iter != nullptr) { if (iter != nullptr) {
iter->Initialize(cmp, data_, restart_offset_, num_restarts, iter->Initialize(cmp, data_, restart_offset_, num_restarts,
hash_index_.get(), prefix_index_.get()); hash_index_ptr, prefix_index_ptr);
} else { } else {
iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
hash_index_.get(), prefix_index_.get()); hash_index_ptr, prefix_index_ptr);
} }
} }

@ -14,6 +14,10 @@
#include "rocksdb/iterator.h" #include "rocksdb/iterator.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "table/block_prefix_index.h"
#include "table/block_hash_index.h"
#include "format.h"
namespace rocksdb { namespace rocksdb {
@ -26,15 +30,17 @@ class BlockPrefixIndex;
class Block { class Block {
public: public:
// Initialize the block with the specified contents. // Initialize the block with the specified contents.
explicit Block(const BlockContents& contents); explicit Block(BlockContents&& contents);
~Block(); ~Block() = default;
size_t size() const { return size_; } size_t size() const { return size_; }
const char* data() const { return data_; } const char* data() const { return data_; }
bool cachable() const { return cachable_; } bool cachable() const { return contents_.cachable; }
uint32_t NumRestarts() const; uint32_t NumRestarts() const;
CompressionType compression_type() const { return compression_type_; } CompressionType compression_type() const {
return contents_.compression_type;
}
// If hash index lookup is enabled and `use_hash_index` is true. This block // If hash index lookup is enabled and `use_hash_index` is true. This block
// will do hash lookup for the key prefix. // will do hash lookup for the key prefix.
@ -45,8 +51,12 @@ class Block {
// //
// If iter is null, return new Iterator // If iter is null, return new Iterator
// If iter is not null, update this one and return it as Iterator* // If iter is not null, update this one and return it as Iterator*
//
// If total_order_seek is true, hash_index_ and prefix_index_ are ignored.
// This option only applies for index block. For data block, hash_index_
// and prefix_index_ are null, so this option does not matter.
Iterator* NewIterator(const Comparator* comparator, Iterator* NewIterator(const Comparator* comparator,
BlockIter* iter = nullptr); BlockIter* iter = nullptr, bool total_order_seek = true);
void SetBlockHashIndex(BlockHashIndex* hash_index); void SetBlockHashIndex(BlockHashIndex* hash_index);
void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index); void SetBlockPrefixIndex(BlockPrefixIndex* prefix_index);
@ -54,12 +64,10 @@ class Block {
size_t ApproximateMemoryUsage() const; size_t ApproximateMemoryUsage() const;
private: private:
const char* data_; BlockContents contents_;
size_t size_; const char* data_; // contents_.data.data()
size_t size_; // contents_.data.size()
uint32_t restart_offset_; // Offset in data_ of restart array uint32_t restart_offset_; // Offset in data_ of restart array
bool owned_; // Block owns data_[]
bool cachable_;
CompressionType compression_type_;
std::unique_ptr<BlockHashIndex> hash_index_; std::unique_ptr<BlockHashIndex> hash_index_;
std::unique_ptr<BlockPrefixIndex> prefix_index_; std::unique_ptr<BlockPrefixIndex> prefix_index_;

@ -7,7 +7,7 @@
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/filter_block.h" #include "table/block_based_filter_block.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "rocksdb/filter_policy.h" #include "rocksdb/filter_policy.h"
@ -15,20 +15,39 @@
namespace rocksdb { namespace rocksdb {
namespace {
bool SamePrefix(const SliceTransform* prefix_extractor,
const Slice& key1, const Slice& key2) {
if (!prefix_extractor->InDomain(key1) &&
!prefix_extractor->InDomain(key2)) {
return true;
} else if (!prefix_extractor->InDomain(key1) ||
!prefix_extractor->InDomain(key2)) {
return false;
} else {
return (prefix_extractor->Transform(key1) ==
prefix_extractor->Transform(key2));
}
}
} // namespace
// See doc/table_format.txt for an explanation of the filter block format. // See doc/table_format.txt for an explanation of the filter block format.
// Generate new filter every 2KB of data // Generate new filter every 2KB of data
static const size_t kFilterBaseLg = 11; static const size_t kFilterBaseLg = 11;
static const size_t kFilterBase = 1 << kFilterBaseLg; static const size_t kFilterBase = 1 << kFilterBaseLg;
FilterBlockBuilder::FilterBlockBuilder(const Options& opt, BlockBasedFilterBlockBuilder::BlockBasedFilterBlockBuilder(
const Comparator* internal_comparator) const SliceTransform* prefix_extractor,
: policy_(opt.filter_policy), const BlockBasedTableOptions& table_opt)
prefix_extractor_(opt.prefix_extractor.get()), : policy_(table_opt.filter_policy.get()),
whole_key_filtering_(opt.whole_key_filtering), prefix_extractor_(prefix_extractor),
comparator_(internal_comparator) {} whole_key_filtering_(table_opt.whole_key_filtering) {
assert(policy_);
}
void FilterBlockBuilder::StartBlock(uint64_t block_offset) { void BlockBasedFilterBlockBuilder::StartBlock(uint64_t block_offset) {
uint64_t filter_index = (block_offset / kFilterBase); uint64_t filter_index = (block_offset / kFilterBase);
assert(filter_index >= filter_offsets_.size()); assert(filter_index >= filter_offsets_.size());
while (filter_index > filter_offsets_.size()) { while (filter_index > filter_offsets_.size()) {
@ -36,59 +55,45 @@ void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
} }
} }
bool FilterBlockBuilder::SamePrefix(const Slice &key1, void BlockBasedFilterBlockBuilder::Add(const Slice& key) {
const Slice &key2) const { added_to_start_ = 0;
if (!prefix_extractor_->InDomain(key1) && if (whole_key_filtering_) {
!prefix_extractor_->InDomain(key2)) { AddKey(key);
return true; added_to_start_ = 1;
} else if (!prefix_extractor_->InDomain(key1) || }
!prefix_extractor_->InDomain(key2)) { if (prefix_extractor_ && prefix_extractor_->InDomain(key)) {
return false; AddPrefix(key);
} else {
return (prefix_extractor_->Transform(key1) ==
prefix_extractor_->Transform(key2));
} }
} }
void FilterBlockBuilder::AddKey(const Slice& key) { // Add key to filter if needed
// get slice for most recently added entry inline void BlockBasedFilterBlockBuilder::AddKey(const Slice& key) {
Slice prev;
size_t added_to_start = 0;
// add key to filter if needed
if (whole_key_filtering_) {
start_.push_back(entries_.size()); start_.push_back(entries_.size());
++added_to_start;
entries_.append(key.data(), key.size()); entries_.append(key.data(), key.size());
} }
if (start_.size() > added_to_start) { // Add prefix to filter if needed
size_t prev_start = start_[start_.size() - 1 - added_to_start]; inline void BlockBasedFilterBlockBuilder::AddPrefix(const Slice& key) {
// get slice for most recently added entry
Slice prev;
if (start_.size() > added_to_start_) {
size_t prev_start = start_[start_.size() - 1 - added_to_start_];
const char* base = entries_.data() + prev_start; const char* base = entries_.data() + prev_start;
size_t length = entries_.size() - prev_start; size_t length = entries_.size() - prev_start;
prev = Slice(base, length); prev = Slice(base, length);
} }
// add prefix to filter if needed
if (prefix_extractor_ && prefix_extractor_->InDomain(ExtractUserKey(key))) {
// If prefix_extractor_, this filter_block layer assumes we only
// operate on internal keys.
Slice user_key = ExtractUserKey(key);
// this assumes prefix(prefix(key)) == prefix(key), as the last // this assumes prefix(prefix(key)) == prefix(key), as the last
// entry in entries_ may be either a key or prefix, and we use // entry in entries_ may be either a key or prefix, and we use
// prefix(last entry) to get the prefix of the last key. // prefix(last entry) to get the prefix of the last key.
if (prev.size() == 0 || if (prev.size() == 0 || !SamePrefix(prefix_extractor_, key, prev)) {
!SamePrefix(user_key, ExtractUserKey(prev))) { Slice prefix = prefix_extractor_->Transform(key);
Slice prefix = prefix_extractor_->Transform(user_key);
InternalKey internal_prefix_tmp(prefix, 0, kTypeValue);
Slice internal_prefix = internal_prefix_tmp.Encode();
start_.push_back(entries_.size()); start_.push_back(entries_.size());
entries_.append(internal_prefix.data(), internal_prefix.size()); entries_.append(prefix.data(), prefix.size());
}
} }
} }
Slice FilterBlockBuilder::Finish() { Slice BlockBasedFilterBlockBuilder::Finish() {
if (!start_.empty()) { if (!start_.empty()) {
GenerateFilter(); GenerateFilter();
} }
@ -104,7 +109,7 @@ Slice FilterBlockBuilder::Finish() {
return Slice(result_); return Slice(result_);
} }
void FilterBlockBuilder::GenerateFilter() { void BlockBasedFilterBlockBuilder::GenerateFilter() {
const size_t num_entries = start_.size(); const size_t num_entries = start_.size();
if (num_entries == 0) { if (num_entries == 0) {
// Fast path if there are no keys for this filter // Fast path if there are no keys for this filter
@ -117,7 +122,7 @@ void FilterBlockBuilder::GenerateFilter() {
tmp_entries_.resize(num_entries); tmp_entries_.resize(num_entries);
for (size_t i = 0; i < num_entries; i++) { for (size_t i = 0; i < num_entries; i++) {
const char* base = entries_.data() + start_[i]; const char* base = entries_.data() + start_[i];
size_t length = start_[i+1] - start_[i]; size_t length = start_[i + 1] - start_[i];
tmp_entries_[i] = Slice(base, length); tmp_entries_[i] = Slice(base, length);
} }
@ -130,49 +135,52 @@ void FilterBlockBuilder::GenerateFilter() {
start_.clear(); start_.clear();
} }
FilterBlockReader::FilterBlockReader( BlockBasedFilterBlockReader::BlockBasedFilterBlockReader(
const Options& opt, const Slice& contents, bool delete_contents_after_use) const SliceTransform* prefix_extractor,
: policy_(opt.filter_policy), const BlockBasedTableOptions& table_opt, BlockContents&& contents)
prefix_extractor_(opt.prefix_extractor.get()), : policy_(table_opt.filter_policy.get()),
whole_key_filtering_(opt.whole_key_filtering), prefix_extractor_(prefix_extractor),
whole_key_filtering_(table_opt.whole_key_filtering),
data_(nullptr), data_(nullptr),
offset_(nullptr), offset_(nullptr),
num_(0), num_(0),
base_lg_(0) { base_lg_(0),
size_t n = contents.size(); contents_(std::move(contents)) {
assert(policy_);
size_t n = contents_.data.size();
if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array if (n < 5) return; // 1 byte for base_lg_ and 4 for start of offset array
base_lg_ = contents[n-1]; base_lg_ = contents_.data[n - 1];
uint32_t last_word = DecodeFixed32(contents.data() + n - 5); uint32_t last_word = DecodeFixed32(contents_.data.data() + n - 5);
if (last_word > n - 5) return; if (last_word > n - 5) return;
data_ = contents.data(); data_ = contents_.data.data();
offset_ = data_ + last_word; offset_ = data_ + last_word;
num_ = (n - 5 - last_word) / 4; num_ = (n - 5 - last_word) / 4;
if (delete_contents_after_use) {
filter_data.reset(contents.data());
}
} }
bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, bool BlockBasedFilterBlockReader::KeyMayMatch(const Slice& key,
const Slice& key) { uint64_t block_offset) {
assert(block_offset != kNotValid);
if (!whole_key_filtering_) { if (!whole_key_filtering_) {
return true; return true;
} }
return MayMatch(block_offset, key); return MayMatch(key, block_offset);
} }
bool FilterBlockReader::PrefixMayMatch(uint64_t block_offset, bool BlockBasedFilterBlockReader::PrefixMayMatch(const Slice& prefix,
const Slice& prefix) { uint64_t block_offset) {
assert(block_offset != kNotValid);
if (!prefix_extractor_) { if (!prefix_extractor_) {
return true; return true;
} }
return MayMatch(block_offset, prefix); return MayMatch(prefix, block_offset);
} }
bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) { bool BlockBasedFilterBlockReader::MayMatch(const Slice& entry,
uint64_t block_offset) {
uint64_t index = block_offset >> base_lg_; uint64_t index = block_offset >> base_lg_;
if (index < num_) { if (index < num_) {
uint32_t start = DecodeFixed32(offset_ + index*4); uint32_t start = DecodeFixed32(offset_ + index * 4);
uint32_t limit = DecodeFixed32(offset_ + index*4 + 4); uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
if (start <= limit && limit <= (uint32_t)(offset_ - data_)) { if (start <= limit && limit <= (uint32_t)(offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start); Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(entry, filter); return policy_->KeyMayMatch(entry, filter);
@ -184,7 +192,7 @@ bool FilterBlockReader::MayMatch(uint64_t block_offset, const Slice& entry) {
return true; // Errors are treated as potential matches return true; // Errors are treated as potential matches
} }
size_t FilterBlockReader::ApproximateMemoryUsage() const { size_t BlockBasedFilterBlockReader::ApproximateMemoryUsage() const {
return num_ * 4 + 5 + (offset_ - data_); return num_ * 4 + 5 + (offset_ - data_);
} }
} }

@ -0,0 +1,101 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
//
// A filter block is stored near the end of a Table file. It contains
// filters (e.g., bloom filters) for all data blocks in the table combined
// into a single filter block.
#pragma once
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <memory>
#include <vector>
#include "rocksdb/options.h"
#include "rocksdb/slice.h"
#include "rocksdb/slice_transform.h"
#include "table/filter_block.h"
#include "util/hash.h"
namespace rocksdb {
// A BlockBasedFilterBlockBuilder is used to construct all of the filters for a
// particular Table. It generates a single string which is stored as
// a special block in the Table.
//
// The sequence of calls to BlockBasedFilterBlockBuilder must match the regexp:
// (StartBlock Add*)* Finish
class BlockBasedFilterBlockBuilder : public FilterBlockBuilder {
public:
BlockBasedFilterBlockBuilder(const SliceTransform* prefix_extractor,
const BlockBasedTableOptions& table_opt);
virtual bool IsBlockBased() override { return true; }
virtual void StartBlock(uint64_t block_offset) override;
virtual void Add(const Slice& key) override;
virtual Slice Finish() override;
private:
void AddKey(const Slice& key);
void AddPrefix(const Slice& key);
void GenerateFilter();
// important: all of these might point to invalid addresses
// at the time of destruction of this filter block. destructor
// should NOT dereference them.
const FilterPolicy* policy_;
const SliceTransform* prefix_extractor_;
bool whole_key_filtering_;
std::string entries_; // Flattened entry contents
std::vector<size_t> start_; // Starting index in entries_ of each entry
uint32_t added_to_start_; // To indicate if key is added
std::string result_; // Filter data computed so far
std::vector<Slice> tmp_entries_; // policy_->CreateFilter() argument
std::vector<uint32_t> filter_offsets_;
// No copying allowed
BlockBasedFilterBlockBuilder(const BlockBasedFilterBlockBuilder&);
void operator=(const BlockBasedFilterBlockBuilder&);
};
// A FilterBlockReader is used to parse filter from SST table.
// KeyMayMatch and PrefixMayMatch would trigger filter checking
class BlockBasedFilterBlockReader : public FilterBlockReader {
public:
// REQUIRES: "contents" and *policy must stay live while *this is live.
BlockBasedFilterBlockReader(const SliceTransform* prefix_extractor,
const BlockBasedTableOptions& table_opt,
BlockContents&& contents);
virtual bool IsBlockBased() override { return true; }
virtual bool KeyMayMatch(const Slice& key,
uint64_t block_offset = kNotValid) override;
virtual bool PrefixMayMatch(const Slice& prefix,
uint64_t block_offset = kNotValid) override;
virtual size_t ApproximateMemoryUsage() const override;
private:
const FilterPolicy* policy_;
const SliceTransform* prefix_extractor_;
bool whole_key_filtering_;
const char* data_; // Pointer to filter data (at block-start)
const char* offset_; // Pointer to beginning of offset array (at block-end)
size_t num_; // Number of entries in offset array
size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file)
BlockContents contents_;
bool MayMatch(const Slice& entry, uint64_t block_offset);
// No copying allowed
BlockBasedFilterBlockReader(const BlockBasedFilterBlockReader&);
void operator=(const BlockBasedFilterBlockReader&);
};
} // namespace rocksdb

@ -0,0 +1,242 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2012 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "table/block_based_filter_block.h"
#include "rocksdb/filter_policy.h"
#include "util/coding.h"
#include "util/hash.h"
#include "util/logging.h"
#include "util/testharness.h"
#include "util/testutil.h"
namespace rocksdb {
// For testing: emit an array with one hash value per key
class TestHashFilter : public FilterPolicy {
public:
virtual const char* Name() const {
return "TestHashFilter";
}
virtual void CreateFilter(const Slice* keys, int n, std::string* dst) const {
for (int i = 0; i < n; i++) {
uint32_t h = Hash(keys[i].data(), keys[i].size(), 1);
PutFixed32(dst, h);
}
}
virtual bool KeyMayMatch(const Slice& key, const Slice& filter) const {
uint32_t h = Hash(key.data(), key.size(), 1);
for (unsigned int i = 0; i + 4 <= filter.size(); i += 4) {
if (h == DecodeFixed32(filter.data() + i)) {
return true;
}
}
return false;
}
};
class FilterBlockTest {
public:
TestHashFilter policy_;
BlockBasedTableOptions table_options_;
FilterBlockTest() {
table_options_.filter_policy.reset(new TestHashFilter());
}
};
TEST(FilterBlockTest, EmptyBuilder) {
BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
BlockContents block(builder.Finish(), false, kNoCompression);
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
ASSERT_TRUE(reader.KeyMayMatch("foo", 100000));
}
TEST(FilterBlockTest, SingleChunk) {
BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
builder.StartBlock(100);
builder.Add("foo");
builder.Add("bar");
builder.Add("box");
builder.StartBlock(200);
builder.Add("box");
builder.StartBlock(300);
builder.Add("hello");
BlockContents block(builder.Finish(), false, kNoCompression);
BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
ASSERT_TRUE(reader.KeyMayMatch("bar", 100));
ASSERT_TRUE(reader.KeyMayMatch("box", 100));
ASSERT_TRUE(reader.KeyMayMatch("hello", 100));
ASSERT_TRUE(reader.KeyMayMatch("foo", 100));
ASSERT_TRUE(!reader.KeyMayMatch("missing", 100));
ASSERT_TRUE(!reader.KeyMayMatch("other", 100));
}
TEST(FilterBlockTest, MultiChunk) {
BlockBasedFilterBlockBuilder builder(nullptr, table_options_);
// First filter
builder.StartBlock(0);
builder.Add("foo");
builder.StartBlock(2000);
builder.Add("bar");
// Second filter
builder.StartBlock(3100);
builder.Add("box");
// Third filter is empty
// Last filter
builder.StartBlock(9000);
builder.Add("box");
builder.Add("hello");
BlockContents block(builder.Finish(), false, kNoCompression);
BlockBasedFilterBlockReader reader(nullptr, table_options_, std::move(block));
// Check first filter
ASSERT_TRUE(reader.KeyMayMatch("foo", 0));
ASSERT_TRUE(reader.KeyMayMatch("bar", 2000));
ASSERT_TRUE(!reader.KeyMayMatch("box", 0));
ASSERT_TRUE(!reader.KeyMayMatch("hello", 0));
// Check second filter
ASSERT_TRUE(reader.KeyMayMatch("box", 3100));
ASSERT_TRUE(!reader.KeyMayMatch("foo", 3100));
ASSERT_TRUE(!reader.KeyMayMatch("bar", 3100));
ASSERT_TRUE(!reader.KeyMayMatch("hello", 3100));
// Check third filter (empty)
ASSERT_TRUE(!reader.KeyMayMatch("foo", 4100));
ASSERT_TRUE(!reader.KeyMayMatch("bar", 4100));
ASSERT_TRUE(!reader.KeyMayMatch("box", 4100));
ASSERT_TRUE(!reader.KeyMayMatch("hello", 4100));
// Check last filter
ASSERT_TRUE(reader.KeyMayMatch("box", 9000));
ASSERT_TRUE(reader.KeyMayMatch("hello", 9000));
ASSERT_TRUE(!reader.KeyMayMatch("foo", 9000));
ASSERT_TRUE(!reader.KeyMayMatch("bar", 9000));
}
// Test for block based filter block
// use new interface in FilterPolicy to create filter builder/reader
class BlockBasedFilterBlockTest {
public:
BlockBasedTableOptions table_options_;
BlockBasedFilterBlockTest() {
table_options_.filter_policy.reset(NewBloomFilterPolicy(10));
}
~BlockBasedFilterBlockTest() {}
};
TEST(BlockBasedFilterBlockTest, BlockBasedEmptyBuilder) {
FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
nullptr, table_options_);
BlockContents block(builder->Finish(), false, kNoCompression);
ASSERT_EQ("\\x00\\x00\\x00\\x00\\x0b", EscapeString(block.data));
FilterBlockReader* reader = new BlockBasedFilterBlockReader(
nullptr, table_options_, std::move(block));
ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
ASSERT_TRUE(reader->KeyMayMatch("foo", 100000));
delete builder;
delete reader;
}
TEST(BlockBasedFilterBlockTest, BlockBasedSingleChunk) {
FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
nullptr, table_options_);
builder->StartBlock(100);
builder->Add("foo");
builder->Add("bar");
builder->Add("box");
builder->StartBlock(200);
builder->Add("box");
builder->StartBlock(300);
builder->Add("hello");
BlockContents block(builder->Finish(), false, kNoCompression);
FilterBlockReader* reader = new BlockBasedFilterBlockReader(
nullptr, table_options_, std::move(block));
ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
ASSERT_TRUE(reader->KeyMayMatch("bar", 100));
ASSERT_TRUE(reader->KeyMayMatch("box", 100));
ASSERT_TRUE(reader->KeyMayMatch("hello", 100));
ASSERT_TRUE(reader->KeyMayMatch("foo", 100));
ASSERT_TRUE(!reader->KeyMayMatch("missing", 100));
ASSERT_TRUE(!reader->KeyMayMatch("other", 100));
delete builder;
delete reader;
}
TEST(BlockBasedFilterBlockTest, BlockBasedMultiChunk) {
FilterBlockBuilder* builder = new BlockBasedFilterBlockBuilder(
nullptr, table_options_);
// First filter
builder->StartBlock(0);
builder->Add("foo");
builder->StartBlock(2000);
builder->Add("bar");
// Second filter
builder->StartBlock(3100);
builder->Add("box");
// Third filter is empty
// Last filter
builder->StartBlock(9000);
builder->Add("box");
builder->Add("hello");
BlockContents block(builder->Finish(), false, kNoCompression);
FilterBlockReader* reader = new BlockBasedFilterBlockReader(
nullptr, table_options_, std::move(block));
// Check first filter
ASSERT_TRUE(reader->KeyMayMatch("foo", 0));
ASSERT_TRUE(reader->KeyMayMatch("bar", 2000));
ASSERT_TRUE(!reader->KeyMayMatch("box", 0));
ASSERT_TRUE(!reader->KeyMayMatch("hello", 0));
// Check second filter
ASSERT_TRUE(reader->KeyMayMatch("box", 3100));
ASSERT_TRUE(!reader->KeyMayMatch("foo", 3100));
ASSERT_TRUE(!reader->KeyMayMatch("bar", 3100));
ASSERT_TRUE(!reader->KeyMayMatch("hello", 3100));
// Check third filter (empty)
ASSERT_TRUE(!reader->KeyMayMatch("foo", 4100));
ASSERT_TRUE(!reader->KeyMayMatch("bar", 4100));
ASSERT_TRUE(!reader->KeyMayMatch("box", 4100));
ASSERT_TRUE(!reader->KeyMayMatch("hello", 4100));
// Check last filter
ASSERT_TRUE(reader->KeyMayMatch("box", 9000));
ASSERT_TRUE(reader->KeyMayMatch("hello", 9000));
ASSERT_TRUE(!reader->KeyMayMatch("foo", 9000));
ASSERT_TRUE(!reader->KeyMayMatch("bar", 9000));
delete builder;
delete reader;
}
} // namespace rocksdb
int main(int argc, char** argv) { return rocksdb::test::RunAllTests(); }

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save