From 08864df2125d777527b642052a70edc824f0a5c0 Mon Sep 17 00:00:00 2001 From: Islam AbdelRahman Date: Mon, 27 Feb 2017 17:36:06 -0800 Subject: [PATCH] Move advanced column family options to advanced_options.h Summary: For the sake of making our options simpler, we should keep options.h as simple as possible and move more advanced/less common options to advaned_options.h I started with ColumnFamilyOptions and also did some re-ordering I have moved all ColumnFamilyOptions to advanced_options.h and only left these options in options.h ``` const Comparator* comparator = BytewiseComparator(); std::shared_ptr merge_operator = nullptr; const CompactionFilter* compaction_filter = nullptr; std::shared_ptr compaction_filter_factory = nullptr; size_t write_buffer_size = 64 << 20; CompressionType compression; int level0_file_num_compaction_trigger = 4; bool disable_auto_compactions = false; ``` Please feel free to comment on specific options if you think they should be advanced or should not be Closes https://github.com/facebook/rocksdb/pull/1847 Differential Revision: D4519996 Pulled By: IslamAbdelRahman fbshipit-source-id: abebd9a --- include/rocksdb/advanced_options.h | 558 ++++++++++++++++++++++++++ include/rocksdb/options.h | 609 ++--------------------------- util/options.cc | 69 ++-- util/options_helper.h | 116 +++--- util/options_settable_test.cc | 41 +- 5 files changed, 720 insertions(+), 673 deletions(-) create mode 100644 include/rocksdb/advanced_options.h diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h new file mode 100644 index 000000000..4b8c5c3e1 --- /dev/null +++ b/include/rocksdb/advanced_options.h @@ -0,0 +1,558 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once + +#include + +#include "rocksdb/memtablerep.h" +#include "rocksdb/universal_compaction.h" + +namespace rocksdb { + +class Slice; +class SliceTransform; +enum CompressionType : unsigned char; +class TablePropertiesCollectorFactory; +class TableFactory; +struct Options; + +enum CompactionStyle : char { + // level based compaction style + kCompactionStyleLevel = 0x0, + // Universal compaction style + // Not supported in ROCKSDB_LITE. + kCompactionStyleUniversal = 0x1, + // FIFO compaction style + // Not supported in ROCKSDB_LITE + kCompactionStyleFIFO = 0x2, + // Disable background compaction. Compaction jobs are submitted + // via CompactFiles(). + // Not supported in ROCKSDB_LITE + kCompactionStyleNone = 0x3, +}; + +// In Level-based comapction, it Determines which file from a level to be +// picked to merge to the next level. We suggest people try +// kMinOverlappingRatio first when you tune your database. +enum CompactionPri : char { + // Slightly Priotize larger files by size compensated by #deletes + kByCompensatedSize = 0x0, + // First compact files whose data's latest update time is oldest. + // Try this if you only update some hot keys in small ranges. + kOldestLargestSeqFirst = 0x1, + // First compact files whose range hasn't been compacted to the next level + // for the longest. If your updates are random across the key space, + // write amplification is slightly better with this option. + kOldestSmallestSeqFirst = 0x2, + // First compact files whose ratio between overlapping size in next level + // and its size is the smallest. It in many cases can optimize write + // amplification. + kMinOverlappingRatio = 0x3, +}; + +struct CompactionOptionsFIFO { + // once the total sum of table files reaches this, we will delete the oldest + // table file + // Default: 1GB + uint64_t max_table_files_size; + + CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} +}; + +// Compression options for different compression algorithms like Zlib +struct CompressionOptions { + int window_bits; + int level; + int strategy; + // Maximum size of dictionary used to prime the compression library. Currently + // this dictionary will be constructed by sampling the first output file in a + // subcompaction when the target level is bottommost. This dictionary will be + // loaded into the compression library before compressing/uncompressing each + // data block of subsequent files in the subcompaction. Effectively, this + // improves compression ratios when there are repetitions across data blocks. + // A value of 0 indicates the feature is disabled. + // Default: 0. + uint32_t max_dict_bytes; + + CompressionOptions() + : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {} + CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes) + : window_bits(wbits), + level(_lev), + strategy(_strategy), + max_dict_bytes(_max_dict_bytes) {} +}; + +enum UpdateStatus { // Return status For inplace update callback + UPDATE_FAILED = 0, // Nothing to update + UPDATED_INPLACE = 1, // Value updated inplace + UPDATED = 2, // No inplace update. Merged value set +}; + + +struct AdvancedColumnFamilyOptions { + // The maximum number of write buffers that are built up in memory. + // The default and the minimum number is 2, so that when 1 write buffer + // is being flushed to storage, new writes can continue to the other + // write buffer. + // If max_write_buffer_number > 3, writing will be slowed down to + // options.delayed_write_rate if we are writing to the last write buffer + // allowed. + // + // Default: 2 + // + // Dynamically changeable through SetOptions() API + int max_write_buffer_number = 2; + + // The minimum number of write buffers that will be merged together + // before writing to storage. If set to 1, then + // all write buffers are flushed to L0 as individual files and this increases + // read amplification because a get request has to check in all of these + // files. Also, an in-memory merge may result in writing lesser + // data to storage if there are duplicate records in each of these + // individual write buffers. Default: 1 + int min_write_buffer_number_to_merge = 1; + + // The total maximum number of write buffers to maintain in memory including + // copies of buffers that have already been flushed. Unlike + // max_write_buffer_number, this parameter does not affect flushing. + // This controls the minimum amount of write history that will be available + // in memory for conflict checking when Transactions are used. + // + // When using an OptimisticTransactionDB: + // If this value is too low, some transactions may fail at commit time due + // to not being able to determine whether there were any write conflicts. + // + // When using a TransactionDB: + // If Transaction::SetSnapshot is used, TransactionDB will read either + // in-memory write buffers or SST files to do write-conflict checking. + // Increasing this value can reduce the number of reads to SST files + // done for conflict detection. + // + // Setting this value to 0 will cause write buffers to be freed immediately + // after they are flushed. + // If this value is set to -1, 'max_write_buffer_number' will be used. + // + // Default: + // If using a TransactionDB/OptimisticTransactionDB, the default value will + // be set to the value of 'max_write_buffer_number' if it is not explicitly + // set by the user. Otherwise, the default is 0. + int max_write_buffer_number_to_maintain = 0; + + // Allows thread-safe inplace updates. If this is true, there is no way to + // achieve point-in-time consistency using snapshot or iterator (assuming + // concurrent updates). Hence iterator and multi-get will return results + // which are not consistent as of any point-in-time. + // If inplace_callback function is not set, + // Put(key, new_value) will update inplace the existing_value iff + // * key exists in current memtable + // * new sizeof(new_value) <= sizeof(existing_value) + // * existing_value for that key is a put i.e. kTypeValue + // If inplace_callback function is set, check doc for inplace_callback. + // Default: false. + bool inplace_update_support = false; + + // Number of locks used for inplace update + // Default: 10000, if inplace_update_support = true, else 0. + // + // Dynamically changeable through SetOptions() API + size_t inplace_update_num_locks = 10000; + + // existing_value - pointer to previous value (from both memtable and sst). + // nullptr if key doesn't exist + // existing_value_size - pointer to size of existing_value). + // nullptr if key doesn't exist + // delta_value - Delta value to be merged with the existing_value. + // Stored in transaction logs. + // merged_value - Set when delta is applied on the previous value. + + // Applicable only when inplace_update_support is true, + // this callback function is called at the time of updating the memtable + // as part of a Put operation, lets say Put(key, delta_value). It allows the + // 'delta_value' specified as part of the Put operation to be merged with + // an 'existing_value' of the key in the database. + + // If the merged value is smaller in size that the 'existing_value', + // then this function can update the 'existing_value' buffer inplace and + // the corresponding 'existing_value'_size pointer, if it wishes to. + // The callback should return UpdateStatus::UPDATED_INPLACE. + // In this case. (In this case, the snapshot-semantics of the rocksdb + // Iterator is not atomic anymore). + + // If the merged value is larger in size than the 'existing_value' or the + // application does not wish to modify the 'existing_value' buffer inplace, + // then the merged value should be returned via *merge_value. It is set by + // merging the 'existing_value' and the Put 'delta_value'. The callback should + // return UpdateStatus::UPDATED in this case. This merged value will be added + // to the memtable. + + // If merging fails or the application does not wish to take any action, + // then the callback should return UpdateStatus::UPDATE_FAILED. + + // Please remember that the original call from the application is Put(key, + // delta_value). So the transaction log (if enabled) will still contain (key, + // delta_value). The 'merged_value' is not stored in the transaction log. + // Hence the inplace_callback function should be consistent across db reopens. + + // Default: nullptr + UpdateStatus (*inplace_callback)(char* existing_value, + uint32_t* existing_value_size, + Slice delta_value, + std::string* merged_value) = nullptr; + + // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, + // create prefix bloom for memtable with the size of + // write_buffer_size * memtable_prefix_bloom_size_ratio. + // If it is larger than 0.25, it is santinized to 0.25. + // + // Default: 0 (disable) + // + // Dynamically changeable through SetOptions() API + double memtable_prefix_bloom_size_ratio = 0.0; + + // Page size for huge page for the arena used by the memtable. If <=0, it + // won't allocate from huge page but from malloc. + // Users are responsible to reserve huge pages for it to be allocated. For + // example: + // sysctl -w vm.nr_hugepages=20 + // See linux doc Documentation/vm/hugetlbpage.txt + // If there isn't enough free huge page available, it will fall back to + // malloc. + // + // Dynamically changeable through SetOptions() API + size_t memtable_huge_page_size = 0; + + // If non-nullptr, memtable will use the specified function to extract + // prefixes for keys, and for each prefix maintain a hint of insert location + // to reduce CPU usage for inserting keys with the prefix. Keys out of + // domain of the prefix extractor will be insert without using hints. + // + // Currently only the default skiplist based memtable implements the feature. + // All other memtable implementation will ignore the option. It incurs ~250 + // additional bytes of memory overhead to store a hint for each prefix. + // Also concurrent writes (when allow_concurrent_memtable_write is true) will + // ignore the option. + // + // The option is best suited for workloads where keys will likely to insert + // to a location close the the last inserted key with the same prefix. + // One example could be inserting keys of the form (prefix + timestamp), + // and keys of the same prefix always comes in with time order. Another + // example would be updating the same key over and over again, in which case + // the prefix can be the key itself. + // + // Default: nullptr (disable) + std::shared_ptr + memtable_insert_with_hint_prefix_extractor = nullptr; + + // Control locality of bloom filter probes to improve cache miss rate. + // This option only applies to memtable prefix bloom and plaintable + // prefix bloom. It essentially limits every bloom checking to one cache line. + // This optimization is turned off when set to 0, and positive number to turn + // it on. + // Default: 0 + uint32_t bloom_locality = 0; + + // size of one block in arena memory allocation. + // If <= 0, a proper value is automatically calculated (usually 1/8 of + // writer_buffer_size, rounded up to a multiple of 4KB). + // + // There are two additional restriction of the The specified size: + // (1) size should be in the range of [4096, 2 << 30] and + // (2) be the multiple of the CPU word (which helps with the memory + // alignment). + // + // We'll automatically check and adjust the size number to make sure it + // conforms to the restrictions. + // + // Default: 0 + // + // Dynamically changeable through SetOptions() API + size_t arena_block_size = 0; + + // Different levels can have different compression policies. There + // are cases where most lower levels would like to use quick compression + // algorithms while the higher levels (which have more data) use + // compression algorithms that have better compression but could + // be slower. This array, if non-empty, should have an entry for + // each level of the database; these override the value specified in + // the previous field 'compression'. + // + // NOTICE if level_compaction_dynamic_level_bytes=true, + // compression_per_level[0] still determines L0, but other elements + // of the array are based on base level (the level L0 files are merged + // to), and may not match the level users see from info log for metadata. + // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] + // determines compaction type for level n+i-1. + // For example, if we have three 5 levels, and we determine to merge L0 + // data to L4 (which means L1..L3 will be empty), then the new files go to + // L4 uses compression type compression_per_level[1]. + // If now L0 is merged to L2. Data goes to L2 will be compressed + // according to compression_per_level[1], L3 using compression_per_level[2] + // and L4 using compression_per_level[3]. Compaction for each level can + // change when data grows. + std::vector compression_per_level; + + // Number of levels for this database + int num_levels = 7; + + // Soft limit on number of level-0 files. We start slowing down writes at this + // point. A value <0 means that no writing slow down will be triggered by + // number of files in level-0. + // + // Default: 20 + // + // Dynamically changeable through SetOptions() API + int level0_slowdown_writes_trigger = 20; + + // Maximum number of level-0 files. We stop writes at this point. + // + // Default: 36 + // + // Dynamically changeable through SetOptions() API + int level0_stop_writes_trigger = 36; + + // Target file size for compaction. + // target_file_size_base is per-file size for level-1. + // Target file size for level L can be calculated by + // target_file_size_base * (target_file_size_multiplier ^ (L-1)) + // For example, if target_file_size_base is 2MB and + // target_file_size_multiplier is 10, then each file on level-1 will + // be 2MB, and each file on level 2 will be 20MB, + // and each file on level-3 will be 200MB. + // + // Default: 64MB. + // + // Dynamically changeable through SetOptions() API + uint64_t target_file_size_base = 64 * 1048576; + + // By default target_file_size_multiplier is 1, which means + // by default files in different levels will have similar size. + // + // Dynamically changeable through SetOptions() API + int target_file_size_multiplier = 1; + + // If true, RocksDB will pick target size of each level dynamically. + // We will pick a base level b >= 1. L0 will be directly merged into level b, + // instead of always into level 1. Level 1 to b-1 need to be empty. + // We try to pick b and its target size so that + // 1. target size is in the range of + // (max_bytes_for_level_base / max_bytes_for_level_multiplier, + // max_bytes_for_level_base] + // 2. target size of the last level (level num_levels-1) equals to extra size + // of the level. + // At the same time max_bytes_for_level_multiplier and + // max_bytes_for_level_multiplier_additional are still satisfied. + // + // With this option on, from an empty DB, we make last level the base level, + // which means merging L0 data into the last level, until it exceeds + // max_bytes_for_level_base. And then we make the second last level to be + // base level, to start to merge L0 data to second last level, with its + // target size to be 1/max_bytes_for_level_multiplier of the last level's + // extra size. After the data accumulates more so that we need to move the + // base level to the third last one, and so on. + // + // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, + // and max_bytes_for_level_base=10MB. + // Target sizes of level 1 to 5 starts with: + // [- - - - 10MB] + // with base level is level. Target sizes of level 1 to 4 are not applicable + // because they will not be used. + // Until the size of Level 5 grows to more than 10MB, say 11MB, we make + // base target to level 4 and now the targets looks like: + // [- - - 1.1MB 11MB] + // While data are accumulated, size targets are tuned based on actual data + // of level 5. When level 5 has 50MB of data, the target is like: + // [- - - 5MB 50MB] + // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep + // level 4 to be the base level, its target size needs to be 10.1MB, which + // doesn't satisfy the target size range. So now we make level 3 the target + // size and the target sizes of the levels look like: + // [- - 1.01MB 10.1MB 101MB] + // In the same way, while level 5 further grows, all levels' targets grow, + // like + // [- - 5MB 50MB 500MB] + // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the + // base level and make levels' target sizes like this: + // [- 1.001MB 10.01MB 100.1MB 1001MB] + // and go on... + // + // By doing it, we give max_bytes_for_level_multiplier a priority against + // max_bytes_for_level_base, for a more predictable LSM tree shape. It is + // useful to limit worse case space amplification. + // + // max_bytes_for_level_multiplier_additional is ignored with this flag on. + // + // Turning this feature on or off for an existing DB can cause unexpected + // LSM tree structure so it's not recommended. + // + // NOTE: this option is experimental + // + // Default: false + bool level_compaction_dynamic_level_bytes = false; + + // Default: 10. + // + // Dynamically changeable through SetOptions() API + double max_bytes_for_level_multiplier = 10; + + // Different max-size multipliers for different levels. + // These are multiplied by max_bytes_for_level_multiplier to arrive + // at the max-size of each level. + // + // Default: 1 + // + // Dynamically changeable through SetOptions() API + std::vector max_bytes_for_level_multiplier_additional = + std::vector(num_levels, 1); + + // We try to limit number of bytes in one compaction to be lower than this + // threshold. But it's not guaranteed. + // Value 0 will be sanitized. + // + // Default: result.target_file_size_base * 25 + uint64_t max_compaction_bytes = 0; + + // All writes will be slowed down to at least delayed_write_rate if estimated + // bytes needed to be compaction exceed this threshold. + // + // Default: 64GB + uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; + + // All writes are stopped if estimated bytes needed to be compaction exceed + // this threshold. + // + // Default: 256GB + uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; + + // The compaction style. Default: kCompactionStyleLevel + CompactionStyle compaction_style = kCompactionStyleLevel; + + // If level compaction_style = kCompactionStyleLevel, for each level, + // which files are prioritized to be picked to compact. + // Default: kByCompensatedSize + CompactionPri compaction_pri = kByCompensatedSize; + + // The options needed to support Universal Style compactions + CompactionOptionsUniversal compaction_options_universal; + + // The options for FIFO compaction style + CompactionOptionsFIFO compaction_options_fifo; + + // An iteration->Next() sequentially skips over keys with the same + // user-key unless this option is set. This number specifies the number + // of keys (with the same userkey) that will be sequentially + // skipped before a reseek is issued. + // + // Default: 8 + // + // Dynamically changeable through SetOptions() API + uint64_t max_sequential_skip_in_iterations = 8; + + // This is a factory that provides MemTableRep objects. + // Default: a factory that provides a skip-list-based implementation of + // MemTableRep. + std::shared_ptr memtable_factory = + std::shared_ptr(new SkipListFactory); + + // Block-based table related options are moved to BlockBasedTableOptions. + // Related options that were originally here but now moved include: + // no_block_cache + // block_cache + // block_cache_compressed + // block_size + // block_size_deviation + // block_restart_interval + // filter_policy + // whole_key_filtering + // If you'd like to customize some of these options, you will need to + // use NewBlockBasedTableFactory() to construct a new table factory. + + // This option allows user to collect their own interested statistics of + // the tables. + // Default: empty vector -- no user-defined statistics collection will be + // performed. + typedef std::vector> + TablePropertiesCollectorFactories; + TablePropertiesCollectorFactories table_properties_collector_factories; + + // Maximum number of successive merge operations on a key in the memtable. + // + // When a merge operation is added to the memtable and the maximum number of + // successive merges is reached, the value of the key will be calculated and + // inserted into the memtable instead of the merge operation. This will + // ensure that there are never more than max_successive_merges merge + // operations in the memtable. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + size_t max_successive_merges = 0; + + // This flag specifies that the implementation should optimize the filters + // mainly for cases where keys are found rather than also optimize for keys + // missed. This would be used in cases where the application knows that + // there are very few misses or the performance in the case of misses is not + // important. + // + // For now, this flag allows us to not store filters for the last level i.e + // the largest level which contains data of the LSM store. For keys which + // are hits, the filters in this level are not useful because we will search + // for the data anyway. NOTE: the filters in other levels are still useful + // even for key hit because they tell us whether to look in that level or go + // to the higher level. + // + // Default: false + bool optimize_filters_for_hits = false; + + // After writing every SST file, reopen it and read all the keys. + // Default: false + bool paranoid_file_checks = false; + + // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM + // change (Flush, Compaction, AddFile). These checks are disabled in release + // mode, use this option to enable them in release mode as well. + // Default: false + bool force_consistency_checks = false; + + // Measure IO stats in compactions and flushes, if true. + // Default: false + bool report_bg_io_stats = false; + + // Create ColumnFamilyOptions with default values for all fields + AdvancedColumnFamilyOptions(); + // Create ColumnFamilyOptions from Options + explicit AdvancedColumnFamilyOptions(const Options& options); + + // ---------------- DEPRECATED OPTIONS ---------------- + + // DEPRECATED + // This does not do anything anymore. + int max_mem_compaction_level; + + // DEPRECATED -- this options is no longer used + // Puts are delayed to options.delayed_write_rate when any level has a + // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. + // + // Default: 0 (disabled) + // + // Dynamically changeable through SetOptions() API + double soft_rate_limit = 0.0; + + // DEPRECATED -- this options is no longer used + double hard_rate_limit = 0.0; + + // DEPRECATED -- this options is no longer used + unsigned int rate_limit_delay_max_milliseconds = 100; + + // DEPREACTED + // Does not have any effect. + bool purge_redundant_kvs_while_flush = true; +}; + +} // namespace rocksdb diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 2f703dd4c..53ce0aefc 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -17,10 +17,10 @@ #include #include +#include "rocksdb/advanced_options.h" #include "rocksdb/comparator.h" #include "rocksdb/env.h" #include "rocksdb/listener.h" -#include "rocksdb/memtablerep.h" #include "rocksdb/universal_compaction.h" #include "rocksdb/version.h" #include "rocksdb/write_buffer_manager.h" @@ -42,12 +42,9 @@ class FilterPolicy; class Logger; class MergeOperator; class Snapshot; -class TableFactory; class MemTableRepFactory; -class TablePropertiesCollectorFactory; class RateLimiter; class Slice; -class SliceTransform; class Statistics; class InternalKeyComparator; class WalFilter; @@ -79,113 +76,9 @@ enum CompressionType : unsigned char { kDisableCompressionOption = 0xff, }; -enum CompactionStyle : char { - // level based compaction style - kCompactionStyleLevel = 0x0, - // Universal compaction style - // Not supported in ROCKSDB_LITE. - kCompactionStyleUniversal = 0x1, - // FIFO compaction style - // Not supported in ROCKSDB_LITE - kCompactionStyleFIFO = 0x2, - // Disable background compaction. Compaction jobs are submitted - // via CompactFiles(). - // Not supported in ROCKSDB_LITE - kCompactionStyleNone = 0x3, -}; - -// In Level-based comapction, it Determines which file from a level to be -// picked to merge to the next level. We suggest people try -// kMinOverlappingRatio first when you tune your database. -enum CompactionPri : char { - // Slightly Priotize larger files by size compensated by #deletes - kByCompensatedSize = 0x0, - // First compact files whose data's latest update time is oldest. - // Try this if you only update some hot keys in small ranges. - kOldestLargestSeqFirst = 0x1, - // First compact files whose range hasn't been compacted to the next level - // for the longest. If your updates are random across the key space, - // write amplification is slightly better with this option. - kOldestSmallestSeqFirst = 0x2, - // First compact files whose ratio between overlapping size in next level - // and its size is the smallest. It in many cases can optimize write - // amplification. - kMinOverlappingRatio = 0x3, -}; - -enum class WALRecoveryMode : char { - // Original levelDB recovery - // We tolerate incomplete record in trailing data on all logs - // Use case : This is legacy behavior (default) - kTolerateCorruptedTailRecords = 0x00, - // Recover from clean shutdown - // We don't expect to find any corruption in the WAL - // Use case : This is ideal for unit tests and rare applications that - // can require high consistency guarantee - kAbsoluteConsistency = 0x01, - // Recover to point-in-time consistency - // We stop the WAL playback on discovering WAL inconsistency - // Use case : Ideal for systems that have disk controller cache like - // hard disk, SSD without super capacitor that store related data - kPointInTimeRecovery = 0x02, - // Recovery after a disaster - // We ignore any corruption in the WAL and try to salvage as much data as - // possible - // Use case : Ideal for last ditch effort to recover data or systems that - // operate with low grade unrelated data - kSkipAnyCorruptedRecords = 0x03, -}; - -struct CompactionOptionsFIFO { - // once the total sum of table files reaches this, we will delete the oldest - // table file - // Default: 1GB - uint64_t max_table_files_size; - - CompactionOptionsFIFO() : max_table_files_size(1 * 1024 * 1024 * 1024) {} -}; - -// Compression options for different compression algorithms like Zlib -struct CompressionOptions { - int window_bits; - int level; - int strategy; - // Maximum size of dictionary used to prime the compression library. Currently - // this dictionary will be constructed by sampling the first output file in a - // subcompaction when the target level is bottommost. This dictionary will be - // loaded into the compression library before compressing/uncompressing each - // data block of subsequent files in the subcompaction. Effectively, this - // improves compression ratios when there are repetitions across data blocks. - // A value of 0 indicates the feature is disabled. - // Default: 0. - uint32_t max_dict_bytes; - - CompressionOptions() - : window_bits(-14), level(-1), strategy(0), max_dict_bytes(0) {} - CompressionOptions(int wbits, int _lev, int _strategy, int _max_dict_bytes) - : window_bits(wbits), - level(_lev), - strategy(_strategy), - max_dict_bytes(_max_dict_bytes) {} -}; - -enum UpdateStatus { // Return status For inplace update callback - UPDATE_FAILED = 0, // Nothing to update - UPDATED_INPLACE = 1, // Value updated inplace - UPDATED = 2, // No inplace update. Merged value set -}; - -struct DbPath { - std::string path; - uint64_t target_size; // Target size of total files under the path, in byte. - - DbPath() : target_size(0) {} - DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} -}; - struct Options; -struct ColumnFamilyOptions { +struct ColumnFamilyOptions : public AdvancedColumnFamilyOptions { // The function recovers options to a previous version. Only 4.6 or later // versions are supported. ColumnFamilyOptions* OldDefaults(int rocksdb_major_version = 4, @@ -295,54 +188,6 @@ struct ColumnFamilyOptions { // Dynamically changeable through SetOptions() API size_t write_buffer_size = 64 << 20; - // The maximum number of write buffers that are built up in memory. - // The default and the minimum number is 2, so that when 1 write buffer - // is being flushed to storage, new writes can continue to the other - // write buffer. - // If max_write_buffer_number > 3, writing will be slowed down to - // options.delayed_write_rate if we are writing to the last write buffer - // allowed. - // - // Default: 2 - // - // Dynamically changeable through SetOptions() API - int max_write_buffer_number = 2; - - // The minimum number of write buffers that will be merged together - // before writing to storage. If set to 1, then - // all write buffers are flushed to L0 as individual files and this increases - // read amplification because a get request has to check in all of these - // files. Also, an in-memory merge may result in writing lesser - // data to storage if there are duplicate records in each of these - // individual write buffers. Default: 1 - int min_write_buffer_number_to_merge = 1; - - // The total maximum number of write buffers to maintain in memory including - // copies of buffers that have already been flushed. Unlike - // max_write_buffer_number, this parameter does not affect flushing. - // This controls the minimum amount of write history that will be available - // in memory for conflict checking when Transactions are used. - // - // When using an OptimisticTransactionDB: - // If this value is too low, some transactions may fail at commit time due - // to not being able to determine whether there were any write conflicts. - // - // When using a TransactionDB: - // If Transaction::SetSnapshot is used, TransactionDB will read either - // in-memory write buffers or SST files to do write-conflict checking. - // Increasing this value can reduce the number of reads to SST files - // done for conflict detection. - // - // Setting this value to 0 will cause write buffers to be freed immediately - // after they are flushed. - // If this value is set to -1, 'max_write_buffer_number' will be used. - // - // Default: - // If using a TransactionDB/OptimisticTransactionDB, the default value will - // be set to the value of 'max_write_buffer_number' if it is not explicitly - // set by the user. Otherwise, the default is 0. - int max_write_buffer_number_to_maintain = 0; - // Compress blocks using the specified compression algorithm. This // parameter can be changed dynamically. // @@ -359,29 +204,6 @@ struct ColumnFamilyOptions { // efficiently detect that and will switch to uncompressed mode. CompressionType compression; - // Different levels can have different compression policies. There - // are cases where most lower levels would like to use quick compression - // algorithms while the higher levels (which have more data) use - // compression algorithms that have better compression but could - // be slower. This array, if non-empty, should have an entry for - // each level of the database; these override the value specified in - // the previous field 'compression'. - // - // NOTICE if level_compaction_dynamic_level_bytes=true, - // compression_per_level[0] still determines L0, but other elements - // of the array are based on base level (the level L0 files are merged - // to), and may not match the level users see from info log for metadata. - // If L0 files are merged to level-n, then, for i>0, compression_per_level[i] - // determines compaction type for level n+i-1. - // For example, if we have three 5 levels, and we determine to merge L0 - // data to L4 (which means L1..L3 will be empty), then the new files go to - // L4 uses compression type compression_per_level[1]. - // If now L0 is merged to L2. Data goes to L2 will be compressed - // according to compression_per_level[1], L3 using compression_per_level[2] - // and L4 using compression_per_level[3]. Compaction for each level can - // change when data grows. - std::vector compression_per_level; - // Compression algorithm that will be used for the bottommost level that // contain files. If level-compaction is used, this option will only affect // levels after base level. @@ -392,6 +214,14 @@ struct ColumnFamilyOptions { // different options for compression algorithms CompressionOptions compression_opts; + // Number of files to trigger level-0 compaction. A value <0 means that + // level-0 compaction will not be triggered by number of files at all. + // + // Default: 4 + // + // Dynamically changeable through SetOptions() API + int level0_file_num_compaction_trigger = 4; + // If non-nullptr, use the specified function to determine the // prefixes for keys. These prefixes will be placed in the filter. // Depending on the workload, this can reduce the number of read-IOP @@ -408,56 +238,6 @@ struct ColumnFamilyOptions { // Default: nullptr std::shared_ptr prefix_extractor = nullptr; - // Number of levels for this database - int num_levels = 7; - - // Number of files to trigger level-0 compaction. A value <0 means that - // level-0 compaction will not be triggered by number of files at all. - // - // Default: 4 - // - // Dynamically changeable through SetOptions() API - int level0_file_num_compaction_trigger = 4; - - // Soft limit on number of level-0 files. We start slowing down writes at this - // point. A value <0 means that no writing slow down will be triggered by - // number of files in level-0. - // - // Default: 20 - // - // Dynamically changeable through SetOptions() API - int level0_slowdown_writes_trigger = 20; - - // Maximum number of level-0 files. We stop writes at this point. - // - // Default: 36 - // - // Dynamically changeable through SetOptions() API - int level0_stop_writes_trigger = 36; - - // This does not do anything anymore. Deprecated. - int max_mem_compaction_level; - - // Target file size for compaction. - // target_file_size_base is per-file size for level-1. - // Target file size for level L can be calculated by - // target_file_size_base * (target_file_size_multiplier ^ (L-1)) - // For example, if target_file_size_base is 2MB and - // target_file_size_multiplier is 10, then each file on level-1 will - // be 2MB, and each file on level 2 will be 20MB, - // and each file on level-3 will be 200MB. - // - // Default: 64MB. - // - // Dynamically changeable through SetOptions() API - uint64_t target_file_size_base = 64 * 1048576; - - // By default target_file_size_multiplier is 1, which means - // by default files in different levels will have similar size. - // - // Dynamically changeable through SetOptions() API - int target_file_size_multiplier = 1; - // Control maximum total data size for a level. // max_bytes_for_level_base is the max total for level-1. // Maximum number of bytes for level L can be calculated as @@ -472,355 +252,18 @@ struct ColumnFamilyOptions { // Dynamically changeable through SetOptions() API uint64_t max_bytes_for_level_base = 256 * 1048576; - // If true, RocksDB will pick target size of each level dynamically. - // We will pick a base level b >= 1. L0 will be directly merged into level b, - // instead of always into level 1. Level 1 to b-1 need to be empty. - // We try to pick b and its target size so that - // 1. target size is in the range of - // (max_bytes_for_level_base / max_bytes_for_level_multiplier, - // max_bytes_for_level_base] - // 2. target size of the last level (level num_levels-1) equals to extra size - // of the level. - // At the same time max_bytes_for_level_multiplier and - // max_bytes_for_level_multiplier_additional are still satisfied. - // - // With this option on, from an empty DB, we make last level the base level, - // which means merging L0 data into the last level, until it exceeds - // max_bytes_for_level_base. And then we make the second last level to be - // base level, to start to merge L0 data to second last level, with its - // target size to be 1/max_bytes_for_level_multiplier of the last level's - // extra size. After the data accumulates more so that we need to move the - // base level to the third last one, and so on. - // - // For example, assume max_bytes_for_level_multiplier=10, num_levels=6, - // and max_bytes_for_level_base=10MB. - // Target sizes of level 1 to 5 starts with: - // [- - - - 10MB] - // with base level is level. Target sizes of level 1 to 4 are not applicable - // because they will not be used. - // Until the size of Level 5 grows to more than 10MB, say 11MB, we make - // base target to level 4 and now the targets looks like: - // [- - - 1.1MB 11MB] - // While data are accumulated, size targets are tuned based on actual data - // of level 5. When level 5 has 50MB of data, the target is like: - // [- - - 5MB 50MB] - // Until level 5's actual size is more than 100MB, say 101MB. Now if we keep - // level 4 to be the base level, its target size needs to be 10.1MB, which - // doesn't satisfy the target size range. So now we make level 3 the target - // size and the target sizes of the levels look like: - // [- - 1.01MB 10.1MB 101MB] - // In the same way, while level 5 further grows, all levels' targets grow, - // like - // [- - 5MB 50MB 500MB] - // Until level 5 exceeds 1000MB and becomes 1001MB, we make level 2 the - // base level and make levels' target sizes like this: - // [- 1.001MB 10.01MB 100.1MB 1001MB] - // and go on... - // - // By doing it, we give max_bytes_for_level_multiplier a priority against - // max_bytes_for_level_base, for a more predictable LSM tree shape. It is - // useful to limit worse case space amplification. - // - // max_bytes_for_level_multiplier_additional is ignored with this flag on. - // - // Turning this feature on or off for an existing DB can cause unexpected - // LSM tree structure so it's not recommended. - // - // NOTE: this option is experimental - // - // Default: false - bool level_compaction_dynamic_level_bytes = false; - - // Default: 10. - // - // Dynamically changeable through SetOptions() API - double max_bytes_for_level_multiplier = 10; - - // Different max-size multipliers for different levels. - // These are multiplied by max_bytes_for_level_multiplier to arrive - // at the max-size of each level. - // - // Default: 1 - // - // Dynamically changeable through SetOptions() API - std::vector max_bytes_for_level_multiplier_additional = - std::vector(num_levels, 1); - - // We try to limit number of bytes in one compaction to be lower than this - // threshold. But it's not guaranteed. - // Value 0 will be sanitized. - // - // Default: result.target_file_size_base * 25 - uint64_t max_compaction_bytes = 0; - - // DEPRECATED -- this options is no longer used - // Puts are delayed to options.delayed_write_rate when any level has a - // compaction score that exceeds soft_rate_limit. This is ignored when == 0.0. - // - // Default: 0 (disabled) - // - // Dynamically changeable through SetOptions() API - double soft_rate_limit = 0.0; - - // DEPRECATED -- this options is no longer used - double hard_rate_limit = 0.0; - - // All writes will be slowed down to at least delayed_write_rate if estimated - // bytes needed to be compaction exceed this threshold. - // - // Default: 64GB - uint64_t soft_pending_compaction_bytes_limit = 64 * 1073741824ull; - - // All writes are stopped if estimated bytes needed to be compaction exceed - // this threshold. - // - // Default: 256GB - uint64_t hard_pending_compaction_bytes_limit = 256 * 1073741824ull; - - // DEPRECATED -- this options is no longer used - unsigned int rate_limit_delay_max_milliseconds = 100; - - // size of one block in arena memory allocation. - // If <= 0, a proper value is automatically calculated (usually 1/8 of - // writer_buffer_size, rounded up to a multiple of 4KB). - // - // There are two additional restriction of the The specified size: - // (1) size should be in the range of [4096, 2 << 30] and - // (2) be the multiple of the CPU word (which helps with the memory - // alignment). - // - // We'll automatically check and adjust the size number to make sure it - // conforms to the restrictions. - // - // Default: 0 - // - // Dynamically changeable through SetOptions() API - size_t arena_block_size = 0; - // Disable automatic compactions. Manual compactions can still // be issued on this column family // // Dynamically changeable through SetOptions() API bool disable_auto_compactions = false; - // DEPREACTED - // Does not have any effect. - bool purge_redundant_kvs_while_flush = true; - - // The compaction style. Default: kCompactionStyleLevel - CompactionStyle compaction_style = kCompactionStyleLevel; - - // If level compaction_style = kCompactionStyleLevel, for each level, - // which files are prioritized to be picked to compact. - // Default: kByCompensatedSize - CompactionPri compaction_pri = kByCompensatedSize; - - - // The options needed to support Universal Style compactions - CompactionOptionsUniversal compaction_options_universal; - - // The options for FIFO compaction style - CompactionOptionsFIFO compaction_options_fifo; - - // An iteration->Next() sequentially skips over keys with the same - // user-key unless this option is set. This number specifies the number - // of keys (with the same userkey) that will be sequentially - // skipped before a reseek is issued. - // - // Default: 8 - // - // Dynamically changeable through SetOptions() API - uint64_t max_sequential_skip_in_iterations = 8; - - // This is a factory that provides MemTableRep objects. - // Default: a factory that provides a skip-list-based implementation of - // MemTableRep. - std::shared_ptr memtable_factory = - std::shared_ptr(new SkipListFactory); - // This is a factory that provides TableFactory objects. // Default: a block-based table factory that provides a default // implementation of TableBuilder and TableReader with default // BlockBasedTableOptions. std::shared_ptr table_factory; - // Block-based table related options are moved to BlockBasedTableOptions. - // Related options that were originally here but now moved include: - // no_block_cache - // block_cache - // block_cache_compressed - // block_size - // block_size_deviation - // block_restart_interval - // filter_policy - // whole_key_filtering - // If you'd like to customize some of these options, you will need to - // use NewBlockBasedTableFactory() to construct a new table factory. - - // This option allows user to collect their own interested statistics of - // the tables. - // Default: empty vector -- no user-defined statistics collection will be - // performed. - typedef std::vector> - TablePropertiesCollectorFactories; - TablePropertiesCollectorFactories table_properties_collector_factories; - - // Allows thread-safe inplace updates. If this is true, there is no way to - // achieve point-in-time consistency using snapshot or iterator (assuming - // concurrent updates). Hence iterator and multi-get will return results - // which are not consistent as of any point-in-time. - // If inplace_callback function is not set, - // Put(key, new_value) will update inplace the existing_value iff - // * key exists in current memtable - // * new sizeof(new_value) <= sizeof(existing_value) - // * existing_value for that key is a put i.e. kTypeValue - // If inplace_callback function is set, check doc for inplace_callback. - // Default: false. - bool inplace_update_support = false; - - // Number of locks used for inplace update - // Default: 10000, if inplace_update_support = true, else 0. - // - // Dynamically changeable through SetOptions() API - size_t inplace_update_num_locks = 10000; - - // existing_value - pointer to previous value (from both memtable and sst). - // nullptr if key doesn't exist - // existing_value_size - pointer to size of existing_value). - // nullptr if key doesn't exist - // delta_value - Delta value to be merged with the existing_value. - // Stored in transaction logs. - // merged_value - Set when delta is applied on the previous value. - - // Applicable only when inplace_update_support is true, - // this callback function is called at the time of updating the memtable - // as part of a Put operation, lets say Put(key, delta_value). It allows the - // 'delta_value' specified as part of the Put operation to be merged with - // an 'existing_value' of the key in the database. - - // If the merged value is smaller in size that the 'existing_value', - // then this function can update the 'existing_value' buffer inplace and - // the corresponding 'existing_value'_size pointer, if it wishes to. - // The callback should return UpdateStatus::UPDATED_INPLACE. - // In this case. (In this case, the snapshot-semantics of the rocksdb - // Iterator is not atomic anymore). - - // If the merged value is larger in size than the 'existing_value' or the - // application does not wish to modify the 'existing_value' buffer inplace, - // then the merged value should be returned via *merge_value. It is set by - // merging the 'existing_value' and the Put 'delta_value'. The callback should - // return UpdateStatus::UPDATED in this case. This merged value will be added - // to the memtable. - - // If merging fails or the application does not wish to take any action, - // then the callback should return UpdateStatus::UPDATE_FAILED. - - // Please remember that the original call from the application is Put(key, - // delta_value). So the transaction log (if enabled) will still contain (key, - // delta_value). The 'merged_value' is not stored in the transaction log. - // Hence the inplace_callback function should be consistent across db reopens. - - // Default: nullptr - UpdateStatus (*inplace_callback)(char* existing_value, - uint32_t* existing_value_size, - Slice delta_value, - std::string* merged_value) = nullptr; - - // if prefix_extractor is set and memtable_prefix_bloom_size_ratio is not 0, - // create prefix bloom for memtable with the size of - // write_buffer_size * memtable_prefix_bloom_size_ratio. - // If it is larger than 0.25, it is santinized to 0.25. - // - // Default: 0 (disable) - // - // Dynamically changeable through SetOptions() API - double memtable_prefix_bloom_size_ratio = 0.0; - - // Page size for huge page for the arena used by the memtable. If <=0, it - // won't allocate from huge page but from malloc. - // Users are responsible to reserve huge pages for it to be allocated. For - // example: - // sysctl -w vm.nr_hugepages=20 - // See linux doc Documentation/vm/hugetlbpage.txt - // If there isn't enough free huge page available, it will fall back to - // malloc. - // - // Dynamically changeable through SetOptions() API - size_t memtable_huge_page_size = 0; - - // If non-nullptr, memtable will use the specified function to extract - // prefixes for keys, and for each prefix maintain a hint of insert location - // to reduce CPU usage for inserting keys with the prefix. Keys out of - // domain of the prefix extractor will be insert without using hints. - // - // Currently only the default skiplist based memtable implements the feature. - // All other memtable implementation will ignore the option. It incurs ~250 - // additional bytes of memory overhead to store a hint for each prefix. - // Also concurrent writes (when allow_concurrent_memtable_write is true) will - // ignore the option. - // - // The option is best suited for workloads where keys will likely to insert - // to a location close the the last inserted key with the same prefix. - // One example could be inserting keys of the form (prefix + timestamp), - // and keys of the same prefix always comes in with time order. Another - // example would be updating the same key over and over again, in which case - // the prefix can be the key itself. - // - // Default: nullptr (disable) - std::shared_ptr - memtable_insert_with_hint_prefix_extractor = nullptr; - - // Control locality of bloom filter probes to improve cache miss rate. - // This option only applies to memtable prefix bloom and plaintable - // prefix bloom. It essentially limits every bloom checking to one cache line. - // This optimization is turned off when set to 0, and positive number to turn - // it on. - // Default: 0 - uint32_t bloom_locality = 0; - - // Maximum number of successive merge operations on a key in the memtable. - // - // When a merge operation is added to the memtable and the maximum number of - // successive merges is reached, the value of the key will be calculated and - // inserted into the memtable instead of the merge operation. This will - // ensure that there are never more than max_successive_merges merge - // operations in the memtable. - // - // Default: 0 (disabled) - // - // Dynamically changeable through SetOptions() API - size_t max_successive_merges = 0; - - // This flag specifies that the implementation should optimize the filters - // mainly for cases where keys are found rather than also optimize for keys - // missed. This would be used in cases where the application knows that - // there are very few misses or the performance in the case of misses is not - // important. - // - // For now, this flag allows us to not store filters for the last level i.e - // the largest level which contains data of the LSM store. For keys which - // are hits, the filters in this level are not useful because we will search - // for the data anyway. NOTE: the filters in other levels are still useful - // even for key hit because they tell us whether to look in that level or go - // to the higher level. - // - // Default: false - bool optimize_filters_for_hits = false; - - // After writing every SST file, reopen it and read all the keys. - // Default: false - bool paranoid_file_checks = false; - - // In debug mode, RocksDB run consistency checks on the LSM everytime the LSM - // change (Flush, Compaction, AddFile). These checks are disabled in release - // mode, use this option to enable them in release mode as well. - // Default: false - bool force_consistency_checks = false; - - // Measure IO stats in compactions and flushes, if true. - // Default: false - bool report_bg_io_stats = false; - // Create ColumnFamilyOptions with default values for all fields ColumnFamilyOptions(); // Create ColumnFamilyOptions from Options @@ -829,6 +272,38 @@ struct ColumnFamilyOptions { void Dump(Logger* log) const; }; +enum class WALRecoveryMode : char { + // Original levelDB recovery + // We tolerate incomplete record in trailing data on all logs + // Use case : This is legacy behavior (default) + kTolerateCorruptedTailRecords = 0x00, + // Recover from clean shutdown + // We don't expect to find any corruption in the WAL + // Use case : This is ideal for unit tests and rare applications that + // can require high consistency guarantee + kAbsoluteConsistency = 0x01, + // Recover to point-in-time consistency + // We stop the WAL playback on discovering WAL inconsistency + // Use case : Ideal for systems that have disk controller cache like + // hard disk, SSD without super capacitor that store related data + kPointInTimeRecovery = 0x02, + // Recovery after a disaster + // We ignore any corruption in the WAL and try to salvage as much data as + // possible + // Use case : Ideal for last ditch effort to recover data or systems that + // operate with low grade unrelated data + kSkipAnyCorruptedRecords = 0x03, +}; + +struct DbPath { + std::string path; + uint64_t target_size; // Target size of total files under the path, in byte. + + DbPath() : target_size(0) {} + DbPath(const std::string& p, uint64_t t) : path(p), target_size(t) {} +}; + + struct DBOptions { // The function recovers options to the option as in version 4.6. DBOptions* OldDefaults(int rocksdb_major_version = 4, diff --git a/util/options.cc b/util/options.cc index b16d0feb4..b45dc93bd 100644 --- a/util/options.cc +++ b/util/options.cc @@ -36,53 +36,42 @@ namespace rocksdb { -ColumnFamilyOptions::ColumnFamilyOptions() - : compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), - table_factory( - std::shared_ptr(new BlockBasedTableFactory())) { +AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions() { assert(memtable_factory.get() != nullptr); } -ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) - : comparator(options.comparator), - merge_operator(options.merge_operator), - compaction_filter(options.compaction_filter), - compaction_filter_factory(options.compaction_filter_factory), - write_buffer_size(options.write_buffer_size), - max_write_buffer_number(options.max_write_buffer_number), +AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) + : max_write_buffer_number(options.max_write_buffer_number), min_write_buffer_number_to_merge( options.min_write_buffer_number_to_merge), max_write_buffer_number_to_maintain( options.max_write_buffer_number_to_maintain), - compression(options.compression), + inplace_update_support(options.inplace_update_support), + inplace_update_num_locks(options.inplace_update_num_locks), + inplace_callback(options.inplace_callback), + memtable_prefix_bloom_size_ratio( + options.memtable_prefix_bloom_size_ratio), + memtable_huge_page_size(options.memtable_huge_page_size), + memtable_insert_with_hint_prefix_extractor( + options.memtable_insert_with_hint_prefix_extractor), + bloom_locality(options.bloom_locality), + arena_block_size(options.arena_block_size), compression_per_level(options.compression_per_level), - bottommost_compression(options.bottommost_compression), - compression_opts(options.compression_opts), - prefix_extractor(options.prefix_extractor), num_levels(options.num_levels), - level0_file_num_compaction_trigger( - options.level0_file_num_compaction_trigger), level0_slowdown_writes_trigger(options.level0_slowdown_writes_trigger), level0_stop_writes_trigger(options.level0_stop_writes_trigger), target_file_size_base(options.target_file_size_base), target_file_size_multiplier(options.target_file_size_multiplier), - max_bytes_for_level_base(options.max_bytes_for_level_base), level_compaction_dynamic_level_bytes( options.level_compaction_dynamic_level_bytes), max_bytes_for_level_multiplier(options.max_bytes_for_level_multiplier), max_bytes_for_level_multiplier_additional( options.max_bytes_for_level_multiplier_additional), max_compaction_bytes(options.max_compaction_bytes), - soft_rate_limit(options.soft_rate_limit), soft_pending_compaction_bytes_limit( options.soft_pending_compaction_bytes_limit), hard_pending_compaction_bytes_limit( options.hard_pending_compaction_bytes_limit), - rate_limit_delay_max_milliseconds( - options.rate_limit_delay_max_milliseconds), - arena_block_size(options.arena_block_size), - disable_auto_compactions(options.disable_auto_compactions), - purge_redundant_kvs_while_flush(options.purge_redundant_kvs_while_flush), compaction_style(options.compaction_style), compaction_pri(options.compaction_pri), compaction_options_universal(options.compaction_options_universal), @@ -90,18 +79,8 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) max_sequential_skip_in_iterations( options.max_sequential_skip_in_iterations), memtable_factory(options.memtable_factory), - table_factory(options.table_factory), table_properties_collector_factories( options.table_properties_collector_factories), - inplace_update_support(options.inplace_update_support), - inplace_update_num_locks(options.inplace_update_num_locks), - inplace_callback(options.inplace_callback), - memtable_prefix_bloom_size_ratio( - options.memtable_prefix_bloom_size_ratio), - memtable_huge_page_size(options.memtable_huge_page_size), - memtable_insert_with_hint_prefix_extractor( - options.memtable_insert_with_hint_prefix_extractor), - bloom_locality(options.bloom_locality), max_successive_merges(options.max_successive_merges), optimize_filters_for_hits(options.optimize_filters_for_hits), paranoid_file_checks(options.paranoid_file_checks), @@ -114,6 +93,28 @@ ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) } } +ColumnFamilyOptions::ColumnFamilyOptions() + : compression(Snappy_Supported() ? kSnappyCompression : kNoCompression), + table_factory( + std::shared_ptr(new BlockBasedTableFactory())) {} + +ColumnFamilyOptions::ColumnFamilyOptions(const Options& options) + : AdvancedColumnFamilyOptions(options), + comparator(options.comparator), + merge_operator(options.merge_operator), + compaction_filter(options.compaction_filter), + compaction_filter_factory(options.compaction_filter_factory), + write_buffer_size(options.write_buffer_size), + compression(options.compression), + bottommost_compression(options.bottommost_compression), + compression_opts(options.compression_opts), + level0_file_num_compaction_trigger( + options.level0_file_num_compaction_trigger), + prefix_extractor(options.prefix_extractor), + max_bytes_for_level_base(options.max_bytes_for_level_base), + disable_auto_compactions(options.disable_auto_compactions), + table_factory(options.table_factory) {} + DBOptions::DBOptions() {} DBOptions::DBOptions(const Options& options) diff --git a/util/options_helper.h b/util/options_helper.h index df0eafe37..85e727877 100644 --- a/util/options_helper.h +++ b/util/options_helper.h @@ -365,6 +365,21 @@ static std::unordered_map db_options_type_info = { OptionType::kBoolean, OptionVerificationType::kNormal, true, offsetof(struct MutableDBOptions, avoid_flush_during_shutdown)}}}; +// offset_of is used to get the offset of a class data member +// ex: offset_of(&ColumnFamilyOptions::num_levels) +// This call will return the offset of num_levels in ColumnFamilyOptions class +// +// This is the same as offsetof() but allow us to work with non standard-layout +// classes and structures +// refs: +// http://en.cppreference.com/w/cpp/concept/StandardLayoutType +// https://gist.github.com/graphitemaster/494f21190bb2c63c5516 +template +inline int offset_of(T1 T2::*member) { + static T2 obj; + return int(size_t(&(obj.*member)) - size_t(&obj)); +} + static std::unordered_map cf_options_type_info = { /* not yet supported CompactionOptionsFIFO compaction_options_fifo; @@ -379,45 +394,44 @@ static std::unordered_map cf_options_type_info = { std::string* merged_value); */ {"report_bg_io_stats", - {offsetof(struct ColumnFamilyOptions, report_bg_io_stats), - OptionType::kBoolean, OptionVerificationType::kNormal, true, + {offset_of(&ColumnFamilyOptions::report_bg_io_stats), OptionType::kBoolean, + OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, report_bg_io_stats)}}, {"compaction_measure_io_stats", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, false, 0}}, {"disable_auto_compactions", - {offsetof(struct ColumnFamilyOptions, disable_auto_compactions), + {offset_of(&ColumnFamilyOptions::disable_auto_compactions), OptionType::kBoolean, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, disable_auto_compactions)}}, {"filter_deletes", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}}, {"inplace_update_support", - {offsetof(struct ColumnFamilyOptions, inplace_update_support), + {offset_of(&ColumnFamilyOptions::inplace_update_support), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"level_compaction_dynamic_level_bytes", - {offsetof(struct ColumnFamilyOptions, - level_compaction_dynamic_level_bytes), + {offset_of(&ColumnFamilyOptions::level_compaction_dynamic_level_bytes), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"optimize_filters_for_hits", - {offsetof(struct ColumnFamilyOptions, optimize_filters_for_hits), + {offset_of(&ColumnFamilyOptions::optimize_filters_for_hits), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"paranoid_file_checks", - {offsetof(struct ColumnFamilyOptions, paranoid_file_checks), + {offset_of(&ColumnFamilyOptions::paranoid_file_checks), OptionType::kBoolean, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, paranoid_file_checks)}}, {"force_consistency_checks", - {offsetof(struct ColumnFamilyOptions, force_consistency_checks), + {offset_of(&ColumnFamilyOptions::force_consistency_checks), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"purge_redundant_kvs_while_flush", - {offsetof(struct ColumnFamilyOptions, purge_redundant_kvs_while_flush), + {offset_of(&ColumnFamilyOptions::purge_redundant_kvs_while_flush), OptionType::kBoolean, OptionVerificationType::kNormal, false, 0}}, {"verify_checksums_in_compaction", {0, OptionType::kBoolean, OptionVerificationType::kDeprecated, true, 0}}, {"soft_pending_compaction_bytes_limit", - {offsetof(struct ColumnFamilyOptions, soft_pending_compaction_bytes_limit), + {offset_of(&ColumnFamilyOptions::soft_pending_compaction_bytes_limit), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, soft_pending_compaction_bytes_limit)}}, {"hard_pending_compaction_bytes_limit", - {offsetof(struct ColumnFamilyOptions, hard_pending_compaction_bytes_limit), + {offset_of(&ColumnFamilyOptions::hard_pending_compaction_bytes_limit), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, hard_pending_compaction_bytes_limit)}}, {"hard_rate_limit", @@ -425,21 +439,21 @@ static std::unordered_map cf_options_type_info = { {"soft_rate_limit", {0, OptionType::kDouble, OptionVerificationType::kDeprecated, true, 0}}, {"max_compaction_bytes", - {offsetof(struct ColumnFamilyOptions, max_compaction_bytes), + {offset_of(&ColumnFamilyOptions::max_compaction_bytes), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_compaction_bytes)}}, {"expanded_compaction_factor", {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}}, {"level0_file_num_compaction_trigger", - {offsetof(struct ColumnFamilyOptions, level0_file_num_compaction_trigger), + {offset_of(&ColumnFamilyOptions::level0_file_num_compaction_trigger), OptionType::kInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, level0_file_num_compaction_trigger)}}, {"level0_slowdown_writes_trigger", - {offsetof(struct ColumnFamilyOptions, level0_slowdown_writes_trigger), + {offset_of(&ColumnFamilyOptions::level0_slowdown_writes_trigger), OptionType::kInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, level0_slowdown_writes_trigger)}}, {"level0_stop_writes_trigger", - {offsetof(struct ColumnFamilyOptions, level0_stop_writes_trigger), + {offset_of(&ColumnFamilyOptions::level0_stop_writes_trigger), OptionType::kInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, level0_stop_writes_trigger)}}, {"max_grandparent_overlap_factor", @@ -447,53 +461,53 @@ static std::unordered_map cf_options_type_info = { {"max_mem_compaction_level", {0, OptionType::kInt, OptionVerificationType::kDeprecated, false, 0}}, {"max_write_buffer_number", - {offsetof(struct ColumnFamilyOptions, max_write_buffer_number), + {offset_of(&ColumnFamilyOptions::max_write_buffer_number), OptionType::kInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_write_buffer_number)}}, {"max_write_buffer_number_to_maintain", - {offsetof(struct ColumnFamilyOptions, max_write_buffer_number_to_maintain), + {offset_of(&ColumnFamilyOptions::max_write_buffer_number_to_maintain), OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, {"min_write_buffer_number_to_merge", - {offsetof(struct ColumnFamilyOptions, min_write_buffer_number_to_merge), + {offset_of(&ColumnFamilyOptions::min_write_buffer_number_to_merge), OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, {"num_levels", - {offsetof(struct ColumnFamilyOptions, num_levels), OptionType::kInt, + {offset_of(&ColumnFamilyOptions::num_levels), OptionType::kInt, OptionVerificationType::kNormal, false, 0}}, {"source_compaction_factor", {0, OptionType::kInt, OptionVerificationType::kDeprecated, true, 0}}, {"target_file_size_multiplier", - {offsetof(struct ColumnFamilyOptions, target_file_size_multiplier), + {offset_of(&ColumnFamilyOptions::target_file_size_multiplier), OptionType::kInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, target_file_size_multiplier)}}, {"arena_block_size", - {offsetof(struct ColumnFamilyOptions, arena_block_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, + {offset_of(&ColumnFamilyOptions::arena_block_size), OptionType::kSizeT, + OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, arena_block_size)}}, {"inplace_update_num_locks", - {offsetof(struct ColumnFamilyOptions, inplace_update_num_locks), + {offset_of(&ColumnFamilyOptions::inplace_update_num_locks), OptionType::kSizeT, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, inplace_update_num_locks)}}, {"max_successive_merges", - {offsetof(struct ColumnFamilyOptions, max_successive_merges), + {offset_of(&ColumnFamilyOptions::max_successive_merges), OptionType::kSizeT, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_successive_merges)}}, {"memtable_huge_page_size", - {offsetof(struct ColumnFamilyOptions, memtable_huge_page_size), + {offset_of(&ColumnFamilyOptions::memtable_huge_page_size), OptionType::kSizeT, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, memtable_huge_page_size)}}, {"memtable_prefix_bloom_huge_page_tlb_size", {0, OptionType::kSizeT, OptionVerificationType::kDeprecated, true, 0}}, {"write_buffer_size", - {offsetof(struct ColumnFamilyOptions, write_buffer_size), - OptionType::kSizeT, OptionVerificationType::kNormal, true, + {offset_of(&ColumnFamilyOptions::write_buffer_size), OptionType::kSizeT, + OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, write_buffer_size)}}, {"bloom_locality", - {offsetof(struct ColumnFamilyOptions, bloom_locality), - OptionType::kUInt32T, OptionVerificationType::kNormal, false, 0}}, + {offset_of(&ColumnFamilyOptions::bloom_locality), OptionType::kUInt32T, + OptionVerificationType::kNormal, false, 0}}, {"memtable_prefix_bloom_bits", {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}}, {"memtable_prefix_bloom_size_ratio", - {offsetof(struct ColumnFamilyOptions, memtable_prefix_bloom_size_ratio), + {offset_of(&ColumnFamilyOptions::memtable_prefix_bloom_size_ratio), OptionType::kDouble, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, memtable_prefix_bloom_size_ratio)}}, {"memtable_prefix_bloom_probes", @@ -501,72 +515,72 @@ static std::unordered_map cf_options_type_info = { {"min_partial_merge_operands", {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, true, 0}}, {"max_bytes_for_level_base", - {offsetof(struct ColumnFamilyOptions, max_bytes_for_level_base), + {offset_of(&ColumnFamilyOptions::max_bytes_for_level_base), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_base)}}, {"max_bytes_for_level_multiplier", - {offsetof(struct ColumnFamilyOptions, max_bytes_for_level_multiplier), + {offset_of(&ColumnFamilyOptions::max_bytes_for_level_multiplier), OptionType::kDouble, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier)}}, {"max_bytes_for_level_multiplier_additional", - {offsetof(struct ColumnFamilyOptions, - max_bytes_for_level_multiplier_additional), + {offset_of( + &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional), OptionType::kVectorInt, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_bytes_for_level_multiplier_additional)}}, {"max_sequential_skip_in_iterations", - {offsetof(struct ColumnFamilyOptions, max_sequential_skip_in_iterations), + {offset_of(&ColumnFamilyOptions::max_sequential_skip_in_iterations), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, max_sequential_skip_in_iterations)}}, {"target_file_size_base", - {offsetof(struct ColumnFamilyOptions, target_file_size_base), + {offset_of(&ColumnFamilyOptions::target_file_size_base), OptionType::kUInt64T, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, target_file_size_base)}}, {"rate_limit_delay_max_milliseconds", {0, OptionType::kUInt, OptionVerificationType::kDeprecated, false, 0}}, {"compression", - {offsetof(struct ColumnFamilyOptions, compression), + {offset_of(&ColumnFamilyOptions::compression), OptionType::kCompressionType, OptionVerificationType::kNormal, true, offsetof(struct MutableCFOptions, compression)}}, {"compression_per_level", - {offsetof(struct ColumnFamilyOptions, compression_per_level), + {offset_of(&ColumnFamilyOptions::compression_per_level), OptionType::kVectorCompressionType, OptionVerificationType::kNormal, false, 0}}, {"bottommost_compression", - {offsetof(struct ColumnFamilyOptions, bottommost_compression), + {offset_of(&ColumnFamilyOptions::bottommost_compression), OptionType::kCompressionType, OptionVerificationType::kNormal, false, 0}}, {"comparator", - {offsetof(struct ColumnFamilyOptions, comparator), OptionType::kComparator, + {offset_of(&ColumnFamilyOptions::comparator), OptionType::kComparator, OptionVerificationType::kByName, false, 0}}, {"prefix_extractor", - {offsetof(struct ColumnFamilyOptions, prefix_extractor), + {offset_of(&ColumnFamilyOptions::prefix_extractor), OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, false, 0}}, {"memtable_insert_with_hint_prefix_extractor", - {offsetof(struct ColumnFamilyOptions, - memtable_insert_with_hint_prefix_extractor), + {offset_of( + &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor), OptionType::kSliceTransform, OptionVerificationType::kByNameAllowNull, false, 0}}, {"memtable_factory", - {offsetof(struct ColumnFamilyOptions, memtable_factory), + {offset_of(&ColumnFamilyOptions::memtable_factory), OptionType::kMemTableRepFactory, OptionVerificationType::kByName, false, 0}}, {"table_factory", - {offsetof(struct ColumnFamilyOptions, table_factory), - OptionType::kTableFactory, OptionVerificationType::kByName, false, 0}}, + {offset_of(&ColumnFamilyOptions::table_factory), OptionType::kTableFactory, + OptionVerificationType::kByName, false, 0}}, {"compaction_filter", - {offsetof(struct ColumnFamilyOptions, compaction_filter), + {offset_of(&ColumnFamilyOptions::compaction_filter), OptionType::kCompactionFilter, OptionVerificationType::kByName, false, 0}}, {"compaction_filter_factory", - {offsetof(struct ColumnFamilyOptions, compaction_filter_factory), + {offset_of(&ColumnFamilyOptions::compaction_filter_factory), OptionType::kCompactionFilterFactory, OptionVerificationType::kByName, false, 0}}, {"merge_operator", - {offsetof(struct ColumnFamilyOptions, merge_operator), + {offset_of(&ColumnFamilyOptions::merge_operator), OptionType::kMergeOperator, OptionVerificationType::kByName, false, 0}}, {"compaction_style", - {offsetof(struct ColumnFamilyOptions, compaction_style), + {offset_of(&ColumnFamilyOptions::compaction_style), OptionType::kCompactionStyle, OptionVerificationType::kNormal, false, 0}}}; diff --git a/util/options_settable_test.cc b/util/options_settable_test.cc index 374a2c6a8..45fa94ef5 100644 --- a/util/options_settable_test.cc +++ b/util/options_settable_test.cc @@ -314,32 +314,31 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { // options in the blacklist need to appear in the same order as in // ColumnFamilyOptions. const OffsetGap kColumnFamilyOptionsBlacklist = { - {offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)}, - {offsetof(struct ColumnFamilyOptions, merge_operator), - sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, compaction_filter), - sizeof(const CompactionFilter*)}, - {offsetof(struct ColumnFamilyOptions, compaction_filter_factory), - sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, compression_per_level), - sizeof(std::vector)}, - {offsetof(struct ColumnFamilyOptions, prefix_extractor), + {offset_of(&ColumnFamilyOptions::inplace_callback), + sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))}, + {offset_of( + &ColumnFamilyOptions::memtable_insert_with_hint_prefix_extractor), sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, - max_bytes_for_level_multiplier_additional), + {offset_of(&ColumnFamilyOptions::compression_per_level), + sizeof(std::vector)}, + {offset_of( + &ColumnFamilyOptions::max_bytes_for_level_multiplier_additional), sizeof(std::vector)}, - {offsetof(struct ColumnFamilyOptions, memtable_factory), + {offset_of(&ColumnFamilyOptions::memtable_factory), sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, table_factory), - sizeof(std::shared_ptr)}, - {offsetof(struct ColumnFamilyOptions, - table_properties_collector_factories), + {offset_of(&ColumnFamilyOptions::table_properties_collector_factories), sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)}, - {offsetof(struct ColumnFamilyOptions, inplace_callback), - sizeof(UpdateStatus(*)(char*, uint32_t*, Slice, std::string*))}, - {offsetof(struct ColumnFamilyOptions, - memtable_insert_with_hint_prefix_extractor), + {offset_of(&ColumnFamilyOptions::comparator), sizeof(Comparator*)}, + {offset_of(&ColumnFamilyOptions::merge_operator), + sizeof(std::shared_ptr)}, + {offset_of(&ColumnFamilyOptions::compaction_filter), + sizeof(const CompactionFilter*)}, + {offset_of(&ColumnFamilyOptions::compaction_filter_factory), + sizeof(std::shared_ptr)}, + {offset_of(&ColumnFamilyOptions::prefix_extractor), sizeof(std::shared_ptr)}, + {offset_of(&ColumnFamilyOptions::table_factory), + sizeof(std::shared_ptr)}, }; char* options_ptr = new char[sizeof(ColumnFamilyOptions)];