Organize + modernize ReadOptions (#11430)

Summary:
Roughly group ReadOptions into those that apply generally and those that only apply to range scans. Also use field assignment idiom to simplify specification of default values.

Also some rearranging to reduce unused padding. sizeof(ReadOptions) was 144 on my system, now 136.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11430

Test Plan: existing tests, no functional change intended

Reviewed By: hx235

Differential Revision: D45626508

Pulled By: pdillinger

fbshipit-source-id: 227d4158c5123405324f273ded2eb9d8bce86364
oxigraph-8.3.2
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent 736b3c4909
commit e1d1c50317
  1. 289
      include/rocksdb/options.h
  2. 82
      options/options.cc

@ -1457,12 +1457,119 @@ enum ReadTier {
// Options that control read operations // Options that control read operations
struct ReadOptions { struct ReadOptions {
// *** BEGIN options relevant to point lookups as well as scans ***
// If "snapshot" is non-nullptr, read as of the supplied snapshot // If "snapshot" is non-nullptr, read as of the supplied snapshot
// (which must belong to the DB that is being read and which must // (which must belong to the DB that is being read and which must
// not have been released). If "snapshot" is nullptr, use an implicit // not have been released). If "snapshot" is nullptr, use an implicit
// snapshot of the state at the beginning of this read operation. // snapshot of the state at the beginning of this read operation.
// Default: nullptr const Snapshot* snapshot = nullptr;
const Snapshot* snapshot;
// Timestamp of operation. Read should return the latest data visible to the
// specified timestamp. All timestamps of the same database must be of the
// same length and format. The user is responsible for providing a customized
// compare function via Comparator to order <key, timestamp> tuples.
// For iterator, iter_start_ts is the lower bound (older) and timestamp
// serves as the upper bound. Versions of the same record that fall in
// the timestamp range will be returned. If iter_start_ts is nullptr,
// only the most recent version visible to timestamp is returned.
// The user-specified timestamp feature is still under active development,
// and the API is subject to change.
const Slice* timestamp = nullptr;
const Slice* iter_start_ts = nullptr;
// Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
// in microseconds.
// It should be set to microseconds since epoch, i.e, gettimeofday or
// equivalent plus allowed duration in microseconds. The best way is to use
// env->NowMicros() + some timeout.
// This is best efforts. The call may exceed the deadline if there is IO
// involved and the file system doesn't support deadlines, or due to
// checking for deadline periodically rather than for every key if
// processing a batch
std::chrono::microseconds deadline = std::chrono::microseconds::zero();
// A timeout in microseconds to be passed to the underlying FileSystem for
// reads. As opposed to deadline, this determines the timeout for each
// individual file read request. If a MultiGet/Get/Seek/Next etc call
// results in multiple reads, each read can last up to io_timeout us.
std::chrono::microseconds io_timeout = std::chrono::microseconds::zero();
// Specify if this read request should process data that ALREADY
// resides on a particular cache. If the required data is not
// found at the specified cache, then Status::Incomplete is returned.
ReadTier read_tier = kReadAllTier;
// For file reads associated with this option, charge the internal rate
// limiter (see `DBOptions::rate_limiter`) at the specified priority. The
// special value `Env::IO_TOTAL` disables charging the rate limiter.
//
// The rate limiting is bypassed no matter this option's value for file reads
// on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
// is a `PlainTableFactory`) and cuckoo tables (these can exist when
// `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
//
// The bytes charged to rate limiter may not exactly match the file read bytes
// since there are some seemingly insignificant reads, like for file
// headers/footers, that we currently do not charge to rate limiter.
Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
// It limits the maximum cumulative value size of the keys in batch while
// reading through MultiGet. Once the cumulative value size exceeds this
// soft limit then all the remaining keys are returned with status Aborted.
uint64_t value_size_soft_limit = std::numeric_limits<uint64_t>::max();
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
bool verify_checksums = true;
// Should the "data block"/"index block" read for this iteration be placed in
// block cache?
// Callers may wish to set this field to false for bulk scans.
// This would help not to the change eviction order of existing items in the
// block cache.
bool fill_cache = true;
// If true, range tombstones handling will be skipped in key lookup paths.
// For DB instances that don't use DeleteRange() calls, this setting can
// be used to optimize the read performance.
// Note that, if this assumption (of no previous DeleteRange() calls) is
// broken, stale keys could be served in read paths.
bool ignore_range_deletions = false;
// Experimental
//
// If async_io is enabled, RocksDB will prefetch some of data asynchronously.
// RocksDB apply it if reads are sequential and its internal automatic
// prefetching.
bool async_io = false;
// Experimental
//
// If async_io is set, then this flag controls whether we read SST files
// in multiple levels asynchronously. Enabling this flag can help reduce
// MultiGet latency by maximizing the number of SST files read in
// parallel if the keys in the MultiGet batch are in different levels. It
// comes at the expense of slightly higher CPU overhead.
bool optimize_multiget_for_io = true;
// *** END options relevant to point lookups (as well as scans) ***
// *** BEGIN options only relevant to iterators or scans ***
// RocksDB does auto-readahead for iterators on noticing more than two reads
// for a table file. The readahead starts at 8KB and doubles on every
// additional read up to 256KB.
// This option can help if most of the range scans are large, and if it is
// determined that a larger readahead than that enabled by auto-readahead is
// needed.
// Using a large readahead size (> 2MB) can typically improve the performance
// of forward iteration on spinning disks.
size_t readahead_size = 0;
// A threshold for the number of keys that can be skipped before failing an
// iterator seek as incomplete. The default value of 0 should be used to
// never fail a request as incomplete, even on skipping too many keys.
uint64_t max_skippable_internal_keys = 0;
// `iterate_lower_bound` defines the smallest key at which the backward // `iterate_lower_bound` defines the smallest key at which the backward
// iterator can return an entry. Once the bound is passed, Valid() will be // iterator can return an entry. Once the bound is passed, Valid() will be
@ -1475,8 +1582,7 @@ struct ReadOptions {
// //
// In case of user_defined timestamp, if enabled, iterate_lower_bound should // In case of user_defined timestamp, if enabled, iterate_lower_bound should
// point to key without timestamp part. // point to key without timestamp part.
// Default: nullptr const Slice* iterate_lower_bound = nullptr;
const Slice* iterate_lower_bound;
// "iterate_upper_bound" defines the extent up to which the forward iterator // "iterate_upper_bound" defines the extent up to which the forward iterator
// can return entries. Once the bound is reached, Valid() will be false. // can return entries. Once the bound is reached, Valid() will be false.
@ -1496,63 +1602,24 @@ struct ReadOptions {
// //
// In case of user_defined timestamp, if enabled, iterate_upper_bound should // In case of user_defined timestamp, if enabled, iterate_upper_bound should
// point to key without timestamp part. // point to key without timestamp part.
// Default: nullptr const Slice* iterate_upper_bound = nullptr;
const Slice* iterate_upper_bound;
// RocksDB does auto-readahead for iterators on noticing more than two reads
// for a table file. The readahead starts at 8KB and doubles on every
// additional read up to 256KB.
// This option can help if most of the range scans are large, and if it is
// determined that a larger readahead than that enabled by auto-readahead is
// needed.
// Using a large readahead size (> 2MB) can typically improve the performance
// of forward iteration on spinning disks.
// Default: 0
size_t readahead_size;
// A threshold for the number of keys that can be skipped before failing an
// iterator seek as incomplete. The default value of 0 should be used to
// never fail a request as incomplete, even on skipping too many keys.
// Default: 0
uint64_t max_skippable_internal_keys;
// Specify if this read request should process data that ALREADY
// resides on a particular cache. If the required data is not
// found at the specified cache, then Status::Incomplete is returned.
// Default: kReadAllTier
ReadTier read_tier;
// If true, all data read from underlying storage will be
// verified against corresponding checksums.
// Default: true
bool verify_checksums;
// Should the "data block"/"index block" read for this iteration be placed in
// block cache?
// Callers may wish to set this field to false for bulk scans.
// This would help not to the change eviction order of existing items in the
// block cache.
// Default: true
bool fill_cache;
// Specify to create a tailing iterator -- a special iterator that has a // Specify to create a tailing iterator -- a special iterator that has a
// view of the complete database (i.e. it can also be used to read newly // view of the complete database (i.e. it can also be used to read newly
// added data) and is optimized for sequential reads. It will return records // added data) and is optimized for sequential reads. It will return records
// that were inserted into the database after the creation of the iterator. // that were inserted into the database after the creation of the iterator.
// Default: false bool tailing = false;
bool tailing;
// This options is not used anymore. It was to turn on a functionality that // This options is not used anymore. It was to turn on a functionality that
// has been removed. // has been removed. DEPRECATED
bool managed; bool managed = false;
// Enable a total order seek regardless of index format (e.g. hash index) // Enable a total order seek regardless of index format (e.g. hash index)
// used in the table. Some table format (e.g. plain table) may not support // used in the table. Some table format (e.g. plain table) may not support
// this option. // this option.
// If true when calling Get(), we also skip prefix bloom when reading from // If true when calling Get(), we also skip prefix bloom when reading from
// block based table, which only affects Get() performance. // block based table, which only affects Get() performance.
// Default: false bool total_order_seek = false;
bool total_order_seek;
// When true, by default use total_order_seek = true, and RocksDB can // When true, by default use total_order_seek = true, and RocksDB can
// selectively enable prefix seek mode if won't generate a different result // selectively enable prefix seek mode if won't generate a different result
@ -1568,84 +1635,21 @@ struct ReadOptions {
// iterators. (We are also assuming the new condition on // iterators. (We are also assuming the new condition on
// IsSameLengthImmediateSuccessor is satisfied; see its BUG section). // IsSameLengthImmediateSuccessor is satisfied; see its BUG section).
// A bug example is in DBTest2::AutoPrefixMode1, search for "BUG". // A bug example is in DBTest2::AutoPrefixMode1, search for "BUG".
// Default: false bool auto_prefix_mode = false;
bool auto_prefix_mode;
// Enforce that the iterator only iterates over the same prefix as the seek. // Enforce that the iterator only iterates over the same prefix as the seek.
// This option is effective only for prefix seeks, i.e. prefix_extractor is // This option is effective only for prefix seeks, i.e. prefix_extractor is
// non-null for the column family and total_order_seek is false. Unlike // non-null for the column family and total_order_seek is false. Unlike
// iterate_upper_bound, prefix_same_as_start only works within a prefix // iterate_upper_bound, prefix_same_as_start only works within a prefix
// but in both directions. // but in both directions.
// Default: false bool prefix_same_as_start = false;
bool prefix_same_as_start;
// Keep the blocks loaded by the iterator pinned in memory as long as the // Keep the blocks loaded by the iterator pinned in memory as long as the
// iterator is not deleted, If used when reading from tables created with // iterator is not deleted, If used when reading from tables created with
// BlockBasedTableOptions::use_delta_encoding = false, // BlockBasedTableOptions::use_delta_encoding = false,
// Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to // Iterator's property "rocksdb.iterator.is-key-pinned" is guaranteed to
// return 1. // return 1.
// Default: false bool pin_data = false;
bool pin_data;
// If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
// schedule a background job in the flush job queue and delete obsolete files
// in background.
// Default: false
bool background_purge_on_iterator_cleanup;
// If true, range tombstones handling will be skipped in key lookup paths.
// For DB instances that don't use DeleteRange() calls, this setting can
// be used to optimize the read performance.
// Note that, if this assumption (of no previous DeleteRange() calls) is
// broken, stale keys could be served in read paths.
// Default: false
bool ignore_range_deletions;
// A callback to determine whether relevant keys for this scan exist in a
// given table based on the table's properties. The callback is passed the
// properties of each table during iteration. If the callback returns false,
// the table will not be scanned. This option only affects Iterators and has
// no impact on point lookups.
// Default: empty (every table will be scanned)
std::function<bool(const TableProperties&)> table_filter;
// Timestamp of operation. Read should return the latest data visible to the
// specified timestamp. All timestamps of the same database must be of the
// same length and format. The user is responsible for providing a customized
// compare function via Comparator to order <key, timestamp> tuples.
// For iterator, iter_start_ts is the lower bound (older) and timestamp
// serves as the upper bound. Versions of the same record that fall in
// the timestamp range will be returned. If iter_start_ts is nullptr,
// only the most recent version visible to timestamp is returned.
// The user-specified timestamp feature is still under active development,
// and the API is subject to change.
// Default: nullptr
const Slice* timestamp;
const Slice* iter_start_ts;
// Deadline for completing an API call (Get/MultiGet/Seek/Next for now)
// in microseconds.
// It should be set to microseconds since epoch, i.e, gettimeofday or
// equivalent plus allowed duration in microseconds. The best way is to use
// env->NowMicros() + some timeout.
// This is best efforts. The call may exceed the deadline if there is IO
// involved and the file system doesn't support deadlines, or due to
// checking for deadline periodically rather than for every key if
// processing a batch
std::chrono::microseconds deadline;
// A timeout in microseconds to be passed to the underlying FileSystem for
// reads. As opposed to deadline, this determines the timeout for each
// individual file read request. If a MultiGet/Get/Seek/Next etc call
// results in multiple reads, each read can last up to io_timeout us.
std::chrono::microseconds io_timeout;
// It limits the maximum cumulative value size of the keys in batch while
// reading through MultiGet. Once the cumulative value size exceeds this
// soft limit then all the remaining keys are returned with status Aborted.
//
// Default: std::numeric_limits<uint64_t>::max()
uint64_t value_size_soft_limit;
// For iterators, RocksDB does auto-readahead on noticing more than two // For iterators, RocksDB does auto-readahead on noticing more than two
// sequential reads for a table file if user doesn't provide readahead_size. // sequential reads for a table file if user doesn't provide readahead_size.
@ -1656,52 +1660,29 @@ struct ReadOptions {
// //
// By enabling this option, RocksDB will do some enhancements for // By enabling this option, RocksDB will do some enhancements for
// prefetching the data. // prefetching the data.
// bool adaptive_readahead = false;
// Default: false
bool adaptive_readahead;
// For file reads associated with this option, charge the internal rate // If true, when PurgeObsoleteFile is called in CleanupIteratorState, we
// limiter (see `DBOptions::rate_limiter`) at the specified priority. The // schedule a background job in the flush job queue and delete obsolete files
// special value `Env::IO_TOTAL` disables charging the rate limiter. // in background.
// bool background_purge_on_iterator_cleanup = false;
// The rate limiting is bypassed no matter this option's value for file reads
// on plain tables (these can exist when `ColumnFamilyOptions::table_factory`
// is a `PlainTableFactory`) and cuckoo tables (these can exist when
// `ColumnFamilyOptions::table_factory` is a `CuckooTableFactory`).
//
// The bytes charged to rate limiter may not exactly match the file read bytes
// since there are some seemingly insignificant reads, like for file
// headers/footers, that we currently do not charge to rate limiter.
//
// Default: `Env::IO_TOTAL`.
Env::IOPriority rate_limiter_priority = Env::IO_TOTAL;
// Experimental // A callback to determine whether relevant keys for this scan exist in a
// // given table based on the table's properties. The callback is passed the
// If async_io is enabled, RocksDB will prefetch some of data asynchronously. // properties of each table during iteration. If the callback returns false,
// RocksDB apply it if reads are sequential and its internal automatic // the table will not be scanned. This option only affects Iterators and has
// prefetching. // no impact on point lookups.
// // Default: empty (every table will be scanned)
// Default: false std::function<bool(const TableProperties&)> table_filter;
bool async_io;
// Experimental // *** END options only relevant to iterators or scans ***
//
// If async_io is set, then this flag controls whether we read SST files
// in multiple levels asynchronously. Enabling this flag can help reduce
// MultiGet latency by maximizing the number of SST files read in
// parallel if the keys in the MultiGet batch are in different levels. It
// comes at the expense of slightly higher CPU overhead.
//
// Default: true
bool optimize_multiget_for_io;
// ** For RocksDB internal use only ** // ** For RocksDB internal use only **
Env::IOActivity io_activity; Env::IOActivity io_activity = Env::IOActivity::kUnknown;
ReadOptions(); ReadOptions() {}
ReadOptions(bool cksum, bool cache); ReadOptions(bool _verify_checksums, bool _fill_cache);
explicit ReadOptions(Env::IOActivity io_activity); explicit ReadOptions(Env::IOActivity _io_activity);
}; };
// Options that control write operations // Options that control write operations

@ -682,85 +682,11 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) {
env->SetBackgroundThreads(1, Env::HIGH); env->SetBackgroundThreads(1, Env::HIGH);
return this; return this;
} }
ReadOptions::ReadOptions()
: snapshot(nullptr), ReadOptions::ReadOptions(bool _verify_checksums, bool _fill_cache)
iterate_lower_bound(nullptr), : verify_checksums(_verify_checksums), fill_cache(_fill_cache) {}
iterate_upper_bound(nullptr),
readahead_size(0),
max_skippable_internal_keys(0),
read_tier(kReadAllTier),
verify_checksums(true),
fill_cache(true),
tailing(false),
managed(false),
total_order_seek(false),
auto_prefix_mode(false),
prefix_same_as_start(false),
pin_data(false),
background_purge_on_iterator_cleanup(false),
ignore_range_deletions(false),
timestamp(nullptr),
iter_start_ts(nullptr),
deadline(std::chrono::microseconds::zero()),
io_timeout(std::chrono::microseconds::zero()),
value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
adaptive_readahead(false),
async_io(false),
optimize_multiget_for_io(true),
io_activity(Env::IOActivity::kUnknown) {}
ReadOptions::ReadOptions(bool cksum, bool cache)
: snapshot(nullptr),
iterate_lower_bound(nullptr),
iterate_upper_bound(nullptr),
readahead_size(0),
max_skippable_internal_keys(0),
read_tier(kReadAllTier),
verify_checksums(cksum),
fill_cache(cache),
tailing(false),
managed(false),
total_order_seek(false),
auto_prefix_mode(false),
prefix_same_as_start(false),
pin_data(false),
background_purge_on_iterator_cleanup(false),
ignore_range_deletions(false),
timestamp(nullptr),
iter_start_ts(nullptr),
deadline(std::chrono::microseconds::zero()),
io_timeout(std::chrono::microseconds::zero()),
value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
adaptive_readahead(false),
async_io(false),
optimize_multiget_for_io(true),
io_activity(Env::IOActivity::kUnknown) {}
ReadOptions::ReadOptions(Env::IOActivity _io_activity) ReadOptions::ReadOptions(Env::IOActivity _io_activity)
: snapshot(nullptr), : io_activity(_io_activity) {}
iterate_lower_bound(nullptr),
iterate_upper_bound(nullptr),
readahead_size(0),
max_skippable_internal_keys(0),
read_tier(kReadAllTier),
verify_checksums(true),
fill_cache(true),
tailing(false),
managed(false),
total_order_seek(false),
auto_prefix_mode(false),
prefix_same_as_start(false),
pin_data(false),
background_purge_on_iterator_cleanup(false),
ignore_range_deletions(false),
timestamp(nullptr),
iter_start_ts(nullptr),
deadline(std::chrono::microseconds::zero()),
io_timeout(std::chrono::microseconds::zero()),
value_size_soft_limit(std::numeric_limits<uint64_t>::max()),
adaptive_readahead(false),
async_io(false),
optimize_multiget_for_io(true),
io_activity(_io_activity) {}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

Loading…
Cancel
Save