// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). // // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS #endif #include #include #include #include #include "options/options_helper.h" #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/convenience.h" #include "rocksdb/flush_block_policy.h" #include "table/block_based/block_based_table_factory.h" #include "table/block_based/block_based_table_builder.h" #include "table/block_based/block_based_table_reader.h" #include "table/format.h" #include "util/mutexlock.h" #include "util/string_util.h" namespace rocksdb { void TailPrefetchStats::RecordEffectiveSize(size_t len) { MutexLock l(&mutex_); if (num_records_ < kNumTracked) { num_records_++; } records_[next_++] = len; if (next_ == kNumTracked) { next_ = 0; } } size_t TailPrefetchStats::GetSuggestedPrefetchSize() { std::vector sorted; { MutexLock l(&mutex_); if (num_records_ == 0) { return 0; } sorted.assign(records_, records_ + num_records_); } // Of the historic size, we find the maximum one that satisifis the condtiion // that if prefetching all, less than 1/8 will be wasted. std::sort(sorted.begin(), sorted.end()); // Assuming we have 5 data points, and after sorting it looks like this: // // +---+ // +---+ | | // | | | | // | | | | // | | | | // | | | | // +---+ | | | | // | | | | | | // +---+ | | | | | | // | | | | | | | | // +---+ | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // +---+ +---+ +---+ +---+ +---+ // // and we use every of the value as a candidate, and estimate how much we // wasted, compared to read. For example, when we use the 3rd record // as candiate. This area is what we read: // +---+ // +---+ | | // | | | | // | | | | // | | | | // | | | | // *** *** *** ***+ *** *** *** *** ** // * | | | | | | // +---+ | | | | | * // * | | | | | | | | // +---+ | | | | | | | * // * | | | | X | | | | | // | | | | | | | | | * // * | | | | | | | | | // | | | | | | | | | * // * | | | | | | | | | // *** *** ***-*** ***--*** ***--*** +**** // which is (size of the record) X (number of records). // // While wasted is this area: // +---+ // +---+ | | // | | | | // | | | | // | | | | // | | | | // *** *** *** ****---+ | | | | // * * | | | | | // * *-*** *** | | | | | // * * | | | | | | | // *--** *** | | | | | | | // | | | | | X | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // +---+ +---+ +---+ +---+ +---+ // // Which can be calculated iteratively. // The difference between wasted using 4st and 3rd record, will // be following area: // +---+ // +--+ +-+ ++ +-+ +-+ +---+ | | // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | // xxxxxxxxxxxxxxxxxxxxxxxx | | | | // + xxxxxxxxxxxxxxxxxxxxxxxx | | | | // | xxxxxxxxxxxxxxxxxxxxxxxx | | | | // +-+ +-+ +-+ ++ +---+ +--+ | | | // | | | | | | | // +---+ ++ | | | | | | // | | | | | | X | | | // +---+ ++ | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // | | | | | | | | | | // +---+ +---+ +---+ +---+ +---+ // // which will be the size difference between 4st and 3rd record, // times 3, which is number of records before the 4st. // Here we assume that all data within the prefetch range will be useful. In // reality, it may not be the case when a partial block is inside the range, // or there are data in the middle that is not read. We ignore those cases // for simplicity. assert(!sorted.empty()); size_t prev_size = sorted[0]; size_t max_qualified_size = sorted[0]; size_t wasted = 0; for (size_t i = 1; i < sorted.size(); i++) { size_t read = sorted[i] * sorted.size(); wasted += (sorted[i] - prev_size) * i; if (wasted <= read / 8) { max_qualified_size = sorted[i]; } prev_size = sorted[i]; } const size_t kMaxPrefetchSize = 512 * 1024; // Never exceed 512KB return std::min(kMaxPrefetchSize, max_qualified_size); } BlockBasedTableFactory::BlockBasedTableFactory( const BlockBasedTableOptions& _table_options) : table_options_(_table_options) { if (table_options_.flush_block_policy_factory == nullptr) { table_options_.flush_block_policy_factory.reset( new FlushBlockBySizePolicyFactory()); } if (table_options_.no_block_cache) { table_options_.block_cache.reset(); } else if (table_options_.block_cache == nullptr) { table_options_.block_cache = NewLRUCache(8 << 20); } if (table_options_.block_size_deviation < 0 || table_options_.block_size_deviation > 100) { table_options_.block_size_deviation = 0; } if (table_options_.block_restart_interval < 1) { table_options_.block_restart_interval = 1; } if (table_options_.index_block_restart_interval < 1) { table_options_.index_block_restart_interval = 1; } if (table_options_.partition_filters && table_options_.index_type != BlockBasedTableOptions::kTwoLevelIndexSearch) { // We do not support partitioned filters without partitioning indexes table_options_.partition_filters = false; } } Status BlockBasedTableFactory::NewTableReader( const TableReaderOptions& table_reader_options, std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table_reader, bool prefetch_index_and_filter_in_cache) const { return BlockBasedTable::Open( table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, &tail_prefetch_stats_); } TableBuilder* BlockBasedTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, uint32_t column_family_id, WritableFileWriter* file) const { auto table_builder = new BlockBasedTableBuilder( table_builder_options.ioptions, table_builder_options.moptions, table_options_, table_builder_options.internal_comparator, table_builder_options.int_tbl_prop_collector_factories, column_family_id, file, table_builder_options.compression_type, table_builder_options.sample_for_compression, table_builder_options.compression_opts, table_builder_options.skip_filters, table_builder_options.column_family_name, table_builder_options.creation_time, table_builder_options.oldest_key_time, table_builder_options.target_file_size, table_builder_options.file_creation_time); return table_builder; } Status BlockBasedTableFactory::SanitizeOptions( const DBOptions& db_opts, const ColumnFamilyOptions& cf_opts) const { if (table_options_.index_type == BlockBasedTableOptions::kHashSearch && cf_opts.prefix_extractor == nullptr) { return Status::InvalidArgument( "Hash index is specified for block-based " "table, but prefix_extractor is not given"); } if (table_options_.cache_index_and_filter_blocks && table_options_.no_block_cache) { return Status::InvalidArgument( "Enable cache_index_and_filter_blocks, " ", but block cache is disabled"); } if (table_options_.pin_l0_filter_and_index_blocks_in_cache && table_options_.no_block_cache) { return Status::InvalidArgument( "Enable pin_l0_filter_and_index_blocks_in_cache, " ", but block cache is disabled"); } if (!BlockBasedTableSupportedVersion(table_options_.format_version)) { return Status::InvalidArgument( "Unsupported BlockBasedTable format_version. Please check " "include/rocksdb/table.h for more info"); } if (table_options_.block_align && (cf_opts.compression != kNoCompression)) { return Status::InvalidArgument( "Enable block_align, but compression " "enabled"); } if (table_options_.block_align && (table_options_.block_size & (table_options_.block_size - 1))) { return Status::InvalidArgument( "Block alignment requested but block size is not a power of 2"); } if (table_options_.data_block_index_type == BlockBasedTableOptions::kDataBlockBinaryAndHash && table_options_.data_block_hash_table_util_ratio <= 0) { return Status::InvalidArgument( "data_block_hash_table_util_ratio should be greater than 0 when " "data_block_index_type is set to kDataBlockBinaryAndHash"); } if (db_opts.unordered_write && cf_opts.max_successive_merges > 0) { // TODO(myabandeh): support it return Status::InvalidArgument( "max_successive_merges larger than 0 is currently inconsistent with " "unordered_write"); } return Status::OK(); } std::string BlockBasedTableFactory::GetPrintableTableOptions() const { std::string ret; ret.reserve(20000); const int kBufferSize = 200; char buffer[kBufferSize]; snprintf(buffer, kBufferSize, " flush_block_policy_factory: %s (%p)\n", table_options_.flush_block_policy_factory->Name(), static_cast(table_options_.flush_block_policy_factory.get())); ret.append(buffer); snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks: %d\n", table_options_.cache_index_and_filter_blocks); ret.append(buffer); snprintf(buffer, kBufferSize, " cache_index_and_filter_blocks_with_high_priority: %d\n", table_options_.cache_index_and_filter_blocks_with_high_priority); ret.append(buffer); snprintf(buffer, kBufferSize, " pin_l0_filter_and_index_blocks_in_cache: %d\n", table_options_.pin_l0_filter_and_index_blocks_in_cache); ret.append(buffer); snprintf(buffer, kBufferSize, " pin_top_level_index_and_filter: %d\n", table_options_.pin_top_level_index_and_filter); ret.append(buffer); snprintf(buffer, kBufferSize, " index_type: %d\n", table_options_.index_type); ret.append(buffer); snprintf(buffer, kBufferSize, " data_block_index_type: %d\n", table_options_.data_block_index_type); ret.append(buffer); snprintf(buffer, kBufferSize, " index_shortening: %d\n", static_cast(table_options_.index_shortening)); ret.append(buffer); snprintf(buffer, kBufferSize, " data_block_hash_table_util_ratio: %lf\n", table_options_.data_block_hash_table_util_ratio); ret.append(buffer); snprintf(buffer, kBufferSize, " hash_index_allow_collision: %d\n", table_options_.hash_index_allow_collision); ret.append(buffer); snprintf(buffer, kBufferSize, " checksum: %d\n", table_options_.checksum); ret.append(buffer); snprintf(buffer, kBufferSize, " no_block_cache: %d\n", table_options_.no_block_cache); ret.append(buffer); snprintf(buffer, kBufferSize, " block_cache: %p\n", static_cast(table_options_.block_cache.get())); ret.append(buffer); if (table_options_.block_cache) { const char* block_cache_name = table_options_.block_cache->Name(); if (block_cache_name != nullptr) { snprintf(buffer, kBufferSize, " block_cache_name: %s\n", block_cache_name); ret.append(buffer); } ret.append(" block_cache_options:\n"); ret.append(table_options_.block_cache->GetPrintableOptions()); } snprintf(buffer, kBufferSize, " block_cache_compressed: %p\n", static_cast(table_options_.block_cache_compressed.get())); ret.append(buffer); if (table_options_.block_cache_compressed) { const char* block_cache_compressed_name = table_options_.block_cache_compressed->Name(); if (block_cache_compressed_name != nullptr) { snprintf(buffer, kBufferSize, " block_cache_name: %s\n", block_cache_compressed_name); ret.append(buffer); } ret.append(" block_cache_compressed_options:\n"); ret.append(table_options_.block_cache_compressed->GetPrintableOptions()); } snprintf(buffer, kBufferSize, " persistent_cache: %p\n", static_cast(table_options_.persistent_cache.get())); ret.append(buffer); if (table_options_.persistent_cache) { snprintf(buffer, kBufferSize, " persistent_cache_options:\n"); ret.append(buffer); ret.append(table_options_.persistent_cache->GetPrintableOptions()); } snprintf(buffer, kBufferSize, " block_size: %" ROCKSDB_PRIszt "\n", table_options_.block_size); ret.append(buffer); snprintf(buffer, kBufferSize, " block_size_deviation: %d\n", table_options_.block_size_deviation); ret.append(buffer); snprintf(buffer, kBufferSize, " block_restart_interval: %d\n", table_options_.block_restart_interval); ret.append(buffer); snprintf(buffer, kBufferSize, " index_block_restart_interval: %d\n", table_options_.index_block_restart_interval); ret.append(buffer); snprintf(buffer, kBufferSize, " metadata_block_size: %" PRIu64 "\n", table_options_.metadata_block_size); ret.append(buffer); snprintf(buffer, kBufferSize, " partition_filters: %d\n", table_options_.partition_filters); ret.append(buffer); snprintf(buffer, kBufferSize, " use_delta_encoding: %d\n", table_options_.use_delta_encoding); ret.append(buffer); snprintf(buffer, kBufferSize, " filter_policy: %s\n", table_options_.filter_policy == nullptr ? "nullptr" : table_options_.filter_policy->Name()); ret.append(buffer); snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n", table_options_.whole_key_filtering); ret.append(buffer); snprintf(buffer, kBufferSize, " verify_compression: %d\n", table_options_.verify_compression); ret.append(buffer); snprintf(buffer, kBufferSize, " read_amp_bytes_per_bit: %d\n", table_options_.read_amp_bytes_per_bit); ret.append(buffer); snprintf(buffer, kBufferSize, " format_version: %d\n", table_options_.format_version); ret.append(buffer); snprintf(buffer, kBufferSize, " enable_index_compression: %d\n", table_options_.enable_index_compression); ret.append(buffer); snprintf(buffer, kBufferSize, " block_align: %d\n", table_options_.block_align); ret.append(buffer); return ret; } #ifndef ROCKSDB_LITE namespace { bool SerializeSingleBlockBasedTableOption( std::string* opt_string, const BlockBasedTableOptions& bbt_options, const std::string& name, const std::string& delimiter) { auto iter = block_based_table_type_info.find(name); if (iter == block_based_table_type_info.end()) { return false; } auto& opt_info = iter->second; const char* opt_address = reinterpret_cast(&bbt_options) + opt_info.offset; std::string value; bool result = SerializeSingleOptionHelper(opt_address, opt_info.type, &value); if (result) { *opt_string = name + "=" + value + delimiter; } return result; } } // namespace Status BlockBasedTableFactory::GetOptionString( std::string* opt_string, const std::string& delimiter) const { assert(opt_string); opt_string->clear(); for (auto iter = block_based_table_type_info.begin(); iter != block_based_table_type_info.end(); ++iter) { if (iter->second.verification == OptionVerificationType::kDeprecated) { // If the option is no longer used in rocksdb and marked as deprecated, // we skip it in the serialization. continue; } std::string single_output; bool result = SerializeSingleBlockBasedTableOption( &single_output, table_options_, iter->first, delimiter); assert(result); if (result) { opt_string->append(single_output); } } return Status::OK(); } #else Status BlockBasedTableFactory::GetOptionString( std::string* /*opt_string*/, const std::string& /*delimiter*/) const { return Status::OK(); } #endif // !ROCKSDB_LITE const BlockBasedTableOptions& BlockBasedTableFactory::table_options() const { return table_options_; } #ifndef ROCKSDB_LITE namespace { std::string ParseBlockBasedTableOption(const std::string& name, const std::string& org_value, BlockBasedTableOptions* new_options, bool input_strings_escaped = false, bool ignore_unknown_options = false) { const std::string& value = input_strings_escaped ? UnescapeOptionString(org_value) : org_value; if (!input_strings_escaped) { // if the input string is not escaped, it means this function is // invoked from SetOptions, which takes the old format. if (name == "block_cache" || name == "block_cache_compressed") { // cache options can be specified in the following format // "block_cache={capacity=1M;num_shard_bits=4; // strict_capacity_limit=true;high_pri_pool_ratio=0.5;}" // To support backward compatibility, the following format // is also supported. // "block_cache=1M" std::shared_ptr cache; // block_cache is specified in format block_cache=. if (value.find('=') == std::string::npos) { cache = NewLRUCache(ParseSizeT(value)); } else { LRUCacheOptions cache_opts; if (!ParseOptionHelper(reinterpret_cast(&cache_opts), OptionType::kLRUCacheOptions, value)) { return "Invalid cache options"; } cache = NewLRUCache(cache_opts); } if (name == "block_cache") { new_options->block_cache = cache; } else { new_options->block_cache_compressed = cache; } return ""; } else if (name == "filter_policy") { // Expect the following format // bloomfilter:int:bool const std::string kName = "bloomfilter:"; if (value.compare(0, kName.size(), kName) != 0) { return "Invalid filter policy name"; } size_t pos = value.find(':', kName.size()); if (pos == std::string::npos) { return "Invalid filter policy config, missing bits_per_key"; } int bits_per_key = ParseInt(trim(value.substr(kName.size(), pos - kName.size()))); bool use_block_based_builder = ParseBoolean("use_block_based_builder", trim(value.substr(pos + 1))); new_options->filter_policy.reset( NewBloomFilterPolicy(bits_per_key, use_block_based_builder)); return ""; } } const auto iter = block_based_table_type_info.find(name); if (iter == block_based_table_type_info.end()) { if (ignore_unknown_options) { return ""; } else { return "Unrecognized option"; } } const auto& opt_info = iter->second; if (opt_info.verification != OptionVerificationType::kDeprecated && !ParseOptionHelper(reinterpret_cast(new_options) + opt_info.offset, opt_info.type, value)) { return "Invalid value"; } return ""; } } // namespace Status GetBlockBasedTableOptionsFromString( const BlockBasedTableOptions& table_options, const std::string& opts_str, BlockBasedTableOptions* new_table_options) { std::unordered_map opts_map; Status s = StringToMap(opts_str, &opts_map); if (!s.ok()) { return s; } return GetBlockBasedTableOptionsFromMap(table_options, opts_map, new_table_options); } Status GetBlockBasedTableOptionsFromMap( const BlockBasedTableOptions& table_options, const std::unordered_map& opts_map, BlockBasedTableOptions* new_table_options, bool input_strings_escaped, bool ignore_unknown_options) { assert(new_table_options); *new_table_options = table_options; for (const auto& o : opts_map) { auto error_message = ParseBlockBasedTableOption( o.first, o.second, new_table_options, input_strings_escaped, ignore_unknown_options); if (error_message != "") { const auto iter = block_based_table_type_info.find(o.first); if (iter == block_based_table_type_info.end() || !input_strings_escaped || // !input_strings_escaped indicates // the old API, where everything is // parsable. (iter->second.verification != OptionVerificationType::kByName && iter->second.verification != OptionVerificationType::kByNameAllowNull && iter->second.verification != OptionVerificationType::kByNameAllowFromNull && iter->second.verification != OptionVerificationType::kDeprecated)) { // Restore "new_options" to the default "base_options". *new_table_options = table_options; return Status::InvalidArgument("Can't parse BlockBasedTableOptions:", o.first + " " + error_message); } } } return Status::OK(); } Status VerifyBlockBasedTableFactory( const BlockBasedTableFactory* base_tf, const BlockBasedTableFactory* file_tf, OptionsSanityCheckLevel sanity_check_level) { if ((base_tf != nullptr) != (file_tf != nullptr) && sanity_check_level > kSanityLevelNone) { return Status::Corruption( "[RocksDBOptionsParser]: Inconsistent TableFactory class type"); } if (base_tf == nullptr) { return Status::OK(); } assert(file_tf != nullptr); const auto& base_opt = base_tf->table_options(); const auto& file_opt = file_tf->table_options(); for (auto& pair : block_based_table_type_info) { if (pair.second.verification == OptionVerificationType::kDeprecated) { // We skip checking deprecated variables as they might // contain random values since they might not be initialized continue; } if (BBTOptionSanityCheckLevel(pair.first) <= sanity_check_level) { if (!AreEqualOptions(reinterpret_cast(&base_opt), reinterpret_cast(&file_opt), pair.second, pair.first, nullptr)) { return Status::Corruption( "[RocksDBOptionsParser]: " "failed the verification on BlockBasedTableOptions::", pair.first); } } } return Status::OK(); } #endif // !ROCKSDB_LITE TableFactory* NewBlockBasedTableFactory( const BlockBasedTableOptions& _table_options) { return new BlockBasedTableFactory(_table_options); } const std::string BlockBasedTableFactory::kName = "BlockBasedTable"; const std::string BlockBasedTablePropertyNames::kIndexType = "rocksdb.block.based.table.index.type"; const std::string BlockBasedTablePropertyNames::kWholeKeyFiltering = "rocksdb.block.based.table.whole.key.filtering"; const std::string BlockBasedTablePropertyNames::kPrefixFiltering = "rocksdb.block.based.table.prefix.filtering"; const std::string kHashIndexPrefixesBlock = "rocksdb.hashindex.prefixes"; const std::string kHashIndexPrefixesMetadataBlock = "rocksdb.hashindex.metadata"; const std::string kPropTrue = "1"; const std::string kPropFalse = "0"; } // namespace rocksdb