@ -11,23 +11,29 @@
# include <assert.h>
# include <inttypes.h>
# include <map>
# include <stdio.h>
# include "rocksdb/flush_block_policy.h"
# include <map>
# include <memory>
# include "db/dbformat.h"
# include "rocksdb/cache.h"
# include "rocksdb/comparator.h"
# include "table/table_builder.h"
# include "rocksdb/env.h"
# include "rocksdb/filter_policy.h"
# include "rocksdb/flush_block_policy.h"
# include "rocksdb/options.h"
# include "db/dbforma t.h"
# include "table/block_based_table_reader.h"
# include "rocks db/table .h"
# include "table/block.h"
# include "table/block_based_table_reader.h"
# include "table/block_builder.h"
# include "table/filter_block.h"
# include "table/format.h"
# include "table/meta_blocks.h"
# include "table/table_builder.h"
# include "util/coding.h"
# include "util/crc32c.h"
# include "util/stop_watch.h"
@ -36,11 +42,167 @@ namespace rocksdb {
namespace {
static bool GoodCompressionRatio ( size_t compressed_size , size_t raw_size ) {
typedef BlockBasedTableOptions : : IndexType IndexType ;
// The interface for building index.
// Instruction for adding a new concrete IndexBuilder:
// 1. Create a subclass instantiated from IndexBuilder.
// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
// 3. Add a create function for the new subclass in CreateIndexBuilder.
// Note: we can devise more advanced design to simplify the process for adding
// new subclass, which will, on the other hand, increase the code complexity and
// catch unwanted attention from readers. Given that we won't add/change
// indexes frequently, it makes sense to just embrace a more straightforward
// design that just works.
class IndexBuilder {
public :
explicit IndexBuilder ( const Comparator * comparator )
: comparator_ ( comparator ) { }
virtual ~ IndexBuilder ( ) { }
// Add a new index entry to index block.
// To allow further optimization, we provide `last_key_in_current_block` and
// `first_key_in_next_block`, based on which the specific implementation can
// determine the best index key to be used for the index block.
// @last_key_in_current_block: this parameter maybe overridden with the value
// "substitute key".
// @first_key_in_next_block: it will be nullptr if the entry being added is
// the last one in the table
//
// REQUIRES: Finish() has not yet been called.
virtual void AddEntry ( std : : string * last_key_in_current_block ,
const Slice * first_key_in_next_block ,
const BlockHandle & block_handle ) = 0 ;
// Inform the index builder that all entries has been written. Block builder
// may therefore perform any operation required for block finalization.
//
// REQUIRES: Finish() has not yet been called.
virtual Slice Finish ( ) = 0 ;
// Get the estimated size for index block.
virtual size_t EstimatedSize ( ) const = 0 ;
protected :
const Comparator * comparator_ ;
} ;
// This index builder builds space-efficient index block for binary-search-based
// index.
//
// Optimizations:
// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
// search when doing index lookup.
// 2. Shorten the key length for index block. Other than honestly using the
// last key in the data block as the index key, we instead find a shortest
// substitute key that serves the same function.
class BinarySearchIndexBuilder : public IndexBuilder {
public :
explicit BinarySearchIndexBuilder ( const Comparator * comparator )
: IndexBuilder ( comparator ) ,
index_block_builder_ ( 1 /* block_restart_interval == 1 */ , comparator ) { }
virtual void AddEntry ( std : : string * last_key_in_current_block ,
const Slice * first_key_in_next_block ,
const BlockHandle & block_handle ) override {
if ( first_key_in_next_block ! = nullptr ) {
comparator_ - > FindShortestSeparator ( last_key_in_current_block ,
* first_key_in_next_block ) ;
} else {
comparator_ - > FindShortSuccessor ( last_key_in_current_block ) ;
}
std : : string handle_encoding ;
block_handle . EncodeTo ( & handle_encoding ) ;
index_block_builder_ . Add ( * last_key_in_current_block , handle_encoding ) ;
}
virtual Slice Finish ( ) override { return index_block_builder_ . Finish ( ) ; }
virtual size_t EstimatedSize ( ) const {
return index_block_builder_ . CurrentSizeEstimate ( ) ;
}
private :
BlockBuilder index_block_builder_ ;
} ;
// Create a index builder based on its type.
IndexBuilder * CreateIndexBuilder ( IndexType type , const Comparator * comparator ) {
switch ( type ) {
case BlockBasedTableOptions : : kBinarySearch : {
return new BinarySearchIndexBuilder ( comparator ) ;
}
default : {
assert ( ! " Do not recognize the index type " ) ;
return nullptr ;
}
}
// impossible.
assert ( false ) ;
return nullptr ;
}
bool GoodCompressionRatio ( size_t compressed_size , size_t raw_size ) {
// Check to see if compressed less than 12.5%
return compressed_size < raw_size - ( raw_size / 8u ) ;
}
Slice CompressBlock ( const Slice & raw ,
const CompressionOptions & compression_options ,
CompressionType * type , std : : string * compressed_output ) {
if ( * type = = kNoCompression ) {
return raw ;
}
// Will return compressed block contents if (1) the compression method is
// supported in this platform and (2) the compression rate is "good enough".
switch ( * type ) {
case kSnappyCompression :
if ( port : : Snappy_Compress ( compression_options , raw . data ( ) , raw . size ( ) ,
compressed_output ) & &
GoodCompressionRatio ( compressed_output - > size ( ) , raw . size ( ) ) ) {
return * compressed_output ;
}
break ; // fall back to no compression.
case kZlibCompression :
if ( port : : Zlib_Compress ( compression_options , raw . data ( ) , raw . size ( ) ,
compressed_output ) & &
GoodCompressionRatio ( compressed_output - > size ( ) , raw . size ( ) ) ) {
return * compressed_output ;
}
break ; // fall back to no compression.
case kBZip2Compression :
if ( port : : BZip2_Compress ( compression_options , raw . data ( ) , raw . size ( ) ,
compressed_output ) & &
GoodCompressionRatio ( compressed_output - > size ( ) , raw . size ( ) ) ) {
return * compressed_output ;
}
break ; // fall back to no compression.
case kLZ4Compression :
if ( port : : LZ4_Compress ( compression_options , raw . data ( ) , raw . size ( ) ,
compressed_output ) & &
GoodCompressionRatio ( compressed_output - > size ( ) , raw . size ( ) ) ) {
return * compressed_output ;
}
break ; // fall back to no compression.
case kLZ4HCCompression :
if ( port : : LZ4HC_Compress ( compression_options , raw . data ( ) , raw . size ( ) ,
compressed_output ) & &
GoodCompressionRatio ( compressed_output - > size ( ) , raw . size ( ) ) ) {
return * compressed_output ;
}
break ; // fall back to no compression.
default : { } // Do not recognize this compression type
}
// Compression method is not supported, or not good compression ratio, so just
// fall back to uncompressed form.
* type = kNoCompression ;
return raw ;
}
} // anonymous namespace
// kBlockBasedTableMagicNumber was picked by running
@ -51,6 +213,46 @@ static bool GoodCompressionRatio(size_t compressed_size, size_t raw_size) {
extern const uint64_t kBlockBasedTableMagicNumber
= 0xdb4775248b80fb57ull ;
// A collector that collects properties of interest to block-based table.
// For now this class looks heavy-weight since we only write one additional
// property.
// But in the forseeable future, we will add more and more properties that are
// specific to block-based table.
class BlockBasedTableBuilder : : BlockBasedTablePropertiesCollector
: public TablePropertiesCollector {
public :
BlockBasedTablePropertiesCollector (
BlockBasedTableOptions : : IndexType index_type )
: index_type_ ( index_type ) { }
virtual Status Add ( const Slice & key , const Slice & value ) {
// Intentionally left blank. Have no interest in collecting stats for
// individual key/value pairs.
return Status : : OK ( ) ;
}
virtual Status Finish ( UserCollectedProperties * properties ) {
std : : string val ;
PutFixed32 ( & val , static_cast < uint32_t > ( index_type_ ) ) ;
properties - > insert ( { BlockBasedTablePropertyNames : : kIndexType , val } ) ;
return Status : : OK ( ) ;
}
// The name of the properties collector can be used for debugging purpose.
virtual const char * Name ( ) const {
return " BlockBasedTablePropertiesCollector " ;
}
virtual UserCollectedProperties GetReadableProperties ( ) const {
// Intentionally left blank.
return UserCollectedProperties ( ) ;
}
private :
BlockBasedTableOptions : : IndexType index_type_ ;
} ;
struct BlockBasedTableBuilder : : Rep {
Options options ;
const InternalKeyComparator & internal_comparator ;
@ -58,7 +260,8 @@ struct BlockBasedTableBuilder::Rep {
uint64_t offset = 0 ;
Status status ;
BlockBuilder data_block ;
BlockBuilder index_block ;
std : : unique_ptr < IndexBuilder > index_builder ;
std : : string last_key ;
CompressionType compression_type ;
TableProperties props ;
@ -75,28 +278,31 @@ struct BlockBasedTableBuilder::Rep {
Rep ( const Options & opt , const InternalKeyComparator & icomparator ,
WritableFile * f , FlushBlockPolicyFactory * flush_block_policy_factory ,
CompressionType compression_type )
CompressionType compression_type , IndexType index_block_type )
: options ( opt ) ,
internal_comparator ( icomparator ) ,
file ( f ) ,
data_block ( options , & internal_comparator ) ,
// To avoid linear scan, we make the block_restart_interval to be `1`
// in index block builder
index_block ( 1 /* block_restart_interval */ , & internal_comparator ) ,
index_builder (
CreateIndexBuilder ( index_block_type , & internal_comparator ) ) ,
compression_type ( compression_type ) ,
filter_block ( opt . filter_policy = = nullptr
? nullptr
: new FilterBlockBuilder ( opt , & internal_comparator ) ) ,
flush_block_policy ( flush_block_policy_factory - > NewFlushBlockPolicy (
options , data_block ) ) { }
options , data_block ) ) {
options . table_properties_collectors . push_back (
std : : make_shared < BlockBasedTablePropertiesCollector > ( index_block_type ) ) ;
}
} ;
BlockBasedTableBuilder : : BlockBasedTableBuilder (
const Options & options , const InternalKeyComparator & internal_comparator ,
WritableFile * file , FlushBlockPolicyFactory * flush_block_policy_factory ,
const Options & options , const BlockBasedTableOptions & table_options ,
const InternalKeyComparator & internal_comparator , WritableFile * file ,
CompressionType compression_type )
: rep_ ( new Rep ( options , internal_comparator , file ,
flush_block_policy_factory , compression_type ) ) {
table_options . flush_block_policy_factory . get ( ) ,
compression_type , table_options . index_type ) ) {
if ( rep_ - > filter_block ! = nullptr ) {
rep_ - > filter_block - > StartBlock ( 0 ) ;
}
@ -136,10 +342,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// entries in the first block and < all entries in subsequent
// blocks.
if ( ok ( ) ) {
r - > internal_comparator . FindShortestSeparator ( & r - > last_key , key ) ;
std : : string handle_encoding ;
r - > pending_handle . EncodeTo ( & handle_encoding ) ;
r - > index_block . Add ( r - > last_key , Slice ( handle_encoding ) ) ;
r - > index_builder - > AddEntry ( & r - > last_key , & key , r - > pending_handle ) ;
}
}
@ -179,88 +382,25 @@ void BlockBasedTableBuilder::Flush() {
void BlockBasedTableBuilder : : WriteBlock ( BlockBuilder * block ,
BlockHandle * handle ) {
WriteBlock ( block - > Finish ( ) , handle ) ;
block - > Reset ( ) ;
}
void BlockBasedTableBuilder : : WriteBlock ( const Slice & raw_block_contents ,
BlockHandle * handle ) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert ( ok ( ) ) ;
Rep * r = rep_ ;
Slice raw = block - > Finish ( ) ;
Slice block_contents ;
std : : string * compressed = & r - > compressed_output ;
CompressionType type = r - > compression_type ;
switch ( type ) {
case kNoCompression :
block_contents = raw ;
break ;
case kSnappyCompression : {
std : : string * compressed = & r - > compressed_output ;
if ( port : : Snappy_Compress ( r - > options . compression_opts , raw . data ( ) ,
raw . size ( ) , compressed ) & &
GoodCompressionRatio ( compressed - > size ( ) , raw . size ( ) ) ) {
block_contents = * compressed ;
} else {
// Snappy not supported, or not good compression ratio, so just
// store uncompressed form
block_contents = raw ;
type = kNoCompression ;
}
break ;
}
case kZlibCompression :
if ( port : : Zlib_Compress ( r - > options . compression_opts , raw . data ( ) ,
raw . size ( ) , compressed ) & &
GoodCompressionRatio ( compressed - > size ( ) , raw . size ( ) ) ) {
block_contents = * compressed ;
} else {
// Zlib not supported, or not good compression ratio, so just
// store uncompressed form
block_contents = raw ;
type = kNoCompression ;
}
break ;
case kBZip2Compression :
if ( port : : BZip2_Compress ( r - > options . compression_opts , raw . data ( ) ,
raw . size ( ) , compressed ) & &
GoodCompressionRatio ( compressed - > size ( ) , raw . size ( ) ) ) {
block_contents = * compressed ;
} else {
// BZip not supported, or not good compression ratio, so just
// store uncompressed form
block_contents = raw ;
type = kNoCompression ;
}
break ;
case kLZ4Compression :
if ( port : : LZ4_Compress ( r - > options . compression_opts , raw . data ( ) ,
raw . size ( ) , compressed ) & &
GoodCompressionRatio ( compressed - > size ( ) , raw . size ( ) ) ) {
block_contents = * compressed ;
} else {
// LZ4 not supported, or not good compression ratio, so just
// store uncompressed form
block_contents = raw ;
type = kNoCompression ;
}
break ;
case kLZ4HCCompression :
if ( port : : LZ4HC_Compress ( r - > options . compression_opts , raw . data ( ) ,
raw . size ( ) , compressed ) & &
GoodCompressionRatio ( compressed - > size ( ) , raw . size ( ) ) ) {
block_contents = * compressed ;
} else {
// LZ4 not supported, or not good compression ratio, so just
// store uncompressed form
block_contents = raw ;
type = kNoCompression ;
}
break ;
}
auto type = r - > compression_type ;
auto block_contents =
CompressBlock ( raw_block_contents , r - > options . compression_opts , & type ,
& r - > compressed_output ) ;
WriteRawBlock ( block_contents , type , handle ) ;
r - > compressed_output . clear ( ) ;
block - > Reset ( ) ;
}
void BlockBasedTableBuilder : : WriteRawBlock ( const Slice & block_contents ,
@ -364,11 +504,8 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
if ( ok ( ) & & ! empty_data_block ) {
r - > internal_comparator . FindShortSuccessor ( & r - > last_key ) ;
std : : string handle_encoding ;
r - > pending_handle . EncodeTo ( & handle_encoding ) ;
r - > index_block . Add ( r - > last_key , handle_encoding ) ;
r - > index_builder - > AddEntry ( & r - > last_key , nullptr /* no next data block */ ,
r - > pending_handle ) ;
}
// Write meta blocks and metaindex block with the following order.
@ -394,11 +531,12 @@ Status BlockBasedTableBuilder::Finish() {
r - > props . filter_policy_name = r - > options . filter_policy ! = nullptr ?
r - > options . filter_policy - > Name ( ) : " " ;
r - > props . index_size =
r - > index_block . CurrentSizeEstimat e ( ) + kBlockTrailerSize ;
r - > index_builder - > EstimatedSiz e ( ) + kBlockTrailerSize ;
// Add basic properties
property_block_builder . AddTableProperty ( r - > props ) ;
// Add use collected properties
NotifyCollectTableCollectorsOnFinish (
r - > options . table_properties_collectors ,
r - > options . info_log . get ( ) ,
@ -425,7 +563,7 @@ Status BlockBasedTableBuilder::Finish() {
// Write index block
if ( ok ( ) ) {
WriteBlock ( & r - > index_block , & index_block_handle ) ;
WriteBlock ( r - > index_builder - > Finish ( ) , & index_block_handle ) ;
}
// Write footer