@ -15,6 +15,8 @@
# include <map>
# include <map>
# include <memory>
# include <memory>
# include <string>
# include <unordered_map>
# include "db/dbformat.h"
# include "db/dbformat.h"
@ -41,6 +43,8 @@
namespace rocksdb {
namespace rocksdb {
extern const std : : string kHashIndexPrefixesBlock ;
extern const std : : string kHashIndexPrefixesMetadataBlock ;
namespace {
namespace {
typedef BlockBasedTableOptions : : IndexType IndexType ;
typedef BlockBasedTableOptions : : IndexType IndexType ;
@ -57,6 +61,14 @@ typedef BlockBasedTableOptions::IndexType IndexType;
// design that just works.
// design that just works.
class IndexBuilder {
class IndexBuilder {
public :
public :
// Index builder will construct a set of blocks which contain:
// 1. One primary index block.
// 2. (Optional) a set of metablocks that contains the metadata of the
// primary index.
struct IndexBlocks {
Slice index_block_contents ;
std : : unordered_map < std : : string , Slice > meta_blocks ;
} ;
explicit IndexBuilder ( const Comparator * comparator )
explicit IndexBuilder ( const Comparator * comparator )
: comparator_ ( comparator ) { }
: comparator_ ( comparator ) { }
@ -72,15 +84,19 @@ class IndexBuilder {
// the last one in the table
// the last one in the table
//
//
// REQUIRES: Finish() has not yet been called.
// REQUIRES: Finish() has not yet been called.
virtual void AddEntry ( std : : string * last_key_in_current_block ,
virtual void AddIndex Entry ( std : : string * last_key_in_current_block ,
const Slice * first_key_in_next_block ,
const Slice * first_key_in_next_block ,
const BlockHandle & block_handle ) = 0 ;
const BlockHandle & block_handle ) = 0 ;
// This method will be called whenever a key is added. The subclasses may
// override OnKeyAdded() if they need to collect additional information.
virtual void OnKeyAdded ( const Slice & key ) { }
// Inform the index builder that all entries has been written. Block builder
// Inform the index builder that all entries has been written. Block builder
// may therefore perform any operation required for block finalization.
// may therefore perform any operation required for block finalization.
//
//
// REQUIRES: Finish() has not yet been called.
// REQUIRES: Finish() has not yet been called.
virtual Slice Finish ( ) = 0 ;
virtual Status Finish ( IndexBlocks * index_blocks ) = 0 ;
// Get the estimated size for index block.
// Get the estimated size for index block.
virtual size_t EstimatedSize ( ) const = 0 ;
virtual size_t EstimatedSize ( ) const = 0 ;
@ -103,7 +119,7 @@ class ShortenedIndexBuilder : public IndexBuilder {
: IndexBuilder ( comparator ) ,
: IndexBuilder ( comparator ) ,
index_block_builder_ ( 1 /* block_restart_interval == 1 */ , comparator ) { }
index_block_builder_ ( 1 /* block_restart_interval == 1 */ , comparator ) { }
virtual void AddEntry ( std : : string * last_key_in_current_block ,
virtual void AddIndex Entry ( std : : string * last_key_in_current_block ,
const Slice * first_key_in_next_block ,
const Slice * first_key_in_next_block ,
const BlockHandle & block_handle ) override {
const BlockHandle & block_handle ) override {
if ( first_key_in_next_block ! = nullptr ) {
if ( first_key_in_next_block ! = nullptr ) {
@ -118,7 +134,10 @@ class ShortenedIndexBuilder : public IndexBuilder {
index_block_builder_ . Add ( * last_key_in_current_block , handle_encoding ) ;
index_block_builder_ . Add ( * last_key_in_current_block , handle_encoding ) ;
}
}
virtual Slice Finish ( ) override { return index_block_builder_ . Finish ( ) ; }
virtual Status Finish ( IndexBlocks * index_blocks ) {
index_blocks - > index_block_contents = index_block_builder_ . Finish ( ) ;
return Status : : OK ( ) ;
}
virtual size_t EstimatedSize ( ) const {
virtual size_t EstimatedSize ( ) const {
return index_block_builder_ . CurrentSizeEstimate ( ) ;
return index_block_builder_ . CurrentSizeEstimate ( ) ;
@ -128,38 +147,125 @@ class ShortenedIndexBuilder : public IndexBuilder {
BlockBuilder index_block_builder_ ;
BlockBuilder index_block_builder_ ;
} ;
} ;
// FullKeyIndexBuilder is also based on BlockBuilder. It works pretty much like
// HashIndexBuilder contains a binary-searchable primary index and the
// ShortenedIndexBuilder, but preserves the full key instead the substitude key.
// metadata for secondary hash index construction.
class FullKeyIndexBuilder : public IndexBuilder {
// The metadata for hash index consists two parts:
// - a metablock that compactly contains a sequence of prefixes. All prefixes
// are stored consectively without any metadata (like, prefix sizes) being
// stored, which is kept in the other metablock.
// - a metablock contains the metadata of the prefixes, including prefix size,
// restart index and number of block it spans. The format looks like:
//
// +-----------------+---------------------------+---------------------+ <=prefix 1
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+ <=prefix 2
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
// | |
// | .... |
// | |
// +-----------------+---------------------------+---------------------+ <=prefix n
// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
// +-----------------+---------------------------+---------------------+
//
// The reason of separating these two metablocks is to enable the efficiently
// reuse the first metablock during hash index construction without unnecessary
// data copy or small heap allocations for prefixes.
class HashIndexBuilder : public IndexBuilder {
public :
public :
explicit FullKeyIndexBuilder ( const Comparator * comparator )
explicit HashIndexBuilder ( const Comparator * comparator ,
const SliceTransform * hash_key_extractor )
: IndexBuilder ( comparator ) ,
: IndexBuilder ( comparator ) ,
index_block_builder_ ( 1 /* block_restart_interval == 1 */ , comparator ) { }
primary_index_builder ( comparator ) ,
hash_key_extractor_ ( hash_key_extractor ) { }
virtual void AddEntry ( std : : string * last_key_in_current_block ,
virtual void AddIndex Entry ( std : : string * last_key_in_current_block ,
const Slice * first_key_in_next_block ,
const Slice * first_key_in_next_block ,
const BlockHandle & block_handle ) override {
const BlockHandle & block_handle ) override {
std : : string handle_encoding ;
+ + current_restart_index_ ;
block_handle . EncodeTo ( & handle_encoding ) ;
primary_index_builder . AddIndexEntry ( last_key_in_current_block ,
index_block_builder_ . Add ( * last_key_in_current_block , handle_encoding ) ;
first_key_in_nex t_block , block_ handle) ;
}
}
virtual Slice Finish ( ) override { return index_block_builder_ . Finish ( ) ; }
virtual void OnKeyAdded ( const Slice & key ) override {
auto key_prefix = hash_key_extractor_ - > Transform ( key ) ;
bool is_first_entry = pending_block_num_ = = 0 ;
// Keys may share the prefix
if ( is_first_entry | | pending_entry_prefix_ ! = key_prefix ) {
if ( ! is_first_entry ) {
FlushPendingPrefix ( ) ;
}
// need a hard copy otherwise the underlying data changes all the time.
// TODO(kailiu) ToString() is expensive. We may speed up can avoid data
// copy.
pending_entry_prefix_ = key_prefix . ToString ( ) ;
pending_block_num_ = 1 ;
pending_entry_index_ = current_restart_index_ ;
} else {
// entry number increments when keys share the prefix reside in
// differnt data blocks.
auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1 ;
assert ( last_restart_index < = current_restart_index_ ) ;
if ( last_restart_index ! = current_restart_index_ ) {
+ + pending_block_num_ ;
}
}
}
virtual Status Finish ( IndexBlocks * index_blocks ) {
FlushPendingPrefix ( ) ;
primary_index_builder . Finish ( index_blocks ) ;
index_blocks - > meta_blocks . insert (
{ kHashIndexPrefixesBlock . c_str ( ) , prefix_block_ } ) ;
index_blocks - > meta_blocks . insert (
{ kHashIndexPrefixesMetadataBlock . c_str ( ) , prefix_meta_block_ } ) ;
return Status : : OK ( ) ;
}
virtual size_t EstimatedSize ( ) const {
virtual size_t EstimatedSize ( ) const {
return index_block_builder_ . CurrentSizeEstimate ( ) ;
return primary_index_builder . EstimatedSize ( ) + prefix_block_ . size ( ) +
prefix_meta_block_ . size ( ) ;
}
}
private :
private :
BlockBuilder index_block_builder_ ;
void FlushPendingPrefix ( ) {
prefix_block_ . append ( pending_entry_prefix_ . data ( ) ,
pending_entry_prefix_ . size ( ) ) ;
PutVarint32 ( & prefix_meta_block_ , pending_entry_prefix_ . size ( ) ) ;
PutVarint32 ( & prefix_meta_block_ , pending_entry_index_ ) ;
PutVarint32 ( & prefix_meta_block_ , pending_block_num_ ) ;
}
ShortenedIndexBuilder primary_index_builder ;
const SliceTransform * hash_key_extractor_ ;
// stores a sequence of prefixes
std : : string prefix_block_ ;
// stores the metadata of prefixes
std : : string prefix_meta_block_ ;
// The following 3 variables keeps unflushed prefix and its metadata.
// The details of block_num and entry_index can be found in
// "block_hash_index.{h,cc}"
uint32_t pending_block_num_ = 0 ;
uint32_t pending_entry_index_ = 0 ;
std : : string pending_entry_prefix_ ;
uint64_t current_restart_index_ = 0 ;
} ;
} ;
// Create a index builder based on its type.
// Create a index builder based on its type.
IndexBuilder * CreateIndexBuilder ( IndexType type , const Comparator * comparator ) {
IndexBuilder * CreateIndexBuilder ( IndexType type , const Comparator * comparator ,
const SliceTransform * prefix_extractor ) {
switch ( type ) {
switch ( type ) {
case BlockBasedTableOptions : : kBinarySearch : {
case BlockBasedTableOptions : : kBinarySearch : {
return new ShortenedIndexBuilder ( comparator ) ;
return new ShortenedIndexBuilder ( comparator ) ;
}
}
case BlockBasedTableOptions : : kHashSearch : {
return new HashIndexBuilder ( comparator , prefix_extractor ) ;
}
default : {
default : {
assert ( ! " Do not recognize the index type " ) ;
assert ( ! " Do not recognize the index type " ) ;
return nullptr ;
return nullptr ;
@ -249,7 +355,7 @@ extern const uint64_t kLegacyBlockBasedTableMagicNumber = 0xdb4775248b80fb57ull;
class BlockBasedTableBuilder : : BlockBasedTablePropertiesCollector
class BlockBasedTableBuilder : : BlockBasedTablePropertiesCollector
: public TablePropertiesCollector {
: public TablePropertiesCollector {
public :
public :
BlockBasedTablePropertiesCollector (
explicit BlockBasedTablePropertiesCollector (
BlockBasedTableOptions : : IndexType index_type )
BlockBasedTableOptions : : IndexType index_type )
: index_type_ ( index_type ) { }
: index_type_ ( index_type ) { }
@ -288,6 +394,8 @@ struct BlockBasedTableBuilder::Rep {
uint64_t offset = 0 ;
uint64_t offset = 0 ;
Status status ;
Status status ;
BlockBuilder data_block ;
BlockBuilder data_block ;
InternalKeySliceTransform internal_prefix_transform ;
std : : unique_ptr < IndexBuilder > index_builder ;
std : : unique_ptr < IndexBuilder > index_builder ;
std : : string last_key ;
std : : string last_key ;
@ -316,8 +424,9 @@ struct BlockBasedTableBuilder::Rep {
internal_comparator ( icomparator ) ,
internal_comparator ( icomparator ) ,
file ( f ) ,
file ( f ) ,
data_block ( options , & internal_comparator ) ,
data_block ( options , & internal_comparator ) ,
index_builder (
internal_prefix_transform ( options . prefix_extractor . get ( ) ) ,
CreateIndexBuilder ( index_block_type , & internal_comparator ) ) ,
index_builder ( CreateIndexBuilder ( index_block_type , & internal_comparator ,
& this - > internal_prefix_transform ) ) ,
compression_type ( compression_type ) ,
compression_type ( compression_type ) ,
checksum_type ( checksum_type ) ,
checksum_type ( checksum_type ) ,
filter_block ( opt . filter_policy = = nullptr
filter_block ( opt . filter_policy = = nullptr
@ -335,16 +444,13 @@ struct BlockBasedTableBuilder::Rep {
}
}
} ;
} ;
// TODO(sdong): Currently only write out binary search index. In
// BlockBasedTableReader, Hash index will be built using binary search index.
BlockBasedTableBuilder : : BlockBasedTableBuilder (
BlockBasedTableBuilder : : BlockBasedTableBuilder (
const Options & options , const BlockBasedTableOptions & table_options ,
const Options & options , const BlockBasedTableOptions & table_options ,
const InternalKeyComparator & internal_comparator , WritableFile * file ,
const InternalKeyComparator & internal_comparator , WritableFile * file ,
CompressionType compression_type )
CompressionType compression_type )
: rep_ ( new Rep ( options , internal_comparator , file ,
: rep_ ( new Rep ( options , internal_comparator , file ,
table_options . flush_block_policy_factory . get ( ) ,
table_options . flush_block_policy_factory . get ( ) ,
compression_type ,
compression_type , table_options . index_type ,
BlockBasedTableOptions : : IndexType : : kBinarySearch ,
table_options . checksum ) ) {
table_options . checksum ) ) {
if ( rep_ - > filter_block ! = nullptr ) {
if ( rep_ - > filter_block ! = nullptr ) {
rep_ - > filter_block - > StartBlock ( 0 ) ;
rep_ - > filter_block - > StartBlock ( 0 ) ;
@ -370,7 +476,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
if ( r - > props . num_entries > 0 ) {
if ( r - > props . num_entries > 0 ) {
assert ( r - > internal_comparator . Compare ( key , Slice ( r - > last_key ) ) > 0 ) ;
assert ( r - > internal_comparator . Compare ( key , Slice ( r - > last_key ) ) > 0 ) ;
}
}
r - > index_builder - > OnKeyAdded ( key ) ;
auto should_flush = r - > flush_block_policy - > Update ( key , value ) ;
auto should_flush = r - > flush_block_policy - > Update ( key , value ) ;
if ( should_flush ) {
if ( should_flush ) {
assert ( ! r - > data_block . empty ( ) ) ;
assert ( ! r - > data_block . empty ( ) ) ;
@ -385,7 +491,7 @@ void BlockBasedTableBuilder::Add(const Slice& key, const Slice& value) {
// entries in the first block and < all entries in subsequent
// entries in the first block and < all entries in subsequent
// blocks.
// blocks.
if ( ok ( ) ) {
if ( ok ( ) ) {
r - > index_builder - > AddEntry ( & r - > last_key , & key , r - > pending_handle ) ;
r - > index_builder - > AddIndex Entry ( & r - > last_key , & key , r - > pending_handle ) ;
}
}
}
}
@ -561,24 +667,36 @@ Status BlockBasedTableBuilder::Finish() {
// block, we will finish writing all index entries here and flush them
// block, we will finish writing all index entries here and flush them
// to storage after metaindex block is written.
// to storage after metaindex block is written.
if ( ok ( ) & & ! empty_data_block ) {
if ( ok ( ) & & ! empty_data_block ) {
r - > index_builder - > AddEntry ( & r - > last_key , nullptr /* no next data block */ ,
r - > index_builder - > AddIndexEntry (
r - > pending_handle ) ;
& r - > last_key , nullptr /* no next data block */ , r - > pending_handle ) ;
}
IndexBuilder : : IndexBlocks index_blocks ;
auto s = r - > index_builder - > Finish ( & index_blocks ) ;
if ( ! s . ok ( ) ) {
return s ;
}
}
// Write meta blocks and metaindex block with the following order.
// Write meta blocks and metaindex block with the following order.
// 1. [meta block: filter]
// 1. [meta block: filter]
// 2. [meta block: properties]
// 2. [other meta blocks]
// 3. [metaindex block]
// 3. [meta block: properties]
if ( ok ( ) ) {
// 4. [metaindex block]
MetaIndexBuilder meta_index_builer ;
// write meta blocks
MetaIndexBuilder meta_index_builder ;
for ( const auto & item : index_blocks . meta_blocks ) {
BlockHandle block_handle ;
WriteBlock ( item . second , & block_handle ) ;
meta_index_builder . Add ( item . first , block_handle ) ;
}
// Write filter block.
if ( ok ( ) ) {
if ( r - > filter_block ! = nullptr ) {
if ( r - > filter_block ! = nullptr ) {
// Add mapping from "<filter_block_prefix>.Name" to location
// Add mapping from "<filter_block_prefix>.Name" to location
// of filter data.
// of filter data.
std : : string key = BlockBasedTable : : kFilterBlockPrefix ;
std : : string key = BlockBasedTable : : kFilterBlockPrefix ;
key . append ( r - > options . filter_policy - > Name ( ) ) ;
key . append ( r - > options . filter_policy - > Name ( ) ) ;
meta_index_builer . Add ( key , filter_block_handle ) ;
meta_index_build er . Add ( key , filter_block_handle ) ;
}
}
// Write properties block.
// Write properties block.
@ -605,20 +723,16 @@ Status BlockBasedTableBuilder::Finish() {
& properties_block_handle
& properties_block_handle
) ;
) ;
meta_index_builer . Add ( kPropertiesBlock ,
meta_index_builder . Add ( kPropertiesBlock , properties_block_handle ) ;
properties_block_handle ) ;
} // end of properties block writing
} // end of properties block writing
} // meta blocks
WriteRawBlock (
meta_index_builer . Finish ( ) ,
kNoCompression ,
& metaindex_block_handle
) ;
} // meta blocks and metaindex block.
// Write index block
// Write index block
if ( ok ( ) ) {
if ( ok ( ) ) {
WriteBlock ( r - > index_builder - > Finish ( ) , & index_block_handle ) ;
// flush the meta index block
WriteRawBlock ( meta_index_builder . Finish ( ) , kNoCompression ,
& metaindex_block_handle ) ;
WriteBlock ( index_blocks . index_block_contents , & index_block_handle ) ;
}
}
// Write footer
// Write footer
@ -685,7 +799,6 @@ uint64_t BlockBasedTableBuilder::FileSize() const {
return rep_ - > offset ;
return rep_ - > offset ;
}
}
const std : : string BlockBasedTable : : kFilterBlockPrefix =
const std : : string BlockBasedTable : : kFilterBlockPrefix = " filter. " ;
" filter. " ;
} // namespace rocksdb
} // namespace rocksdb