@ -32,8 +32,7 @@
# include "util/random.h"
# include "util/sync_point.h"
# include "util/timer_queue.h"
# include "utilities/transactions/optimistic_transaction.h"
# include "utilities/transactions/optimistic_transaction_db_impl.h"
# include "utilities/blob_db/blob_db_iterator.h"
namespace {
int kBlockBasedTableVersionFormat = 2 ;
@ -78,7 +77,7 @@ class BlobHandle {
void EncodeTo ( std : : string * dst ) const ;
Status DecodeFrom ( Slice * input ) ;
Status DecodeFrom ( const Slice & input ) ;
void clear ( ) ;
@ -109,10 +108,12 @@ void BlobHandle::clear() {
compression_ = kNoCompression ;
}
Status BlobHandle : : DecodeFrom ( Slice * input ) {
if ( GetVarint64 ( input , & file_number_ ) & & GetVarint64 ( input , & offset_ ) & &
GetVarint64 ( input , & size_ ) ) {
compression_ = static_cast < CompressionType > ( input - > data ( ) [ 0 ] ) ;
Status BlobHandle : : DecodeFrom ( const Slice & input ) {
Slice s ( input ) ;
Slice * p = & s ;
if ( GetVarint64 ( p , & file_number_ ) & & GetVarint64 ( p , & offset_ ) & &
GetVarint64 ( p , & size_ ) ) {
compression_ = static_cast < CompressionType > ( p - > data ( ) [ 0 ] ) ;
return Status : : OK ( ) ;
} else {
clear ( ) ;
@ -149,8 +150,7 @@ void EvictAllVersionsCompactionListener::InternalListener::OnCompaction(
value_type = =
CompactionEventListener : : CompactionListenerValueType : : kValue ) {
BlobHandle handle ;
Slice lsmval ( existing_value ) ;
Status s = handle . DecodeFrom ( & lsmval ) ;
Status s = handle . DecodeFrom ( existing_value ) ;
if ( s . ok ( ) ) {
if ( impl_ - > debug_level_ > = 3 )
ROCKS_LOG_INFO ( impl_ - > db_options_ . info_log ,
@ -211,8 +211,6 @@ Status BlobDBImpl::LinkToBaseDB(DB* db) {
env_ = db_ - > GetEnv ( ) ;
opt_db_ . reset ( new OptimisticTransactionDBImpl ( db , false ) ) ;
Status s = env_ - > CreateDirIfMissing ( blob_dir_ ) ;
if ( ! s . ok ( ) ) {
ROCKS_LOG_WARN ( db_options_ . info_log ,
@ -237,7 +235,6 @@ BlobDBOptions BlobDBImpl::GetBlobDBOptions() const { return bdb_options_; }
BlobDBImpl : : BlobDBImpl ( DB * db , const BlobDBOptions & blob_db_options )
: BlobDB ( db ) ,
db_impl_ ( static_cast_with_check < DBImpl , DB > ( db ) ) ,
opt_db_ ( new OptimisticTransactionDBImpl ( db , false ) ) ,
wo_set_ ( false ) ,
bdb_options_ ( blob_db_options ) ,
db_options_ ( db - > GetOptions ( ) ) ,
@ -827,8 +824,8 @@ Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) {
extendTTL ( & ( bfile - > ttl_range_ ) , expiration ) ;
}
return WriteBatchInternal : : Put ( & updates_blob_ , column_family_id , key ,
index_entry ) ;
return WriteBatchInternal : : PutBlobIndex ( & updates_blob_ , column_family_id ,
key , index_entry ) ;
}
virtual Status DeleteCF ( uint32_t column_family_id ,
@ -997,18 +994,6 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
std : : string headerbuf ;
Writer : : ConstructBlobHeader ( & headerbuf , key , value , expiration , - 1 ) ;
// this is another more safer way to do it, where you keep the writeLock
// for the entire write path. this will increase latency and reduce
// throughput
// WriteLock lockbfile_w(&bfile->mutex_);
// std::shared_ptr<Writer> writer =
// CheckOrCreateWriterLocked(bfile);
if ( debug_level_ > = 3 )
ROCKS_LOG_DEBUG (
db_options_ . info_log , " >Adding KEY FILE: %s: KEY: %s VALSZ: %d " ,
bfile - > PathName ( ) . c_str ( ) , key . ToString ( ) . c_str ( ) , value . size ( ) ) ;
std : : string index_entry ;
Status s = AppendBlob ( bfile , headerbuf , key , value , & index_entry ) ;
if ( ! s . ok ( ) ) {
@ -1022,11 +1007,17 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
}
WriteBatch batch ;
batch . Put ( key , index_entry ) ;
uint32_t column_family_id =
reinterpret_cast < ColumnFamilyHandleImpl * > ( DefaultColumnFamily ( ) ) - > GetID ( ) ;
s = WriteBatchInternal : : PutBlobIndex ( & batch , column_family_id , key ,
index_entry ) ;
// this goes to the base db and can be expensive
if ( s . ok ( ) ) {
s = db_ - > Write ( options , & batch ) ;
}
if ( s . ok ( ) ) {
// this is the sequence number of the write.
SequenceNumber sn = WriteBatchInternal : : Sequence ( & batch ) ;
bfile - > ExtendSequenceRange ( sn ) ;
@ -1035,7 +1026,6 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options, const Slice& key,
extendTTL ( & ( bfile - > ttl_range_ ) , expiration ) ;
}
if ( s . ok ( ) ) {
s = CloseBlobFileIfNeeded ( bfile ) ;
}
@ -1112,21 +1102,16 @@ std::vector<Status> BlobDBImpl::MultiGet(
// fetch and index entry and reading from the file.
ReadOptions ro ( read_options ) ;
bool snapshot_created = SetSnapshotIfNeeded ( & ro ) ;
std : : vector < std : : string > values_lsm ;
values_lsm . resize ( keys . size ( ) ) ;
auto statuses = db_ - > MultiGet ( ro , keys , & values_lsm ) ;
TEST_SYNC_POINT ( " BlobDBImpl::MultiGet:AfterIndexEntryGet:1 " ) ;
TEST_SYNC_POINT ( " BlobDBImpl::MultiGet:AfterIndexEntryGet:2 " ) ;
values - > resize ( keys . size ( ) ) ;
assert ( statuses . size ( ) = = keys . size ( ) ) ;
assert ( values_lsm . size ( ) = = keys . size ( ) ) ;
for ( size_t i = 0 ; i < keys . size ( ) ; + + i ) {
if ( ! statuses [ i ] . ok ( ) ) {
continue ;
}
Status s = CommonGet ( keys [ i ] , values_lsm [ i ] , & ( ( * values ) [ i ] ) ) ;
statuses [ i ] = s ;
std : : vector < Status > statuses ;
statuses . reserve ( keys . size ( ) ) ;
values - > clear ( ) ;
values - > reserve ( keys . size ( ) ) ;
PinnableSlice value ;
for ( size_t i = 0 ; i < keys . size ( ) ; i + + ) {
statuses . push_back ( Get ( ro , DefaultColumnFamily ( ) , keys [ i ] , & value ) ) ;
values - > push_back ( value . ToString ( ) ) ;
value . Reset ( ) ;
}
if ( snapshot_created ) {
db_ - > ReleaseSnapshot ( ro . snapshot ) ;
@ -1143,12 +1128,11 @@ bool BlobDBImpl::SetSnapshotIfNeeded(ReadOptions* read_options) {
return true ;
}
Status BlobDBImpl : : Common Get( const Slice & key , const std : : string & index_entry ,
std : : string * value ) {
Status BlobDBImpl : : GetBlobValue ( const Slice & key , const Slice & index_entry ,
PinnableSlice * value ) {
assert ( value ! = nullptr ) ;
Slice index_entry_slice ( index_entry ) ;
BlobHandle handle ;
Status s = handle . DecodeFrom ( & index_entry_slice ) ;
Status s = handle . DecodeFrom ( index_entry ) ;
if ( ! s . ok ( ) ) return s ;
// offset has to have certain min, as we will read CRC
@ -1179,9 +1163,8 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
bfile = hitr - > second ;
}
// 0 - size
if ( ! handle . size ( ) & & value ! = nullptr ) {
value - > clear ( ) ;
if ( handle . size ( ) = = 0 & & value ! = nullptr ) {
value - > PinSelf ( " " ) ;
return Status : : OK ( ) ;
}
@ -1189,7 +1172,7 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
std : : shared_ptr < RandomAccessFileReader > reader =
GetOrOpenRandomAccessReader ( bfile , env_ , env_options_ ) ;
std : : string * valueptr = value ;
std : : string * valueptr = value - > GetSelf ( ) ;
std : : string value_c ;
if ( bdb_options_ . compression ! = kNoCompression ) {
valueptr = & value_c ;
@ -1251,9 +1234,11 @@ Status BlobDBImpl::CommonGet(const Slice& key, const std::string& index_entry,
blob_value . data ( ) , blob_value . size ( ) , & contents ,
kBlockBasedTableVersionFormat , Slice ( ) , bdb_options_ . compression ,
* ( cfh - > cfd ( ) - > ioptions ( ) ) ) ;
* value = contents . data . ToString ( ) ;
* ( value - > GetSelf ( ) ) = contents . data . ToString ( ) ;
}
value - > PinSelf ( ) ;
return s ;
}
@ -1271,13 +1256,16 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
bool snapshot_created = SetSnapshotIfNeeded ( & ro ) ;
Status s ;
std : : string index_entry ;
s = db_ - > Get ( ro , key , & index_entry ) ;
bool is_blob_index = false ;
s = db_impl_ - > GetImpl ( ro , column_family , key , value , nullptr /*value_found*/ ,
nullptr /*read_callback*/ , & is_blob_index ) ;
TEST_SYNC_POINT ( " BlobDBImpl::Get:AfterIndexEntryGet:1 " ) ;
TEST_SYNC_POINT ( " BlobDBImpl::Get:AfterIndexEntryGet:2 " ) ;
if ( s . ok ( ) ) {
s = CommonGet ( key , index_entry , value - > GetSelf ( ) ) ;
value - > PinSelf ( ) ;
if ( is_blob_index ) {
PinnableSlice index_entry = std : : move ( * value ) ;
s = GetBlobValue ( key , index_entry , value ) ;
}
}
if ( snapshot_created ) {
db_ - > ReleaseSnapshot ( ro . snapshot ) ;
@ -1285,15 +1273,6 @@ Status BlobDBImpl::Get(const ReadOptions& read_options,
return s ;
}
Slice BlobDBIterator : : value ( ) const {
TEST_SYNC_POINT ( " BlobDBIterator::value:BeforeGetBlob:1 " ) ;
TEST_SYNC_POINT ( " BlobDBIterator::value:BeforeGetBlob:2 " ) ;
Slice index_entry = iter_ - > value ( ) ;
Status s =
db_impl_ - > CommonGet ( iter_ - > key ( ) , index_entry . ToString ( false ) , & vpart_ ) ;
return Slice ( vpart_ ) ;
}
std : : pair < bool , int64_t > BlobDBImpl : : SanityCheck ( bool aborted ) {
if ( aborted ) return std : : make_pair ( false , - 1 ) ;
@ -1411,14 +1390,13 @@ bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size,
return true ;
}
bool BlobDBImpl : : MarkBlobDeleted ( const Slice & key , const Slice & lsmValue ) {
Slice val ( lsmValue ) ;
bool BlobDBImpl : : MarkBlobDeleted ( const Slice & key , const Slice & index_entry ) {
BlobHandle handle ;
Status s = handle . DecodeFrom ( & val ) ;
Status s = handle . DecodeFrom ( index_entry ) ;
if ( ! s . ok ( ) ) {
ROCKS_LOG_INFO ( db_options_ . info_log ,
" Could not parse lsm val in MarkBlobDeleted %s " ,
lsmValue . ToString ( ) . c_str ( ) ) ;
index_entry . ToString ( ) . c_str ( ) ) ;
return false ;
}
bool succ = FindFileAndEvictABlob ( handle . filenumber ( ) , key . size ( ) ,
@ -1618,7 +1596,52 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
return std : : make_pair ( true , - 1 ) ;
}
////////////////////////////////////////////////////////////////////////////////
// Write callback for garbage collection to check if key has been updated
// since last read. Similar to how OptimisticTransaction works. See inline
// comment in GCFileAndUpdateLSM().
class BlobDBImpl : : GarbageCollectionWriteCallback : public WriteCallback {
public :
GarbageCollectionWriteCallback ( ColumnFamilyData * cfd , const Slice & key ,
SequenceNumber upper_bound )
: cfd_ ( cfd ) , key_ ( key ) , upper_bound_ ( upper_bound ) { }
virtual Status Callback ( DB * db ) override {
auto * db_impl = reinterpret_cast < DBImpl * > ( db ) ;
auto * sv = db_impl - > GetAndRefSuperVersion ( cfd_ ) ;
SequenceNumber latest_seq = 0 ;
bool found_record_for_key = false ;
bool is_blob_index = false ;
Status s = db_impl - > GetLatestSequenceForKey (
sv , key_ , false /*cache_only*/ , & latest_seq , & found_record_for_key ,
& is_blob_index ) ;
db_impl - > ReturnAndCleanupSuperVersion ( cfd_ , sv ) ;
if ( ! s . ok ( ) & & ! s . IsNotFound ( ) ) {
// Error.
assert ( ! s . IsBusy ( ) ) ;
return s ;
}
if ( s . IsNotFound ( ) ) {
assert ( ! found_record_for_key ) ;
return Status : : Busy ( " Key deleted " ) ;
}
assert ( found_record_for_key ) ;
assert ( is_blob_index ) ;
if ( latest_seq > upper_bound_ ) {
return Status : : Busy ( " Key overwritten " ) ;
}
return s ;
}
virtual bool AllowWriteBatching ( ) override { return false ; }
private :
ColumnFamilyData * cfd_ ;
// Key to check
Slice key_ ;
// Upper bound of sequence number to proceed.
SequenceNumber upper_bound_ ;
} ;
// iterate over the blobs sequentially and check if the blob sequence number
// is the latest. If it is the latest, preserve it, otherwise delete it
// if it is TTL based, and the TTL has expired, then
@ -1631,7 +1654,6 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
//
// if it is not TTL based, then we can blow the key if the key has been
// DELETED in the LSM
////////////////////////////////////////////////////////////////////////////////
Status BlobDBImpl : : GCFileAndUpdateLSM ( const std : : shared_ptr < BlobFile > & bfptr ,
GCStats * gc_stats ) {
uint64_t now = EpochNow ( ) ;
@ -1656,14 +1678,14 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
bool first_gc = bfptr - > gc_once_after_open_ ;
ColumnFamilyHandle * cfh = bfptr - > GetColumnFamily ( db_ ) ;
auto * cfh = bfptr - > GetColumnFamily ( db_ ) ;
auto * cfd = reinterpret_cast < ColumnFamilyHandleImpl * > ( cfh ) - > cfd ( ) ;
auto column_family_id = cfd - > GetID ( ) ;
bool has_ttl = header . HasTTL ( ) ;
// this reads the key but skips the blob
Reader : : ReadLevel shallow = Reader : : kReadHeaderKey ;
assert ( opt_db_ ) ;
bool no_relocation_ttl = ( has_ttl & & now > = bfptr - > GetTTLRange ( ) . second ) ;
bool no_relocation_lsmdel = false ;
@ -1683,23 +1705,11 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
BlobLogRecord record ;
std : : shared_ptr < BlobFile > newfile ;
std : : shared_ptr < Writer > new_writer ;
Transaction * transaction = nullptr ;
uint64_t blob_offset = 0 ;
bool retry = false ;
static const WriteOptions kGarbageCollectionWriteOptions = [ ] ( ) {
WriteOptions write_options ;
// It is ok to ignore column families that were dropped.
write_options . ignore_missing_column_families = true ;
return write_options ;
} ( ) ;
while ( true ) {
assert ( s . ok ( ) ) ;
if ( retry ) {
// Retry in case transaction fail with Status::TryAgain.
retry = false ;
} else {
// Read the next blob record.
Status read_record_status =
reader - > ReadRecord ( & record , shallow , & blob_offset ) ;
@ -1709,33 +1719,38 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
break ;
}
gc_stats - > blob_count + + ;
}
transaction =
opt_db_ - > BeginTransaction ( kGarbageCollectionWriteOptions ,
OptimisticTransactionOptions ( ) , transaction ) ;
std : : string index_entry ;
Status get_status = transaction - > GetForUpdate ( ReadOptions ( ) , cfh ,
record . Key ( ) , & index_entry ) ;
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:AfterGetForUpdate " ) ;
if ( get_status . IsNotFound ( ) ) {
// Key has been deleted. Drop the blob record.
continue ;
}
if ( ! get_status . ok ( ) ) {
// Similar to OptimisticTransaction, we obtain latest_seq from
// base DB, which is guaranteed to be no smaller than the sequence of
// current key. We use a WriteCallback on write to check the key sequence
// on write. If the key sequence is larger than latest_seq, we know
// a new versions is inserted and the old blob can be disgard.
//
// We cannot use OptimisticTransaction because we need to pass
// is_blob_index flag to GetImpl.
SequenceNumber latest_seq = GetLatestSequenceNumber ( ) ;
bool is_blob_index = false ;
PinnableSlice index_entry ;
Status get_status = db_impl_ - > GetImpl (
ReadOptions ( ) , cfh , record . Key ( ) , & index_entry , nullptr /*value_found*/ ,
nullptr /*read_callback*/ , & is_blob_index ) ;
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:AfterGetFromBaseDB " ) ;
if ( ! get_status . ok ( ) & & ! get_status . ok ( ) ) {
// error
s = get_status ;
ROCKS_LOG_ERROR ( db_options_ . info_log ,
" Error while getting index entry: %s " ,
s . ToString ( ) . c_str ( ) ) ;
break ;
}
if ( get_status . IsNotFound ( ) | | ! is_blob_index ) {
// Either the key is deleted or updated with a newer version whish is
// inlined in LSM.
continue ;
}
// TODO(yiwu): We should have an override of GetForUpdate returning a
// PinnableSlice.
Slice index_entry_slice ( index_entry ) ;
BlobHandle handle ;
s = handle . DecodeFrom ( & index_entry_slice ) ;
s = handle . DecodeFrom ( index_entry ) ;
if ( ! s . ok ( ) ) {
ROCKS_LOG_ERROR ( db_options_ . info_log ,
" Error while decoding index entry: %s " ,
@ -1748,21 +1763,24 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
continue ;
}
GarbageCollectionWriteCallback callback ( cfd , record . Key ( ) , latest_seq ) ;
// If key has expired, remove it from base DB.
if ( no_relocation_ttl | | ( has_ttl & & now > = record . GetTTL ( ) ) ) {
gc_stats - > num_deletes + + ;
gc_stats - > deleted_size + = record . GetBlobSize ( ) ;
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete " ) ;
transaction - > Delete ( cfh , record . Key ( ) ) ;
Status delete_status = transaction - > Commit ( ) ;
WriteBatch delete_batch ;
Status delete_status = delete_batch . Delete ( record . Key ( ) ) ;
if ( delete_status . ok ( ) ) {
delete_status = db_impl_ - > WriteWithCallback ( WriteOptions ( ) ,
& delete_batch , & callback ) ;
}
if ( delete_status . ok ( ) ) {
gc_stats - > delete_succeeded + + ;
} else if ( delete_status . IsBusy ( ) ) {
// The key is overwritten in the meanwhile. Drop the blob record.
gc_stats - > overwritten_while_delete + + ;
} else if ( delete_status . IsTryAgain ( ) ) {
// Retry the transaction.
retry = true ;
} else {
// We hit an error.
s = delete_status ;
@ -1829,29 +1847,27 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
BlobLogRecord : : kHeaderSize + record . Key ( ) . size ( ) + record . Blob ( ) . size ( ) ;
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate " ) ;
transaction - > Put ( cfh , record . Key ( ) , new_index_entry ) ;
Status put_status = transaction - > Commit ( ) ;
if ( put_status . ok ( ) ) {
WriteBatch rewrite_batch ;
Status rewrite_status = WriteBatchInternal : : PutBlobIndex (
& rewrite_batch , column_family_id , record . Key ( ) , new_index_entry ) ;
if ( rewrite_status . ok ( ) ) {
rewrite_status = db_impl_ - > WriteWithCallback ( WriteOptions ( ) ,
& rewrite_batch , & callback ) ;
}
if ( rewrite_status . ok ( ) ) {
gc_stats - > relocate_succeeded + + ;
} else if ( put_status . IsBusy ( ) ) {
} else if ( rewrite _status. IsBusy ( ) ) {
// The key is overwritten in the meanwhile. Drop the blob record.
gc_stats - > overwritten_while_relocate + + ;
} else if ( put_status . IsTryAgain ( ) ) {
// Retry the transaction.
// TODO(yiwu): On retry, we can reuse the new blob record.
retry = true ;
} else {
// We hit an error.
s = put _status;
s = rewrite_status ;
ROCKS_LOG_ERROR ( db_options_ . info_log , " Error while relocating key: %s " ,
s . ToString ( ) . c_str ( ) ) ;
break ;
}
} // end of ReadRecord loop
if ( transaction ! = nullptr ) {
delete transaction ;
}
ROCKS_LOG_INFO (
db_options_ . info_log ,
" %s blob file % " PRIu64
@ -2195,12 +2211,20 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
}
Iterator * BlobDBImpl : : NewIterator ( const ReadOptions & read_options ) {
auto * cfd =
reinterpret_cast < ColumnFamilyHandleImpl * > ( DefaultColumnFamily ( ) ) - > cfd ( ) ;
// Get a snapshot to avoid blob file get deleted between we
// fetch and index entry and reading from the file.
ReadOptions ro ( read_options ) ;
bool snapshot_created = SetSnapshotIfNeeded ( & ro ) ;
return new BlobDBIterator ( db_ - > NewIterator ( ro ) , this , snapshot_created ,
ro . snapshot ) ;
ManagedSnapshot * own_snapshot = nullptr ;
const Snapshot * snapshot = read_options . snapshot ;
if ( snapshot = = nullptr ) {
own_snapshot = new ManagedSnapshot ( db_ ) ;
snapshot = own_snapshot - > snapshot ( ) ;
}
auto * iter = db_impl_ - > NewIteratorImpl (
read_options , cfd , snapshot - > GetSequenceNumber ( ) ,
nullptr /*read_callback*/ , true /*allow_blob*/ ) ;
return new BlobDBIterator ( own_snapshot , iter , this ) ;
}
Status DestroyBlobDB ( const std : : string & dbname , const Options & options ,