@ -30,6 +30,7 @@
# include "util/logging.h"
# include "util/logging.h"
# include "util/mutexlock.h"
# include "util/mutexlock.h"
# include "util/random.h"
# include "util/random.h"
# include "util/sync_point.h"
# include "util/timer_queue.h"
# include "util/timer_queue.h"
# include "utilities/transactions/optimistic_transaction_db_impl.h"
# include "utilities/transactions/optimistic_transaction_db_impl.h"
# include "utilities/transactions/optimistic_transaction.h"
# include "utilities/transactions/optimistic_transaction.h"
@ -951,6 +952,7 @@ Slice BlobDBImpl::GetCompressedSlice(const Slice& raw,
Status BlobDBImpl : : PutUntil ( const WriteOptions & options ,
Status BlobDBImpl : : PutUntil ( const WriteOptions & options ,
ColumnFamilyHandle * column_family , const Slice & key ,
ColumnFamilyHandle * column_family , const Slice & key ,
const Slice & value_unc , uint64_t expiration ) {
const Slice & value_unc , uint64_t expiration ) {
TEST_SYNC_POINT ( " BlobDBImpl::PutUntil:Start " ) ;
MutexLock l ( & write_mutex_ ) ;
MutexLock l ( & write_mutex_ ) ;
UpdateWriteOptions ( options ) ;
UpdateWriteOptions ( options ) ;
@ -1022,6 +1024,7 @@ Status BlobDBImpl::PutUntil(const WriteOptions& options,
CloseIf ( bfile ) ;
CloseIf ( bfile ) ;
TEST_SYNC_POINT ( " BlobDBImpl::PutUntil:Finish " ) ;
return s ;
return s ;
}
}
@ -1655,8 +1658,8 @@ std::pair<bool, int64_t> BlobDBImpl::WaStats(bool aborted) {
// DELETED in the LSM
// DELETED in the LSM
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
Status BlobDBImpl : : GCFileAndUpdateLSM ( const std : : shared_ptr < BlobFile > & bfptr ,
Status BlobDBImpl : : GCFileAndUpdateLSM ( const std : : shared_ptr < BlobFile > & bfptr ,
GCStats * gcstats ) {
GCStats * gc_ stats ) {
uint64_t tt = EpochNow ( ) ;
uint64_t now = EpochNow ( ) ;
std : : shared_ptr < Reader > reader =
std : : shared_ptr < Reader > reader =
bfptr - > OpenSequentialReader ( env_ , db_options_ , env_options_ ) ;
bfptr - > OpenSequentialReader ( env_ , db_options_ , env_options_ ) ;
@ -1679,8 +1682,6 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
bool first_gc = bfptr - > gc_once_after_open_ ;
bool first_gc = bfptr - > gc_once_after_open_ ;
ColumnFamilyHandle * cfh = bfptr - > GetColumnFamily ( db_ ) ;
ColumnFamilyHandle * cfh = bfptr - > GetColumnFamily ( db_ ) ;
auto cfhi = reinterpret_cast < ColumnFamilyHandleImpl * > ( cfh ) ;
auto cfd = cfhi - > cfd ( ) ;
bool has_ttl = header . HasTTL ( ) ;
bool has_ttl = header . HasTTL ( ) ;
// this reads the key but skips the blob
// this reads the key but skips the blob
@ -1688,7 +1689,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
assert ( opt_db_ ) ;
assert ( opt_db_ ) ;
bool no_relocation_ttl = ( has_ttl & & tt > bfptr - > GetTTLRange ( ) . second ) ;
bool no_relocation_ttl = ( has_ttl & & now > = bfptr - > GetTTLRange ( ) . second ) ;
bool no_relocation_lsmdel = false ;
bool no_relocation_lsmdel = false ;
{
{
@ -1707,64 +1708,120 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
BlobLogRecord record ;
BlobLogRecord record ;
std : : shared_ptr < BlobFile > newfile ;
std : : shared_ptr < BlobFile > newfile ;
std : : shared_ptr < Writer > new_writer ;
std : : shared_ptr < Writer > new_writer ;
Transaction * transaction = nullptr ;
uint64_t blob_offset = 0 ;
bool retry = false ;
static const WriteOptions kGarbageCollectionWriteOptions = [ ] ( ) {
WriteOptions write_options ;
// TODO(yiwu): Disable WAL for garbage colection to make it compatible with
// use cases that don't use WAL. However without WAL there are at least
// two issues with crash:
// 1. If a key is dropped from blob file (e.g. due to TTL), right before a
// crash, the key may still presents in LSM after restart.
// 2. If a key is relocated to another blob file, right before a crash,
// after restart the new offset may be lost with the old offset pointing
// to the removed blob file.
// We need to have better recovery mechanism to address these issues.
write_options . disableWAL = true ;
// It is ok to ignore column families that were dropped.
write_options . ignore_missing_column_families = true ;
return write_options ;
} ( ) ;
while ( true ) {
assert ( s . ok ( ) ) ;
if ( retry ) {
// Retry in case transaction fail with Status::TryAgain.
retry = false ;
} else {
// Read the next blob record.
Status read_record_status =
reader - > ReadRecord ( & record , shallow , & blob_offset ) ;
// Exit if we reach the end of blob file.
// TODO(yiwu): properly handle ReadRecord error.
if ( ! read_record_status . ok ( ) ) {
break ;
}
gc_stats - > blob_count + + ;
}
while ( reader - > ReadRecord ( & record , shallow ) . ok ( ) ) {
transaction =
gcstats - > blob_count + + ;
opt_db_ - > BeginTransaction ( kGarbageCollectionWriteOptions ,
OptimisticTransactionOptions ( ) , transaction ) ;
bool del_this = false ;
bool reloc_this = false ;
std : : string index_entry ;
Status get_status = transaction - > GetForUpdate ( ReadOptions ( ) , cfh ,
// TODO(yiwu): The following logic should use GetForUpdate() from
record . Key ( ) , & index_entry ) ;
// optimistic transaction to check if the key is current, otherwise
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:AfterGetForUpdate " ) ;
// there can be another writer sneak in between sequence number of
if ( get_status . IsNotFound ( ) ) {
// and the deletion.
// Key has been deleted. Drop the blob record.
continue ;
// this particular TTL has expired
}
if ( no_relocation_ttl | | ( has_ttl & & tt > record . GetTTL ( ) ) ) {
if ( ! get_status . ok ( ) ) {
del_this = true ;
s = get_status ;
} else if ( ! first_gc ) {
ROCKS_LOG_ERROR ( db_options_ . info_log ,
SequenceNumber seq = kMaxSequenceNumber ;
" Error while getting index entry: %s " ,
bool found_record_for_key = false ;
s . ToString ( ) . c_str ( ) ) ;
SuperVersion * sv = db_impl_ - > GetAndRefSuperVersion ( cfd ) ;
break ;
if ( sv = = nullptr ) {
}
Status result =
Status : : InvalidArgument ( " Could not access column family 0 " ) ;
// TODO(yiwu): We should have an override of GetForUpdate returning a
return result ;
// PinnableSlice.
}
Slice index_entry_slice ( index_entry ) ;
Status s1 = db_impl_ - > GetLatestSequenceForKey (
BlobHandle handle ;
sv , record . Key ( ) , false , & seq , & found_record_for_key ) ;
s = handle . DecodeFrom ( & index_entry_slice ) ;
if ( found_record_for_key & & seq = = record . GetSN ( ) ) {
if ( ! s . ok ( ) ) {
reloc_this = true ;
ROCKS_LOG_ERROR ( db_options_ . info_log ,
}
" Error while decoding index entry: %s " ,
db_impl_ - > ReturnAndCleanupSuperVersion ( cfd , sv ) ;
s . ToString ( ) . c_str ( ) ) ;
}
break ;
}
if ( del_this ) {
if ( handle . filenumber ( ) ! = bfptr - > BlobFileNumber ( ) | |
gcstats - > num_deletes + + ;
handle . offset ( ) ! = blob_offset ) {
gcstats - > deleted_size + = record . GetBlobSize ( ) ;
// Key has been overwritten. Drop the blob record.
if ( first_gc ) continue ;
continue ;
}
Transaction * txn = opt_db_ - > BeginTransaction (
write_options_ , OptimisticTransactionOptions ( ) , nullptr ) ;
// If key has expired, remove it from base DB.
txn - > Delete ( cfh , record . Key ( ) ) ;
if ( no_relocation_ttl | | ( has_ttl & & now > = record . GetTTL ( ) ) ) {
Status s1 = txn - > Commit ( ) ;
gc_stats - > num_deletes + + ;
// chances that this DELETE will fail is low. If it fails, it would be
gc_stats - > deleted_size + = record . GetBlobSize ( ) ;
// because a new version of the key came in at this time, which will
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:BeforeDelete " ) ;
// override the current version being iterated on.
transaction - > Delete ( cfh , record . Key ( ) ) ;
if ( ! s1 . IsBusy ( ) ) {
Status delete_status = transaction - > Commit ( ) ;
// assume that failures happen due to new writes.
if ( delete_status . ok ( ) ) {
gcstats - > overrided_while_delete + + ;
gc_stats - > delete_succeeded + + ;
}
} else if ( delete_status . IsBusy ( ) ) {
delete txn ;
// The key is overwritten in the meanwhile. Drop the blob record.
}
gc_stats - > overwritten_while_delete + + ;
} else if ( delete_status . IsTryAgain ( ) ) {
if ( reloc_this ) {
// Retry the transaction.
retry = true ;
} else {
// We hit an error.
s = delete_status ;
ROCKS_LOG_ERROR ( db_options_ . info_log ,
" Error while deleting expired key: %s " ,
s . ToString ( ) . c_str ( ) ) ;
break ;
}
// Continue to next blob record or retry.
continue ;
}
if ( first_gc ) {
// Do not relocate blob record for initial GC.
continue ;
}
// Relocate the blob record to new file.
if ( ! newfile ) {
if ( ! newfile ) {
// new file
// new file
std : : string reason ( " GC of " ) ;
std : : string reason ( " GC of " ) ;
reason + = bfptr - > PathName ( ) ;
reason + = bfptr - > PathName ( ) ;
newfile = NewBlobFile ( reason ) ;
newfile = NewBlobFile ( reason ) ;
gcstats - > newfile = newfile ;
gc_ stats - > newfile = newfile ;
new_writer = CheckOrCreateWriterLocked ( newfile ) ;
new_writer = CheckOrCreateWriterLocked ( newfile ) ;
newfile - > header_ = std : : move ( header ) ;
newfile - > header_ = std : : move ( header ) ;
@ -1777,7 +1834,7 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
ROCKS_LOG_ERROR ( db_options_ . info_log ,
ROCKS_LOG_ERROR ( db_options_ . info_log ,
" File: %s - header writing failed " ,
" File: %s - header writing failed " ,
newfile - > PathName ( ) . c_str ( ) ) ;
newfile - > PathName ( ) . c_str ( ) ) ;
return s ;
break ;
}
}
WriteLock wl ( & mutex_ ) ;
WriteLock wl ( & mutex_ ) ;
@ -1786,57 +1843,64 @@ Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr<BlobFile>& bfptr,
blob_files_ . insert ( std : : make_pair ( newfile - > BlobFileNumber ( ) , newfile ) ) ;
blob_files_ . insert ( std : : make_pair ( newfile - > BlobFileNumber ( ) , newfile ) ) ;
}
}
gcstats - > num_relocs + + ;
gc_ stats - > num_relocate + + ;
std : : string index_entry ;
std : : string new_ index_entry;
uint64_t blob_offset = 0 ;
uint64_t new_ blob_offset = 0 ;
uint64_t key_offset = 0 ;
uint64_t new_ key_offset = 0 ;
// write the blob to the blob log.
// write the blob to the blob log.
s = new_writer - > AddRecord ( record . Key ( ) , record . Blob ( ) , & key_offset ,
s = new_writer - > AddRecord ( record . Key ( ) , record . Blob ( ) , & new_ key_offset,
& blob_offset , record . GetTTL ( ) ) ;
& new_ blob_offset, record . GetTTL ( ) ) ;
BlobHandle handle ;
BlobHandle new_ handle;
handle . set_filenumber ( newfile - > BlobFileNumber ( ) ) ;
new_ handle. set_filenumber ( newfile - > BlobFileNumber ( ) ) ;
handle . set_size ( record . Blob ( ) . size ( ) ) ;
new_ handle. set_size ( record . Blob ( ) . size ( ) ) ;
handle . set_offset ( blob_offset ) ;
new_ handle. set_offset ( new_ blob_offset) ;
handle . set_compression ( bdb_options_ . compression ) ;
new_ handle. set_compression ( bdb_options_ . compression ) ;
handle . EncodeTo ( & index_entry ) ;
new_ handle. EncodeTo ( & new_ index_entry) ;
new_writer - > AddRecordFooter ( record . GetSN ( ) ) ;
new_writer - > AddRecordFooter ( record . GetSN ( ) ) ;
newfile - > blob_count_ + + ;
newfile - > blob_count_ + + ;
newfile - > file_size_ + = BlobLogRecord : : kHeaderSize + record . Key ( ) . size ( ) +
newfile - > file_size_ + = BlobLogRecord : : kHeaderSize + record . Key ( ) . size ( ) +
record . Blob ( ) . size ( ) + BlobLogRecord : : kFooterSize ;
record . Blob ( ) . size ( ) + BlobLogRecord : : kFooterSize ;
Transaction * txn = opt_db_ - > BeginTransaction (
TEST_SYNC_POINT ( " BlobDBImpl::GCFileAndUpdateLSM:BeforeRelocate " ) ;
write_options_ , OptimisticTransactionOptions ( ) , nullptr ) ;
transaction - > Put ( cfh , record . Key ( ) , new_index_entry ) ;
txn - > Put ( cfh , record . Key ( ) , index_entry ) ;
Status put_status = transaction - > Commit ( ) ;
Status s1 = txn - > Commit ( ) ;
if ( put_status . ok ( ) ) {
// chances that this Put will fail is low. If it fails, it would be
gc_stats - > relocate_succeeded + + ;
// because a new version of the key came in at this time, which will
} else if ( put_status . IsBusy ( ) ) {
// override the current version being iterated on.
// The key is overwritten in the meanwhile. Drop the blob record.
if ( s1 . IsBusy ( ) ) {
gc_stats - > overwritten_while_relocate + + ;
ROCKS_LOG_INFO ( db_options_ . info_log ,
} else if ( put_status . IsTryAgain ( ) ) {
" Optimistic transaction failed: %s put bn: % " PRIu32 ,
// Retry the transaction.
bfptr - > PathName ( ) . c_str ( ) , gcstats - > blob_count ) ;
// TODO(yiwu): On retry, we can reuse the new blob record.
retry = true ;
} else {
} else {
gcstats - > succ_relocs + + ;
// We hit an error.
ROCKS_LOG_DEBUG ( db_options_ . info_log ,
s = put_status ;
" Successfully added put back into LSM: %s bn: % " PRIu32 ,
ROCKS_LOG_ERROR ( db_options_ . info_log , " Error while relocating key: %s " ,
bfptr - > PathName ( ) . c_str ( ) , gcstats - > blob_count ) ;
s . ToString ( ) . c_str ( ) ) ;
break ;
}
}
delete txn ;
} // end of ReadRecord loop
if ( transaction ! = nullptr ) {
delete transaction ;
}
}
ROCKS_LOG_INFO (
db_options_ . info_log , " %s blob file % " PRIu64 " . " ,
" . Total blob records: % " PRIu64 " , Deletes: % " PRIu64 " /% " PRIu64
" succeeded, Relocates: % " PRIu64 " /% " PRIu64 " succeeded. " ,
s . ok ( ) ? " Successfully garbage collected " : " Failed to garbage collect " ,
bfptr - > BlobFileNumber ( ) , gc_stats - > blob_count , gc_stats - > delete_succeeded ,
gc_stats - > num_deletes , gc_stats - > relocate_succeeded ,
gc_stats - > num_relocate ) ;
if ( newfile ! = nullptr ) {
total_blob_space_ + = newfile - > file_size_ ;
ROCKS_LOG_INFO ( db_options_ . info_log , " New blob file % " PRIu64 " . " ,
newfile - > BlobFileNumber ( ) ) ;
}
}
if ( gcstats - > newfile ) total_blob_space_ + = newfile - > file_size_ ;
ROCKS_LOG_INFO ( db_options_ . info_log ,
" File: %s Num deletes % " PRIu32 " Num relocs: % " PRIu32
" Succ Deletes: % " PRIu32 " Succ relocs: % " PRIu32 ,
bfptr - > PathName ( ) . c_str ( ) , gcstats - > num_deletes ,
gcstats - > num_relocs , gcstats - > succ_deletes_lsm ,
gcstats - > succ_relocs ) ;
return s ;
return s ;
}
}
@ -2119,15 +2183,17 @@ std::pair<bool, int64_t> BlobDBImpl::RunGC(bool aborted) {
// in this collect the set of files, which became obsolete
// in this collect the set of files, which became obsolete
std : : vector < std : : shared_ptr < BlobFile > > obsoletes ;
std : : vector < std : : shared_ptr < BlobFile > > obsoletes ;
for ( auto bfile : to_process ) {
for ( auto bfile : to_process ) {
GCStats gcstats ;
GCStats gc_stats ;
Status s = GCFileAndUpdateLSM ( bfile , & gcstats ) ;
Status s = GCFileAndUpdateLSM ( bfile , & gc_stats ) ;
if ( ! s . ok ( ) ) continue ;
if ( ! s . ok ( ) ) {
continue ;
}
if ( bfile - > gc_once_after_open_ . load ( ) ) {
if ( bfile - > gc_once_after_open_ . load ( ) ) {
WriteLock lockbfile_w ( & bfile - > mutex_ ) ;
WriteLock lockbfile_w ( & bfile - > mutex_ ) ;
bfile - > deleted_size_ = gcstats . deleted_size ;
bfile - > deleted_size_ = gc_ stats . deleted_size ;
bfile - > deleted_count_ = gcstats . num_deletes ;
bfile - > deleted_count_ = gc_ stats . num_deletes ;
bfile - > gc_once_after_open_ = false ;
bfile - > gc_once_after_open_ = false ;
} else {
} else {
obsoletes . push_back ( bfile ) ;
obsoletes . push_back ( bfile ) ;