@ -9,7 +9,7 @@
# ifndef ROCKSDB_LITE
# include "rocksdb/ utilities/checkpoint.h"
# include "utilities/checkpoint/checkpoint_impl .h"
# ifndef __STDC_FORMAT_MACROS
# define __STDC_FORMAT_MACROS
@ -18,37 +18,20 @@
# include <inttypes.h>
# include <algorithm>
# include <string>
# include <vector>
# include "db/wal_manager.h"
# include "port/port.h"
# include "rocksdb/db.h"
# include "rocksdb/env.h"
# include "rocksdb/transaction_log.h"
# include "rocksdb/utilities/checkpoint.h"
# include "util/file_util.h"
# include "util/filename.h"
# include "util/sync_point.h"
namespace rocksdb {
class CheckpointImpl : public Checkpoint {
public :
// Creates a Checkpoint object to be used for creating openable snapshots
explicit CheckpointImpl ( DB * db ) : db_ ( db ) { }
// Builds an openable snapshot of RocksDB on the same disk, which
// accepts an output directory on the same disk, and under the directory
// (1) hard-linked SST files pointing to existing live SST files
// SST files will be copied if output directory is on a different filesystem
// (2) a copied manifest files and other files
// The directory should not already exist and will be created by this API.
// The directory will be an absolute path
using Checkpoint : : CreateCheckpoint ;
virtual Status CreateCheckpoint ( const std : : string & checkpoint_dir ,
uint64_t log_size_for_flush ) override ;
private :
DB * db_ ;
} ;
Status Checkpoint : : Create ( DB * db , Checkpoint * * checkpoint_ptr ) {
* checkpoint_ptr = new CheckpointImpl ( db ) ;
return Status : : OK ( ) ;
@ -62,16 +45,9 @@ Status Checkpoint::CreateCheckpoint(const std::string& checkpoint_dir,
// Builds an openable snapshot of RocksDB
Status CheckpointImpl : : CreateCheckpoint ( const std : : string & checkpoint_dir ,
uint64_t log_size_for_flush ) {
Status s ;
std : : vector < std : : string > live_files ;
uint64_t manifest_file_size = 0 ;
DBOptions db_options = db_ - > GetDBOptions ( ) ;
uint64_t min_log_num = port : : kMaxUint64 ;
uint64_t sequence_number = db_ - > GetLatestSequenceNumber ( ) ;
bool same_fs = true ;
VectorLogPtr live_wal_files ;
s = db_ - > GetEnv ( ) - > FileExists ( checkpoint_dir ) ;
Status s = db_ - > GetEnv ( ) - > FileExists ( checkpoint_dir ) ;
if ( s . ok ( ) ) {
return Status : : InvalidArgument ( " Directory exists " ) ;
} else if ( ! s . IsNotFound ( ) ) {
@ -79,29 +55,124 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
return s ;
}
s = db_ - > DisableFileDeletions ( ) ;
ROCKS_LOG_INFO (
db_options . info_log ,
" Started the snapshot process -- creating snapshot in directory %s " ,
checkpoint_dir . c_str ( ) ) ;
std : : string full_private_path = checkpoint_dir + " .tmp " ;
// create snapshot directory
s = db_ - > GetEnv ( ) - > CreateDir ( full_private_path ) ;
uint64_t sequence_number = 0 ;
if ( s . ok ( ) ) {
db_ - > DisableFileDeletions ( ) ;
s = CreateCustomCheckpoint (
db_options ,
[ & ] ( const std : : string & src_dirname , const std : : string & fname ,
FileType ) {
ROCKS_LOG_INFO ( db_options . info_log , " Hard Linking %s " , fname . c_str ( ) ) ;
return db_ - > GetEnv ( ) - > LinkFile ( src_dirname + fname ,
full_private_path + fname ) ;
} /* link_file_cb */ ,
[ & ] ( const std : : string & src_dirname , const std : : string & fname ,
uint64_t size_limit_bytes , FileType ) {
ROCKS_LOG_INFO ( db_options . info_log , " Copying %s " , fname . c_str ( ) ) ;
return CopyFile ( db_ - > GetEnv ( ) , src_dirname + fname ,
full_private_path + fname , size_limit_bytes ,
db_options . use_fsync ) ;
} /* copy_file_cb */ ,
[ & ] ( const std : : string & fname , const std : : string & contents , FileType ) {
ROCKS_LOG_INFO ( db_options . info_log , " Creating %s " , fname . c_str ( ) ) ;
return CreateFile ( db_ - > GetEnv ( ) , full_private_path + fname , contents ) ;
} /* create_file_cb */ ,
& sequence_number , log_size_for_flush ) ;
// we copied all the files, enable file deletions
db_ - > EnableFileDeletions ( false ) ;
}
if ( s . ok ( ) ) {
// move tmp private backup to real snapshot directory
s = db_ - > GetEnv ( ) - > RenameFile ( full_private_path , checkpoint_dir ) ;
}
if ( s . ok ( ) ) {
unique_ptr < Directory > checkpoint_directory ;
db_ - > GetEnv ( ) - > NewDirectory ( checkpoint_dir , & checkpoint_directory ) ;
if ( checkpoint_directory ! = nullptr ) {
s = checkpoint_directory - > Fsync ( ) ;
}
}
if ( s . ok ( ) ) {
// here we know that we succeeded and installed the new snapshot
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot DONE. All is good " ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot sequence number: % " PRIu64 ,
sequence_number ) ;
} else {
// clean all the files we might have created
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot failed -- %s " ,
s . ToString ( ) . c_str ( ) ) ;
// we have to delete the dir and all its children
std : : vector < std : : string > subchildren ;
db_ - > GetEnv ( ) - > GetChildren ( full_private_path , & subchildren ) ;
for ( auto & subchild : subchildren ) {
std : : string subchild_path = full_private_path + " / " + subchild ;
Status s1 = db_ - > GetEnv ( ) - > DeleteFile ( subchild_path ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Delete file %s -- %s " ,
subchild_path . c_str ( ) , s1 . ToString ( ) . c_str ( ) ) ;
}
// finally delete the private dir
Status s1 = db_ - > GetEnv ( ) - > DeleteDir ( full_private_path ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Delete dir %s -- %s " ,
full_private_path . c_str ( ) , s1 . ToString ( ) . c_str ( ) ) ;
}
return s ;
}
Status CheckpointImpl : : CreateCustomCheckpoint (
const DBOptions & db_options ,
std : : function < Status ( const std : : string & src_dirname ,
const std : : string & src_fname , FileType type ) >
link_file_cb ,
std : : function < Status ( const std : : string & src_dirname ,
const std : : string & src_fname ,
uint64_t size_limit_bytes , FileType type ) >
copy_file_cb ,
std : : function < Status ( const std : : string & fname , const std : : string & contents ,
FileType type ) >
create_file_cb ,
uint64_t * sequence_number , uint64_t log_size_for_flush ) {
Status s ;
std : : vector < std : : string > live_files ;
uint64_t manifest_file_size = 0 ;
uint64_t min_log_num = port : : kMaxUint64 ;
* sequence_number = db_ - > GetLatestSequenceNumber ( ) ;
bool same_fs = true ;
VectorLogPtr live_wal_files ;
bool flush_memtable = true ;
if ( s . ok ( ) ) {
if ( ! db_options . allow_2pc ) {
// If out standing log files are small, we skip the flush.
s = db_ - > GetSortedWalFiles ( live_wal_files ) ;
if ( log_size_for_flush = = port : : kMaxUint64 ) {
flush_memtable = false ;
} else if ( log_size_for_flush > 0 ) {
// If out standing log files are small, we skip the flush.
s = db_ - > GetSortedWalFiles ( live_wal_files ) ;
if ( ! s . ok ( ) ) {
db_ - > EnableFileDeletions ( false ) ;
return s ;
}
if ( ! s . ok ( ) ) {
return s ;
}
// Don't flush column families if total log size is smaller than
// log_size_for_flush. We copy the log files instead.
// We may be able to cover 2PC case too.
uint64_t total_wal_size = 0 ;
for ( auto & wal : live_wal_files ) {
total_wal_size + = wal - > SizeFileBytes ( ) ;
}
if ( total_wal_size < log_size_for_flush ) {
flush_memtable = false ;
// Don't flush column families if total log size is smaller than
// log_size_for_flush. We copy the log files instead.
// We may be able to cover 2PC case too.
uint64_t total_wal_size = 0 ;
for ( auto & wal : live_wal_files ) {
total_wal_size + = wal - > SizeFileBytes ( ) ;
}
if ( total_wal_size < log_size_for_flush ) {
flush_memtable = false ;
}
live_wal_files . clear ( ) ;
}
live_wal_files . clear ( ) ;
}
// this will return live_files prefixed with "/"
@ -112,7 +183,6 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
// Need to refetch the live files to recapture the snapshot.
if ( ! db_ - > GetIntProperty ( DB : : Properties : : kMinLogNumberToKeep ,
& min_log_num ) ) {
db_ - > EnableFileDeletions ( false ) ;
return Status : : InvalidArgument (
" 2PC enabled but cannot fine the min log number to keep. " ) ;
}
@ -127,12 +197,12 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
// be skipped. 000003.log contains commit message of tnx1, but we don't
// have respective prepare record for it.
// In order to avoid this situation, we need to force flush to make sure
// all transactions commited before getting min_log_num will be flushed
// all transactions committ ed before getting min_log_num will be flushed
// to SST files.
// We cannot get min_log_num before calling the GetLiveFiles() for the
// first time, because if we do that, all the logs files will be included,
// far more than needed.
s = db_ - > GetLiveFiles ( live_files , & manifest_file_size , /* flush */ tru e) ;
s = db_ - > GetLiveFiles ( live_files , & manifest_file_size , flush_memtabl e) ;
}
TEST_SYNC_POINT ( " CheckpointImpl::CreateCheckpoint:SavedLiveFiles1 " ) ;
@ -143,20 +213,10 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
s = db_ - > GetSortedWalFiles ( live_wal_files ) ;
}
if ( ! s . ok ( ) ) {
db_ - > EnableFileDeletions ( false ) ;
return s ;
}
size_t wal_size = live_wal_files . size ( ) ;
ROCKS_LOG_INFO (
db_options . info_log ,
" Started the snapshot process -- creating snapshot in directory %s " ,
checkpoint_dir . c_str ( ) ) ;
std : : string full_private_path = checkpoint_dir + " .tmp " ;
// create snapshot directory
s = db_ - > GetEnv ( ) - > CreateDir ( full_private_path ) ;
// copy/hard link live_files
std : : string manifest_fname , current_fname ;
@ -188,25 +248,21 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
// * if it's kDescriptorFile, limit the size to manifest_file_size
// * always copy if cross-device link
if ( ( type = = kTableFile ) & & same_fs ) {
ROCKS_LOG_INFO ( db_options . info_log , " Hard Linking %s " , src_fname . c_str ( ) ) ;
s = db_ - > GetEnv ( ) - > LinkFile ( db_ - > GetName ( ) + src_fname ,
full_private_path + src_fname ) ;
s = link_file_cb ( db_ - > GetName ( ) , src_fname , type ) ;
if ( s . IsNotSupported ( ) ) {
same_fs = false ;
s = Status : : OK ( ) ;
}
}
if ( ( type ! = kTableFile ) | | ( ! same_fs ) ) {
ROCKS_LOG_INFO ( db_options . info_log , " Copying %s " , src_fname . c_str ( ) ) ;
s = CopyFile ( db_ - > GetEnv ( ) , db_ - > GetName ( ) + src_fname ,
full_private_path + src_fname ,
( type = = kDescriptorFile ) ? manifest_file_size : 0 ,
db_options . use_fsync ) ;
s = copy_file_cb ( db_ - > GetName ( ) , src_fname ,
( type = = kDescriptorFile ) ? manifest_file_size : 0 ,
type ) ;
}
}
if ( s . ok ( ) & & ! current_fname . empty ( ) & & ! manifest_fname . empty ( ) ) {
s = CreateFile ( db_ - > GetEnv ( ) , full_private_path + current_fname ,
manifest_fname . substr ( 1 ) + " \n " ) ;
create_file_cb ( current_fname , manifest_fname . substr ( 1 ) + " \n " ,
kCurrentFile ) ;
}
ROCKS_LOG_INFO ( db_options . info_log , " Number of log files % " ROCKSDB_PRIszt ,
live_wal_files . size ( ) ) ;
@ -216,82 +272,32 @@ Status CheckpointImpl::CreateCheckpoint(const std::string& checkpoint_dir,
for ( size_t i = 0 ; s . ok ( ) & & i < wal_size ; + + i ) {
if ( ( live_wal_files [ i ] - > Type ( ) = = kAliveLogFile ) & &
( ! flush_memtable | |
live_wal_files [ i ] - > StartSequence ( ) > = sequence_number | |
live_wal_files [ i ] - > StartSequence ( ) > = * sequence_number | |
live_wal_files [ i ] - > LogNumber ( ) > = min_log_num ) ) {
if ( i + 1 = = wal_size ) {
ROCKS_LOG_INFO ( db_options . info_log , " Copying %s " ,
live_wal_files [ i ] - > PathName ( ) . c_str ( ) ) ;
s = CopyFile ( db_ - > GetEnv ( ) ,
db_options . wal_dir + live_wal_files [ i ] - > PathName ( ) ,
full_private_path + live_wal_files [ i ] - > PathName ( ) ,
live_wal_files [ i ] - > SizeFileBytes ( ) , db_options . use_fsync ) ;
s = copy_file_cb ( db_options . wal_dir , live_wal_files [ i ] - > PathName ( ) ,
live_wal_files [ i ] - > SizeFileBytes ( ) , kLogFile ) ;
break ;
}
if ( same_fs ) {
// we only care about live log files
ROCKS_LOG_INFO ( db_options . info_log , " Hard Linking %s " ,
live_wal_files [ i ] - > PathName ( ) . c_str ( ) ) ;
s = db_ - > GetEnv ( ) - > LinkFile (
db_options . wal_dir + live_wal_files [ i ] - > PathName ( ) ,
full_private_path + live_wal_files [ i ] - > PathName ( ) ) ;
s = link_file_cb ( db_options . wal_dir , live_wal_files [ i ] - > PathName ( ) ,
kLogFile ) ;
if ( s . IsNotSupported ( ) ) {
same_fs = false ;
s = Status : : OK ( ) ;
}
}
if ( ! same_fs ) {
ROCKS_LOG_INFO ( db_options . info_log , " Copying %s " ,
live_wal_files [ i ] - > PathName ( ) . c_str ( ) ) ;
s = CopyFile ( db_ - > GetEnv ( ) ,
db_options . wal_dir + live_wal_files [ i ] - > PathName ( ) ,
full_private_path + live_wal_files [ i ] - > PathName ( ) , 0 ,
db_options . use_fsync ) ;
s = copy_file_cb ( db_options . wal_dir , live_wal_files [ i ] - > PathName ( ) , 0 ,
kLogFile ) ;
}
}
}
// we copied all the files, enable file deletions
db_ - > EnableFileDeletions ( false ) ;
if ( s . ok ( ) ) {
// move tmp private backup to real snapshot directory
s = db_ - > GetEnv ( ) - > RenameFile ( full_private_path , checkpoint_dir ) ;
}
if ( s . ok ( ) ) {
unique_ptr < Directory > checkpoint_directory ;
db_ - > GetEnv ( ) - > NewDirectory ( checkpoint_dir , & checkpoint_directory ) ;
if ( checkpoint_directory ! = nullptr ) {
s = checkpoint_directory - > Fsync ( ) ;
}
}
if ( ! s . ok ( ) ) {
// clean all the files we might have created
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot failed -- %s " ,
s . ToString ( ) . c_str ( ) ) ;
// we have to delete the dir and all its children
std : : vector < std : : string > subchildren ;
db_ - > GetEnv ( ) - > GetChildren ( full_private_path , & subchildren ) ;
for ( auto & subchild : subchildren ) {
std : : string subchild_path = full_private_path + " / " + subchild ;
Status s1 = db_ - > GetEnv ( ) - > DeleteFile ( subchild_path ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Delete file %s -- %s " ,
subchild_path . c_str ( ) , s1 . ToString ( ) . c_str ( ) ) ;
}
// finally delete the private dir
Status s1 = db_ - > GetEnv ( ) - > DeleteDir ( full_private_path ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Delete dir %s -- %s " ,
full_private_path . c_str ( ) , s1 . ToString ( ) . c_str ( ) ) ;
return s ;
}
// here we know that we succeeded and installed the new snapshot
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot DONE. All is good " ) ;
ROCKS_LOG_INFO ( db_options . info_log , " Snapshot sequence number: % " PRIu64 ,
sequence_number ) ;
return s ;
}
} // namespace rocksdb
# endif // ROCKSDB_LITE