Merge branch 'master' into wal_filter

main
Praveen Rao 9 years ago
commit 4ce117c4d5
  1. 59
      CMakeLists.txt
  2. 10
      Makefile
  3. 2
      appveyor.yml
  4. 2
      appveyordailytests.yml
  5. 3
      arcanist_util/cpp_linter/FbcodeCppLinter.php
  6. 2
      build_tools/build_detect_platform
  7. 2
      build_tools/fbcode_config.sh
  8. 2
      build_tools/fbcode_config4.8.1.sh
  9. 31
      build_tools/rocksdb-lego-determinator
  10. 5
      db/c.cc
  11. 7
      db/column_family_test.cc
  12. 2
      db/compaction_job_stats_test.cc
  13. 2
      db/compaction_job_test.cc
  14. 4
      db/db_compaction_filter_test.cc
  15. 15
      db/db_compaction_test.cc
  16. 6
      db/db_dynamic_level_test.cc
  17. 76
      db/db_impl.cc
  18. 2
      db/db_impl.h
  19. 3
      db/db_iter_test.cc
  20. 6
      db/db_log_iter_test.cc
  21. 4
      db/db_table_properties_test.cc
  22. 6
      db/db_tailing_iter_test.cc
  23. 11
      db/db_test.cc
  24. 7
      db/db_test_util.cc
  25. 7
      db/db_test_util.h
  26. 6
      db/db_universal_compaction_test.cc
  27. 8
      db/db_wal_test.cc
  28. 8
      db/fault_injection_test.cc
  29. 2
      db/flush_job_test.cc
  30. 14
      db/log_format.h
  31. 161
      db/log_reader.cc
  32. 27
      db/log_reader.h
  33. 216
      db/log_test.cc
  34. 68
      db/log_writer.cc
  35. 16
      db/log_writer.h
  36. 6
      db/repair.cc
  37. 6
      db/transaction_log_impl.cc
  38. 11
      db/version_builder.cc
  39. 2
      db/version_builder.h
  40. 43
      db/version_set.cc
  41. 2
      db/version_set.h
  42. 4
      db/wal_manager.cc
  43. 5
      db/wal_manager_test.cc
  44. 180
      include/posix/io_posix.h
  45. 2
      include/rocksdb/c.h
  46. 6
      include/rocksdb/env.h
  47. 10
      include/rocksdb/options.h
  48. 3
      include/rocksdb/utilities/stackable_db.h
  49. 2
      include/rocksdb/version.h
  50. 53
      java/rocksjni/options.cc
  51. 1
      src.mk
  52. 12
      tools/db_crashtest.py
  53. 16
      tools/ldb_cmd.cc
  54. 11
      util/env.cc
  55. 786
      util/env_posix.cc
  56. 637
      util/io_posix.cc
  57. 4
      util/options.cc
  58. 3
      util/options_helper.h
  59. 6
      util/options_test.cc
  60. 8
      utilities/checkpoint/checkpoint_test.cc
  61. 4
      utilities/document/json_document.cc

@ -14,10 +14,15 @@
# 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries.
# See thirdparty.inc for more information.
# sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 ..
# 4. Then build the project in debug mode (you may want to add /m:<N> flag to run msbuild in <N> parallel threads)
# msbuild ALL_BUILD.vcxproj
# 4. Then build the project in debug mode (you may want to add /m[:<N>] flag to run msbuild in <N> parallel threads
# or simply /m ot use all avail cores)
# msbuild rocksdb.sln
#
# rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything
# will be attempted but test only code does not build in Release mode.
#
# 5. And release mode (/m[:<N>] is also supported)
# msbuild ALL_BUILD.vcxproj /p:Configuration=Release
# msbuild rocksdb.sln /p:Configuration=Release
#
cmake_minimum_required(VERSION 2.6)
@ -83,6 +88,7 @@ set(LIBS ${ROCKSDB_LIBS} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS})
add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest)
# Main library source code
set(SOURCES
db/builder.cc
db/c.cc
@ -100,7 +106,6 @@ set(SOURCES
db/db_impl_experimental.cc
db/db_impl_readonly.cc
db/db_iter.cc
db/db_test_util.cc
db/event_helpers.cc
db/experimental.cc
db/filename.cc
@ -252,6 +257,12 @@ set(SOURCES
utilities/write_batch_with_index/write_batch_with_index_internal.cc
)
# For test util library that is build only in DEBUG mode
# and linked to tests. Add test only code that is not #ifdefed for Release here.
set(TESTUTIL_SOURCE
db/db_test_util.cc
)
add_library(rocksdblib${ARTIFACT_SUFFIX} ${SOURCES})
set_target_properties(rocksdblib${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdblib${ARTIFACT_SUFFIX}.pdb")
add_dependencies(rocksdblib${ARTIFACT_SUFFIX} GenerateBuildVersion)
@ -367,7 +378,7 @@ set(TESTS
utilities/write_batch_with_index/write_batch_with_index_test.cc
)
set(EXES ${APPS} ${TESTS})
set(EXES ${APPS})
foreach(sourcefile ${EXES})
string(REPLACE ".cc" "" exename ${sourcefile})
@ -376,12 +387,42 @@ foreach(sourcefile ${EXES})
target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS})
endforeach(sourcefile ${EXES})
# test utilities are only build in debug
set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX})
add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE})
set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb")
set_target_properties(${TESTUTILLIB}
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
)
# Tests are excluded from Release builds
set(TEST_EXES ${TESTS})
foreach(sourcefile ${TEST_EXES})
string(REPLACE ".cc" "" exename ${sourcefile})
string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
set_target_properties(${exename}${ARTIFACT_SUFFIX}
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
)
target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS} testutillib${ARTIFACT_SUFFIX})
endforeach(sourcefile ${TEST_EXES})
# C executables must link to a shared object
set(C_EXES ${C_TESTS})
set(C_TEST_EXES ${C_TESTS})
foreach(sourcefile ${C_EXES})
foreach(sourcefile ${C_TEST_EXES})
string(REPLACE ".c" "" exename ${sourcefile})
string(REGEX REPLACE "^((.+)/)+" "" exename ${exename})
add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile})
target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX})
endforeach(sourcefile ${C_TESTS})
set_target_properties(${exename}${ARTIFACT_SUFFIX}
PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1
EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1
)
target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX})
endforeach(sourcefile ${C_TEST_EXES})

@ -41,6 +41,14 @@ ifeq ($(MAKECMDGOALS),dbg)
DEBUG_LEVEL=2
endif
ifeq ($(MAKECMDGOALS),clean)
DEBUG_LEVEL=0
endif
ifeq ($(MAKECMDGOALS),release)
DEBUG_LEVEL=0
endif
ifeq ($(MAKECMDGOALS),shared_lib)
DEBUG_LEVEL=0
endif
@ -404,7 +412,7 @@ dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS)
# creates static library and programs
release:
$(MAKE) clean
OPT="-DNDEBUG -O2" $(MAKE) static_lib tools db_bench
DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench
coverage:
$(MAKE) clean

@ -5,7 +5,7 @@ before_build:
- cmake -G "Visual Studio 12 Win64" ..
- cd ..
build:
project: build\ALL_BUILD.vcxproj
project: build\rocksdb.sln
parallel: true
verbosity: minimal
test: off

@ -5,7 +5,7 @@ before_build:
- cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 ..
- cd ..
build:
project: build\ALL_BUILD.vcxproj
project: build\rocksdb.sln
parallel: true
verbosity: minimal
test:

@ -88,6 +88,9 @@ class FbcodeCppLinter extends ArcanistLinter {
}
private function getCppLintOutput($path) {
if (!array_key_exists($path, $this->rawLintOutput)) {
return array();
}
list($output) = $this->rawLintOutput[$path];
$msgs = array();

@ -45,7 +45,7 @@ fi
# we depend on C++11
PLATFORM_CXXFLAGS="-std=c++11"
# we currently depend on POSIX platform
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX"
COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX"
# Default to fbcode gcc on internal fb machines
if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then

@ -116,7 +116,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CXXFLAGS+=" $CFLAGS"
EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB"

@ -91,7 +91,7 @@ else
fi
CFLAGS+=" $DEPS_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE"
CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA"
CXXFLAGS+=" $CFLAGS"

@ -70,6 +70,13 @@ TSAN="COMPILE_WITH_TSAN=1"
DISABLE_JEMALLOC="DISABLE_JEMALLOC=1"
PARSER="'parser':'egrep \'Failure|^#|Abort\''"
ARTIFACTS=" 'artifacts': [
{
'name':'database',
'paths':[ '/dev/shm/rocksdb' ],
}
]"
#
# A mechanism to disable tests temporarily
#
@ -256,6 +263,26 @@ LITE_BUILD_COMMANDS="[
}
]"
#
# RocksDB lite tests
#
LITE_UNIT_TEST_COMMANDS="[
{
'name':'Rocksdb Lite Unit Test',
'oncall':'$ONCALL',
'steps': [
$CLEANUP_ENV,
{
'name':'Build RocksDB debug version',
'shell':'$SHM $LITE $DEBUG make J=1 check',
'user':'root',
$PARSER
},
],
$REPORT
}
]"
#
# RocksDB stress/crash test
#
@ -280,6 +307,7 @@ STRESS_CRASH_TEST_COMMANDS="[
$PARSER
}
],
$ARTIFACTS,
$REPORT
}
]"
@ -567,6 +595,9 @@ case $1 in
lite)
echo $LITE_BUILD_COMMANDS
;;
lite_test)
echo $LITE_UNIT_TEST_COMMANDS
;;
stress_crash)
echo $STRESS_CRASH_TEST_COMMANDS
;;

@ -1687,6 +1687,11 @@ void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) {
opt->rep.keep_log_file_num = v;
}
void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt,
size_t v) {
opt->rep.recycle_log_file_num = v;
}
void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) {
opt->rep.soft_rate_limit = v;
}

@ -23,8 +23,6 @@
#include "util/sync_point.h"
#include "utilities/merge_operators.h"
#if !(defined NDEBUG) || !defined(OS_WIN)
namespace rocksdb {
namespace {
@ -1262,13 +1260,8 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) {
}
} // namespace rocksdb
#endif
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -64,7 +64,7 @@
#include "util/xfunc.h"
#include "utilities/merge_operators.h"
#if !defined(IOS_CROSS_COMPILE) && (!defined(NDEBUG) || !defined(OS_WIN))
#if !defined(IOS_CROSS_COMPILE)
#ifndef ROCKSDB_LITE
namespace rocksdb {

@ -198,7 +198,7 @@ class CompactionJobTest : public testing::Test {
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options_));
{
log::Writer log(std::move(file_writer));
log::Writer log(std::move(file_writer), 0, false);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);

@ -625,11 +625,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) {
} // namespace rocksdb
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -14,7 +14,7 @@
namespace rocksdb {
// SYNC_POINT is not supported in released Windows mode.
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
class DBCompactionTest : public DBTestBase {
public:
@ -1227,6 +1227,7 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) {
TEST_P(DBCompactionTestWithParam, ManualCompaction) {
Options options = CurrentOptions();
options.max_subcompactions = max_subcompactions_;
options.statistics = rocksdb::CreateDBStatistics();
CreateAndReopenWithCF({"pikachu"}, options);
// iter - 0 with 7 levels
@ -1258,7 +1259,14 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) {
// Compact all
MakeTables(1, "a", "z", 1);
ASSERT_EQ("1,0,2", FilesPerLevel(1));
uint64_t prev_block_cache_add =
options.statistics->getTickerCount(BLOCK_CACHE_ADD);
db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr);
// Verify manual compaction doesn't fill block cache
ASSERT_EQ(prev_block_cache_add,
options.statistics->getTickerCount(BLOCK_CACHE_ADD));
ASSERT_EQ("0,0,1", FilesPerLevel(1));
if (iter == 0) {
@ -1266,6 +1274,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) {
options.max_background_flushes = 0;
options.num_levels = 3;
options.create_if_missing = true;
options.statistics = rocksdb::CreateDBStatistics();
DestroyAndReopen(options);
CreateAndReopenWithCF({"pikachu"}, options);
}
@ -1843,11 +1852,11 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) {
INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam,
::testing::Values(1, 4));
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#endif // !defined(ROCKSDB_LITE)
} // namespace rocksdb
int main(int argc, char** argv) {
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();

@ -10,7 +10,7 @@
// Introduction of SyncPoint effectively disabled building and running this test
// in Release build.
// which is a pity, it is a good test
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
#include "db/db_test_util.h"
#include "port/stack_trace.h"
@ -484,10 +484,10 @@ TEST_F(DBTestDynamicLevel, MigrateToDynamicLevelMaxBytesBase) {
}
} // namespace rocksdb
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#endif // !defined(ROCKSDB_LITE)
int main(int argc, char** argv) {
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();

@ -151,6 +151,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
}
}
if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) {
result.recycle_log_file_num = false;
}
if (result.wal_dir.empty()) {
// Use dbname as default
result.wal_dir = dbname;
@ -417,7 +421,7 @@ Status DBImpl::NewDB() {
file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options));
log::Writer log(std::move(file_writer));
log::Writer log(std::move(file_writer), 0, false);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
@ -602,7 +606,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
// find newly obsoleted log files
while (alive_log_files_.begin()->number < min_log_number) {
auto& earliest = *alive_log_files_.begin();
job_context->log_delete_files.push_back(earliest.number);
if (db_options_.recycle_log_file_num > log_recycle_files.size()) {
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"adding log %" PRIu64 " to recycle list\n", earliest.number);
log_recycle_files.push_back(earliest.number);
} else {
job_context->log_delete_files.push_back(earliest.number);
}
total_log_size_ -= earliest.size;
alive_log_files_.pop_front();
// Current log should always stay alive since it can't have
@ -1110,28 +1120,13 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/);
log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
true /*checksum*/, 0 /*initial_offset*/, log_number);
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number,
db_options_.wal_recovery_mode, !continue_replay_log);
// Determine if we should tolerate incomplete records at the tail end of the
// log
bool report_eof_inconsistency;
if (db_options_.wal_recovery_mode ==
WALRecoveryMode::kAbsoluteConsistency) {
// in clean shutdown we don't expect any error in the log files
report_eof_inconsistency = true;
} else {
// for other modes ignore only incomplete records in the last log file
// which is presumably due to write in progress during restart
report_eof_inconsistency = false;
// TODO krad: Evaluate if we need to move to a more strict mode where we
// restrict the inconsistency to only the last log
}
// Read all the records and add to a memtable
std::string scratch;
Slice record;
@ -1146,9 +1141,10 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
}
}
while (continue_replay_log &&
reader.ReadRecord(&record, &scratch, report_eof_inconsistency) &&
status.ok()) {
while (
continue_replay_log &&
reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) &&
status.ok()) {
if (record.size() < 12) {
reporter.Corruption(record.size(),
Status::Corruption("log record too small"));
@ -4142,6 +4138,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
// Do this without holding the dbmutex lock.
assert(versions_->prev_log_number() == 0);
bool creating_new_log = !log_empty_;
uint64_t recycle_log_number = 0;
if (creating_new_log && db_options_.recycle_log_file_num &&
!log_recycle_files.empty()) {
recycle_log_number = log_recycle_files.front();
log_recycle_files.pop_front();
}
uint64_t new_log_number =
creating_new_log ? versions_->NewFileNumber() : logfile_number_;
SuperVersion* new_superversion = nullptr;
@ -4152,17 +4154,28 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) {
if (creating_new_log) {
EnvOptions opt_env_opt =
env_->OptimizeForLogWrite(env_options_, db_options_);
s = NewWritableFile(env_,
LogFileName(db_options_.wal_dir, new_log_number),
&lfile, opt_env_opt);
if (recycle_log_number) {
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"reusing log %" PRIu64 " from recycle list\n", recycle_log_number);
s = env_->ReuseWritableFile(
LogFileName(db_options_.wal_dir, new_log_number),
LogFileName(db_options_.wal_dir, recycle_log_number), &lfile,
opt_env_opt);
} else {
s = NewWritableFile(env_,
LogFileName(db_options_.wal_dir, new_log_number),
&lfile, opt_env_opt);
}
if (s.ok()) {
// Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution.
lfile->SetPreallocationBlockSize(
1.1 * mutable_cf_options.write_buffer_size);
lfile->SetPreallocationBlockSize(1.1 *
mutable_cf_options.write_buffer_size);
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(lfile), opt_env_opt));
new_log = new log::Writer(std::move(file_writer));
new_log = new log::Writer(std::move(file_writer),
new_log_number,
db_options_.recycle_log_file_num > 0);
}
}
@ -4801,8 +4814,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname,
impl->logfile_number_ = new_log_number;
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(lfile), opt_env_options));
impl->logs_.emplace_back(new_log_number,
new log::Writer(std::move(file_writer)));
impl->logs_.emplace_back(
new_log_number,
new log::Writer(std::move(file_writer),
new_log_number,
impl->db_options_.recycle_log_file_num > 0));
// set column family handles
for (auto cf : column_families) {

@ -556,6 +556,8 @@ class DBImpl : public DB {
// * whenever there is an error in background flush or compaction
InstrumentedCondVar bg_cv_;
uint64_t logfile_number_;
std::deque<uint64_t>
log_recycle_files; // a list of log files that we can recycle
bool log_dir_synced_;
bool log_empty_;
ColumnFamilyHandleImpl* default_cf_handle_;

@ -1943,8 +1943,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) {
ASSERT_EQ(db_iter_->value().ToString(), "4");
}
#if !(defined NDEBUG) || !defined(OS_WIN)
TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) {
// Test Prev() when one child iterator is at its end but more rows
// are added.
@ -2295,7 +2293,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) {
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}
#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
} // namespace rocksdb
int main(int argc, char** argv) {

@ -10,7 +10,7 @@
// Introduction of SyncPoint effectively disabled building and running this test
// in Release build.
// which is a pity, it is a good test
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
#include "db/db_test_util.h"
#include "port/stack_trace.h"
@ -277,10 +277,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) {
}
} // namespace rocksdb
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#endif // !defined(ROCKSDB_LITE)
int main(int argc, char** argv) {
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();

@ -207,11 +207,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) {
#endif // ROCKSDB_LITE
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -10,7 +10,7 @@
// Introduction of SyncPoint effectively disabled building and running this test
// in Release build.
// which is a pity, it is a good test
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
#include "db/db_test_util.h"
#include "db/forward_iterator.h"
@ -646,10 +646,10 @@ TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) {
} // namespace rocksdb
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#endif // !defined(ROCKSDB_LITE)
int main(int argc, char** argv) {
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();

@ -10,8 +10,6 @@
// Introduction of SyncPoint effectively disabled building and running this test
// in Release build.
// which is a pity, it is a good test
#if !(defined NDEBUG) || !defined(OS_WIN)
#include <algorithm>
#include <iostream>
#include <set>
@ -5075,7 +5073,9 @@ class RecoveryTestHelper {
ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options));
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options));
current_log_writer.reset(new log::Writer(std::move(file_writer)));
current_log_writer.reset(new log::Writer(
std::move(file_writer), current_log_number,
db_options.recycle_log_file_num > 0));
for (int i = 0; i < kKeysPerWALFile; i++) {
std::string key = "key" + ToString(count++);
@ -10445,14 +10445,9 @@ INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam,
#endif // ROCKSDB_LITE
} // namespace rocksdb
#endif
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -58,12 +58,9 @@ DBTestBase::DBTestBase(const std::string path)
}
DBTestBase::~DBTestBase() {
// SyncPoint is not supported in Released Windows Mode.
#if !(defined NDEBUG) || !defined(OS_WIN)
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
rocksdb::SyncPoint::GetInstance()->LoadDependency({});
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
#endif // !(defined NDEBUG) || !defined(OS_WIN)
Close();
Options options;
options.db_paths.emplace_back(dbname_, 0);
@ -343,6 +340,10 @@ Options DBTestBase::CurrentOptions(
options.row_cache = NewLRUCache(1024 * 1024);
break;
}
case kRecycleLogFiles: {
options.recycle_log_file_num = 2;
break;
}
case kLevelSubcompactions: {
options.max_subcompactions = 4;
break;

@ -437,9 +437,10 @@ class DBTestBase : public testing::Test {
kFIFOCompaction = 26,
kOptimizeFiltersForHits = 27,
kRowCache = 28,
kLevelSubcompactions = 29,
kUniversalSubcompactions = 30,
kEnd = 29
kRecycleLogFiles = 29,
kLevelSubcompactions = 30,
kUniversalSubcompactions = 31,
kEnd = 30
};
int option_config_;

@ -9,7 +9,7 @@
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
#include "util/sync_point.h"
namespace rocksdb {
@ -1210,10 +1210,10 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId,
} // namespace rocksdb
#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#endif // !defined(ROCKSDB_LITE)
int main(int argc, char** argv) {
#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE)
#if !defined(ROCKSDB_LITE)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();

@ -9,9 +9,7 @@
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#if !(defined NDEBUG) || !defined(OS_WIN)
#include "util/sync_point.h"
#endif
namespace rocksdb {
class DBWALTest : public DBTestBase {
@ -70,7 +68,6 @@ TEST_F(DBWALTest, RollLog) {
} while (ChangeOptions());
}
#if !(defined NDEBUG) || !defined(OS_WIN)
TEST_F(DBWALTest, SyncWALNotBlockWrite) {
Options options = CurrentOptions();
options.max_write_buffer_number = 4;
@ -130,15 +127,10 @@ TEST_F(DBWALTest, SyncWALNotWaitWrite) {
ASSERT_EQ(Get("foo2"), "bar2");
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
}
#endif
} // namespace rocksdb
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -11,8 +11,6 @@
// the last "sync". It then checks for data loss errors by purposely dropping
// file data (or entire files) not protected by a "sync".
#if !(defined NDEBUG) || !defined(OS_WIN)
#include <map>
#include <set>
#include "db/db_impl.h"
@ -902,13 +900,7 @@ INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool());
} // namespace rocksdb
#endif // #if !(defined NDEBUG) || !defined(OS_WIN)
int main(int argc, char** argv) {
#if !(defined NDEBUG) || !defined(OS_WIN)
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}

@ -61,7 +61,7 @@ class FlushJobTest : public testing::Test {
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), EnvOptions()));
{
log::Writer log(std::move(file_writer));
log::Writer log(std::move(file_writer), 0, false);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);

@ -22,14 +22,24 @@ enum RecordType {
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
kLastType = 4,
// For recycled log files
kRecyclableFullType = 5,
kRecyclableFirstType = 6,
kRecyclableMiddleType = 7,
kRecyclableLastType = 8,
};
static const int kMaxRecordType = kLastType;
static const int kMaxRecordType = kRecyclableLastType;
static const unsigned int kBlockSize = 32768;
// Header is checksum (4 bytes), type (1 byte), length (2 bytes).
static const int kHeaderSize = 4 + 1 + 2;
// Recyclable header is checksum (4 bytes), type (1 byte), log number
// (4 bytes), length (2 bytes).
static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2;
} // namespace log
} // namespace rocksdb

@ -21,9 +21,12 @@ namespace log {
Reader::Reporter::~Reporter() {
}
Reader::Reader(unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
bool checksum, uint64_t initial_offset)
: file_(std::move(_file)),
Reader::Reader(std::shared_ptr<Logger> info_log,
unique_ptr<SequentialFileReader>&& _file,
Reporter* reporter, bool checksum, uint64_t initial_offset,
uint64_t log_num)
: info_log_(info_log),
file_(std::move(_file)),
reporter_(reporter),
checksum_(checksum),
backing_store_(new char[kBlockSize]),
@ -33,7 +36,8 @@ Reader::Reader(unique_ptr<SequentialFileReader>&& _file, Reporter* reporter,
eof_offset_(0),
last_record_offset_(0),
end_of_buffer_offset_(0),
initial_offset_(initial_offset) {}
initial_offset_(initial_offset),
log_number_(log_num) {}
Reader::~Reader() {
delete[] backing_store_;
@ -62,8 +66,15 @@ bool Reader::SkipToInitialBlock() {
return true;
}
// For kAbsoluteConsistency, on clean shutdown we don't expect any error
// in the log files. For other modes, we can ignore only incomplete records
// in the last log file, which are presumably due to a write in progress
// during restart (or from log recycling).
//
// TODO krad: Evaluate if we need to move to a more strict mode where we
// restrict the inconsistency to only the last log
bool Reader::ReadRecord(Slice* record, std::string* scratch,
const bool report_eof_inconsistency) {
WALRecoveryMode wal_recovery_mode) {
if (last_record_offset_ < initial_offset_) {
if (!SkipToInitialBlock()) {
return false;
@ -80,10 +91,11 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
Slice fragment;
while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
const unsigned int record_type =
ReadPhysicalRecord(&fragment, report_eof_inconsistency);
size_t drop_size;
const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size);
switch (record_type) {
case kFullType:
case kRecyclableFullType:
if (in_fragmented_record && !scratch->empty()) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
@ -98,6 +110,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
return true;
case kFirstType:
case kRecyclableFirstType:
if (in_fragmented_record && !scratch->empty()) {
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
@ -111,6 +124,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
break;
case kMiddleType:
case kRecyclableMiddleType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
@ -120,6 +134,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
break;
case kLastType:
case kRecyclableLastType:
if (!in_fragmented_record) {
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
@ -131,9 +146,17 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
}
break;
case kBadHeader:
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
// in clean shutdown we don't expect any error in the log files
ReportCorruption(drop_size, "truncated header");
}
// fall-thru
case kEof:
if (in_fragmented_record) {
if (report_eof_inconsistency) {
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
// in clean shutdown we don't expect any error in the log files
ReportCorruption(scratch->size(), "error reading trailing data");
}
// This can be caused by the writer dying immediately after
@ -143,6 +166,23 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch,
}
return false;
case kOldRecord:
if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) {
// Treat a record from a previous instance of the log as EOF.
if (in_fragmented_record) {
if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) {
// in clean shutdown we don't expect any error in the log files
ReportCorruption(scratch->size(), "error reading trailing data");
}
// This can be caused by the writer dying immediately after
// writing a physical record but before completing the next; don't
// treat it as a corruption, just ignore the entire logical record.
scratch->clear();
}
return false;
}
// fall-thru
case kBadRecord:
if (in_fragmented_record) {
ReportCorruption(scratch->size(), "error in middle of record");
@ -244,36 +284,49 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) {
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result,
const bool report_eof_inconsistency) {
bool Reader::ReadMore(size_t* drop_size, int *error) {
if (!eof_ && !read_error_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
read_error_ = true;
*error = kEof;
return false;
} else if (buffer_.size() < (size_t)kBlockSize) {
eof_ = true;
eof_offset_ = buffer_.size();
}
return true;
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Unless explicitly requested we don't
// considering this an error, just report EOF.
if (buffer_.size()) {
*drop_size = buffer_.size();
buffer_.clear();
*error = kBadHeader;
return false;
}
buffer_.clear();
*error = kEof;
return false;
}
}
unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) {
while (true) {
// We need at least the minimum header size
if (buffer_.size() < (size_t)kHeaderSize) {
if (!eof_ && !read_error_) {
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);
end_of_buffer_offset_ += buffer_.size();
if (!status.ok()) {
buffer_.clear();
ReportDrop(kBlockSize, status);
read_error_ = true;
return kEof;
} else if (buffer_.size() < (size_t)kBlockSize) {
eof_ = true;
eof_offset_ = buffer_.size();
}
continue;
} else {
// Note that if buffer_ is non-empty, we have a truncated header at the
// end of the file, which can be caused by the writer crashing in the
// middle of writing the header. Unless explicitly requested we don't
// considering this an error, just report EOF.
if (buffer_.size() && report_eof_inconsistency) {
ReportCorruption(buffer_.size(), "truncated header");
}
buffer_.clear();
return kEof;
int r;
if (!ReadMore(drop_size, &r)) {
return r;
}
continue;
}
// Parse the header
@ -282,18 +335,34 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result,
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
if (kHeaderSize + length > buffer_.size()) {
size_t drop_size = buffer_.size();
int header_size = kHeaderSize;
if (type >= kRecyclableFullType && type <= kRecyclableLastType) {
header_size = kRecyclableHeaderSize;
// We need enough for the larger header
if (buffer_.size() < (size_t)kRecyclableHeaderSize) {
int r;
if (!ReadMore(drop_size, &r)) {
return r;
}
continue;
}
const uint32_t log_num = DecodeFixed32(header + 7);
if (log_num != log_number_) {
return kOldRecord;
}
}
if (header_size + length > buffer_.size()) {
*drop_size = buffer_.size();
buffer_.clear();
if (!eof_) {
ReportCorruption(drop_size, "bad record length");
ReportCorruption(*drop_size, "bad record length");
return kBadRecord;
}
// If the end of the file has been reached without reading |length| bytes
// of payload, assume the writer died in the middle of writing the record.
// Don't report a corruption unless requested.
if (drop_size && report_eof_inconsistency) {
ReportCorruption(drop_size, "truncated header");
if (*drop_size) {
return kBadHeader;
}
return kEof;
}
@ -311,29 +380,29 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result,
// Check crc
if (checksum_) {
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header));
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length);
uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6);
if (actual_crc != expected_crc) {
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
*drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
ReportCorruption(*drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length);
buffer_.remove_prefix(header_size + length);
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length <
if (end_of_buffer_offset_ - buffer_.size() - header_size - length <
initial_offset_) {
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
*result = Slice(header + header_size, length);
return type;
}
}

@ -14,10 +14,12 @@
#include "db/log_format.h"
#include "rocksdb/slice.h"
#include "rocksdb/status.h"
#include "rocksdb/options.h"
namespace rocksdb {
class SequentialFileReader;
class Logger;
using std::unique_ptr;
namespace log {
@ -51,8 +53,10 @@ class Reader {
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file.
Reader(unique_ptr<SequentialFileReader>&& file, Reporter* reporter,
bool checksum, uint64_t initial_offset);
Reader(std::shared_ptr<Logger> info_log,
unique_ptr<SequentialFileReader>&& file,
Reporter* reporter, bool checksum, uint64_t initial_offset,
uint64_t log_num);
~Reader();
@ -62,7 +66,8 @@ class Reader {
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch,
bool report_eof_inconsistency = false);
WALRecoveryMode wal_recovery_mode =
WALRecoveryMode::kTolerateCorruptedTailRecords);
// Returns the physical offset of the last record returned by ReadRecord.
//
@ -84,6 +89,7 @@ class Reader {
SequentialFileReader* file() { return file_.get(); }
private:
std::shared_ptr<Logger> info_log_;
const unique_ptr<SequentialFileReader> file_;
Reporter* const reporter_;
bool const checksum_;
@ -104,6 +110,9 @@ class Reader {
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// which log number this is
uint64_t const log_number_;
// Extend record types with the following special values
enum {
kEof = kMaxRecordType + 1,
@ -112,7 +121,11 @@ class Reader {
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
kBadRecord = kMaxRecordType + 2,
// Returned when we fail to read a valid header.
kBadHeader = kMaxRecordType + 3,
// Returned when we read an old record from a previous user of the log.
kOldRecord = kMaxRecordType + 4,
};
// Skips all blocks that are completely before "initial_offset_".
@ -121,8 +134,10 @@ class Reader {
bool SkipToInitialBlock();
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result,
bool report_eof_inconsistency = false);
unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size);
// Read some more
bool ReadMore(size_t* drop_size, int *error);
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.

@ -43,7 +43,7 @@ static std::string RandomSkewedString(int i, Random* rnd) {
return BigString(NumberString(i), rnd->Skewed(17));
}
class LogTest : public testing::Test {
class LogTest : public ::testing::TestWithParam<int> {
private:
class StringSource : public SequentialFile {
public:
@ -153,19 +153,26 @@ class LogTest : public testing::Test {
// Record metadata for testing initial offset functionality
static size_t initial_offset_record_sizes_[];
static uint64_t initial_offset_last_record_offsets_[];
uint64_t initial_offset_last_record_offsets_[4];
public:
LogTest()
: reader_contents_(),
dest_holder_(
test::GetWritableFileWriter(
new test::StringSink(&reader_contents_))),
dest_holder_(test::GetWritableFileWriter(
new test::StringSink(&reader_contents_))),
source_holder_(
test::GetSequentialFileReader(new StringSource(reader_contents_))),
writer_(std::move(dest_holder_)),
reader_(std::move(source_holder_), &report_, true /*checksum*/,
0 /*initial_offset*/) {}
writer_(std::move(dest_holder_), 123, GetParam()),
reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/,
0 /*initial_offset*/, 123) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
initial_offset_last_record_offsets_[0] = 0;
initial_offset_last_record_offsets_[1] = header_size + 10000;
initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000);
initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) +
(2 * log::kBlockSize - 1000) +
3 * header_size;
}
void Write(const std::string& msg) {
writer_.AddRecord(Slice(msg));
@ -175,10 +182,11 @@ class LogTest : public testing::Test {
return dest_contents().size();
}
std::string Read(const bool report_eof_inconsistency = false) {
std::string Read(const WALRecoveryMode wal_recovery_mode =
WALRecoveryMode::kTolerateCorruptedTailRecords) {
std::string scratch;
Slice record;
if (reader_.ReadRecord(&record, &scratch, report_eof_inconsistency)) {
if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) {
return record.ToString();
} else {
return "EOF";
@ -200,9 +208,11 @@ class LogTest : public testing::Test {
dest->Drop(bytes);
}
void FixChecksum(int header_offset, int len) {
void FixChecksum(int header_offset, int len, bool recyclable) {
// Compute crc of type/len/data
uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len);
int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize;
uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6],
header_size - 6 + len);
crc = crc32c::Mask(crc);
EncodeFixed32(&dest_contents()[header_offset], crc);
}
@ -259,8 +269,8 @@ class LogTest : public testing::Test {
unique_ptr<SequentialFileReader> file_reader(
test::GetSequentialFileReader(new StringSource(reader_contents_)));
unique_ptr<Reader> offset_reader(
new Reader(std::move(file_reader), &report_, true /*checksum*/,
WrittenBytes() + offset_past_end));
new Reader(NULL, std::move(file_reader), &report_,
true /*checksum*/, WrittenBytes() + offset_past_end, 123));
Slice record;
std::string scratch;
ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch));
@ -271,8 +281,9 @@ class LogTest : public testing::Test {
WriteInitialOffsetLog();
unique_ptr<SequentialFileReader> file_reader(
test::GetSequentialFileReader(new StringSource(reader_contents_)));
unique_ptr<Reader> offset_reader(new Reader(
std::move(file_reader), &report_, true /*checksum*/, initial_offset));
unique_ptr<Reader> offset_reader(
new Reader(NULL, std::move(file_reader), &report_,
true /*checksum*/, initial_offset, 123));
Slice record;
std::string scratch;
ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch));
@ -291,16 +302,9 @@ size_t LogTest::initial_offset_record_sizes_[] =
2 * log::kBlockSize - 1000, // Span three blocks
1};
uint64_t LogTest::initial_offset_last_record_offsets_[] =
{0,
kHeaderSize + 10000,
2 * (kHeaderSize + 10000),
2 * (kHeaderSize + 10000) +
(2 * log::kBlockSize - 1000) + 3 * kHeaderSize};
TEST_F(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); }
TEST_F(LogTest, ReadWrite) {
TEST_P(LogTest, ReadWrite) {
Write("foo");
Write("bar");
Write("");
@ -313,7 +317,7 @@ TEST_F(LogTest, ReadWrite) {
ASSERT_EQ("EOF", Read()); // Make sure reads at eof work
}
TEST_F(LogTest, ManyBlocks) {
TEST_P(LogTest, ManyBlocks) {
for (int i = 0; i < 100000; i++) {
Write(NumberString(i));
}
@ -323,7 +327,7 @@ TEST_F(LogTest, ManyBlocks) {
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, Fragmentation) {
TEST_P(LogTest, Fragmentation) {
Write("small");
Write(BigString("medium", 50000));
Write(BigString("large", 100000));
@ -333,11 +337,12 @@ TEST_F(LogTest, Fragmentation) {
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, MarginalTrailer) {
TEST_P(LogTest, MarginalTrailer) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
const int n = kBlockSize - 2 * header_size;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
@ -346,11 +351,12 @@ TEST_F(LogTest, MarginalTrailer) {
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, MarginalTrailer2) {
TEST_P(LogTest, MarginalTrailer2) {
// Make a trailer that is exactly the same length as an empty record.
const int n = kBlockSize - 2*kHeaderSize;
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
const int n = kBlockSize - 2 * header_size;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes());
ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes());
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("bar", Read());
@ -359,10 +365,11 @@ TEST_F(LogTest, MarginalTrailer2) {
ASSERT_EQ("", ReportMessage());
}
TEST_F(LogTest, ShortTrailer) {
const int n = kBlockSize - 2*kHeaderSize + 4;
TEST_P(LogTest, ShortTrailer) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
const int n = kBlockSize - 2 * header_size + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
Write("");
Write("bar");
ASSERT_EQ(BigString("foo", n), Read());
@ -371,15 +378,16 @@ TEST_F(LogTest, ShortTrailer) {
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, AlignedEof) {
const int n = kBlockSize - 2*kHeaderSize + 4;
TEST_P(LogTest, AlignedEof) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
const int n = kBlockSize - 2 * header_size + 4;
Write(BigString("foo", n));
ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes());
ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes());
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, RandomRead) {
TEST_P(LogTest, RandomRead) {
const int N = 500;
Random write_rnd(301);
for (int i = 0; i < N; i++) {
@ -394,7 +402,7 @@ TEST_F(LogTest, RandomRead) {
// Tests of all the error paths in log_reader.cc follow:
TEST_F(LogTest, ReadError) {
TEST_P(LogTest, ReadError) {
Write("foo");
ForceError();
ASSERT_EQ("EOF", Read());
@ -402,17 +410,17 @@ TEST_F(LogTest, ReadError) {
ASSERT_EQ("OK", MatchError("read error"));
}
TEST_F(LogTest, BadRecordType) {
TEST_P(LogTest, BadRecordType) {
Write("foo");
// Type is stored in header[6]
IncrementByte(6, 100);
FixChecksum(0, 3);
FixChecksum(0, 3, false);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("unknown record type"));
}
TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read());
@ -421,17 +429,18 @@ TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) {
ASSERT_EQ("", ReportMessage());
}
TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) {
TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) {
Write("foo");
ShrinkSize(4); // Drop all payload as well as a header byte
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true));
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
// Truncated last record is ignored, not treated as an error
ASSERT_GT(DroppedBytes(), 0U);
ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
}
TEST_F(LogTest, BadLength) {
const int kPayloadSize = kBlockSize - kHeaderSize;
TEST_P(LogTest, BadLength) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
const int kPayloadSize = kBlockSize - header_size;
Write(BigString("bar", kPayloadSize));
Write("foo");
// Least significant size byte is stored in header[4].
@ -441,7 +450,7 @@ TEST_F(LogTest, BadLength) {
ASSERT_EQ("OK", MatchError("bad record length"));
}
TEST_F(LogTest, BadLengthAtEndIsIgnored) {
TEST_P(LogTest, BadLengthAtEndIsIgnored) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read());
@ -449,63 +458,63 @@ TEST_F(LogTest, BadLengthAtEndIsIgnored) {
ASSERT_EQ("", ReportMessage());
}
TEST_F(LogTest, BadLengthAtEndIsNotIgnored) {
TEST_P(LogTest, BadLengthAtEndIsNotIgnored) {
Write("foo");
ShrinkSize(1);
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
ASSERT_GT(DroppedBytes(), 0U);
ASSERT_EQ("OK", MatchError("Corruption: truncated header"));
}
TEST_F(LogTest, ChecksumMismatch) {
Write("foo");
IncrementByte(0, 10);
TEST_P(LogTest, ChecksumMismatch) {
Write("foooooo");
IncrementByte(0, 14);
ASSERT_EQ("EOF", Read());
ASSERT_EQ(10U, DroppedBytes());
ASSERT_EQ(14U + 4 * !!GetParam(), DroppedBytes());
ASSERT_EQ("OK", MatchError("checksum mismatch"));
}
TEST_F(LogTest, UnexpectedMiddleType) {
TEST_P(LogTest, UnexpectedMiddleType) {
Write("foo");
SetByte(6, kMiddleType);
FixChecksum(0, 3);
SetByte(6, GetParam() ? kRecyclableMiddleType : kMiddleType);
FixChecksum(0, 3, !!GetParam());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST_F(LogTest, UnexpectedLastType) {
TEST_P(LogTest, UnexpectedLastType) {
Write("foo");
SetByte(6, kLastType);
FixChecksum(0, 3);
SetByte(6, GetParam() ? kRecyclableLastType : kLastType);
FixChecksum(0, 3, !!GetParam());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("missing start"));
}
TEST_F(LogTest, UnexpectedFullType) {
TEST_P(LogTest, UnexpectedFullType) {
Write("foo");
Write("bar");
SetByte(6, kFirstType);
FixChecksum(0, 3);
SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
FixChecksum(0, 3, !!GetParam());
ASSERT_EQ("bar", Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST_F(LogTest, UnexpectedFirstType) {
TEST_P(LogTest, UnexpectedFirstType) {
Write("foo");
Write(BigString("bar", 100000));
SetByte(6, kFirstType);
FixChecksum(0, 3);
SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType);
FixChecksum(0, 3, !!GetParam());
ASSERT_EQ(BigString("bar", 100000), Read());
ASSERT_EQ("EOF", Read());
ASSERT_EQ(3U, DroppedBytes());
ASSERT_EQ("OK", MatchError("partial record without end"));
}
TEST_F(LogTest, MissingLastIsIgnored) {
TEST_P(LogTest, MissingLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Remove the LAST block, including header.
ShrinkSize(14);
@ -514,16 +523,16 @@ TEST_F(LogTest, MissingLastIsIgnored) {
ASSERT_EQ(0U, DroppedBytes());
}
TEST_F(LogTest, MissingLastIsNotIgnored) {
TEST_P(LogTest, MissingLastIsNotIgnored) {
Write(BigString("bar", kBlockSize));
// Remove the LAST block, including header.
ShrinkSize(14);
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
ASSERT_GT(DroppedBytes(), 0U);
ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data"));
}
TEST_F(LogTest, PartialLastIsIgnored) {
TEST_P(LogTest, PartialLastIsIgnored) {
Write(BigString("bar", kBlockSize));
// Cause a bad record length in the LAST block.
ShrinkSize(1);
@ -532,18 +541,18 @@ TEST_F(LogTest, PartialLastIsIgnored) {
ASSERT_EQ(0U, DroppedBytes());
}
TEST_F(LogTest, PartialLastIsNotIgnored) {
TEST_P(LogTest, PartialLastIsNotIgnored) {
Write(BigString("bar", kBlockSize));
// Cause a bad record length in the LAST block.
ShrinkSize(1);
ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true));
ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency));
ASSERT_GT(DroppedBytes(), 0U);
ASSERT_EQ("OK", MatchError(
"Corruption: truncated headerCorruption: "
"error reading trailing data"));
}
TEST_F(LogTest, ErrorJoinsRecords) {
TEST_P(LogTest, ErrorJoinsRecords) {
// Consider two fragmented records:
// first(R1) last(R1) first(R2) last(R2)
// where the middle two fragments disappear. We do not want
@ -566,46 +575,60 @@ TEST_F(LogTest, ErrorJoinsRecords) {
ASSERT_GE(dropped, 2 * kBlockSize);
}
TEST_F(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); }
TEST_F(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); }
TEST_F(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); }
TEST_F(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); }
TEST_P(LogTest, ReadSecondStart) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
CheckInitialOffsetRecord(10000 + header_size, 1);
}
TEST_F(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); }
TEST_P(LogTest, ReadThirdOneOff) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
CheckInitialOffsetRecord(10000 + header_size + 1, 2);
}
TEST_F(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); }
TEST_P(LogTest, ReadThirdStart) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
CheckInitialOffsetRecord(20000 + 2 * header_size, 2);
}
TEST_F(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); }
TEST_P(LogTest, ReadFourthOneOff) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3);
}
TEST_F(LogTest, ReadFourthFirstBlockTrailer) {
TEST_P(LogTest, ReadFourthFirstBlockTrailer) {
CheckInitialOffsetRecord(log::kBlockSize - 4, 3);
}
TEST_F(LogTest, ReadFourthMiddleBlock) {
TEST_P(LogTest, ReadFourthMiddleBlock) {
CheckInitialOffsetRecord(log::kBlockSize + 1, 3);
}
TEST_F(LogTest, ReadFourthLastBlock) {
TEST_P(LogTest, ReadFourthLastBlock) {
CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3);
}
TEST_F(LogTest, ReadFourthStart) {
TEST_P(LogTest, ReadFourthStart) {
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
CheckInitialOffsetRecord(
2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize,
2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size,
3);
}
TEST_F(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); }
TEST_F(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); }
TEST_F(LogTest, ClearEofSingleBlock) {
TEST_P(LogTest, ClearEofSingleBlock) {
Write("foo");
Write("bar");
ForceEOF(3 + kHeaderSize + 2);
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
ForceEOF(3 + header_size + 2);
ASSERT_EQ("foo", Read());
UnmarkEOF();
ASSERT_EQ("bar", Read());
@ -617,12 +640,13 @@ TEST_F(LogTest, ClearEofSingleBlock) {
ASSERT_TRUE(IsEOF());
}
TEST_F(LogTest, ClearEofMultiBlock) {
TEST_P(LogTest, ClearEofMultiBlock) {
size_t num_full_blocks = 5;
size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25;
int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize;
size_t n = (kBlockSize - header_size) * num_full_blocks + 25;
Write(BigString("foo", n));
Write(BigString("bar", n));
ForceEOF(n + num_full_blocks * kHeaderSize + 10);
ForceEOF(n + num_full_blocks * header_size + header_size + 3);
ASSERT_EQ(BigString("foo", n), Read());
ASSERT_TRUE(IsEOF());
UnmarkEOF();
@ -634,7 +658,7 @@ TEST_F(LogTest, ClearEofMultiBlock) {
ASSERT_TRUE(IsEOF());
}
TEST_F(LogTest, ClearEofError) {
TEST_P(LogTest, ClearEofError) {
// If an error occurs during Read() in UnmarkEOF(), the records contained
// in the buffer should be returned on subsequent calls of ReadRecord()
// until no more full records are left, whereafter ReadRecord() should return
@ -652,7 +676,7 @@ TEST_F(LogTest, ClearEofError) {
ASSERT_EQ("EOF", Read());
}
TEST_F(LogTest, ClearEofError2) {
TEST_P(LogTest, ClearEofError2) {
Write("foo");
Write("bar");
UnmarkEOF();
@ -666,6 +690,8 @@ TEST_F(LogTest, ClearEofError2) {
ASSERT_EQ("OK", MatchError("read error"));
}
INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2));
} // namespace log
} // namespace rocksdb

@ -18,8 +18,12 @@
namespace rocksdb {
namespace log {
Writer::Writer(unique_ptr<WritableFileWriter>&& dest)
: dest_(std::move(dest)), block_offset_(0) {
Writer::Writer(unique_ptr<WritableFileWriter>&& dest,
uint64_t log_number, bool recycle_log_files)
: dest_(std::move(dest)),
block_offset_(0),
log_number_(log_number),
recycle_log_files_(recycle_log_files) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast<char>(i);
type_crc_[i] = crc32c::Value(&t, 1);
@ -33,6 +37,10 @@ Status Writer::AddRecord(const Slice& slice) {
const char* ptr = slice.data();
size_t left = slice.size();
// Header size varies depending on whether we are recycling or not.
const int header_size =
recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize;
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single
// zero-length record
@ -41,32 +49,34 @@ Status Writer::AddRecord(const Slice& slice) {
do {
const int leftover = kBlockSize - block_offset_;
assert(leftover >= 0);
if (leftover < kHeaderSize) {
if (leftover < header_size) {
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover));
// Fill the trailer (literal below relies on kHeaderSize and
// kRecyclableHeaderSize being <= 11)
assert(header_size <= 11);
dest_->Append(
Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover));
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(static_cast<int>(kBlockSize) - block_offset_ >= kHeaderSize);
// Invariant: we never leave < header_size bytes in a block.
assert(static_cast<int>(kBlockSize) - block_offset_ >= header_size);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize;
const size_t avail = kBlockSize - block_offset_ - header_size;
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length);
if (begin && end) {
type = kFullType;
type = recycle_log_files_ ? kRecyclableFullType : kFullType;
} else if (begin) {
type = kFirstType;
type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
} else if (end) {
type = kLastType;
type = recycle_log_files_ ? kRecyclableLastType : kLastType;
} else {
type = kMiddleType;
type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length);
@ -79,28 +89,48 @@ Status Writer::AddRecord(const Slice& slice) {
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) {
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
size_t header_size;
char buf[kRecyclableHeaderSize];
// Format the header
char buf[kHeaderSize];
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
buf[6] = static_cast<char>(t);
uint32_t crc = type_crc_[t];
if (t < kRecyclableFullType) {
// Legacy record format
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
header_size = kHeaderSize;
} else {
// Recyclable record format
assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize);
header_size = kRecyclableHeaderSize;
// Only encode low 32-bits of the 64-bit log number. This means
// we will fail to detect an old record if we recycled a log from
// ~4 billion logs ago, but that is effectively impossible, and
// even if it were we'dbe far more likely to see a false positive
// on the 32-bit CRC.
EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
crc = crc32c::Extend(crc, buf + 7, 4);
}
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
crc = crc32c::Extend(crc, ptr, n);
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc);
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize));
Status s = dest_->Append(Slice(buf, header_size));
if (s.ok()) {
s = dest_->Append(Slice(ptr, n));
if (s.ok()) {
s = dest_->Flush();
}
}
block_offset_ += kHeaderSize + n;
block_offset_ += header_size + n;
return s;
}

@ -43,7 +43,7 @@ namespace log {
* Data is written out in kBlockSize chunks. If next record does not fit
* into the space left, the leftover space will be padded with \0.
*
* Record format:
* Legacy record format:
*
* +---------+-----------+-----------+--- ... ---+
* |CRC (4B) | Size (2B) | Type (1B) | Payload |
@ -57,13 +57,23 @@ namespace log {
* blocks that are larger than kBlockSize
* Payload = Byte stream as long as specified by the payload size
*
* Recyclable record format:
*
* +---------+-----------+-----------+----------------+--- ... ---+
* |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload |
* +---------+-----------+-----------+----------------+--- ... ---+
*
* Same as above, with the addition of
* Log number = 32bit log file number, so that we can distinguish between
* records written by the most recent log writer vs a previous one.
*/
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(unique_ptr<WritableFileWriter>&& dest);
explicit Writer(unique_ptr<WritableFileWriter>&& dest,
uint64_t log_number, bool recycle_log_files);
~Writer();
Status AddRecord(const Slice& slice);
@ -74,6 +84,8 @@ class Writer {
private:
unique_ptr<WritableFileWriter> dest_;
int block_offset_; // Current offset in block
uint64_t log_number_;
bool recycle_log_files_;
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the

@ -249,8 +249,8 @@ class Repairer {
// corruptions cause entire commits to be skipped instead of
// propagating bad information (like overly large sequence
// numbers).
log::Reader reader(std::move(lfile_reader), &reporter,
true /*enable checksum*/, 0 /*initial_offset*/);
log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter,
true /*enable checksum*/, 0 /*initial_offset*/, log);
// Read all the records and add to a memtable
std::string scratch;
@ -413,7 +413,7 @@ class Repairer {
{
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options));
log::Writer log(std::move(file_writer));
log::Writer log(std::move(file_writer), 0, false);
std::string record;
edit_->EncodeTo(&record);
status = log.AddRecord(record);

@ -262,8 +262,10 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) {
return s;
}
assert(file);
currentLogReader_.reset(new log::Reader(std::move(file), &reporter_,
read_options_.verify_checksums_, 0));
currentLogReader_.reset(new log::Reader(options_->info_log,
std::move(file), &reporter_,
read_options_.verify_checksums_, 0,
logFile->LogNumber()));
return Status::OK();
}
} // namespace rocksdb

@ -82,6 +82,7 @@ class VersionBuilder::Rep {
};
const EnvOptions& env_options_;
Logger* info_log_;
TableCache* table_cache_;
VersionStorageInfo* base_vstorage_;
LevelState* levels_;
@ -89,9 +90,10 @@ class VersionBuilder::Rep {
FileComparator level_nonzero_cmp_;
public:
Rep(const EnvOptions& env_options, TableCache* table_cache,
Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache,
VersionStorageInfo* base_vstorage)
: env_options_(env_options),
info_log_(info_log),
table_cache_(table_cache),
base_vstorage_(base_vstorage) {
levels_ = new LevelState[base_vstorage_->num_levels()];
@ -335,15 +337,16 @@ class VersionBuilder::Rep {
if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) {
// File is deleted: do nothing
} else {
vstorage->AddFile(level, f);
vstorage->AddFile(level, f, info_log_);
}
}
};
VersionBuilder::VersionBuilder(const EnvOptions& env_options,
TableCache* table_cache,
VersionStorageInfo* base_vstorage)
: rep_(new Rep(env_options, table_cache, base_vstorage)) {}
VersionStorageInfo* base_vstorage,
Logger* info_log)
: rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {}
VersionBuilder::~VersionBuilder() { delete rep_; }
void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) {
rep_->CheckConsistency(vstorage);

@ -24,7 +24,7 @@ class InternalStats;
class VersionBuilder {
public:
VersionBuilder(const EnvOptions& env_options, TableCache* table_cache,
VersionStorageInfo* base_vstorage);
VersionStorageInfo* base_vstorage, Logger* info_log = nullptr);
~VersionBuilder();
void CheckConsistency(VersionStorageInfo* vstorage);
void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number,

@ -524,7 +524,7 @@ class BaseReferencedVersionBuilder {
explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd)
: version_builder_(new VersionBuilder(
cfd->current()->version_set()->env_options(), cfd->table_cache(),
cfd->current()->storage_info())),
cfd->current()->storage_info(), cfd->ioptions()->info_log)),
version_(cfd->current()) {
version_->Ref();
}
@ -1262,13 +1262,27 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) {
}
} // anonymous namespace
void VersionStorageInfo::AddFile(int level, FileMetaData* f) {
void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) {
auto* level_files = &files_[level];
// Must not overlap
assert(level <= 0 || level_files->empty() ||
internal_comparator_->Compare(
(*level_files)[level_files->size() - 1]->largest, f->smallest) <
0);
#ifndef NDEBUG
if (level > 0 && !level_files->empty() &&
internal_comparator_->Compare(
(*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) {
auto* f2 = (*level_files)[level_files->size() - 1];
if (info_log != nullptr) {
Error(info_log, "Adding new file %" PRIu64
" range (%s, %s) to level %d but overlapping "
"with existing file %" PRIu64 " %s %s",
f->fd.GetNumber(), f->smallest.DebugString(true).c_str(),
f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(),
f2->smallest.DebugString(true).c_str(),
f2->largest.DebugString(true).c_str());
LogFlush(info_log);
}
assert(false);
}
#endif
f->refs++;
level_files->push_back(f);
}
@ -2112,7 +2126,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(descriptor_file), opt_env_opts));
descriptor_log_.reset(new log::Writer(std::move(file_writer)));
descriptor_log_.reset(new log::Writer(std::move(file_writer), 0, false));
s = WriteSnapshot(descriptor_log_.get());
}
}
@ -2376,8 +2390,8 @@ Status VersionSet::Recover(
{
VersionSet::LogReporter reporter;
reporter.status = &s;
log::Reader reader(std::move(manifest_file_reader), &reporter,
true /*checksum*/, 0 /*initial_offset*/);
log::Reader reader(NULL, std::move(manifest_file_reader), &reporter,
true /*checksum*/, 0 /*initial_offset*/, 0);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@ -2629,8 +2643,8 @@ Status VersionSet::ListColumnFamilies(std::vector<std::string>* column_families,
column_family_names.insert({0, kDefaultColumnFamilyName});
VersionSet::LogReporter reporter;
reporter.status = &s;
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/);
log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/, 0);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@ -2787,8 +2801,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname,
{
VersionSet::LogReporter reporter;
reporter.status = &s;
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/);
log::Reader reader(NULL, std::move(file_reader), &reporter,
true /*checksum*/, 0 /*initial_offset*/, 0);
Slice record;
std::string scratch;
while (reader.ReadRecord(&record, &scratch) && s.ok()) {
@ -3040,7 +3054,8 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num,
}
file_reader.reset(new SequentialFileReader(std::move(file)));
}
log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0);
log::Reader reader(NULL, std::move(file_reader), nullptr,
true /*checksum*/, 0, 0);
Slice r;
std::string scratch;
bool result = false;

@ -97,7 +97,7 @@ class VersionStorageInfo {
void Reserve(int level, size_t size) { files_[level].reserve(size); }
void AddFile(int level, FileMetaData* f);
void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr);
void SetFinalized();

@ -448,8 +448,8 @@ Status WalManager::ReadFirstLine(const std::string& fname,
reporter.fname = fname.c_str();
reporter.status = &status;
reporter.ignore_error = !db_options_.paranoid_checks;
log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/,
0 /*initial_offset*/);
log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter,
true /*checksum*/, 0 /*initial_offset*/, *sequence);
std::string scratch;
Slice record;

@ -77,7 +77,7 @@ class WalManagerTest : public testing::Test {
ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_));
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options_));
current_log_writer_.reset(new log::Writer(std::move(file_writer)));
current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false));
}
void CreateArchiveLogs(int num_logs, int entries_per_log) {
@ -127,7 +127,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) {
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), EnvOptions()));
log::Writer writer(std::move(file_writer));
log::Writer writer(std::move(file_writer), 1,
db_options_.recycle_log_file_num > 0);
WriteBatch batch;
batch.Put("foo", "bar");
WriteBatchInternal::SetSequence(&batch, 10);

@ -0,0 +1,180 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <unistd.h>
#include "rocksdb/env.h"
// For non linux platform, the following macros are used only as place
// holder.
#if !(defined OS_LINUX) && !(defined CYGWIN)
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
#endif
namespace rocksdb {
static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
class PosixSequentialFile : public SequentialFile {
private:
std::string filename_;
FILE* file_;
int fd_;
bool use_os_buffer_;
public:
PosixSequentialFile(const std::string& fname, FILE* f,
const EnvOptions& options);
virtual ~PosixSequentialFile();
virtual Status Read(size_t n, Slice* result, char* scratch) override;
virtual Status Skip(uint64_t n) override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
};
class PosixRandomAccessFile : public RandomAccessFile {
private:
std::string filename_;
int fd_;
bool use_os_buffer_;
public:
PosixRandomAccessFile(const std::string& fname, int fd,
const EnvOptions& options);
virtual ~PosixRandomAccessFile();
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override;
#ifdef OS_LINUX
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
#endif
virtual void Hint(AccessPattern pattern) override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
};
class PosixWritableFile : public WritableFile {
private:
const std::string filename_;
int fd_;
uint64_t filesize_;
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_;
bool fallocate_with_keep_size_;
#endif
public:
PosixWritableFile(const std::string& fname, int fd,
const EnvOptions& options);
~PosixWritableFile();
// Means Close() will properly take care of truncate
// and it does not need any additional information
virtual Status Truncate(uint64_t size) override { return Status::OK(); }
virtual Status Close() override;
virtual Status Append(const Slice& data) override;
virtual Status Flush() override;
virtual Status Sync() override;
virtual Status Fsync() override;
virtual bool IsSyncThreadSafe() const override;
virtual uint64_t GetFileSize() override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override;
virtual Status RangeSync(off_t offset, off_t nbytes) override;
virtual size_t GetUniqueId(char* id, size_t max_size) const override;
#endif
};
class PosixMmapReadableFile : public RandomAccessFile {
private:
int fd_;
std::string filename_;
void* mmapped_region_;
size_t length_;
public:
PosixMmapReadableFile(const int fd, const std::string& fname, void* base,
size_t length, const EnvOptions& options);
virtual ~PosixMmapReadableFile();
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
};
class PosixMmapFile : public WritableFile {
private:
std::string filename_;
int fd_;
size_t page_size_;
size_t map_size_; // How much extra memory to map at a time
char* base_; // The mapped region
char* limit_; // Limit of the mapped region
char* dst_; // Where to write next (in range [base_,limit_])
char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_; // If false, fallocate calls are bypassed
bool fallocate_with_keep_size_;
#endif
// Roundup x to a multiple of y
static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; }
size_t TruncateToPageBoundary(size_t s) {
s -= (s & (page_size_ - 1));
assert((s % page_size_) == 0);
return s;
}
Status MapNewRegion();
Status UnmapCurrentRegion();
Status Msync();
public:
PosixMmapFile(const std::string& fname, int fd, size_t page_size,
const EnvOptions& options);
~PosixMmapFile();
// Means Close() will properly take care of truncate
// and it does not need any additional information
virtual Status Truncate(uint64_t size) override { return Status::OK(); }
virtual Status Close() override;
virtual Status Append(const Slice& data) override;
virtual Status Flush() override;
virtual Status Sync() override;
virtual Status Fsync() override;
virtual uint64_t GetFileSize() override;
virtual Status InvalidateCache(size_t offset, size_t length) override;
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override;
#endif
};
class PosixDirectory : public Directory {
public:
explicit PosixDirectory(int fd) : fd_(fd) {}
~PosixDirectory() { close(fd_); }
virtual Status Fsync() override {
if (fsync(fd_) == -1) {
return IOError("directory", errno);
}
return Status::OK();
}
private:
int fd_;
};
} // namespace rocksdb

@ -572,6 +572,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll(
rocksdb_options_t*, size_t);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num(
rocksdb_options_t*, size_t);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num(
rocksdb_options_t*, size_t);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit(
rocksdb_options_t*, double);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit(

@ -139,6 +139,12 @@ class Env {
unique_ptr<WritableFile>* result,
const EnvOptions& options) = 0;
// Reuse an existing file by renaming it and opening it as writable.
virtual Status ReuseWritableFile(const std::string& fname,
const std::string& old_fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options);
// Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory
// and create a new Directory object.

@ -958,6 +958,16 @@ struct DBOptions {
// Default: 1000
size_t keep_log_file_num;
// Recycle log files.
// If non-zero, we will reuse previously written log files for new
// logs, overwriting the old data. The value indicates how many
// such files we will keep around at any point in time for later
// use. This is more efficient because the blocks are already
// allocated and fdatasync does not need to update the inode after
// each write.
// Default: 0
size_t recycle_log_file_num;
// manifest file is rolled over on reaching this limit.
// The older manifest file be deleted.
// The default value is MAX_INT so that roll-over does not take place.

@ -148,7 +148,8 @@ class StackableDB : public DB {
virtual void GetApproximateSizes(ColumnFamilyHandle* column_family,
const Range* r, int n, uint64_t* sizes,
bool include_memtable = false) override {
return db_->GetApproximateSizes(column_family, r, n, sizes);
return db_->GetApproximateSizes(column_family, r, n, sizes,
include_memtable);
}
using DB::CompactRange;

@ -5,7 +5,7 @@
#pragma once
#define ROCKSDB_MAJOR 4
#define ROCKSDB_MINOR 1
#define ROCKSDB_MINOR 2
#define ROCKSDB_PATCH 0
// Do not use these. We made the mistake of declaring macros starting with

@ -576,6 +576,33 @@ void Java_org_rocksdb_Options_setKeepLogFileNum(
}
}
/*
* Class: org_rocksdb_Options
* Method: recycleLogFiles
* Signature: (J)J
*/
jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv* env, jobject jobj,
jlong jhandle) {
return reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num;
}
/*
* Class: org_rocksdb_Options
* Method: setRecycleLogFiles
* Signature: (JJ)V
*/
void Java_org_rocksdb_Options_setRecycleLogFiles(JNIEnv* env, jobject jobj,
jlong jhandle,
jlong recycle_log_file_num) {
rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
if (s.ok()) {
reinterpret_cast<rocksdb::Options*>(jhandle)->recycle_log_file_num =
recycle_log_file_num;
} else {
rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
}
}
/*
* Class: org_rocksdb_Options
* Method: maxManifestFileSize
@ -3533,6 +3560,32 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum(
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->keep_log_file_num;
}
/*
* Class: org_rocksdb_DBOptions
* Method: setRecycleLogFiles
* Signature: (JJ)V
*/
void Java_org_rocksdb_DBOptions_setRecycleLogFileNum(
JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) {
rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num);
if (s.ok()) {
reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num =
recycle_log_file_num;
} else {
rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s);
}
}
/*
* Class: org_rocksdb_DBOptions
* Method: recycleLogFiles
* Signature: (J)J
*/
jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj,
jlong jhandle) {
return reinterpret_cast<rocksdb::DBOptions*>(jhandle)->recycle_log_file_num;
}
/*
* Class: org_rocksdb_DBOptions
* Method: setMaxManifestFileSize

@ -95,6 +95,7 @@ LIB_SOURCES = \
util/env.cc \
util/env_hdfs.cc \
util/env_posix.cc \
util/io_posix.cc \
util/file_util.cc \
util/file_reader_writer.cc \
util/filter_policy.cc \

@ -57,7 +57,6 @@ def get_dbname(test_name):
return dbname
blackbox_default_params = {
'db': lambda: get_dbname('blackbox'),
# total time for this script to test db_stress
"duration": 6000,
# time for one db_stress instance to run
@ -69,7 +68,6 @@ blackbox_default_params = {
}
whitebox_default_params = {
'db': lambda: get_dbname('whitebox'),
"duration": 10000,
"log2_keys_per_lock": 10,
"nooverwritepercent": 1,
@ -110,7 +108,6 @@ simple_default_params = {
}
blackbox_simple_default_params = {
'db': lambda: get_dbname('blackbox'),
"duration": 6000,
"interval": 120,
"open_files": -1,
@ -120,7 +117,6 @@ blackbox_simple_default_params = {
}
whitebox_simple_default_params = {
'db': lambda: get_dbname('whitebox'),
"duration": 10000,
"log2_keys_per_lock": 10,
"nooverwritepercent": 1,
@ -166,7 +162,7 @@ def gen_cmd(params):
# in case of unsafe crashes in RocksDB.
def blackbox_crash_main(args):
cmd_params = gen_cmd_params(args)
dbname = get_dbname('blackbox')
exit_time = time.time() + cmd_params['duration']
print("Running blackbox-crash-test with \n"
@ -180,7 +176,7 @@ def blackbox_crash_main(args):
run_had_errors = False
killtime = time.time() + cmd_params['interval']
cmd = gen_cmd(cmd_params)
cmd = gen_cmd(dict(cmd_params.items() + {'db': dbname}.items()))
child = subprocess.Popen([cmd],
stderr=subprocess.PIPE, shell=True)
@ -226,6 +222,7 @@ def blackbox_crash_main(args):
# kill_random_test that causes rocksdb to crash at various points in code.
def whitebox_crash_main(args):
cmd_params = gen_cmd_params(args)
dbname = get_dbname('whitebox')
cur_time = time.time()
exit_time = cur_time + cmd_params['duration']
@ -285,7 +282,8 @@ def whitebox_crash_main(args):
"ops_per_thread": cmd_params['ops_per_thread'],
}
cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()))
cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items()
+ {'db': dbname}.items()))
print "Running:" + cmd + "\n"

@ -1438,7 +1438,21 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values,
}
} else {
StdErrReporter reporter;
log::Reader reader(move(wal_file_reader), &reporter, true, 0);
uint64_t log_number;
FileType type;
// we need the log number, but ParseFilename expects dbname/NNN.log.
string sanitized = wal_file;
size_t lastslash = sanitized.rfind('/');
if (lastslash != std::string::npos)
sanitized = sanitized.substr(lastslash + 1);
if (!ParseFileName(sanitized, &log_number, &type)) {
// bogus input, carry on as best we can
log_number = 0;
}
DBOptions db_options;
log::Reader reader(db_options.info_log, move(wal_file_reader), &reporter,
true, 0, log_number);
string scratch;
WriteBatch batch;
Slice record;

@ -27,6 +27,17 @@ uint64_t Env::GetThreadID() const {
return hasher(std::this_thread::get_id());
}
Status Env::ReuseWritableFile(const std::string& fname,
const std::string& old_fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) {
Status s = RenameFile(old_fname, fname);
if (!s.ok()) {
return s;
}
return NewWritableFile(fname, result, options);
}
SequentialFile::~SequentialFile() {
}

@ -6,13 +6,14 @@
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <deque>
#include <set>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#if defined(OS_LINUX)
#include <linux/fs.h>
#endif
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -26,25 +27,7 @@
#include <sys/time.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#if defined(OS_LINUX)
#include <linux/fs.h>
#endif
#include <signal.h>
#include <algorithm>
#include "rocksdb/env.h"
#include "rocksdb/slice.h"
#include "port/port.h"
#include "util/coding.h"
#include "util/logging.h"
#include "util/posix_logger.h"
#include "util/random.h"
#include "util/iostats_context_imp.h"
#include "util/string_util.h"
#include "util/sync_point.h"
#include "util/thread_status_updater.h"
#include "util/thread_status_util.h"
// Get nano time includes
#if defined(OS_LINUX) || defined(OS_FREEBSD)
#elif defined(__MACH__)
@ -53,6 +36,19 @@
#else
#include <chrono>
#endif
#include <deque>
#include <set>
#include "port/port.h"
#include "posix/io_posix.h"
#include "rocksdb/slice.h"
#include "util/coding.h"
#include "util/iostats_context_imp.h"
#include "util/logging.h"
#include "util/posix_logger.h"
#include "util/string_util.h"
#include "util/sync_point.h"
#include "util/thread_status_updater.h"
#include "util/thread_status_util.h"
#if !defined(TMPFS_MAGIC)
#define TMPFS_MAGIC 0x01021994
@ -64,31 +60,10 @@
#define EXT4_SUPER_MAGIC 0xEF53
#endif
// For non linux platform, the following macros are used only as place
// holder.
#if !(defined OS_LINUX) && !(defined CYGWIN)
#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */
#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */
#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */
#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */
#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */
#endif
namespace rocksdb {
namespace {
// A wrapper for fadvise, if the platform doesn't support fadvise,
// it will simply return Status::NotSupport.
int Fadvise(int fd, off_t offset, size_t len, int advice) {
#ifdef OS_LINUX
return posix_fadvise(fd, offset, len, advice);
#else
return 0; // simply do nothing.
#endif
}
ThreadStatusUpdater* CreateThreadStatusUpdater() {
return new ThreadStatusUpdater();
}
@ -97,677 +72,6 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() {
static std::set<std::string> lockedFiles;
static port::Mutex mutex_lockedFiles;
static Status IOError(const std::string& context, int err_number) {
return Status::IOError(context, strerror(err_number));
}
#if defined(OS_LINUX)
namespace {
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length*3) {
return 0;
}
struct stat buf;
int result = fstat(fd, &buf);
if (result == -1) {
return 0;
}
long version = 0;
result = ioctl(fd, FS_IOC_GETVERSION, &version);
if (result == -1) {
return 0;
}
uint64_t uversion = (uint64_t)version;
char* rid = id;
rid = EncodeVarint64(rid, buf.st_dev);
rid = EncodeVarint64(rid, buf.st_ino);
rid = EncodeVarint64(rid, uversion);
assert(rid >= id);
return static_cast<size_t>(rid-id);
}
}
#endif
class PosixSequentialFile: public SequentialFile {
private:
std::string filename_;
FILE* file_;
int fd_;
bool use_os_buffer_;
public:
PosixSequentialFile(const std::string& fname, FILE* f,
const EnvOptions& options)
: filename_(fname), file_(f), fd_(fileno(f)),
use_os_buffer_(options.use_os_buffer) {
}
virtual ~PosixSequentialFile() { fclose(file_); }
virtual Status Read(size_t n, Slice* result, char* scratch) override {
Status s;
size_t r = 0;
do {
r = fread_unlocked(scratch, 1, n, file_);
} while (r == 0 && ferror(file_) && errno == EINTR);
*result = Slice(scratch, r);
if (r < n) {
if (feof(file_)) {
// We leave status as ok if we hit the end of the file
// We also clear the error so that the reads can continue
// if a new data is written to the file
clearerr(file_);
} else {
// A partial read with an error: return a non-ok status
s = IOError(filename_, errno);
}
}
if (!use_os_buffer_) {
// we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
}
return s;
}
virtual Status Skip(uint64_t n) override {
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
return IOError(filename_, errno);
}
return Status::OK();
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
};
// pread() based random-access
class PosixRandomAccessFile: public RandomAccessFile {
private:
std::string filename_;
int fd_;
bool use_os_buffer_;
public:
PosixRandomAccessFile(const std::string& fname, int fd,
const EnvOptions& options)
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
assert(!options.use_mmap_reads || sizeof(void*) < 8);
}
virtual ~PosixRandomAccessFile() { close(fd_); }
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override {
Status s;
ssize_t r = -1;
size_t left = n;
char* ptr = scratch;
while (left > 0) {
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
if (r <= 0) {
if (errno == EINTR) {
continue;
}
break;
}
ptr += r;
offset += r;
left -= r;
}
*result = Slice(scratch, (r < 0) ? 0 : n - left);
if (r < 0) {
// An error: return a non-ok status
s = IOError(filename_, errno);
}
if (!use_os_buffer_) {
// we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
}
return s;
}
#ifdef OS_LINUX
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
return GetUniqueIdFromFile(fd_, id, max_size);
}
#endif
virtual void Hint(AccessPattern pattern) override {
switch(pattern) {
case NORMAL:
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
break;
case RANDOM:
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
break;
case SEQUENTIAL:
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
break;
case WILLNEED:
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
break;
case DONTNEED:
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
break;
default:
assert(false);
break;
}
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
};
// mmap() based random-access
class PosixMmapReadableFile: public RandomAccessFile {
private:
int fd_;
std::string filename_;
void* mmapped_region_;
size_t length_;
public:
// base[0,length-1] contains the mmapped contents of the file.
PosixMmapReadableFile(const int fd, const std::string& fname,
void* base, size_t length,
const EnvOptions& options)
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
fd_ = fd_ + 0; // suppress the warning for used variables
assert(options.use_mmap_reads);
assert(options.use_os_buffer);
}
virtual ~PosixMmapReadableFile() {
int ret = munmap(mmapped_region_, length_);
if (ret != 0) {
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
mmapped_region_, length_);
}
}
virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override {
Status s;
if (offset > length_) {
*result = Slice();
return IOError(filename_, EINVAL);
} else if (offset + n > length_) {
n = static_cast<size_t>(length_ - offset);
}
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
return s;
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
};
// We preallocate up to an extra megabyte and use memcpy to append new
// data to the file. This is safe since we either properly close the
// file before reading from it, or for log files, the reading code
// knows enough to skip zero suffixes.
class PosixMmapFile : public WritableFile {
private:
std::string filename_;
int fd_;
size_t page_size_;
size_t map_size_; // How much extra memory to map at a time
char* base_; // The mapped region
char* limit_; // Limit of the mapped region
char* dst_; // Where to write next (in range [base_,limit_])
char* last_sync_; // Where have we synced up to
uint64_t file_offset_; // Offset of base_ in file
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_; // If false, fallocate calls are bypassed
bool fallocate_with_keep_size_;
#endif
// Roundup x to a multiple of y
static size_t Roundup(size_t x, size_t y) {
return ((x + y - 1) / y) * y;
}
size_t TruncateToPageBoundary(size_t s) {
s -= (s & (page_size_ - 1));
assert((s % page_size_) == 0);
return s;
}
Status UnmapCurrentRegion() {
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
if (base_ != nullptr) {
int munmap_status = munmap(base_, limit_ - base_);
if (munmap_status != 0) {
return IOError(filename_, munmap_status);
}
file_offset_ += limit_ - base_;
base_ = nullptr;
limit_ = nullptr;
last_sync_ = nullptr;
dst_ = nullptr;
// Increase the amount we map the next time, but capped at 1MB
if (map_size_ < (1<<20)) {
map_size_ *= 2;
}
}
return Status::OK();
}
Status MapNewRegion() {
#ifdef ROCKSDB_FALLOCATE_PRESENT
assert(base_ == nullptr);
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
if (allow_fallocate_) {
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
}
}
TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED,
fd_, file_offset_);
if (ptr == MAP_FAILED) {
return Status::IOError("MMap failed on " + filename_);
}
TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
base_ = reinterpret_cast<char*>(ptr);
limit_ = base_ + map_size_;
dst_ = base_;
last_sync_ = base_;
return Status::OK();
#else
return Status::NotSupported("This platform doesn't support fallocate()");
#endif
}
Status Msync() {
if (dst_ == last_sync_) {
return Status::OK();
}
// Find the beginnings of the pages that contain the first and last
// bytes to be synced.
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
last_sync_ = dst_;
TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
public:
PosixMmapFile(const std::string& fname, int fd, size_t page_size,
const EnvOptions& options)
: filename_(fname),
fd_(fd),
page_size_(page_size),
map_size_(Roundup(65536, page_size)),
base_(nullptr),
limit_(nullptr),
dst_(nullptr),
last_sync_(nullptr),
file_offset_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert((page_size & (page_size - 1)) == 0);
assert(options.use_mmap_writes);
}
~PosixMmapFile() {
if (fd_ >= 0) {
PosixMmapFile::Close();
}
}
virtual Status Append(const Slice& data) override {
const char* src = data.data();
size_t left = data.size();
while (left > 0) {
assert(base_ <= dst_);
assert(dst_ <= limit_);
size_t avail = limit_ - dst_;
if (avail == 0) {
Status s = UnmapCurrentRegion();
if (!s.ok()) {
return s;
}
s = MapNewRegion();
if (!s.ok()) {
return s;
}
TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
}
size_t n = (left <= avail) ? left : avail;
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
}
return Status::OK();
}
// Means Close() will properly take care of truncate
// and it does not need any additional information
virtual Status Truncate(uint64_t size) override {
return Status::OK();
}
virtual Status Close() override {
Status s;
size_t unused = limit_ - dst_;
s = UnmapCurrentRegion();
if (!s.ok()) {
s = IOError(filename_, errno);
} else if (unused > 0) {
// Trim the extra space at the end of the file
if (ftruncate(fd_, file_offset_ - unused) < 0) {
s = IOError(filename_, errno);
}
}
if (close(fd_) < 0) {
if (s.ok()) {
s = IOError(filename_, errno);
}
}
fd_ = -1;
base_ = nullptr;
limit_ = nullptr;
return s;
}
virtual Status Flush() override {
return Status::OK();
}
virtual Status Sync() override {
if (fdatasync(fd_) < 0) {
return IOError(filename_, errno);
}
return Msync();
}
/**
* Flush data as well as metadata to stable storage.
*/
virtual Status Fsync() override {
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
return Msync();
}
/**
* Get the size of valid data in the file. This will not match the
* size that is returned from the filesystem because we use mmap
* to extend file by map_size every time.
*/
virtual uint64_t GetFileSize() override {
size_t used = dst_ - base_;
return file_offset_ + used;
}
virtual Status InvalidateCache(size_t offset, size_t length) override {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
#endif
};
// Use posix write to write data to a file.
class PosixWritableFile : public WritableFile {
private:
const std::string filename_;
int fd_;
uint64_t filesize_;
#ifdef ROCKSDB_FALLOCATE_PRESENT
bool allow_fallocate_;
bool fallocate_with_keep_size_;
#endif
public:
PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options)
: filename_(fname), fd_(fd), filesize_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert(!options.use_mmap_writes);
}
~PosixWritableFile() {
if (fd_ >= 0) {
PosixWritableFile::Close();
}
}
virtual Status Append(const Slice& data) override {
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = write(fd_, src, left);
if (done < 0) {
if (errno == EINTR) {
continue;
}
return IOError(filename_, errno);
}
left -= done;
src += done;
}
filesize_ += data.size();
return Status::OK();
}
// Means Close() will properly take care of truncate
// and it does not need any additional information
virtual Status Truncate(uint64_t size) override {
return Status::OK();
}
virtual Status Close() override {
Status s;
size_t block_size;
size_t last_allocated_block;
GetPreallocationStatus(&block_size, &last_allocated_block);
if (last_allocated_block > 0) {
// trim the extra space preallocated at the end of the file
// NOTE(ljin): we probably don't want to surface failure as an IOError,
// but it will be nice to log these errors.
int dummy __attribute__((unused));
dummy = ftruncate(fd_, filesize_);
#ifdef ROCKSDB_FALLOCATE_PRESENT
// in some file systems, ftruncate only trims trailing space if the
// new file size is smaller than the current size. Calling fallocate
// with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
// blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
// filesystems:
// XFS (since Linux 2.6.38)
// ext4 (since Linux 3.0)
// Btrfs (since Linux 3.7)
// tmpfs (since Linux 3.5)
// We ignore error since failure of this operation does not affect
// correctness.
IOSTATS_TIMER_GUARD(allocate_nanos);
if (allow_fallocate_) {
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
block_size * last_allocated_block - filesize_);
}
#endif
}
if (close(fd_) < 0) {
s = IOError(filename_, errno);
}
fd_ = -1;
return s;
}
// write out the cached data to the OS cache
virtual Status Flush() override {
return Status::OK();
}
virtual Status Sync() override {
if (fdatasync(fd_) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
virtual Status Fsync() override {
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
virtual bool IsSyncThreadSafe() const override {
return true;
}
virtual uint64_t GetFileSize() override { return filesize_; }
virtual Status InvalidateCache(size_t offset, size_t length) override {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
#ifdef ROCKSDB_FALLOCATE_PRESENT
virtual Status Allocate(off_t offset, off_t len) override {
TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status =
fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
virtual Status RangeSync(off_t offset, off_t nbytes) override {
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
virtual size_t GetUniqueId(char* id, size_t max_size) const override {
return GetUniqueIdFromFile(fd_, id, max_size);
}
#endif
};
class PosixDirectory : public Directory {
public:
explicit PosixDirectory(int fd) : fd_(fd) {}
~PosixDirectory() {
close(fd_);
}
virtual Status Fsync() override {
if (fsync(fd_) == -1) {
return IOError("directory", errno);
}
return Status::OK();
}
private:
int fd_;
};
static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
mutex_lockedFiles.Lock();
if (lock) {
@ -806,12 +110,6 @@ static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
return value;
}
class PosixFileLock : public FileLock {
public:
int fd_;
std::string filename;
};
void PthreadCall(const char* label, int result) {
if (result != 0) {
fprintf(stderr, "pthread %s: %s\n", label, strerror(result));
@ -819,6 +117,12 @@ void PthreadCall(const char* label, int result) {
}
}
class PosixFileLock : public FileLock {
public:
int fd_;
std::string filename;
};
class PosixEnv : public Env {
public:
PosixEnv();
@ -933,6 +237,50 @@ class PosixEnv : public Env {
return s;
}
virtual Status ReuseWritableFile(const std::string& fname,
const std::string& old_fname,
unique_ptr<WritableFile>* result,
const EnvOptions& options) override {
result->reset();
Status s;
int fd = -1;
do {
IOSTATS_TIMER_GUARD(open_nanos);
fd = open(old_fname.c_str(), O_RDWR, 0644);
} while (fd < 0 && errno == EINTR);
if (fd < 0) {
s = IOError(fname, errno);
} else {
SetFD_CLOEXEC(fd, &options);
// rename into place
if (rename(old_fname.c_str(), fname.c_str()) != 0) {
Status r = IOError(old_fname, errno);
close(fd);
return r;
}
if (options.use_mmap_writes) {
if (!checkedDiskForMmap_) {
// this will be executed once in the program's lifetime.
// do not use mmapWrite on non ext-3/xfs/tmpfs systems.
if (!SupportsFastAllocate(fname)) {
forceMmapOff = true;
}
checkedDiskForMmap_ = true;
}
}
if (options.use_mmap_writes && !forceMmapOff) {
result->reset(new PosixMmapFile(fname, fd, page_size_, options));
} else {
// disable mmap writes
EnvOptions no_mmap_writes_options = options;
no_mmap_writes_options.use_mmap_writes = false;
result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
}
}
return s;
}
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) override {
result->reset();

@ -0,0 +1,637 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifdef ROCKSDB_LIB_IO_POSIX
#include "posix/io_posix.h"
#include <errno.h>
#include <fcntl.h>
#if defined(OS_LINUX)
#include <linux/fs.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifdef OS_LINUX
#include <sys/statfs.h>
#include <sys/syscall.h>
#endif
#include "port/port.h"
#include "rocksdb/slice.h"
#include "util/coding.h"
#include "util/iostats_context_imp.h"
#include "util/posix_logger.h"
#include "util/string_util.h"
#include "util/sync_point.h"
namespace rocksdb {
// A wrapper for fadvise, if the platform doesn't support fadvise,
// it will simply return Status::NotSupport.
int Fadvise(int fd, off_t offset, size_t len, int advice) {
#ifdef OS_LINUX
return posix_fadvise(fd, offset, len, advice);
#else
return 0; // simply do nothing.
#endif
}
/*
* PosixSequentialFile
*/
PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* f,
const EnvOptions& options)
: filename_(fname),
file_(f),
fd_(fileno(f)),
use_os_buffer_(options.use_os_buffer) {}
PosixSequentialFile::~PosixSequentialFile() { fclose(file_); }
Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) {
Status s;
size_t r = 0;
do {
r = fread_unlocked(scratch, 1, n, file_);
} while (r == 0 && ferror(file_) && errno == EINTR);
*result = Slice(scratch, r);
if (r < n) {
if (feof(file_)) {
// We leave status as ok if we hit the end of the file
// We also clear the error so that the reads can continue
// if a new data is written to the file
clearerr(file_);
} else {
// A partial read with an error: return a non-ok status
s = IOError(filename_, errno);
}
}
if (!use_os_buffer_) {
// we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
}
return s;
}
Status PosixSequentialFile::Skip(uint64_t n) {
if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
return IOError(filename_, errno);
}
return Status::OK();
}
Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
#if defined(OS_LINUX)
namespace {
static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
if (max_size < kMaxVarint64Length * 3) {
return 0;
}
struct stat buf;
int result = fstat(fd, &buf);
if (result == -1) {
return 0;
}
long version = 0;
result = ioctl(fd, FS_IOC_GETVERSION, &version);
if (result == -1) {
return 0;
}
uint64_t uversion = (uint64_t)version;
char* rid = id;
rid = EncodeVarint64(rid, buf.st_dev);
rid = EncodeVarint64(rid, buf.st_ino);
rid = EncodeVarint64(rid, uversion);
assert(rid >= id);
return static_cast<size_t>(rid - id);
}
}
#endif
/*
* PosixRandomAccessFile
*
* pread() based random-access
*/
PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd,
const EnvOptions& options)
: filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) {
assert(!options.use_mmap_reads || sizeof(void*) < 8);
}
PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
ssize_t r = -1;
size_t left = n;
char* ptr = scratch;
while (left > 0) {
r = pread(fd_, ptr, left, static_cast<off_t>(offset));
if (r <= 0) {
if (errno == EINTR) {
continue;
}
break;
}
ptr += r;
offset += r;
left -= r;
}
*result = Slice(scratch, (r < 0) ? 0 : n - left);
if (r < 0) {
// An error: return a non-ok status
s = IOError(filename_, errno);
}
if (!use_os_buffer_) {
// we need to fadvise away the entire range of pages because
// we do not want readahead pages to be cached.
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages
}
return s;
}
#ifdef OS_LINUX
size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(fd_, id, max_size);
}
#endif
void PosixRandomAccessFile::Hint(AccessPattern pattern) {
switch (pattern) {
case NORMAL:
Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
break;
case RANDOM:
Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
break;
case SEQUENTIAL:
Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
break;
case WILLNEED:
Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
break;
case DONTNEED:
Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
break;
default:
assert(false);
break;
}
}
Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
/*
* PosixMmapReadableFile
*
* mmap() based random-access
*/
// base[0,length-1] contains the mmapped contents of the file.
PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
const std::string& fname,
void* base, size_t length,
const EnvOptions& options)
: fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
fd_ = fd_ + 0; // suppress the warning for used variables
assert(options.use_mmap_reads);
assert(options.use_os_buffer);
}
PosixMmapReadableFile::~PosixMmapReadableFile() {
int ret = munmap(mmapped_region_, length_);
if (ret != 0) {
fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
mmapped_region_, length_);
}
}
Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const {
Status s;
if (offset > length_) {
*result = Slice();
return IOError(filename_, EINVAL);
} else if (offset + n > length_) {
n = static_cast<size_t>(length_ - offset);
}
*result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
return s;
}
Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
/*
* PosixMmapFile
*
* We preallocate up to an extra megabyte and use memcpy to append new
* data to the file. This is safe since we either properly close the
* file before reading from it, or for log files, the reading code
* knows enough to skip zero suffixes.
*/
Status PosixMmapFile::UnmapCurrentRegion() {
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
if (base_ != nullptr) {
int munmap_status = munmap(base_, limit_ - base_);
if (munmap_status != 0) {
return IOError(filename_, munmap_status);
}
file_offset_ += limit_ - base_;
base_ = nullptr;
limit_ = nullptr;
last_sync_ = nullptr;
dst_ = nullptr;
// Increase the amount we map the next time, but capped at 1MB
if (map_size_ < (1 << 20)) {
map_size_ *= 2;
}
}
return Status::OK();
}
Status PosixMmapFile::MapNewRegion() {
#ifdef ROCKSDB_FALLOCATE_PRESENT
assert(base_ == nullptr);
TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
// we can't fallocate with FALLOC_FL_KEEP_SIZE here
if (allow_fallocate_) {
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
if (alloc_status != 0) {
// fallback to posix_fallocate
alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
}
if (alloc_status != 0) {
return Status::IOError("Error allocating space to file : " + filename_ +
"Error : " + strerror(alloc_status));
}
}
TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
file_offset_);
if (ptr == MAP_FAILED) {
return Status::IOError("MMap failed on " + filename_);
}
TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
base_ = reinterpret_cast<char*>(ptr);
limit_ = base_ + map_size_;
dst_ = base_;
last_sync_ = base_;
return Status::OK();
#else
return Status::NotSupported("This platform doesn't support fallocate()");
#endif
}
Status PosixMmapFile::Msync() {
if (dst_ == last_sync_) {
return Status::OK();
}
// Find the beginnings of the pages that contain the first and last
// bytes to be synced.
size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
last_sync_ = dst_;
TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
const EnvOptions& options)
: filename_(fname),
fd_(fd),
page_size_(page_size),
map_size_(Roundup(65536, page_size)),
base_(nullptr),
limit_(nullptr),
dst_(nullptr),
last_sync_(nullptr),
file_offset_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert((page_size & (page_size - 1)) == 0);
assert(options.use_mmap_writes);
}
PosixMmapFile::~PosixMmapFile() {
if (fd_ >= 0) {
PosixMmapFile::Close();
}
}
Status PosixMmapFile::Append(const Slice& data) {
const char* src = data.data();
size_t left = data.size();
while (left > 0) {
assert(base_ <= dst_);
assert(dst_ <= limit_);
size_t avail = limit_ - dst_;
if (avail == 0) {
Status s = UnmapCurrentRegion();
if (!s.ok()) {
return s;
}
s = MapNewRegion();
if (!s.ok()) {
return s;
}
TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
}
size_t n = (left <= avail) ? left : avail;
memcpy(dst_, src, n);
dst_ += n;
src += n;
left -= n;
}
return Status::OK();
}
Status PosixMmapFile::Close() {
Status s;
size_t unused = limit_ - dst_;
s = UnmapCurrentRegion();
if (!s.ok()) {
s = IOError(filename_, errno);
} else if (unused > 0) {
// Trim the extra space at the end of the file
if (ftruncate(fd_, file_offset_ - unused) < 0) {
s = IOError(filename_, errno);
}
}
if (close(fd_) < 0) {
if (s.ok()) {
s = IOError(filename_, errno);
}
}
fd_ = -1;
base_ = nullptr;
limit_ = nullptr;
return s;
}
Status PosixMmapFile::Flush() { return Status::OK(); }
Status PosixMmapFile::Sync() {
if (fdatasync(fd_) < 0) {
return IOError(filename_, errno);
}
return Msync();
}
/**
* Flush data as well as metadata to stable storage.
*/
Status PosixMmapFile::Fsync() {
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
return Msync();
}
/**
* Get the size of valid data in the file. This will not match the
* size that is returned from the filesystem because we use mmap
* to extend file by map_size every time.
*/
uint64_t PosixMmapFile::GetFileSize() {
size_t used = dst_ - base_;
return file_offset_ + used;
}
Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
#ifdef ROCKSDB_FALLOCATE_PRESENT
Status PosixMmapFile::Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
#endif
/*
* PosixWritableFile
*
* Use posix write to write data to a file.
*/
PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
const EnvOptions& options)
: filename_(fname), fd_(fd), filesize_(0) {
#ifdef ROCKSDB_FALLOCATE_PRESENT
allow_fallocate_ = options.allow_fallocate;
fallocate_with_keep_size_ = options.fallocate_with_keep_size;
#endif
assert(!options.use_mmap_writes);
}
PosixWritableFile::~PosixWritableFile() {
if (fd_ >= 0) {
PosixWritableFile::Close();
}
}
Status PosixWritableFile::Append(const Slice& data) {
const char* src = data.data();
size_t left = data.size();
while (left != 0) {
ssize_t done = write(fd_, src, left);
if (done < 0) {
if (errno == EINTR) {
continue;
}
return IOError(filename_, errno);
}
left -= done;
src += done;
}
filesize_ += data.size();
return Status::OK();
}
Status PosixWritableFile::Close() {
Status s;
size_t block_size;
size_t last_allocated_block;
GetPreallocationStatus(&block_size, &last_allocated_block);
if (last_allocated_block > 0) {
// trim the extra space preallocated at the end of the file
// NOTE(ljin): we probably don't want to surface failure as an IOError,
// but it will be nice to log these errors.
int dummy __attribute__((unused));
dummy = ftruncate(fd_, filesize_);
#ifdef ROCKSDB_FALLOCATE_PRESENT
// in some file systems, ftruncate only trims trailing space if the
// new file size is smaller than the current size. Calling fallocate
// with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
// blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
// filesystems:
// XFS (since Linux 2.6.38)
// ext4 (since Linux 3.0)
// Btrfs (since Linux 3.7)
// tmpfs (since Linux 3.5)
// We ignore error since failure of this operation does not affect
// correctness.
IOSTATS_TIMER_GUARD(allocate_nanos);
if (allow_fallocate_) {
fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
block_size * last_allocated_block - filesize_);
}
#endif
}
if (close(fd_) < 0) {
s = IOError(filename_, errno);
}
fd_ = -1;
return s;
}
// write out the cached data to the OS cache
Status PosixWritableFile::Flush() { return Status::OK(); }
Status PosixWritableFile::Sync() {
if (fdatasync(fd_) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
Status PosixWritableFile::Fsync() {
if (fsync(fd_) < 0) {
return IOError(filename_, errno);
}
return Status::OK();
}
bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
uint64_t PosixWritableFile::GetFileSize() { return filesize_; }
Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
#ifndef OS_LINUX
return Status::OK();
#else
// free OS pages
int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
if (ret == 0) {
return Status::OK();
}
return IOError(filename_, errno);
#endif
}
#ifdef ROCKSDB_FALLOCATE_PRESENT
Status PosixWritableFile::Allocate(off_t offset, off_t len) {
TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
IOSTATS_TIMER_GUARD(allocate_nanos);
int alloc_status = 0;
if (allow_fallocate_) {
alloc_status = fallocate(
fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len);
}
if (alloc_status == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
Status PosixWritableFile::RangeSync(off_t offset, off_t nbytes) {
if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
return Status::OK();
} else {
return IOError(filename_, errno);
}
}
size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
return GetUniqueIdFromFile(fd_, id, max_size);
}
#endif
} // namespace rocksdb
#endif

@ -232,6 +232,7 @@ DBOptions::DBOptions()
max_log_file_size(0),
log_file_time_to_roll(0),
keep_log_file_num(1000),
recycle_log_file_num(0),
max_manifest_file_size(std::numeric_limits<uint64_t>::max()),
table_cache_numshardbits(4),
WAL_ttl_seconds(0),
@ -290,6 +291,7 @@ DBOptions::DBOptions(const Options& options)
max_log_file_size(options.max_log_file_size),
log_file_time_to_roll(options.log_file_time_to_roll),
keep_log_file_num(options.keep_log_file_num),
recycle_log_file_num(options.recycle_log_file_num),
max_manifest_file_size(options.max_manifest_file_size),
table_cache_numshardbits(options.table_cache_numshardbits),
WAL_ttl_seconds(options.WAL_ttl_seconds),
@ -348,6 +350,8 @@ void DBOptions::Dump(Logger* log) const {
log_file_time_to_roll);
Header(log, " Options.keep_log_file_num: %" ROCKSDB_PRIszt,
keep_log_file_num);
Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt,
recycle_log_file_num);
Header(log, " Options.allow_os_buffer: %d", allow_os_buffer);
Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads);
Header(log, " Options.allow_fallocate: %d", allow_fallocate);

@ -207,6 +207,9 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"keep_log_file_num",
{offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT,
OptionVerificationType::kNormal}},
{"recycle_log_file_num",
{offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT,
OptionVerificationType::kNormal}},
{"log_file_time_to_roll",
{offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT,
OptionVerificationType::kNormal}},

@ -323,6 +323,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"max_log_file_size", "37"},
{"log_file_time_to_roll", "38"},
{"keep_log_file_num", "39"},
{"recycle_log_file_num", "5"},
{"max_manifest_file_size", "40"},
{"table_cache_numshardbits", "41"},
{"WAL_ttl_seconds", "43"},
@ -339,7 +340,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"new_table_reader_for_compaction_inputs", "true"},
{"compaction_readahead_size", "100"},
{"bytes_per_sync", "47"},
{"wal_bytes_per_sync", "48"}, };
{"wal_bytes_per_sync", "48"},
};
ColumnFamilyOptions base_cf_opt;
ColumnFamilyOptions new_cf_opt;
@ -431,6 +433,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.max_log_file_size, 37U);
ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U);
ASSERT_EQ(new_db_opt.keep_log_file_num, 39U);
ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U);
ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast<uint64_t>(40));
ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41);
ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast<uint64_t>(43));
@ -692,6 +695,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) {
db_opt->skip_stats_update_on_db_open = rnd->Uniform(2);
db_opt->use_adaptive_mutex = rnd->Uniform(2);
db_opt->use_fsync = rnd->Uniform(2);
db_opt->recycle_log_file_num = rnd->Uniform(2);
// int options
db_opt->max_background_compactions = rnd->Uniform(100);

@ -10,8 +10,6 @@
// Syncpoint prevents us building and running tests in release
#ifndef ROCKSDB_LITE
#if !defined(NDEBUG) || !defined(OS_WIN)
#ifndef OS_WIN
#include <unistd.h>
#endif
@ -350,16 +348,10 @@ TEST_F(DBTest, CheckpointCF) {
} // namespace rocksdb
#endif
int main(int argc, char** argv) {
#if !defined(NDEBUG) || !defined(OS_WIN)
rocksdb::port::InstallStackTraceHandler();
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
#else
return 0;
#endif
}
#else

@ -588,8 +588,8 @@ JSONDocument::const_item_iterator::~const_item_iterator() {
JSONDocument::const_item_iterator::value_type
JSONDocument::const_item_iterator::operator*() {
return {std::string(it_->getKeyStr(), it_->klen()),
JSONDocument(it_->value(), false)};
return JSONDocument::const_item_iterator::value_type(std::string(it_->getKeyStr(), it_->klen()),
JSONDocument(it_->value(), false));
}
JSONDocument::ItemsIteratorGenerator::ItemsIteratorGenerator(

Loading…
Cancel
Save