diff --git a/CMakeLists.txt b/CMakeLists.txt index ccbe14a00..6cb80cd10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -603,6 +603,7 @@ set(SOURCES util/filename.cc util/filter_policy.cc util/hash.cc + util/jemalloc_nodump_allocator.cc util/log_buffer.cc util/murmurhash.cc util/random.cc diff --git a/TARGETS b/TARGETS index 68a8e6f7f..5040ff153 100644 --- a/TARGETS +++ b/TARGETS @@ -226,6 +226,7 @@ cpp_library( "util/filename.cc", "util/filter_policy.cc", "util/hash.cc", + "util/jemalloc_nodump_allocator.cc", "util/log_buffer.cc", "util/murmurhash.cc", "util/random.cc", @@ -931,6 +932,11 @@ ROCKS_TESTS = [ "db/range_del_aggregator_test.cc", "serial", ], + [ + "range_tombstone_fragmenter_test", + "db/range_tombstone_fragmenter_test.cc", + "serial", + ], [ "rate_limiter_test", "util/rate_limiter_test.cc", diff --git a/include/rocksdb/memory_allocator.h b/include/rocksdb/memory_allocator.h index 30b77dfdf..15aab65fc 100644 --- a/include/rocksdb/memory_allocator.h +++ b/include/rocksdb/memory_allocator.h @@ -5,6 +5,10 @@ #pragma once +#include "rocksdb/status.h" + +#include + namespace rocksdb { // MemoryAllocator is an interface that a client can implement to supply custom @@ -18,10 +22,12 @@ class MemoryAllocator { // Name of the cache allocator, printed in the log virtual const char* Name() const = 0; - // Allocate a block of at least size size + // Allocate a block of at least size. Has to be thread-safe. virtual void* Allocate(size_t size) = 0; - // Deallocate previously allocated block + + // Deallocate previously allocated block. Has to be thread-safe. virtual void Deallocate(void* p) = 0; + // Returns the memory size of the block allocated at p. The default // implementation that just returns the original allocation_size is fine. virtual size_t UsableSize(void* /*p*/, size_t allocation_size) const { @@ -30,4 +36,9 @@ class MemoryAllocator { } }; +// Generate cache allocators which allocates through Jemalloc and utilize +// MADV_DONTDUMP through madvice to exclude cache items from core dump. +extern Status NewJemallocNodumpAllocator( + std::shared_ptr* memory_allocator); + } // namespace rocksdb diff --git a/src.mk b/src.mk index 7ebd93a15..cfe9dcd2f 100644 --- a/src.mk +++ b/src.mk @@ -143,6 +143,7 @@ LIB_SOURCES = \ util/filename.cc \ util/filter_policy.cc \ util/hash.cc \ + util/jemalloc_nodump_allocator.cc \ util/log_buffer.cc \ util/murmurhash.cc \ util/random.cc \ diff --git a/util/jemalloc_nodump_allocator.cc b/util/jemalloc_nodump_allocator.cc new file mode 100644 index 000000000..c1391649f --- /dev/null +++ b/util/jemalloc_nodump_allocator.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "util/jemalloc_nodump_allocator.h" + +#include + +#include "util/string_util.h" + +namespace rocksdb { + +#ifdef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +std::atomic JemallocNodumpAllocator::original_alloc_{nullptr}; + +JemallocNodumpAllocator::JemallocNodumpAllocator( + unsigned arena_index, int flags, std::unique_ptr&& hooks) + : arena_index_(arena_index), flags_(flags), hooks_(std::move(hooks)) { + assert(arena_index != 0); +} + +void* JemallocNodumpAllocator::Allocate(size_t size) { + return mallocx(size, flags_); +} + +void JemallocNodumpAllocator::Deallocate(void* p) { dallocx(p, flags_); } + +void* JemallocNodumpAllocator::Alloc(extent_hooks_t* extent, void* new_addr, + size_t size, size_t alignment, bool* zero, + bool* commit, unsigned arena_ind) { + extent_alloc_t* original_alloc = + original_alloc_.load(std::memory_order_relaxed); + assert(original_alloc != nullptr); + void* result = original_alloc(extent, new_addr, size, alignment, zero, commit, + arena_ind); + if (result != nullptr) { + int ret = madvise(result, size, MADV_DONTDUMP); + if (ret != 0) { + fprintf( + stderr, + "JemallocNodumpAllocator failed to set MADV_DONTDUMP, error code: %d", + ret); + assert(false); + } + } + return result; +} + +JemallocNodumpAllocator::~JemallocNodumpAllocator() { + assert(arena_index_ != 0); + std::string key = "arena." + ToString(arena_index_) + ".destroy"; + int ret = mallctl(key.c_str(), nullptr, 0, nullptr, 0); + if (ret != 0) { + fprintf(stderr, "Failed to destroy jemalloc arena, error code: %d\n", ret); + } +} + +size_t JemallocNodumpAllocator::UsableSize(void* p, + size_t /*allocation_size*/) const { + return malloc_usable_size(static_cast(p)); +} +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +Status NewJemallocNodumpAllocator( + std::shared_ptr* memory_allocator) { +#ifndef ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + *memory_allocator = nullptr; + return Status::NotSupported( + "JemallocNodumpAllocator only available with jemalloc version >= 5 " + "and MADV_DONTDUMP is available."); +#else + if (memory_allocator == nullptr) { + return Status::InvalidArgument("memory_allocator must be non-null."); + } + // Create arena. + unsigned arena_index = 0; + size_t arena_index_size = sizeof(arena_index); + int ret = + mallctl("arenas.create", &arena_index, &arena_index_size, nullptr, 0); + if (ret != 0) { + return Status::Incomplete("Failed to create jemalloc arena, error code: " + + ToString(ret)); + } + assert(arena_index != 0); + int flags = MALLOCX_ARENA(arena_index) | MALLOCX_TCACHE_NONE; + std::string key = "arena." + ToString(arena_index) + ".extent_hooks"; + + // Read existing hooks. + extent_hooks_t* hooks; + size_t hooks_size = sizeof(hooks); + ret = mallctl(key.c_str(), &hooks, &hooks_size, nullptr, 0); + if (ret != 0) { + std::string msg = + "Failed to read existing hooks, error code: " + ToString(ret); + return Status::Incomplete("Failed to read existing hooks, error code: " + + ToString(ret)); + } + + // Store existing alloc. + extent_alloc_t* original_alloc = hooks->alloc; + extent_alloc_t* expected = nullptr; + bool success __attribute__((__unused__)) = + JemallocNodumpAllocator::original_alloc_.compare_exchange_strong( + expected, original_alloc); + assert(success || original_alloc == expected); + + // Set the custom hook. + std::unique_ptr new_hooks(new extent_hooks_t(*hooks)); + new_hooks->alloc = &JemallocNodumpAllocator::Alloc; + extent_hooks_t* hooks_ptr = new_hooks.get(); + ret = mallctl(key.c_str(), nullptr, nullptr, &hooks_ptr, sizeof(hooks_ptr)); + if (ret != 0) { + return Status::Incomplete("Failed to set custom hook, error code: " + + ToString(ret)); + } + + // Create cache allocator. + memory_allocator->reset( + new JemallocNodumpAllocator(arena_index, flags, std::move(new_hooks))); + return Status::OK(); +#endif // ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR +} + +} // namespace rocksdb diff --git a/util/jemalloc_nodump_allocator.h b/util/jemalloc_nodump_allocator.h new file mode 100644 index 000000000..69826fafe --- /dev/null +++ b/util/jemalloc_nodump_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#pragma once + +#include + +#include "rocksdb/memory_allocator.h" + +#if defined(ROCKSDB_JEMALLOC) && defined(ROCKSDB_PLATFORM_POSIX) + +#include +#include + +#if (JEMALLOC_VERSION_MAJOR >= 5) && defined(MADV_DONTDUMP) +#define ROCKSDB_JEMALLOC_NODUMP_ALLOCATOR + +namespace rocksdb { + +class JemallocNodumpAllocator : public MemoryAllocator { + public: + JemallocNodumpAllocator(unsigned arena_index, int flags, + std::unique_ptr&& hooks); + ~JemallocNodumpAllocator(); + + const char* Name() const override { return "JemallocNodumpAllocator"; } + void* Allocate(size_t size) override; + void Deallocate(void* p) override; + size_t UsableSize(void* p, size_t allocation_size) const override; + + private: + friend Status NewJemallocNodumpAllocator( + std::shared_ptr* memory_allocator); + + // Custom alloc hook to replace jemalloc default alloc. + static void* Alloc(extent_hooks_t* extent, void* new_addr, size_t size, + size_t alignment, bool* zero, bool* commit, + unsigned arena_ind); + + // A function pointer to jemalloc default alloc. Use atomic to make sure + // NewJemallocNodumpAllocator is thread-safe. + // + // Hack: original_alloc_ needs to be static for Alloc() to access it. + // alloc needs to be static to pass to jemalloc as function pointer. + static std::atomic original_alloc_; + + unsigned arena_index_; + int flags_; + const std::unique_ptr hooks_; +}; + +} // namespace rocksdb +#endif // (JEMALLOC_VERSION_MAJOR >= 5) && MADV_DONTDUMP +#endif // ROCKSDB_JEMALLOC && ROCKSDB_PLATFORM_POSIX