From f0660d5253cddd84a1d5d5caafa592a9ec448d79 Mon Sep 17 00:00:00 2001 From: Radheshyam Balasundaram Date: Mon, 7 Jul 2014 10:53:31 -0700 Subject: [PATCH] Adding NUMA support to db_bench tests Summary: Changes: - Adding numa_aware flag to db_bench.cc - Using numa.h library to bind memory and cpu of threads to a fixed NUMA node Result: There seems to be no significant change in the micros/op time with numa_aware enabled. I also tried this with other implementations, including a combination of pthread_setaffinity_np, sched_setaffinity and set_mempolicy methods. It'd be great if someone could point out where I'm going wrong and if we can achieve a better micors/op. Test Plan: Ran db_bench tests using following command: ./db_bench --db=/mnt/tmp --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=134217728 --max_bytes_for_level_base=1073741824 --disable_wal=0 --wal_dir=/mnt/tmp --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --duration=300 --benchmarks=readwhilewriting --use_existing_db=1 --num=157286400 --threads=24 --writes_per_second=10240 --numa_aware=[False/True] The tests were run in private devserver with 24 cores and the db was prepopulated using filluniquerandom test. The tests resulted in 0.145 us/op with numa_aware=False and 0.161 us/op with numa_aware=True. Reviewers: sdong, yhchiang, ljin, igor Reviewed By: ljin, igor Subscribers: igor, leveldb Differential Revision: https://reviews.facebook.net/D19353 --- build_tools/build_detect_platform | 12 +++++++++ build_tools/fbcode.gcc481.sh | 11 +++++--- db/db_bench.cc | 43 +++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 3 deletions(-) diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index c8ed00487..6ea1a63d0 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -21,6 +21,7 @@ # -DLEVELDB_PLATFORM_NOATOMIC if it is not # -DSNAPPY if the Snappy library is present # -DLZ4 if the LZ4 library is present +# -DNUMA if the NUMA library is present # # Using gflags in rocksdb: # Our project depends on gflags, which requires users to take some extra steps @@ -272,6 +273,17 @@ EOF PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4" fi + # Test whether numa is available + $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null < + #inlcude + int main() {} +EOF + if [ "$?" = 0 ]; then + COMMON_FLAGS="$COMMON_FLAGS -DNUMA" + PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma" + fi + # Test whether tcmalloc is available $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null < +#include +#endif + #include #include #include @@ -173,6 +178,14 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink" DEFINE_bool(histogram, false, "Print histogram of operation timings"); +DEFINE_bool(enable_numa, false, + "Make operations aware of NUMA architecture and bind memory " + "and cpus corresponding to nodes together. In NUMA, memory " + "in same node as CPUs are closer when compared to memory in " + "other nodes. Reads can be faster when the process is bound to " + "CPU and memory of same node. Use \"$numactl --hardware\" command " + "to see NUMA memory architecture."); + DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size, "Number of bytes to buffer in memtable before compacting"); @@ -863,6 +876,18 @@ class Benchmark { * num_) / 1048576.0)); fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second); + if (FLAGS_enable_numa) { + fprintf(stderr, "Running in NUMA enabled mode.\n"); +#ifndef NUMA + fprintf(stderr, "NUMA is not defined in the system.\n"); + exit(1); +#else + if (numa_available() == -1) { + fprintf(stderr, "NUMA is not supported by the system.\n"); + exit(1); + } +#endif + } switch (FLAGS_compression_type_e) { case rocksdb::kNoCompression: fprintf(stdout, "Compression: none\n"); @@ -1348,7 +1373,25 @@ class Benchmark { shared.start = false; ThreadArg* arg = new ThreadArg[n]; + for (int i = 0; i < n; i++) { +#ifdef NUMA + if (FLAGS_enable_numa) { + // Performs a local allocation of memory to threads in numa node. + int n_nodes = numa_num_task_nodes(); // Number of nodes in NUMA. + numa_exit_on_error = 1; + int numa_node = i % n_nodes; + bitmask* nodes = numa_allocate_nodemask(); + numa_bitmask_clearall(nodes); + numa_bitmask_setbit(nodes, numa_node); + // numa_bind() call binds the process to the node and these + // properties are passed on to the thread that is created in + // StartThread method called later in the loop. + numa_bind(nodes); + numa_set_strict(1); + numa_free_nodemask(nodes); + } +#endif arg[i].bm = this; arg[i].method = method; arg[i].shared = &shared;