From f0660d5253cddd84a1d5d5caafa592a9ec448d79 Mon Sep 17 00:00:00 2001
From: Radheshyam Balasundaram <rbs@fb.com>
Date: Mon, 7 Jul 2014 10:53:31 -0700
Subject: [PATCH] Adding NUMA support to db_bench tests

Summary:
Changes:
- Adding numa_aware flag to db_bench.cc
- Using numa.h library to bind memory and cpu of threads to a fixed NUMA node
Result: There seems to be no significant change in the micros/op time with numa_aware enabled. I also tried this with other implementations, including a combination of pthread_setaffinity_np, sched_setaffinity and set_mempolicy methods. It'd be great if someone could point out where I'm going wrong and if we can achieve a better micors/op.

Test Plan:
Ran db_bench tests using following command:
./db_bench --db=/mnt/tmp --num_levels=6 --key_size=20 --prefix_size=20 --keys_per_prefix=0 --value_size=100 --block_size=4096 --cache_size=17179869184 --cache_numshardbits=6 --compression_type=none --compression_ratio=1 --min_level_to_compress=-1 --disable_seek_compaction=1 --hard_rate_limit=2 --write_buffer_size=134217728 --max_write_buffer_number=2 --level0_file_num_compaction_trigger=8 --target_file_size_base=134217728 --max_bytes_for_level_base=1073741824 --disable_wal=0 --wal_dir=/mnt/tmp --sync=0 --disable_data_sync=1 --verify_checksum=1 --delete_obsolete_files_period_micros=314572800 --max_grandparent_overlap_factor=10 --max_background_compactions=4 --max_background_flushes=0 --level0_slowdown_writes_trigger=16 --level0_stop_writes_trigger=24 --statistics=0 --stats_per_interval=0 --stats_interval=1048576 --histogram=0 --use_plain_table=1 --open_files=-1 --mmap_read=1 --mmap_write=0 --memtablerep=prefix_hash --bloom_bits=10 --bloom_locality=1 --perf_level=0 --duration=300 --benchmarks=readwhilewriting --use_existing_db=1 --num=157286400 --threads=24 --writes_per_second=10240 --numa_aware=[False/True]

The tests were run in private devserver with 24 cores and the db was prepopulated using filluniquerandom test. The tests resulted in 0.145 us/op with numa_aware=False and 0.161 us/op with numa_aware=True.

Reviewers: sdong, yhchiang, ljin, igor

Reviewed By: ljin, igor

Subscribers: igor, leveldb

Differential Revision: https://reviews.facebook.net/D19353
---
 build_tools/build_detect_platform | 12 +++++++++
 build_tools/fbcode.gcc481.sh      | 11 +++++---
 db/db_bench.cc                    | 43 +++++++++++++++++++++++++++++++
 3 files changed, 63 insertions(+), 3 deletions(-)
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index c8ed00487..6ea1a63d0 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -21,6 +21,7 @@
 #       -DLEVELDB_PLATFORM_NOATOMIC if it is not
 #       -DSNAPPY                    if the Snappy library is present
 #       -DLZ4                       if the LZ4 library is present
+#       -DNUMA                      if the NUMA library is present
 #
 # Using gflags in rocksdb:
 # Our project depends on gflags, which requires users to take some extra steps
@@ -272,6 +273,17 @@ EOF
         PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -llz4"
     fi
 
+    # Test whether numa is available
+    $CXX $CFLAGS -x c++ - -o /dev/null -lnuma 2>/dev/null  <<EOF
+      #include <numa.h>
+      #inlcude <numaif.h>
+      int main() {}
+EOF
+    if [ "$?" = 0 ]; then
+        COMMON_FLAGS="$COMMON_FLAGS -DNUMA"
+        PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lnuma"
+    fi
+
     # Test whether tcmalloc is available
     $CXX $CFLAGS -x c++ - -o /dev/null -ltcmalloc 2>/dev/null  <<EOF
       int main() {}
diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh
index c1399099c..5426e3f9a 100644
--- a/build_tools/fbcode.gcc481.sh
+++ b/build_tools/fbcode.gcc481.sh
@@ -55,22 +55,27 @@ GFLAGS_LIBS=" $TOOLCHAIN_LIB_BASE/gflags/gflags-1.6/c3f970a/lib/libgflags.a"
 JEMALLOC_INCLUDE=" -I $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/include/"
 JEMALLOC_LIB=" -Wl,--whole-archive $TOOLCHAIN_LIB_BASE/jemalloc/jemalloc-3.4.1/4d53c6f/lib/libjemalloc.a"
 
+# location of numa
+NUMA_REV=829d10dac0230f99cd7e1778869d2adf3da24b65
+NUMA_INCLUDE=" -I /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/include/"
+NUMA_LIB=" /mnt/gvfs/third-party2/numa/$NUMA_REV/2.0.8/gcc-4.8.1-glibc-2.17/c3f970a/lib/libnuma.a"
+
 # use Intel SSE support for checksum calculations
 export USE_SSE=" -msse -msse4.2 "
 
 CC="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/gcc"
-CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE"
+CXX="$TOOLCHAIN_EXECUTABLES/gcc/gcc-4.8.1/cc6c9dc/bin/g++ $JINCLUDE $SNAPPY_INCLUDE $ZLIB_INCLUDE $BZIP_INCLUDE $LZ4_INCLUDE $GFLAGS_INCLUDE $NUMA_INCLUDE"
 AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
 RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
 
 CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
 CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
 CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
-CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4"
+CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DNUMA"
 
 EXEC_LDFLAGS="-Wl,--dynamic-linker,/usr/local/fbcode/gcc-4.8.1-glibc-2.17/lib/ld.so"
 EXEC_LDFLAGS+=" -Wl,--no-whole-archive $TOOLCHAIN_LIB_BASE/libunwind/libunwind-1.0.1/675d945/lib/libunwind.a"
-EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS"
+EXEC_LDFLAGS+=" $HDFSLIB $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $GFLAGS_LIBS $NUMA_LIB"
 
 PLATFORM_LDFLAGS="$LIBGCC_LIBS $GLIBC_LIBS "
 
diff --git a/db/db_bench.cc b/db/db_bench.cc
index 201133c66..b74ce28e7 100644
--- a/db/db_bench.cc
+++ b/db/db_bench.cc
@@ -17,6 +17,11 @@ int main() {
 }
 #else
 
+#ifdef NUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
 #include <inttypes.h>
 #include <cstddef>
 #include <sys/types.h>
@@ -173,6 +178,14 @@ DEFINE_double(compression_ratio, 0.5, "Arrange to generate values that shrink"
 
 DEFINE_bool(histogram, false, "Print histogram of operation timings");
 
+DEFINE_bool(enable_numa, false,
+            "Make operations aware of NUMA architecture and bind memory "
+            "and cpus corresponding to nodes together. In NUMA, memory "
+            "in same node as CPUs are closer when compared to memory in "
+            "other nodes. Reads can be faster when the process is bound to "
+            "CPU and memory of same node. Use \"$numactl --hardware\" command "
+            "to see NUMA memory architecture.");
+
 DEFINE_int64(write_buffer_size, rocksdb::Options().write_buffer_size,
              "Number of bytes to buffer in memtable before compacting");
 
@@ -863,6 +876,18 @@ class Benchmark {
               * num_)
              / 1048576.0));
     fprintf(stdout, "Write rate limit: %d\n", FLAGS_writes_per_second);
+    if (FLAGS_enable_numa) {
+      fprintf(stderr, "Running in NUMA enabled mode.\n");
+#ifndef NUMA
+      fprintf(stderr, "NUMA is not defined in the system.\n");
+      exit(1);
+#else
+      if (numa_available() == -1) {
+        fprintf(stderr, "NUMA is not supported by the system.\n");
+        exit(1);
+      }
+#endif
+    }
     switch (FLAGS_compression_type_e) {
       case rocksdb::kNoCompression:
         fprintf(stdout, "Compression: none\n");
@@ -1348,7 +1373,25 @@ class Benchmark {
     shared.start = false;
 
     ThreadArg* arg = new ThreadArg[n];
+
     for (int i = 0; i < n; i++) {
+#ifdef NUMA
+      if (FLAGS_enable_numa) {
+        // Performs a local allocation of memory to threads in numa node.
+        int n_nodes = numa_num_task_nodes();  // Number of nodes in NUMA.
+        numa_exit_on_error = 1;
+        int numa_node = i % n_nodes;
+        bitmask* nodes = numa_allocate_nodemask();
+        numa_bitmask_clearall(nodes);
+        numa_bitmask_setbit(nodes, numa_node);
+        // numa_bind() call binds the process to the node and these
+        // properties are passed on to the thread that is created in
+        // StartThread method called later in the loop.
+        numa_bind(nodes);
+        numa_set_strict(1);
+        numa_free_nodemask(nodes);
+      }
+#endif
       arg[i].bm = this;
       arg[i].method = method;
       arg[i].shared = &shared;