From 58410aee44e902735659b80364eecc0e075676e9 Mon Sep 17 00:00:00 2001
From: Maysam Yabandeh <myabandeh@fb.com>
Date: Thu, 3 Aug 2017 10:36:50 -0700
Subject: [PATCH] Fix the overflow bug in AwaitState

Summary:
https://github.com/facebook/rocksdb/issues/2559 reports an overflow in AwaitState. nbronson has debugged the issue and presented the fix, which is applied to this patch. Moreover this patch adds more comments to clarify the logic in AwaitState.

I tried with both 16 and 64 threads on update benchmark. The fix lowers cpu usage by 1.6 but also lowers the throughput by 1.6 and 2% respectively. Apparently the bug had favored using the spinning more often.

Benchmarks:
TEST_TMPDIR=/dev/shm/tmpdb time ./db_bench --benchmarks="fillrandom" --threads=16 --num=2000000
TEST_TMPDIR=/dev/shm/tmpdb time ./db_bench --use_existing_db=1 --benchmarks="updaterandom[X3]" --threads=16 --num=2000000
TEST_TMPDIR=/dev/shm/tmpdb time ./db_bench --use_existing_db=1 --benchmarks="updaterandom[X3]" --threads=64 --num=200000

Results
$ cat update-16t-bug.txt | tail -4
updaterandom [AVG    3 runs] : 234117 ops/sec;   51.8 MB/sec
updaterandom [MEDIAN 3 runs] : 233581 ops/sec;   51.7 MB/sec
3896.42user 1539.12system 6:50.61elapsed 1323%CPU (0avgtext+0avgdata 331308maxresident)k
0inputs+0outputs (0major+1281001minor)pagefaults 0swaps
$ cat update-16t-fixed.txt | tail -4
updaterandom [AVG    3 runs] : 230364 ops/sec;   51.0 MB/sec
updaterandom [MEDIAN 3 runs] : 226169 ops/sec;   50.0 MB/sec
3865.46user 1568.32system 6:57.63elapsed 1301%CPU (0avgtext+0avgdata 315012maxresident)k
0inputs+0outputs (0major+1342568minor)pagefaults 0swaps

$ cat update-64t-bug.txt | tail -4
updaterandom [AVG    3 runs] : 261878 ops/sec;   57.9 MB/sec
updaterandom [MEDIAN 3 runs] : 262859 ops/sec;   58.2 MB/sec
926.27user 578.06system 2:27.46elapsed 1020%CPU (0avgtext+0avgdata 475480maxresident)k
0inputs+0outputs (0major+1058728minor)pagefaults 0swaps
$ cat update-64t-fixed.txt | tail -4
updaterandom [AVG    3 runs] : 256699 ops/sec;   56.8 MB/sec
updaterandom [MEDIAN 3 runs] : 256380 ops/sec;   56.7 MB/sec
933.47user 575.37system 2:30.41elapsed 1003%CPU (0avgtext+0avgdata 482340maxresident)k
0inputs+0outputs (0major+1078557minor)pagefaults 0swaps
Closes https://github.com/facebook/rocksdb/pull/2679

Differential Revision: D5553732

Pulled By: maysamyabandeh

fbshipit-source-id: 98b72dc3a8e0f22ea29d4f7c7790af10c369c5bb
---
 db/write_thread.cc | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/db/write_thread.cc b/db/write_thread.cc
index 4a9fc1406..2d3b34602 100644
--- a/db/write_thread.cc
+++ b/db/write_thread.cc
@@ -57,6 +57,10 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
                                 AdaptationContext* ctx) {
   uint8_t state;
 
+  // 1. Busy loop using "pause" for 1 micro sec
+  // 2. Else SOMETIMES busy loop using "yield" for 100 micro sec (default)
+  // 3. Else blocking wait
+
   // On a modern Xeon each loop takes about 7 nanoseconds (most of which
   // is the effect of the pause instruction), so 200 iterations is a bit
   // more than a microsecond.  This is long enough that waits longer than
@@ -114,13 +118,21 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
 
   const size_t kMaxSlowYieldsWhileSpinning = 3;
 
+  // Whether the yield approach has any credit in this context. The credit is
+  // added by yield being succesfull before timing out, and decreased otherwise.
+  auto& yield_credit = ctx->value;
+  // Update the yield_credit based on sample runs or right after a hard failure
   bool update_ctx = false;
+  // Should we reinforce the yield credit
   bool would_spin_again = false;
+  // The samling base for updating the yeild credit. The sampling rate would be
+  // 1/sampling_base.
+  const int sampling_base = 256;
 
   if (max_yield_usec_ > 0) {
-    update_ctx = Random::GetTLSInstance()->OneIn(256);
+    update_ctx = Random::GetTLSInstance()->OneIn(sampling_base);
 
-    if (update_ctx || ctx->value.load(std::memory_order_relaxed) >= 0) {
+    if (update_ctx || yield_credit.load(std::memory_order_relaxed) >= 0) {
       // we're updating the adaptation statistics, or spinning has >
       // 50% chance of being shorter than max_yield_usec_ and causing no
       // involuntary context switches
@@ -149,7 +161,7 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
           // accurate enough to measure the yield duration
           ++slow_yield_count;
           if (slow_yield_count >= kMaxSlowYieldsWhileSpinning) {
-            // Not just one ivcsw, but several.  Immediately update ctx
+            // Not just one ivcsw, but several.  Immediately update yield_credit
             // and fall back to blocking
             update_ctx = true;
             break;
@@ -165,11 +177,19 @@ uint8_t WriteThread::AwaitState(Writer* w, uint8_t goal_mask,
   }
 
   if (update_ctx) {
-    auto v = ctx->value.load(std::memory_order_relaxed);
+    // Since our update is sample based, it is ok if a thread overwrites the
+    // updates by other threads. Thus the update does not have to be atomic.
+    auto v = yield_credit.load(std::memory_order_relaxed);
     // fixed point exponential decay with decay constant 1/1024, with +1
     // and -1 scaled to avoid overflow for int32_t
-    v = v + (v / 1024) + (would_spin_again ? 1 : -1) * 16384;
-    ctx->value.store(v, std::memory_order_relaxed);
+    //
+    // On each update the positive credit is decayed by a facor of 1/1024 (i.e.,
+    // 0.1%). If the sampled yield was successful, the credit is also increased
+    // by X. Setting X=2^17 ensures that the credit never exceeds
+    // 2^17*2^10=2^27, which is lower than 2^31 the upperbound of int32_t. Same
+    // logic applies to negative credits.
+    v = v - (v / 1024) + (would_spin_again ? 1 : -1) * 131072;
+    yield_credit.store(v, std::memory_order_relaxed);
   }
 
   assert((state & goal_mask) != 0);