rcu: Improve SRCU's wait_idx() comments

[platform/adaptation/renesas_rcar/renesas_kernel.git] / kernel / srcu.c
diff --git a/kernel/srcu.c b/kernel/srcu.c

index ba35f3a..1fecb4d 100644 (file)
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -73,19 +73,102 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
  #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
  
  /*
- * srcu_readers_active_idx -- returns approximate number of readers
- *     active on the specified rank of per-CPU counters.
+ * Returns approximate number of readers active on the specified rank
+ * of per-CPU counters.  Also snapshots each counter's value in the
+ * corresponding element of sp->snap[] for later use validating
+ * the sum.
   */
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+       int cpu;
+       unsigned long sum = 0;
+       unsigned long t;
  
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+       for_each_possible_cpu(cpu) {
+               t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+               sum += t;
+               sp->snap[cpu] = t;
+       }
+       return sum & SRCU_REF_MASK;
+}
+
+/*
+ * To be called from the update side after an index flip.  Returns true
+ * if the modulo sum of the counters is stably zero, false if there is
+ * some possibility of non-zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
  {
         int cpu;
-       int sum;
  
-       sum = 0;
+       /*
+        * Note that srcu_readers_active_idx() can incorrectly return
+        * zero even though there is a pre-existing reader throughout.
+        * To see this, suppose that task A is in a very long SRCU
+        * read-side critical section that started on CPU 0, and that
+        * no other reader exists, so that the modulo sum of the counters
+        * is equal to one.  Then suppose that task B starts executing
+        * srcu_readers_active_idx(), summing up to CPU 1, and then that
+        * task C starts reading on CPU 0, so that its increment is not
+        * summed, but finishes reading on CPU 2, so that its decrement
+        * -is- summed.  Then when task B completes its sum, it will
+        * incorrectly get zero, despite the fact that task A has been
+        * in its SRCU read-side critical section the whole time.
+        *
+        * We therefore do a validation step should srcu_readers_active_idx()
+        * return zero.
+        */
+       if (srcu_readers_active_idx(sp, idx) != 0)
+               return false;
+
+       /*
+        * Since the caller recently flipped ->completed, we can see at
+        * most one increment of each CPU's counter from this point
+        * forward.  The reason for this is that the reader CPU must have
+        * fetched the index before srcu_readers_active_idx checked
+        * that CPU's counter, but not yet incremented its counter.
+        * Its eventual counter increment will follow the read in
+        * srcu_readers_active_idx(), and that increment is immediately
+        * followed by smp_mb() B.  Because smp_mb() D is between
+        * the ->completed flip and srcu_readers_active_idx()'s read,
+        * that CPU's subsequent load of ->completed must see the new
+        * value, and therefore increment the counter in the other rank.
+        */
+       smp_mb(); /* A */
+
+       /*
+        * Now, we check the ->snap array that srcu_readers_active_idx()
+        * filled in from the per-CPU counter values. Since
+        * __srcu_read_lock() increments the upper bits of the per-CPU
+        * counter, an increment/decrement pair will change the value
+        * of the counter.  Since there is only one possible increment,
+        * the only way to wrap the counter is to have a huge number of
+        * counter decrements, which requires a huge number of tasks and
+        * huge SRCU read-side critical-section nesting levels, even on
+        * 32-bit systems.
+        *
+        * All of the ways of confusing the readings require that the scan
+        * in srcu_readers_active_idx() see the read-side task's decrement,
+        * but not its increment.  However, between that decrement and
+        * increment are smb_mb() B and C.  Either or both of these pair
+        * with smp_mb() A above to ensure that the scan below will see
+        * the read-side tasks's increment, thus noting a difference in
+        * the counter values between the two passes.
+        *
+        * Therefore, if srcu_readers_active_idx() returned zero, and
+        * none of the counters changed, we know that the zero was the
+        * correct sum.
+        *
+        * Of course, it is possible that a task might be delayed
+        * for a very long time in __srcu_read_lock() after fetching
+        * the index but before incrementing its counter.  This
+        * possibility will be dealt with in __synchronize_srcu().
+        */
         for_each_possible_cpu(cpu)
-               sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
-       return sum;
+               if (sp->snap[cpu] !=
+                   ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))
+                       return false;  /* False zero reading! */
+       return true;
  }
  
  /**
@@ -131,10 +214,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
         int idx;
  
         preempt_disable();
-       idx = sp->completed & 0x1;
-       barrier();  /* ensure compiler looks -once- at sp->completed. */
-       per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
-       srcu_barrier();  /* ensure compiler won't misorder critical section. */
+       idx = rcu_dereference_index_check(sp->completed,
+                                         rcu_read_lock_sched_held()) & 0x1;
+       ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
+               SRCU_USAGE_COUNT + 1;
+       smp_mb(); /* B */  /* Avoid leaking the critical section. */
         preempt_enable();
         return idx;
  }
@@ -149,8 +233,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
  void __srcu_read_unlock(struct srcu_struct *sp, int idx)
  {
         preempt_disable();
-       srcu_barrier();  /* ensure compiler won't misorder critical section. */
-       per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+       smp_mb(); /* C */  /* Avoid leaking the critical section. */
+       ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
         preempt_enable();
  }
  EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,104 +247,114 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
   * we repeatedly block for 1-millisecond time periods.  This approach
   * has done well in testing, so there is no need for a config parameter.
   */
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+#define SYNCHRONIZE_SRCU_READER_DELAY 5
  
  /*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ * Wait until all pre-existing readers complete.  Such readers
+ * will have used the index specified by "idx".
   */
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
  {
-       int idx;
-
-       rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
-                          !lock_is_held(&rcu_bh_lock_map) &&
-                          !lock_is_held(&rcu_lock_map) &&
-                          !lock_is_held(&rcu_sched_lock_map),
-                          "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-
-       idx = sp->completed;
-       mutex_lock(&sp->mutex);
+       int trycount = 0;
  
         /*
-        * Check to see if someone else did the work for us while we were
-        * waiting to acquire the lock.  We need -two- advances of
-        * the counter, not just one.  If there was but one, we might have
-        * shown up -after- our helper's first synchronize_sched(), thus
-        * having failed to prevent CPU-reordering races with concurrent
-        * srcu_read_unlock()s on other CPUs (see comment below).  So we
-        * either (1) wait for two or (2) supply the second ourselves.
+        * If a reader fetches the index before the ->completed increment,
+        * but increments its counter after srcu_readers_active_idx_check()
+        * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
+        * smp_mb() B to ensure that the SRCU read-side critical section
+        * will see any updates that the current task performed before its
+        * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
+        * as the case may be.
          */
+       smp_mb(); /* D */
  
-       if ((sp->completed - idx) >= 2) {
-               mutex_unlock(&sp->mutex);
-               return;
+       /*
+        * SRCU read-side critical sections are normally short, so wait
+        * a small amount of time before possibly blocking.
+        */
+       if (!srcu_readers_active_idx_check(sp, idx)) {
+               udelay(SYNCHRONIZE_SRCU_READER_DELAY);
+               while (!srcu_readers_active_idx_check(sp, idx)) {
+                       if (expedited && ++ trycount < 10)
+                               udelay(SYNCHRONIZE_SRCU_READER_DELAY);
+                       else
+                               schedule_timeout_interruptible(1);
+               }
         }
  
-       sync_func();  /* Force memory barrier on all CPUs. */
-
         /*
-        * The preceding synchronize_sched() ensures that any CPU that
-        * sees the new value of sp->completed will also see any preceding
-        * changes to data structures made by this CPU.  This prevents
-        * some other CPU from reordering the accesses in its SRCU
-        * read-side critical section to precede the corresponding
-        * srcu_read_lock() -- ensuring that such references will in
-        * fact be protected.
+        * The following smp_mb() E pairs with srcu_read_unlock()'s
+        * smp_mb C to ensure that if srcu_readers_active_idx_check()
+        * sees srcu_read_unlock()'s counter decrement, then any
+        * of the current task's subsequent code will happen after
+        * that SRCU read-side critical section.
          *
-        * So it is now safe to do the flip.
+        * It also ensures the order between the above waiting and
+        * the next flipping.
          */
+       smp_mb(); /* E */
+}
  
-       idx = sp->completed & 0x1;
+static void srcu_flip(struct srcu_struct *sp)
+{
         sp->completed++;
+}
  
-       sync_func();  /* Force memory barrier on all CPUs. */
-
-       /*
-        * At this point, because of the preceding synchronize_sched(),
-        * all srcu_read_lock() calls using the old counters have completed.
-        * Their corresponding critical sections might well be still
-        * executing, but the srcu_read_lock() primitives themselves
-        * will have finished executing.  We initially give readers
-        * an arbitrarily chosen 10 microseconds to get out of their
-        * SRCU read-side critical sections, then loop waiting 1/HZ
-        * seconds per iteration.  The 10-microsecond value has done
-        * very well in testing.
-        */
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, bool expedited)
+{
+       int busy_idx;
  
-       if (srcu_readers_active_idx(sp, idx))
-               udelay(SYNCHRONIZE_SRCU_READER_DELAY);
-       while (srcu_readers_active_idx(sp, idx))
-               schedule_timeout_interruptible(1);
+       rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+                          !lock_is_held(&rcu_bh_lock_map) &&
+                          !lock_is_held(&rcu_lock_map) &&
+                          !lock_is_held(&rcu_sched_lock_map),
+                          "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
  
-       sync_func();  /* Force memory barrier on all CPUs. */
+       mutex_lock(&sp->mutex);
+       busy_idx = sp->completed & 0X1UL;
  
         /*
-        * The preceding synchronize_sched() forces all srcu_read_unlock()
-        * primitives that were executing concurrently with the preceding
-        * for_each_possible_cpu() loop to have completed by this point.
-        * More importantly, it also forces the corresponding SRCU read-side
-        * critical sections to have also completed, and the corresponding
-        * references to SRCU-protected data items to be dropped.
+        * If we recently flipped the index, there will be some readers
+        * using idx=0 and others using idx=1.  Therefore, two calls to
+        * wait_idx()s suffice to ensure that all pre-existing readers
+        * have completed:
+        *
+        * __synchronize_srcu() {
+        *      wait_idx(sp, 0, expedited);
+        *      wait_idx(sp, 1, expedited);
+        * }
          *
-        * Note:
+        * Starvation is prevented by the fact that we flip the index.
+        * While we wait on one index to clear out, almost all new readers
+        * will be using the other index.  The number of new readers using the
+        * index we are waiting on is sharply bounded by roughly the number
+        * of CPUs.
          *
-        *      Despite what you might think at first glance, the
-        *      preceding synchronize_sched() -must- be within the
-        *      critical section ended by the following mutex_unlock().
-        *      Otherwise, a task taking the early exit can race
-        *      with a srcu_read_unlock(), which might have executed
-        *      just before the preceding srcu_readers_active() check,
-        *      and whose CPU might have reordered the srcu_read_unlock()
-        *      with the preceding critical section.  In this case, there
-        *      is nothing preventing the synchronize_sched() task that is
-        *      taking the early exit from freeing a data structure that
-        *      is still being referenced (out of order) by the task
-        *      doing the srcu_read_unlock().
+        * How can new readers possibly using the old pre-flip value of
+        * the index?  Consider the following sequence of events:
          *
-        *      Alternatively, the comparison with "2" on the early exit
-        *      could be changed to "3", but this increases synchronize_srcu()
-        *      latency for bulk loads.  So the current code is preferred.
+        * Suppose that during the previous grace period, a reader
+        * picked up the old value of the index, but did not increment
+        * its counter until after the previous instance of
+        * __synchronize_srcu() did the counter summation and recheck.
+        * That previous grace period was OK because the reader did
+        * not start until after the grace period started, so the grace
+        * period was not obligated to wait for that reader.
+        *
+        * However, this sequence of events is quite improbable, so
+        * this call to wait_idx(), which waits on really old readers
+        * describe in this comment above, will almost never need to wait.
          */
+       wait_idx(sp, 1 - busy_idx, expedited);
+
+       /* Flip the index to avoid reader-induced starvation. */
+       srcu_flip(sp);
+
+       /* Wait for recent pre-existing readers. */
+       wait_idx(sp, busy_idx, expedited);
  
         mutex_unlock(&sp->mutex);
  }
@@ -281,7 +375,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
   */
  void synchronize_srcu(struct srcu_struct *sp)
  {
-       __synchronize_srcu(sp, synchronize_sched);
+       __synchronize_srcu(sp, 0);
  }
  EXPORT_SYMBOL_GPL(synchronize_srcu);
  
@@ -289,18 +383,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
   * synchronize_srcu_expedited - Brute-force SRCU grace period
   * @sp: srcu_struct with which to synchronize.
   *
- * Wait for an SRCU grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly.  This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code.  In fact,
- * if you are using synchronize_srcu_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_srcu() instead.
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
   *
   * Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier.  And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier.  Failing to observe
- * these restriction will result in deadlock.  It is also illegal to call
+ * that is acquired by a CPU-hotplug notifier.  It is also illegal to call
   * synchronize_srcu_expedited() from the corresponding SRCU read-side
   * critical section; doing so will result in deadlock.  However, it is
   * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,7 +396,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
   */
  void synchronize_srcu_expedited(struct srcu_struct *sp)
  {
-       __synchronize_srcu(sp, synchronize_sched_expedited);
+       __synchronize_srcu(sp, 1);
  }
  EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);