#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
/*
- * srcu_readers_active_idx -- returns approximate number of readers
- * active on the specified rank of per-CPU counters.
+ * Returns approximate number of readers active on the specified rank
+ * of per-CPU counters. Also snapshots each counter's value in the
+ * corresponding element of sp->snap[] for later use validating
+ * the sum.
*/
+static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long t;
-static int srcu_readers_active_idx(struct srcu_struct *sp, int idx)
+ for_each_possible_cpu(cpu) {
+ t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
+ sum += t;
+ sp->snap[cpu] = t;
+ }
+ return sum & SRCU_REF_MASK;
+}
+
+/*
+ * To be called from the update side after an index flip. Returns true
+ * if the modulo sum of the counters is stably zero, false if there is
+ * some possibility of non-zero.
+ */
+static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
{
int cpu;
- int sum;
- sum = 0;
+ /*
+ * Note that srcu_readers_active_idx() can incorrectly return
+ * zero even though there is a pre-existing reader throughout.
+ * To see this, suppose that task A is in a very long SRCU
+ * read-side critical section that started on CPU 0, and that
+ * no other reader exists, so that the modulo sum of the counters
+ * is equal to one. Then suppose that task B starts executing
+ * srcu_readers_active_idx(), summing up to CPU 1, and then that
+ * task C starts reading on CPU 0, so that its increment is not
+ * summed, but finishes reading on CPU 2, so that its decrement
+ * -is- summed. Then when task B completes its sum, it will
+ * incorrectly get zero, despite the fact that task A has been
+ * in its SRCU read-side critical section the whole time.
+ *
+ * We therefore do a validation step should srcu_readers_active_idx()
+ * return zero.
+ */
+ if (srcu_readers_active_idx(sp, idx) != 0)
+ return false;
+
+ /*
+ * Since the caller recently flipped ->completed, we can see at
+ * most one increment of each CPU's counter from this point
+ * forward. The reason for this is that the reader CPU must have
+ * fetched the index before srcu_readers_active_idx checked
+ * that CPU's counter, but not yet incremented its counter.
+ * Its eventual counter increment will follow the read in
+ * srcu_readers_active_idx(), and that increment is immediately
+ * followed by smp_mb() B. Because smp_mb() D is between
+ * the ->completed flip and srcu_readers_active_idx()'s read,
+ * that CPU's subsequent load of ->completed must see the new
+ * value, and therefore increment the counter in the other rank.
+ */
+ smp_mb(); /* A */
+
+ /*
+ * Now, we check the ->snap array that srcu_readers_active_idx()
+ * filled in from the per-CPU counter values. Since
+ * __srcu_read_lock() increments the upper bits of the per-CPU
+ * counter, an increment/decrement pair will change the value
+ * of the counter. Since there is only one possible increment,
+ * the only way to wrap the counter is to have a huge number of
+ * counter decrements, which requires a huge number of tasks and
+ * huge SRCU read-side critical-section nesting levels, even on
+ * 32-bit systems.
+ *
+ * All of the ways of confusing the readings require that the scan
+ * in srcu_readers_active_idx() see the read-side task's decrement,
+ * but not its increment. However, between that decrement and
+ * increment are smb_mb() B and C. Either or both of these pair
+ * with smp_mb() A above to ensure that the scan below will see
+ * the read-side tasks's increment, thus noting a difference in
+ * the counter values between the two passes.
+ *
+ * Therefore, if srcu_readers_active_idx() returned zero, and
+ * none of the counters changed, we know that the zero was the
+ * correct sum.
+ *
+ * Of course, it is possible that a task might be delayed
+ * for a very long time in __srcu_read_lock() after fetching
+ * the index but before incrementing its counter. This
+ * possibility will be dealt with in __synchronize_srcu().
+ */
for_each_possible_cpu(cpu)
- sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx];
- return sum;
+ if (sp->snap[cpu] !=
+ ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))
+ return false; /* False zero reading! */
+ return true;
}
/**
int idx;
preempt_disable();
- idx = sp->completed & 0x1;
- barrier(); /* ensure compiler looks -once- at sp->completed. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++;
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
+ idx = rcu_dereference_index_check(sp->completed,
+ rcu_read_lock_sched_held()) & 0x1;
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
+ SRCU_USAGE_COUNT + 1;
+ smp_mb(); /* B */ /* Avoid leaking the critical section. */
preempt_enable();
return idx;
}
void __srcu_read_unlock(struct srcu_struct *sp, int idx)
{
preempt_disable();
- srcu_barrier(); /* ensure compiler won't misorder critical section. */
- per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
+ smp_mb(); /* C */ /* Avoid leaking the critical section. */
+ ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1;
preempt_enable();
}
EXPORT_SYMBOL_GPL(__srcu_read_unlock);
* we repeatedly block for 1-millisecond time periods. This approach
* has done well in testing, so there is no need for a config parameter.
*/
-#define SYNCHRONIZE_SRCU_READER_DELAY 10
+#define SYNCHRONIZE_SRCU_READER_DELAY 5
/*
- * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ * Wait until all pre-existing readers complete. Such readers
+ * will have used the index specified by "idx".
*/
-static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
+static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
{
- int idx;
-
- rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
- !lock_is_held(&rcu_bh_lock_map) &&
- !lock_is_held(&rcu_lock_map) &&
- !lock_is_held(&rcu_sched_lock_map),
- "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
-
- idx = sp->completed;
- mutex_lock(&sp->mutex);
+ int trycount = 0;
/*
- * Check to see if someone else did the work for us while we were
- * waiting to acquire the lock. We need -two- advances of
- * the counter, not just one. If there was but one, we might have
- * shown up -after- our helper's first synchronize_sched(), thus
- * having failed to prevent CPU-reordering races with concurrent
- * srcu_read_unlock()s on other CPUs (see comment below). So we
- * either (1) wait for two or (2) supply the second ourselves.
+ * If a reader fetches the index before the ->completed increment,
+ * but increments its counter after srcu_readers_active_idx_check()
+ * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
+ * smp_mb() B to ensure that the SRCU read-side critical section
+ * will see any updates that the current task performed before its
+ * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
+ * as the case may be.
*/
+ smp_mb(); /* D */
- if ((sp->completed - idx) >= 2) {
- mutex_unlock(&sp->mutex);
- return;
+ /*
+ * SRCU read-side critical sections are normally short, so wait
+ * a small amount of time before possibly blocking.
+ */
+ if (!srcu_readers_active_idx_check(sp, idx)) {
+ udelay(SYNCHRONIZE_SRCU_READER_DELAY);
+ while (!srcu_readers_active_idx_check(sp, idx)) {
+ if (expedited && ++ trycount < 10)
+ udelay(SYNCHRONIZE_SRCU_READER_DELAY);
+ else
+ schedule_timeout_interruptible(1);
+ }
}
- sync_func(); /* Force memory barrier on all CPUs. */
-
/*
- * The preceding synchronize_sched() ensures that any CPU that
- * sees the new value of sp->completed will also see any preceding
- * changes to data structures made by this CPU. This prevents
- * some other CPU from reordering the accesses in its SRCU
- * read-side critical section to precede the corresponding
- * srcu_read_lock() -- ensuring that such references will in
- * fact be protected.
+ * The following smp_mb() E pairs with srcu_read_unlock()'s
+ * smp_mb C to ensure that if srcu_readers_active_idx_check()
+ * sees srcu_read_unlock()'s counter decrement, then any
+ * of the current task's subsequent code will happen after
+ * that SRCU read-side critical section.
*
- * So it is now safe to do the flip.
+ * It also ensures the order between the above waiting and
+ * the next flipping.
*/
+ smp_mb(); /* E */
+}
- idx = sp->completed & 0x1;
+static void srcu_flip(struct srcu_struct *sp)
+{
sp->completed++;
+}
- sync_func(); /* Force memory barrier on all CPUs. */
-
- /*
- * At this point, because of the preceding synchronize_sched(),
- * all srcu_read_lock() calls using the old counters have completed.
- * Their corresponding critical sections might well be still
- * executing, but the srcu_read_lock() primitives themselves
- * will have finished executing. We initially give readers
- * an arbitrarily chosen 10 microseconds to get out of their
- * SRCU read-side critical sections, then loop waiting 1/HZ
- * seconds per iteration. The 10-microsecond value has done
- * very well in testing.
- */
+/*
+ * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
+ */
+static void __synchronize_srcu(struct srcu_struct *sp, bool expedited)
+{
+ int busy_idx;
- if (srcu_readers_active_idx(sp, idx))
- udelay(SYNCHRONIZE_SRCU_READER_DELAY);
- while (srcu_readers_active_idx(sp, idx))
- schedule_timeout_interruptible(1);
+ rcu_lockdep_assert(!lock_is_held(&sp->dep_map) &&
+ !lock_is_held(&rcu_bh_lock_map) &&
+ !lock_is_held(&rcu_lock_map) &&
+ !lock_is_held(&rcu_sched_lock_map),
+ "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
- sync_func(); /* Force memory barrier on all CPUs. */
+ mutex_lock(&sp->mutex);
+ busy_idx = sp->completed & 0X1UL;
/*
- * The preceding synchronize_sched() forces all srcu_read_unlock()
- * primitives that were executing concurrently with the preceding
- * for_each_possible_cpu() loop to have completed by this point.
- * More importantly, it also forces the corresponding SRCU read-side
- * critical sections to have also completed, and the corresponding
- * references to SRCU-protected data items to be dropped.
+ * If we recently flipped the index, there will be some readers
+ * using idx=0 and others using idx=1. Therefore, two calls to
+ * wait_idx()s suffice to ensure that all pre-existing readers
+ * have completed:
+ *
+ * __synchronize_srcu() {
+ * wait_idx(sp, 0, expedited);
+ * wait_idx(sp, 1, expedited);
+ * }
*
- * Note:
+ * Starvation is prevented by the fact that we flip the index.
+ * While we wait on one index to clear out, almost all new readers
+ * will be using the other index. The number of new readers using the
+ * index we are waiting on is sharply bounded by roughly the number
+ * of CPUs.
*
- * Despite what you might think at first glance, the
- * preceding synchronize_sched() -must- be within the
- * critical section ended by the following mutex_unlock().
- * Otherwise, a task taking the early exit can race
- * with a srcu_read_unlock(), which might have executed
- * just before the preceding srcu_readers_active() check,
- * and whose CPU might have reordered the srcu_read_unlock()
- * with the preceding critical section. In this case, there
- * is nothing preventing the synchronize_sched() task that is
- * taking the early exit from freeing a data structure that
- * is still being referenced (out of order) by the task
- * doing the srcu_read_unlock().
+ * How can new readers possibly using the old pre-flip value of
+ * the index? Consider the following sequence of events:
*
- * Alternatively, the comparison with "2" on the early exit
- * could be changed to "3", but this increases synchronize_srcu()
- * latency for bulk loads. So the current code is preferred.
+ * Suppose that during the previous grace period, a reader
+ * picked up the old value of the index, but did not increment
+ * its counter until after the previous instance of
+ * __synchronize_srcu() did the counter summation and recheck.
+ * That previous grace period was OK because the reader did
+ * not start until after the grace period started, so the grace
+ * period was not obligated to wait for that reader.
+ *
+ * However, this sequence of events is quite improbable, so
+ * this call to wait_idx(), which waits on really old readers
+ * describe in this comment above, will almost never need to wait.
*/
+ wait_idx(sp, 1 - busy_idx, expedited);
+
+ /* Flip the index to avoid reader-induced starvation. */
+ srcu_flip(sp);
+
+ /* Wait for recent pre-existing readers. */
+ wait_idx(sp, busy_idx, expedited);
mutex_unlock(&sp->mutex);
}
*/
void synchronize_srcu(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched);
+ __synchronize_srcu(sp, 0);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
* synchronize_srcu_expedited - Brute-force SRCU grace period
* @sp: srcu_struct with which to synchronize.
*
- * Wait for an SRCU grace period to elapse, but use a "big hammer"
- * approach to force the grace period to end quickly. This consumes
- * significant time on all CPUs and is unfriendly to real-time workloads,
- * so is thus not recommended for any sort of common-case code. In fact,
- * if you are using synchronize_srcu_expedited() in a loop, please
- * restructure your code to batch your updates, and then use a single
- * synchronize_srcu() instead.
+ * Wait for an SRCU grace period to elapse, but be more aggressive about
+ * spinning rather than blocking when waiting.
*
* Note that it is illegal to call this function while holding any lock
- * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
- * to call this function from a CPU-hotplug notifier. Failing to observe
- * these restriction will result in deadlock. It is also illegal to call
+ * that is acquired by a CPU-hotplug notifier. It is also illegal to call
* synchronize_srcu_expedited() from the corresponding SRCU read-side
* critical section; doing so will result in deadlock. However, it is
* perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
*/
void synchronize_srcu_expedited(struct srcu_struct *sp)
{
- __synchronize_srcu(sp, synchronize_sched_expedited);
+ __synchronize_srcu(sp, 1);
}
EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);