Merge branches 'doc.2022.06.21a', 'fixes.2022.07.19a', 'nocb.2022.07.19a', 'poll...
[platform/kernel/linux-starfive.git] / kernel / rcu / tree.c
index 3b9f45e..62e5147 100644 (file)
@@ -1779,6 +1779,79 @@ static void rcu_strict_gp_boundary(void *unused)
        invoke_rcu_core();
 }
 
+// Has rcu_init() been invoked?  This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+       return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked())
+               raw_lockdep_assert_held_rcu_node(rnp);
+
+       // If RCU was idle, note beginning of GP.
+       if (!rcu_seq_state(rcu_state.gp_seq_polled))
+               rcu_seq_start(&rcu_state.gp_seq_polled);
+
+       // Either way, record current state.
+       *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked())
+               raw_lockdep_assert_held_rcu_node(rnp);
+
+       // If the previously noted GP is still in effect, record the
+       // end of that GP.  Either way, zero counter to avoid counter-wrap
+       // problems.
+       if (*snap && *snap == rcu_state.gp_seq_polled) {
+               rcu_seq_end(&rcu_state.gp_seq_polled);
+               rcu_state.gp_seq_polled_snap = 0;
+               rcu_state.gp_seq_polled_exp_snap = 0;
+       } else {
+               *snap = 0;
+       }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked()) {
+               lockdep_assert_irqs_enabled();
+               raw_spin_lock_irq_rcu_node(rnp);
+       }
+       rcu_poll_gp_seq_start(snap);
+       if (rcu_init_invoked())
+               raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked()) {
+               lockdep_assert_irqs_enabled();
+               raw_spin_lock_irq_rcu_node(rnp);
+       }
+       rcu_poll_gp_seq_end(snap);
+       if (rcu_init_invoked())
+               raw_spin_unlock_irq_rcu_node(rnp);
+}
+
 /*
  * Initialize a new grace period.  Return false if no grace period required.
  */
@@ -1814,6 +1887,7 @@ static noinline_for_stack bool rcu_gp_init(void)
        rcu_seq_start(&rcu_state.gp_seq);
        ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
        trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+       rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
        raw_spin_unlock_irq_rcu_node(rnp);
 
        /*
@@ -1975,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time)
  */
 static noinline_for_stack void rcu_gp_fqs_loop(void)
 {
-       bool first_gp_fqs;
+       bool first_gp_fqs = true;
        int gf = 0;
        unsigned long j;
        int ret;
        struct rcu_node *rnp = rcu_get_root();
 
-       first_gp_fqs = true;
        j = READ_ONCE(jiffies_till_first_fqs);
        if (rcu_state.cbovld)
                gf = RCU_GP_FLAG_OVLD;
        ret = 0;
        for (;;) {
-               if (!ret) {
+               if (rcu_state.cbovld) {
+                       j = (j + 2) / 3;
+                       if (j <= 0)
+                               j = 1;
+               }
+               if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
                        WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
                        /*
                         * jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -2005,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
                rcu_gp_torture_wait();
                WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
                /* Locking provides needed memory barriers. */
-               /* If grace period done, leave loop. */
+               /*
+                * Exit the loop if the root rcu_node structure indicates that the grace period
+                * has ended, leave the loop.  The rcu_preempt_blocked_readers_cgp(rnp) check
+                * is required only for single-node rcu_node trees because readers blocking
+                * the current grace period are queued only on leaf rcu_node structures.
+                * For multi-node trees, checking the root node's ->qsmask suffices, because a
+                * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+                * the corresponding leaf nodes have passed through their quiescent state.
+                */
                if (!READ_ONCE(rnp->qsmask) &&
                    !rcu_preempt_blocked_readers_cgp(rnp))
                        break;
@@ -2073,6 +2159,7 @@ static noinline void rcu_gp_cleanup(void)
         * safe for us to drop the lock in order to mark the grace
         * period as completed in all of the rcu_node structures.
         */
+       rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
        raw_spin_unlock_irq_rcu_node(rnp);
 
        /*
@@ -3215,7 +3302,6 @@ struct kfree_rcu_cpu_work {
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
  * @initialized: The @rcu_work fields have been initialized
  * @count: Number of objects for which GP not started
  * @bkvcache:
@@ -3240,7 +3326,6 @@ struct kfree_rcu_cpu {
        struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
        raw_spinlock_t lock;
        struct delayed_work monitor_work;
-       bool monitor_todo;
        bool initialized;
        int count;
 
@@ -3420,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work)
        }
 }
 
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (krcp->bkvhead[i])
+                       return true;
+
+       return !!krcp->head;
+}
+
 /*
  * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
  */
@@ -3476,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
        // of the channels that is still busy we should rearm the
        // work to repeat an attempt. Because previous batches are
        // still in progress.
-       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
-               krcp->monitor_todo = false;
-       else
+       if (need_offload_krc(krcp))
                schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
 
        raw_spin_unlock_irqrestore(&krcp->lock, flags);
@@ -3666,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
        WRITE_ONCE(krcp->count, krcp->count + 1);
 
        // Set timer to drain after KFREE_DRAIN_JIFFIES.
-       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-           !krcp->monitor_todo) {
-               krcp->monitor_todo = true;
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
                schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-       }
 
 unlock_return:
        krc_this_cpu_unlock(krcp, flags);
@@ -3745,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void)
                struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
                raw_spin_lock_irqsave(&krcp->lock, flags);
-               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
-                               krcp->monitor_todo) {
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-                       continue;
-               }
-               krcp->monitor_todo = true;
-               schedule_delayed_work_on(cpu, &krcp->monitor_work,
-                                        KFREE_DRAIN_JIFFIES);
+               if (need_offload_krc(krcp))
+                       schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
                raw_spin_unlock_irqrestore(&krcp->lock, flags);
        }
 }
@@ -3841,8 +3927,18 @@ void synchronize_rcu(void)
                         lock_is_held(&rcu_lock_map) ||
                         lock_is_held(&rcu_sched_lock_map),
                         "Illegal synchronize_rcu() in RCU read-side critical section");
-       if (rcu_blocking_is_gp())
+       if (rcu_blocking_is_gp()) {
+               // Note well that this code runs with !PREEMPT && !SMP.
+               // In addition, all code that advances grace periods runs at
+               // process level.  Therefore, this normal GP overlaps with
+               // other normal GPs only by being fully nested within them,
+               // which allows reuse of ->gp_seq_polled_snap.
+               rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+               rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+               if (rcu_init_invoked())
+                       cond_resched_tasks_rcu_qs();
                return;  // Context allows vacuous grace periods.
+       }
        if (rcu_gp_is_expedited())
                synchronize_rcu_expedited();
        else
@@ -3864,7 +3960,7 @@ unsigned long get_state_synchronize_rcu(void)
         * before the load from ->gp_seq.
         */
        smp_mb();  /* ^^^ */
-       return rcu_seq_snap(&rcu_state.gp_seq);
+       return rcu_seq_snap(&rcu_state.gp_seq_polled);
 }
 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
 
@@ -3893,7 +3989,13 @@ unsigned long start_poll_synchronize_rcu(void)
        rdp = this_cpu_ptr(&rcu_data);
        rnp = rdp->mynode;
        raw_spin_lock_rcu_node(rnp); // irqs already disabled.
-       needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+       // Note it is possible for a grace period to have elapsed between
+       // the above call to get_state_synchronize_rcu() and the below call
+       // to rcu_seq_snap.  This is OK, the worst that happens is that we
+       // get a grace period that no one needed.  These accesses are ordered
+       // by smp_mb(), and we are accessing them in the opposite order
+       // from which they are updated at grace-period start, as required.
+       needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
        if (needwake)
                rcu_gp_kthread_wake();
@@ -3915,7 +4017,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  *
  * Yes, this function does not take counter wrap into account.
  * But counter wrap is harmless.  If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!).
+ * more than a billion grace periods (and way more on a 64-bit system!).
  * Those needing to keep oldstate values for very long time periods
  * (many hours even on 32-bit systems) should check them occasionally
  * and either refresh them or set a flag indicating that the grace period
@@ -3928,7 +4030,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
  */
 bool poll_state_synchronize_rcu(unsigned long oldstate)
 {
-       if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+       if (oldstate == RCU_GET_STATE_COMPLETED ||
+           rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
                smp_mb(); /* Ensure GP ends before subsequent accesses. */
                return true;
        }
@@ -3939,20 +4042,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
 /**
  * cond_synchronize_rcu - Conditionally wait for an RCU grace period
  *
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
  *
  * If a full RCU grace period has elapsed since the earlier call to
  * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
  * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
  *
- * Yes, this function does not take counter wrap into account.  But
- * counter wrap is harmless.  If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
  * more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
  *
  * This function provides the same memory-ordering guarantees that
  * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
  * of this function.
  */
 void cond_synchronize_rcu(unsigned long oldstate)
@@ -4445,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu)
        rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
        if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
                /* Report quiescent state -before- changing ->qsmaskinitnext! */
+               rcu_disable_urgency_upon_qs(rdp);
                rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
        }
@@ -4490,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu)
        needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
        rcu_segcblist_disable(&rdp->cblist);
        WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+       check_cb_ovld_locked(my_rdp, my_rnp);
        if (rcu_rdp_is_offloaded(my_rdp)) {
                raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
                __call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4705,6 +4810,9 @@ static void __init rcu_init_one(void)
                        init_waitqueue_head(&rnp->exp_wq[3]);
                        spin_lock_init(&rnp->exp_lock);
                        mutex_init(&rnp->boost_kthread_mutex);
+                       raw_spin_lock_init(&rnp->exp_poll_lock);
+                       rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+                       INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
                }
        }
 
@@ -4930,6 +5038,10 @@ void __init rcu_init(void)
                qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
        else
                qovld_calc = qovld;
+
+       // Kick-start any polled grace periods that started early.
+       if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
+               (void)start_poll_synchronize_rcu_expedited();
 }
 
 #include "tree_stall.h"