Merge branches 'doc.2022.06.21a', 'fixes.2022.07.19a', 'nocb.2022.07.19a', 'poll...

[platform/kernel/linux-starfive.git] / kernel / rcu / tree.c
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c

index 3b9f45e..62e5147 100644 (file)
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1779,6 +1779,79 @@ static void rcu_strict_gp_boundary(void *unused)
         invoke_rcu_core();
  }
  
+// Has rcu_init() been invoked?  This is used (for example) to determine
+// whether spinlocks may be acquired safely.
+static bool rcu_init_invoked(void)
+{
+       return !!rcu_state.n_online_cpus;
+}
+
+// Make the polled API aware of the beginning of a grace period.
+static void rcu_poll_gp_seq_start(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked())
+               raw_lockdep_assert_held_rcu_node(rnp);
+
+       // If RCU was idle, note beginning of GP.
+       if (!rcu_seq_state(rcu_state.gp_seq_polled))
+               rcu_seq_start(&rcu_state.gp_seq_polled);
+
+       // Either way, record current state.
+       *snap = rcu_state.gp_seq_polled;
+}
+
+// Make the polled API aware of the end of a grace period.
+static void rcu_poll_gp_seq_end(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked())
+               raw_lockdep_assert_held_rcu_node(rnp);
+
+       // If the previously noted GP is still in effect, record the
+       // end of that GP.  Either way, zero counter to avoid counter-wrap
+       // problems.
+       if (*snap && *snap == rcu_state.gp_seq_polled) {
+               rcu_seq_end(&rcu_state.gp_seq_polled);
+               rcu_state.gp_seq_polled_snap = 0;
+               rcu_state.gp_seq_polled_exp_snap = 0;
+       } else {
+               *snap = 0;
+       }
+}
+
+// Make the polled API aware of the beginning of a grace period, but
+// where caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_start_unlocked(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked()) {
+               lockdep_assert_irqs_enabled();
+               raw_spin_lock_irq_rcu_node(rnp);
+       }
+       rcu_poll_gp_seq_start(snap);
+       if (rcu_init_invoked())
+               raw_spin_unlock_irq_rcu_node(rnp);
+}
+
+// Make the polled API aware of the end of a grace period, but where
+// caller does not hold the root rcu_node structure's lock.
+static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap)
+{
+       struct rcu_node *rnp = rcu_get_root();
+
+       if (rcu_init_invoked()) {
+               lockdep_assert_irqs_enabled();
+               raw_spin_lock_irq_rcu_node(rnp);
+       }
+       rcu_poll_gp_seq_end(snap);
+       if (rcu_init_invoked())
+               raw_spin_unlock_irq_rcu_node(rnp);
+}
+
  /*
   * Initialize a new grace period.  Return false if no grace period required.
   */
@@ -1814,6 +1887,7 @@ static noinline_for_stack bool rcu_gp_init(void)
         rcu_seq_start(&rcu_state.gp_seq);
         ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq);
         trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start"));
+       rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap);
         raw_spin_unlock_irq_rcu_node(rnp);
  
         /*
@@ -1975,19 +2049,23 @@ static void rcu_gp_fqs(bool first_time)
   */
  static noinline_for_stack void rcu_gp_fqs_loop(void)
  {
-       bool first_gp_fqs;
+       bool first_gp_fqs = true;
         int gf = 0;
         unsigned long j;
         int ret;
         struct rcu_node *rnp = rcu_get_root();
  
-       first_gp_fqs = true;
         j = READ_ONCE(jiffies_till_first_fqs);
         if (rcu_state.cbovld)
                 gf = RCU_GP_FLAG_OVLD;
         ret = 0;
         for (;;) {
-               if (!ret) {
+               if (rcu_state.cbovld) {
+                       j = (j + 2) / 3;
+                       if (j <= 0)
+                               j = 1;
+               }
+               if (!ret || time_before(jiffies + j, rcu_state.jiffies_force_qs)) {
                         WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j);
                         /*
                          * jiffies_force_qs before RCU_GP_WAIT_FQS state
@@ -2005,7 +2083,15 @@ static noinline_for_stack void rcu_gp_fqs_loop(void)
                 rcu_gp_torture_wait();
                 WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
                 /* Locking provides needed memory barriers. */
-               /* If grace period done, leave loop. */
+               /*
+                * Exit the loop if the root rcu_node structure indicates that the grace period
+                * has ended, leave the loop.  The rcu_preempt_blocked_readers_cgp(rnp) check
+                * is required only for single-node rcu_node trees because readers blocking
+                * the current grace period are queued only on leaf rcu_node structures.
+                * For multi-node trees, checking the root node's ->qsmask suffices, because a
+                * given root node's ->qsmask bit is cleared only when all CPUs and tasks from
+                * the corresponding leaf nodes have passed through their quiescent state.
+                */
                 if (!READ_ONCE(rnp->qsmask) &&
                     !rcu_preempt_blocked_readers_cgp(rnp))
                         break;
@@ -2073,6 +2159,7 @@ static noinline void rcu_gp_cleanup(void)
          * safe for us to drop the lock in order to mark the grace
          * period as completed in all of the rcu_node structures.
          */
+       rcu_poll_gp_seq_end(&rcu_state.gp_seq_polled_snap);
         raw_spin_unlock_irq_rcu_node(rnp);
  
         /*
@@ -3215,7 +3302,6 @@ struct kfree_rcu_cpu_work {
   * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
   * @lock: Synchronize access to this structure
   * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
   * @initialized: The @rcu_work fields have been initialized
   * @count: Number of objects for which GP not started
   * @bkvcache:
@@ -3240,7 +3326,6 @@ struct kfree_rcu_cpu {
         struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
         raw_spinlock_t lock;
         struct delayed_work monitor_work;
-       bool monitor_todo;
         bool initialized;
         int count;
  
@@ -3420,6 +3505,18 @@ static void kfree_rcu_work(struct work_struct *work)
         }
  }
  
+static bool
+need_offload_krc(struct kfree_rcu_cpu *krcp)
+{
+       int i;
+
+       for (i = 0; i < FREE_N_CHANNELS; i++)
+               if (krcp->bkvhead[i])
+                       return true;
+
+       return !!krcp->head;
+}
+
  /*
   * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
   */
@@ -3476,9 +3573,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
         // of the channels that is still busy we should rearm the
         // work to repeat an attempt. Because previous batches are
         // still in progress.
-       if (!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head)
-               krcp->monitor_todo = false;
-       else
+       if (need_offload_krc(krcp))
                 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
  
         raw_spin_unlock_irqrestore(&krcp->lock, flags);
@@ -3666,11 +3761,8 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
         WRITE_ONCE(krcp->count, krcp->count + 1);
  
         // Set timer to drain after KFREE_DRAIN_JIFFIES.
-       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
-           !krcp->monitor_todo) {
-               krcp->monitor_todo = true;
+       if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
                 schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-       }
  
  unlock_return:
         krc_this_cpu_unlock(krcp, flags);
@@ -3745,14 +3837,8 @@ void __init kfree_rcu_scheduler_running(void)
                 struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
  
                 raw_spin_lock_irqsave(&krcp->lock, flags);
-               if ((!krcp->bkvhead[0] && !krcp->bkvhead[1] && !krcp->head) ||
-                               krcp->monitor_todo) {
-                       raw_spin_unlock_irqrestore(&krcp->lock, flags);
-                       continue;
-               }
-               krcp->monitor_todo = true;
-               schedule_delayed_work_on(cpu, &krcp->monitor_work,
-                                        KFREE_DRAIN_JIFFIES);
+               if (need_offload_krc(krcp))
+                       schedule_delayed_work_on(cpu, &krcp->monitor_work, KFREE_DRAIN_JIFFIES);
                 raw_spin_unlock_irqrestore(&krcp->lock, flags);
         }
  }
@@ -3841,8 +3927,18 @@ void synchronize_rcu(void)
                          lock_is_held(&rcu_lock_map) ||
                          lock_is_held(&rcu_sched_lock_map),
                          "Illegal synchronize_rcu() in RCU read-side critical section");
-       if (rcu_blocking_is_gp())
+       if (rcu_blocking_is_gp()) {
+               // Note well that this code runs with !PREEMPT && !SMP.
+               // In addition, all code that advances grace periods runs at
+               // process level.  Therefore, this normal GP overlaps with
+               // other normal GPs only by being fully nested within them,
+               // which allows reuse of ->gp_seq_polled_snap.
+               rcu_poll_gp_seq_start_unlocked(&rcu_state.gp_seq_polled_snap);
+               rcu_poll_gp_seq_end_unlocked(&rcu_state.gp_seq_polled_snap);
+               if (rcu_init_invoked())
+                       cond_resched_tasks_rcu_qs();
                 return;  // Context allows vacuous grace periods.
+       }
         if (rcu_gp_is_expedited())
                 synchronize_rcu_expedited();
         else
@@ -3864,7 +3960,7 @@ unsigned long get_state_synchronize_rcu(void)
          * before the load from ->gp_seq.
          */
         smp_mb();  /* ^^^ */
-       return rcu_seq_snap(&rcu_state.gp_seq);
+       return rcu_seq_snap(&rcu_state.gp_seq_polled);
  }
  EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
  
@@ -3893,7 +3989,13 @@ unsigned long start_poll_synchronize_rcu(void)
         rdp = this_cpu_ptr(&rcu_data);
         rnp = rdp->mynode;
         raw_spin_lock_rcu_node(rnp); // irqs already disabled.
-       needwake = rcu_start_this_gp(rnp, rdp, gp_seq);
+       // Note it is possible for a grace period to have elapsed between
+       // the above call to get_state_synchronize_rcu() and the below call
+       // to rcu_seq_snap.  This is OK, the worst that happens is that we
+       // get a grace period that no one needed.  These accesses are ordered
+       // by smp_mb(), and we are accessing them in the opposite order
+       // from which they are updated at grace-period start, as required.
+       needwake = rcu_start_this_gp(rnp, rdp, rcu_seq_snap(&rcu_state.gp_seq));
         raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
         if (needwake)
                 rcu_gp_kthread_wake();
@@ -3915,7 +4017,7 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
   *
   * Yes, this function does not take counter wrap into account.
   * But counter wrap is harmless.  If the counter wraps, we have waited for
- * more than 2 billion grace periods (and way more on a 64-bit system!).
+ * more than a billion grace periods (and way more on a 64-bit system!).
   * Those needing to keep oldstate values for very long time periods
   * (many hours even on 32-bit systems) should check them occasionally
   * and either refresh them or set a flag indicating that the grace period
@@ -3928,7 +4030,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_rcu);
   */
  bool poll_state_synchronize_rcu(unsigned long oldstate)
  {
-       if (rcu_seq_done(&rcu_state.gp_seq, oldstate)) {
+       if (oldstate == RCU_GET_STATE_COMPLETED ||
+           rcu_seq_done_exact(&rcu_state.gp_seq_polled, oldstate)) {
                 smp_mb(); /* Ensure GP ends before subsequent accesses. */
                 return true;
         }
@@ -3939,20 +4042,20 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
  /**
   * cond_synchronize_rcu - Conditionally wait for an RCU grace period
   *
- * @oldstate: value from get_state_synchronize_rcu() or start_poll_synchronize_rcu()
+ * @oldstate: value from get_state_synchronize_rcu(), start_poll_synchronize_rcu(), or start_poll_synchronize_rcu_expedited()
   *
   * If a full RCU grace period has elapsed since the earlier call to
   * get_state_synchronize_rcu() or start_poll_synchronize_rcu(), just return.
   * Otherwise, invoke synchronize_rcu() to wait for a full grace period.
   *
- * Yes, this function does not take counter wrap into account.  But
- * counter wrap is harmless.  If the counter wraps, we have waited for
+ * Yes, this function does not take counter wrap into account.
+ * But counter wrap is harmless.  If the counter wraps, we have waited for
   * more than 2 billion grace periods (and way more on a 64-bit system!),
- * so waiting for one additional grace period should be just fine.
+ * so waiting for a couple of additional grace periods should be just fine.
   *
   * This function provides the same memory-ordering guarantees that
   * would be provided by a synchronize_rcu() that was invoked at the call
- * to the function that provided @oldstate, and that returned at the end
+ * to the function that provided @oldstate and that returned at the end
   * of this function.
   */
  void cond_synchronize_rcu(unsigned long oldstate)
@@ -4445,6 +4548,7 @@ void rcu_report_dead(unsigned int cpu)
         rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags);
         if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */
                 /* Report quiescent state -before- changing ->qsmaskinitnext! */
+               rcu_disable_urgency_upon_qs(rdp);
                 rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags);
                 raw_spin_lock_irqsave_rcu_node(rnp, flags);
         }
@@ -4490,6 +4594,7 @@ void rcutree_migrate_callbacks(int cpu)
         needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
         rcu_segcblist_disable(&rdp->cblist);
         WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
+       check_cb_ovld_locked(my_rdp, my_rnp);
         if (rcu_rdp_is_offloaded(my_rdp)) {
                 raw_spin_unlock_rcu_node(my_rnp); /* irqs remain disabled. */
                 __call_rcu_nocb_wake(my_rdp, true, flags);
@@ -4705,6 +4810,9 @@ static void __init rcu_init_one(void)
                         init_waitqueue_head(&rnp->exp_wq[3]);
                         spin_lock_init(&rnp->exp_lock);
                         mutex_init(&rnp->boost_kthread_mutex);
+                       raw_spin_lock_init(&rnp->exp_poll_lock);
+                       rnp->exp_seq_poll_rq = RCU_GET_STATE_COMPLETED;
+                       INIT_WORK(&rnp->exp_poll_wq, sync_rcu_do_polled_gp);
                 }
         }
  
@@ -4930,6 +5038,10 @@ void __init rcu_init(void)
                 qovld_calc = DEFAULT_RCU_QOVLD_MULT * qhimark;
         else
                 qovld_calc = qovld;
+
+       // Kick-start any polled grace periods that started early.
+       if (!(per_cpu_ptr(&rcu_data, cpu)->mynode->exp_seq_poll_rq & 0x1))
+               (void)start_poll_synchronize_rcu_expedited();
  }
  
  #include "tree_stall.h"