rcu: Break rcu_node_0 --> &rq->__lock order
authorPeter Zijlstra <peterz@infradead.org>
Tue, 31 Oct 2023 08:53:08 +0000 (09:53 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 10 Jan 2024 16:16:56 +0000 (17:16 +0100)
[ Upstream commit 85d68222ddc5f4522e456d97d201166acb50f716 ]

Commit 851a723e45d1 ("sched: Always clear user_cpus_ptr in
do_set_cpus_allowed()") added a kfree() call to free any user
provided affinity mask, if present. It was changed later to use
kfree_rcu() in commit 9a5418bc48ba ("sched/core: Use kfree_rcu()
in do_set_cpus_allowed()") to avoid a circular locking dependency
problem.

It turns out that even kfree_rcu() isn't safe for avoiding
circular locking problem. As reported by kernel test robot,
the following circular locking dependency now exists:

  &rdp->nocb_lock --> rcu_node_0 --> &rq->__lock

Solve this by breaking the rcu_node_0 --> &rq->__lock chain by moving
the resched_cpu() out from under rcu_node lock.

[peterz: heavily borrowed from Waiman's Changelog]
[paulmck: applied Z qiang feedback]

Fixes: 851a723e45d1 ("sched: Always clear user_cpus_ptr in do_set_cpus_allowed()")
Reported-by: kernel test robot <oliver.sang@intel.com>
Acked-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/oe-lkp/202310302207.a25f1a30-oliver.sang@intel.com
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
kernel/rcu/tree.c

index 7b4517d..92a090e 100644 (file)
@@ -755,14 +755,19 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 }
 
 /*
- * Return true if the specified CPU has passed through a quiescent
- * state by virtue of being in or having passed through an dynticks
- * idle state since the last call to dyntick_save_progress_counter()
- * for this same CPU, or by virtue of having been offline.
+ * Returns positive if the specified CPU has passed through a quiescent state
+ * by virtue of being in or having passed through an dynticks idle state since
+ * the last call to dyntick_save_progress_counter() for this same CPU, or by
+ * virtue of having been offline.
+ *
+ * Returns negative if the specified CPU needs a force resched.
+ *
+ * Returns zero otherwise.
  */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
        unsigned long jtsq;
+       int ret = 0;
        struct rcu_node *rnp = rdp->mynode;
 
        /*
@@ -848,8 +853,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
            (time_after(jiffies, READ_ONCE(rdp->last_fqs_resched) + jtsq * 3) ||
             rcu_state.cbovld)) {
                WRITE_ONCE(rdp->rcu_urgent_qs, true);
-               resched_cpu(rdp->cpu);
                WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+               ret = -1;
        }
 
        /*
@@ -862,8 +867,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
        if (time_after(jiffies, rcu_state.jiffies_resched)) {
                if (time_after(jiffies,
                               READ_ONCE(rdp->last_fqs_resched) + jtsq)) {
-                       resched_cpu(rdp->cpu);
                        WRITE_ONCE(rdp->last_fqs_resched, jiffies);
+                       ret = -1;
                }
                if (IS_ENABLED(CONFIG_IRQ_WORK) &&
                    !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
@@ -892,7 +897,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
                }
        }
 
-       return 0;
+       return ret;
 }
 
 /* Trace-event wrapper function for trace_rcu_future_grace_period.  */
@@ -2270,15 +2275,15 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 {
        int cpu;
        unsigned long flags;
-       unsigned long mask;
-       struct rcu_data *rdp;
        struct rcu_node *rnp;
 
        rcu_state.cbovld = rcu_state.cbovldnext;
        rcu_state.cbovldnext = false;
        rcu_for_each_leaf_node(rnp) {
+               unsigned long mask = 0;
+               unsigned long rsmask = 0;
+
                cond_resched_tasks_rcu_qs();
-               mask = 0;
                raw_spin_lock_irqsave_rcu_node(rnp, flags);
                rcu_state.cbovldnext |= !!rnp->cbovldmask;
                if (rnp->qsmask == 0) {
@@ -2296,11 +2301,17 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
                        continue;
                }
                for_each_leaf_node_cpu_mask(rnp, cpu, rnp->qsmask) {
+                       struct rcu_data *rdp;
+                       int ret;
+
                        rdp = per_cpu_ptr(&rcu_data, cpu);
-                       if (f(rdp)) {
+                       ret = f(rdp);
+                       if (ret > 0) {
                                mask |= rdp->grpmask;
                                rcu_disable_urgency_upon_qs(rdp);
                        }
+                       if (ret < 0)
+                               rsmask |= rdp->grpmask;
                }
                if (mask != 0) {
                        /* Idle/offline CPUs, report (releases rnp->lock). */
@@ -2309,6 +2320,9 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
                        /* Nothing to do here, so just drop the lock. */
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                }
+
+               for_each_leaf_node_cpu_mask(rnp, cpu, rsmask)
+                       resched_cpu(cpu);
        }
 }