sched/core: Fix ttwu() race
authorPeter Zijlstra <peterz@infradead.org>
Mon, 22 Jun 2020 10:01:23 +0000 (12:01 +0200)
committerBorislav Petkov <bp@suse.de>
Sun, 28 Jun 2020 15:01:20 +0000 (17:01 +0200)
Paul reported rcutorture occasionally hitting a NULL deref:

  sched_ttwu_pending()
    ttwu_do_wakeup()
      check_preempt_curr() := check_preempt_wakeup()
        find_matching_se()
          is_same_group()
            if (se->cfs_rq == pse->cfs_rq) <-- *BOOM*

Debugging showed that this only appears to happen when we take the new
code-path from commit:

  2ebb17717550 ("sched/core: Offload wakee task activation if it the wakee is descheduling")

and only when @cpu == smp_processor_id(). Something which should not
be possible, because p->on_cpu can only be true for remote tasks.
Similarly, without the new code-path from commit:

  c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")

this would've unconditionally hit:

  smp_cond_load_acquire(&p->on_cpu, !VAL);

and if: 'cpu == smp_processor_id() && p->on_cpu' is possible, this
would result in an instant live-lock (with IRQs disabled), something
that hasn't been reported.

The NULL deref can be explained however if the task_cpu(p) load at the
beginning of try_to_wake_up() returns an old value, and this old value
happens to be smp_processor_id(). Further assume that the p->on_cpu
load accurately returns 1, it really is still running, just not here.

Then, when we enqueue the task locally, we can crash in exactly the
observed manner because p->se.cfs_rq != rq->cfs_rq, because p's cfs_rq
is from the wrong CPU, therefore we'll iterate into the non-existant
parents and NULL deref.

The closest semi-plausible scenario I've managed to contrive is
somewhat elaborate (then again, actual reproduction takes many CPU
hours of rcutorture, so it can't be anything obvious):

X->cpu = 1
rq(1)->curr = X

CPU0 CPU1 CPU2

// switch away from X
LOCK rq(1)->lock
smp_mb__after_spinlock
dequeue_task(X)
  X->on_rq = 9
switch_to(Z)
  X->on_cpu = 0
UNLOCK rq(1)->lock

// migrate X to cpu 0
LOCK rq(1)->lock
dequeue_task(X)
set_task_cpu(X, 0)
  X->cpu = 0
UNLOCK rq(1)->lock

LOCK rq(0)->lock
enqueue_task(X)
  X->on_rq = 1
UNLOCK rq(0)->lock

// switch to X
LOCK rq(0)->lock
smp_mb__after_spinlock
switch_to(X)
  X->on_cpu = 1
UNLOCK rq(0)->lock

// X goes sleep
X->state = TASK_UNINTERRUPTIBLE
smp_mb(); // wake X
ttwu()
  LOCK X->pi_lock
  smp_mb__after_spinlock

  if (p->state)

  cpu = X->cpu; // =? 1

  smp_rmb()

// X calls schedule()
LOCK rq(0)->lock
smp_mb__after_spinlock
dequeue_task(X)
  X->on_rq = 0

  if (p->on_rq)

  smp_rmb();

  if (p->on_cpu && ttwu_queue_wakelist(..)) [*]

  smp_cond_load_acquire(&p->on_cpu, !VAL)

  cpu = select_task_rq(X, X->wake_cpu, ...)
  if (X->cpu != cpu)
switch_to(Y)
  X->on_cpu = 0
UNLOCK rq(0)->lock

However I'm having trouble convincing myself that's actually possible
on x86_64 -- after all, every LOCK implies an smp_mb() there, so if ttwu
observes ->state != RUNNING, it must also observe ->cpu != 1.

(Most of the previous ttwu() races were found on very large PowerPC)

Nevertheless, this fully explains the observed failure case.

Fix it by ordering the task_cpu(p) load after the p->on_cpu load,
which is easy since nothing actually uses @cpu before this.

Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu")
Reported-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200622125649.GC576871@hirez.programming.kicks-ass.net
kernel/sched/core.c

index c1ba2e5..60791b9 100644 (file)
@@ -2293,8 +2293,15 @@ void sched_ttwu_pending(void *arg)
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
 
-       llist_for_each_entry_safe(p, t, llist, wake_entry)
+       llist_for_each_entry_safe(p, t, llist, wake_entry) {
+               if (WARN_ON_ONCE(p->on_cpu))
+                       smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+               if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
+                       set_task_cpu(p, cpu_of(rq));
+
                ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
+       }
 
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -2378,6 +2385,9 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
 {
        if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
+               if (WARN_ON_ONCE(cpu == smp_processor_id()))
+                       return false;
+
                sched_clock_cpu(cpu); /* Sync clocks across CPUs */
                __ttwu_queue_wakelist(p, cpu, wake_flags);
                return true;
@@ -2528,7 +2538,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                        goto out;
 
                success = 1;
-               cpu = task_cpu(p);
                trace_sched_waking(p);
                p->state = TASK_RUNNING;
                trace_sched_wakeup(p);
@@ -2550,7 +2559,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
        /* We're going to change ->state: */
        success = 1;
-       cpu = task_cpu(p);
 
        /*
         * Ensure we load p->on_rq _after_ p->state, otherwise it would
@@ -2614,8 +2622,21 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         * which potentially sends an IPI instead of spinning on p->on_cpu to
         * let the waker make forward progress. This is safe because IRQs are
         * disabled and the IPI will deliver after on_cpu is cleared.
+        *
+        * Ensure we load task_cpu(p) after p->on_cpu:
+        *
+        * set_task_cpu(p, cpu);
+        *   STORE p->cpu = @cpu
+        * __schedule() (switch to task 'p')
+        *   LOCK rq->lock
+        *   smp_mb__after_spin_lock()          smp_cond_load_acquire(&p->on_cpu)
+        *   STORE p->on_cpu = 1                LOAD p->cpu
+        *
+        * to ensure we observe the correct CPU on which the task is currently
+        * scheduling.
         */
-       if (READ_ONCE(p->on_cpu) && ttwu_queue_wakelist(p, cpu, wake_flags | WF_ON_RQ))
+       if (smp_load_acquire(&p->on_cpu) &&
+           ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_RQ))
                goto unlock;
 
        /*
@@ -2635,6 +2656,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                psi_ttwu_dequeue(p);
                set_task_cpu(p, cpu);
        }
+#else
+       cpu = task_cpu(p);
 #endif /* CONFIG_SMP */
 
        ttwu_queue(p, cpu, wake_flags);
@@ -2642,7 +2665,7 @@ unlock:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out:
        if (success)
-               ttwu_stat(p, cpu, wake_flags);
+               ttwu_stat(p, task_cpu(p), wake_flags);
        preempt_enable();
 
        return success;