Merge tag 'hardening-v6.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees...

[platform/kernel/linux-starfive.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 285ef88..25b582b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1392,7 +1392,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
         if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
                 return;
  
-       WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+       uclamp_rq_set(rq, clamp_id, clamp_value);
  }
  
  static inline
@@ -1543,8 +1543,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
         if (bucket->tasks == 1 || uc_se->value > bucket->value)
                 bucket->value = uc_se->value;
  
-       if (uc_se->value > READ_ONCE(uc_rq->value))
-               WRITE_ONCE(uc_rq->value, uc_se->value);
+       if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+               uclamp_rq_set(rq, clamp_id, uc_se->value);
  }
  
  /*
@@ -1610,7 +1610,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
         if (likely(bucket->tasks))
                 return;
  
-       rq_clamp = READ_ONCE(uc_rq->value);
+       rq_clamp = uclamp_rq_get(rq, clamp_id);
         /*
          * Defensive programming: this should never happen. If it happens,
          * e.g. due to future modification, warn and fixup the expected value.
@@ -1618,7 +1618,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
         SCHED_WARN_ON(bucket->value > rq_clamp);
         if (bucket->value >= rq_clamp) {
                 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
-               WRITE_ONCE(uc_rq->value, bkt_clamp);
+               uclamp_rq_set(rq, clamp_id, bkt_clamp);
         }
  }
  
@@ -2053,7 +2053,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
  
         if (!(flags & ENQUEUE_RESTORE)) {
                 sched_info_enqueue(rq, p);
-               psi_enqueue(p, flags & ENQUEUE_WAKEUP);
+               psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
         }
  
         uclamp_rq_inc(rq, p);
@@ -2189,14 +2189,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
  #ifdef CONFIG_SMP
  
  static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
  
  static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask,
-                                 u32 flags);
+                                 struct affinity_context *ctx);
  
  static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
  {
+       struct affinity_context ac = {
+               .new_mask  = cpumask_of(rq->cpu),
+               .flags     = SCA_MIGRATE_DISABLE,
+       };
+
         if (likely(!p->migration_disabled))
                 return;
  
@@ -2206,7 +2210,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
         /*
          * Violates locking rules! see comment in __do_set_cpus_allowed().
          */
-       __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+       __do_set_cpus_allowed(p, &ac);
  }
  
  void migrate_disable(void)
@@ -2228,6 +2232,10 @@ EXPORT_SYMBOL_GPL(migrate_disable);
  void migrate_enable(void)
  {
         struct task_struct *p = current;
+       struct affinity_context ac = {
+               .new_mask  = &p->cpus_mask,
+               .flags     = SCA_MIGRATE_ENABLE,
+       };
  
         if (p->migration_disabled > 1) {
                 p->migration_disabled--;
@@ -2243,7 +2251,7 @@ void migrate_enable(void)
          */
         preempt_disable();
         if (p->cpus_ptr != &p->cpus_mask)
-               __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+               __set_cpus_allowed_ptr(p, &ac);
         /*
          * Mustn't clear migration_disabled() until cpus_ptr points back at the
          * regular cpus_mask, otherwise things that race (eg.
@@ -2523,19 +2531,25 @@ out_unlock:
   * sched_class::set_cpus_allowed must do the below, but is not required to
   * actually call this function.
   */
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx)
  {
-       if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
-               p->cpus_ptr = new_mask;
+       if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+               p->cpus_ptr = ctx->new_mask;
                 return;
         }
  
-       cpumask_copy(&p->cpus_mask, new_mask);
-       p->nr_cpus_allowed = cpumask_weight(new_mask);
+       cpumask_copy(&p->cpus_mask, ctx->new_mask);
+       p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+
+       /*
+        * Swap in a new user_cpus_ptr if SCA_USER flag set
+        */
+       if (ctx->flags & SCA_USER)
+               swap(p->user_cpus_ptr, ctx->user_mask);
  }
  
  static void
-__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
+__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
  {
         struct rq *rq = task_rq(p);
         bool queued, running;
@@ -2552,7 +2566,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
          *
          * XXX do further audits, this smells like something putrid.
          */
-       if (flags & SCA_MIGRATE_DISABLE)
+       if (ctx->flags & SCA_MIGRATE_DISABLE)
                 SCHED_WARN_ON(!p->on_cpu);
         else
                 lockdep_assert_held(&p->pi_lock);
@@ -2571,7 +2585,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
         if (running)
                 put_prev_task(rq, p);
  
-       p->sched_class->set_cpus_allowed(p, new_mask, flags);
+       p->sched_class->set_cpus_allowed(p, ctx);
  
         if (queued)
                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -2579,14 +2593,27 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32
                 set_next_task(rq, p);
  }
  
+/*
+ * Used for kthread_bind() and select_fallback_rq(), in both cases the user
+ * affinity (if any) should be destroyed too.
+ */
  void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
  {
-       __do_set_cpus_allowed(p, new_mask, 0);
+       struct affinity_context ac = {
+               .new_mask  = new_mask,
+               .user_mask = NULL,
+               .flags     = SCA_USER,  /* clear the user requested mask */
+       };
+
+       __do_set_cpus_allowed(p, &ac);
+       kfree(ac.user_mask);
  }
  
  int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
                       int node)
  {
+       unsigned long flags;
+
         if (!src->user_cpus_ptr)
                 return 0;
  
@@ -2594,7 +2621,10 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
         if (!dst->user_cpus_ptr)
                 return -ENOMEM;
  
+       /* Use pi_lock to protect content of user_cpus_ptr */
+       raw_spin_lock_irqsave(&src->pi_lock, flags);
         cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+       raw_spin_unlock_irqrestore(&src->pi_lock, flags);
         return 0;
  }
  
@@ -2690,6 +2720,8 @@ void release_user_cpus_ptr(struct task_struct *p)
   */
  static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf,
                             int dest_cpu, unsigned int flags)
+       __releases(rq->lock)
+       __releases(p->pi_lock)
  {
         struct set_affinity_pending my_pending = { }, *pending = NULL;
         bool stop_pending, complete = false;
@@ -2832,8 +2864,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
   * Called with both p->pi_lock and rq->lock held; drops both before returning.
   */
  static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
-                                        const struct cpumask *new_mask,
-                                        u32 flags,
+                                        struct affinity_context *ctx,
                                          struct rq *rq,
                                          struct rq_flags *rf)
         __releases(rq->lock)
@@ -2842,7 +2873,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
         const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
         const struct cpumask *cpu_valid_mask = cpu_active_mask;
         bool kthread = p->flags & PF_KTHREAD;
-       struct cpumask *user_mask = NULL;
         unsigned int dest_cpu;
         int ret = 0;
  
@@ -2862,7 +2892,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
                 cpu_valid_mask = cpu_online_mask;
         }
  
-       if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+       if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
@@ -2871,18 +2901,18 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
          * Must re-check here, to close a race against __kthread_bind(),
          * sched_setaffinity() is not guaranteed to observe the flag.
          */
-       if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
+       if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) {
                 ret = -EINVAL;
                 goto out;
         }
  
-       if (!(flags & SCA_MIGRATE_ENABLE)) {
-               if (cpumask_equal(&p->cpus_mask, new_mask))
+       if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
+               if (cpumask_equal(&p->cpus_mask, ctx->new_mask))
                         goto out;
  
                 if (WARN_ON_ONCE(p == current &&
                                  is_migration_disabled(p) &&
-                                !cpumask_test_cpu(task_cpu(p), new_mask))) {
+                                !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) {
                         ret = -EBUSY;
                         goto out;
                 }
@@ -2893,22 +2923,15 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
          * for groups of tasks (ie. cpuset), so that load balancing is not
          * immediately required to distribute the tasks within their new mask.
          */
-       dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
+       dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask);
         if (dest_cpu >= nr_cpu_ids) {
                 ret = -EINVAL;
                 goto out;
         }
  
-       __do_set_cpus_allowed(p, new_mask, flags);
-
-       if (flags & SCA_USER)
-               user_mask = clear_user_cpus_ptr(p);
-
-       ret = affine_move_task(rq, p, rf, dest_cpu, flags);
-
-       kfree(user_mask);
+       __do_set_cpus_allowed(p, ctx);
  
-       return ret;
+       return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
  
  out:
         task_rq_unlock(rq, p, rf);
@@ -2926,25 +2949,41 @@ out:
   * call is not atomic; no spinlocks may be held.
   */
  static int __set_cpus_allowed_ptr(struct task_struct *p,
-                                 const struct cpumask *new_mask, u32 flags)
+                                 struct affinity_context *ctx)
  {
         struct rq_flags rf;
         struct rq *rq;
  
         rq = task_rq_lock(p, &rf);
-       return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+       /*
+        * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_*
+        * flags are set.
+        */
+       if (p->user_cpus_ptr &&
+           !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) &&
+           cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr))
+               ctx->new_mask = rq->scratch_mask;
+
+       return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf);
  }
  
  int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  {
-       return __set_cpus_allowed_ptr(p, new_mask, 0);
+       struct affinity_context ac = {
+               .new_mask  = new_mask,
+               .flags     = 0,
+       };
+
+       return __set_cpus_allowed_ptr(p, &ac);
  }
  EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
  
  /*
   * Change a given task's CPU affinity to the intersection of its current
- * affinity mask and @subset_mask, writing the resulting mask to @new_mask
- * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask.
+ * If user_cpus_ptr is defined, use it as the basis for restricting CPU
+ * affinity or use cpu_online_mask instead.
+ *
   * If the resulting mask is empty, leave the affinity unchanged and return
   * -EINVAL.
   */
@@ -2952,17 +2991,14 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
                                      struct cpumask *new_mask,
                                      const struct cpumask *subset_mask)
  {
-       struct cpumask *user_mask = NULL;
+       struct affinity_context ac = {
+               .new_mask  = new_mask,
+               .flags     = 0,
+       };
         struct rq_flags rf;
         struct rq *rq;
         int err;
  
-       if (!p->user_cpus_ptr) {
-               user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
-               if (!user_mask)
-                       return -ENOMEM;
-       }
-
         rq = task_rq_lock(p, &rf);
  
         /*
@@ -2975,31 +3011,21 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p,
                 goto err_unlock;
         }
  
-       if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+       if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) {
                 err = -EINVAL;
                 goto err_unlock;
         }
  
-       /*
-        * We're about to butcher the task affinity, so keep track of what
-        * the user asked for in case we're able to restore it later on.
-        */
-       if (user_mask) {
-               cpumask_copy(user_mask, p->cpus_ptr);
-               p->user_cpus_ptr = user_mask;
-       }
-
-       return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+       return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf);
  
  err_unlock:
         task_rq_unlock(rq, p, &rf);
-       kfree(user_mask);
         return err;
  }
  
  /*
   * Restrict the CPU affinity of task @p so that it is a subset of
- * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the
   * old affinity mask. If the resulting mask is empty, we warn and walk
   * up the cpuset hierarchy until we find a suitable mask.
   */
@@ -3043,34 +3069,29 @@ out_free_mask:
  }
  
  static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
  
  /*
   * Restore the affinity of a task @p which was previously restricted by a
- * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
- * @p->user_cpus_ptr.
+ * call to force_compatible_cpus_allowed_ptr().
   *
   * It is the caller's responsibility to serialise this with any calls to
   * force_compatible_cpus_allowed_ptr(@p).
   */
  void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
  {
-       struct cpumask *user_mask = p->user_cpus_ptr;
-       unsigned long flags;
+       struct affinity_context ac = {
+               .new_mask  = task_user_cpus(p),
+               .flags     = 0,
+       };
+       int ret;
  
         /*
-        * Try to restore the old affinity mask. If this fails, then
-        * we free the mask explicitly to avoid it being inherited across
-        * a subsequent fork().
+        * Try to restore the old affinity mask with __sched_setaffinity().
+        * Cpuset masking will be done there too.
          */
-       if (!user_mask || !__sched_setaffinity(p, user_mask))
-               return;
-
-       raw_spin_lock_irqsave(&p->pi_lock, flags);
-       user_mask = clear_user_cpus_ptr(p);
-       raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-
-       kfree(user_mask);
+       ret = __sched_setaffinity(p, &ac);
+       WARN_ON_ONCE(ret);
  }
  
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -3548,10 +3569,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
  #else /* CONFIG_SMP */
  
  static inline int __set_cpus_allowed_ptr(struct task_struct *p,
-                                        const struct cpumask *new_mask,
-                                        u32 flags)
+                                        struct affinity_context *ctx)
  {
-       return set_cpus_allowed_ptr(p, new_mask);
+       return set_cpus_allowed_ptr(p, ctx->new_mask);
  }
  
  static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { }
@@ -3719,13 +3739,6 @@ void sched_ttwu_pending(void *arg)
         if (!llist)
                 return;
  
-       /*
-        * rq::ttwu_pending racy indication of out-standing wakeups.
-        * Races such that false-negatives are possible, since they
-        * are shorter lived that false-positives would be.
-        */
-       WRITE_ONCE(rq->ttwu_pending, 0);
-
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
  
@@ -3739,6 +3752,17 @@ void sched_ttwu_pending(void *arg)
                 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
         }
  
+       /*
+        * Must be after enqueueing at least once task such that
+        * idle_cpu() does not observe a false-negative -- if it does,
+        * it is possible for select_idle_siblings() to stack a number
+        * of tasks on this CPU during that window.
+        *
+        * It is ok to clear ttwu_pending when another task pending.
+        * We will receive IPI after local irq enabled and then enqueue it.
+        * Since now nr_running > 0, idle_cpu() will always get correct result.
+        */
+       WRITE_ONCE(rq->ttwu_pending, 0);
         rq_unlock_irqrestore(rq, &rf);
  }
  
@@ -4200,6 +4224,40 @@ out:
         return success;
  }
  
+static bool __task_needs_rq_lock(struct task_struct *p)
+{
+       unsigned int state = READ_ONCE(p->__state);
+
+       /*
+        * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
+        * the task is blocked. Make sure to check @state since ttwu() can drop
+        * locks at the end, see ttwu_queue_wakelist().
+        */
+       if (state == TASK_RUNNING || state == TASK_WAKING)
+               return true;
+
+       /*
+        * Ensure we load p->on_rq after p->__state, otherwise it would be
+        * possible to, falsely, observe p->on_rq == 0.
+        *
+        * See try_to_wake_up() for a longer comment.
+        */
+       smp_rmb();
+       if (p->on_rq)
+               return true;
+
+#ifdef CONFIG_SMP
+       /*
+        * Ensure the task has finished __schedule() and will not be referenced
+        * anymore. Again, see try_to_wake_up() for a longer comment.
+        */
+       smp_rmb();
+       smp_cond_load_acquire(&p->on_cpu, !VAL);
+#endif
+
+       return false;
+}
+
  /**
   * task_call_func - Invoke a function on task in fixed state
   * @p: Process for which the function is to be invoked, can be @current.
@@ -4217,28 +4275,12 @@ out:
  int task_call_func(struct task_struct *p, task_call_f func, void *arg)
  {
         struct rq *rq = NULL;
-       unsigned int state;
         struct rq_flags rf;
         int ret;
  
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
  
-       state = READ_ONCE(p->__state);
-
-       /*
-        * Ensure we load p->on_rq after p->__state, otherwise it would be
-        * possible to, falsely, observe p->on_rq == 0.
-        *
-        * See try_to_wake_up() for a longer comment.
-        */
-       smp_rmb();
-
-       /*
-        * Since pi->lock blocks try_to_wake_up(), we don't need rq->lock when
-        * the task is blocked. Make sure to check @state since ttwu() can drop
-        * locks at the end, see ttwu_queue_wakelist().
-        */
-       if (state == TASK_RUNNING || state == TASK_WAKING || p->on_rq)
+       if (__task_needs_rq_lock(p))
                 rq = __task_rq_lock(p, &rf);
  
         /*
@@ -4401,7 +4443,7 @@ static void reset_memory_tiering(void)
         }
  }
  
-int sysctl_numa_balancing(struct ctl_table *table, int write,
+static int sysctl_numa_balancing(struct ctl_table *table, int write,
                           void *buffer, size_t *lenp, loff_t *ppos)
  {
         struct ctl_table t;
@@ -4528,6 +4570,17 @@ static struct ctl_table sched_core_sysctls[] = {
                 .proc_handler   = sysctl_sched_uclamp_handler,
         },
  #endif /* CONFIG_UCLAMP_TASK */
+#ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing",
+               .data           = NULL, /* filled in by handler */
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = sysctl_numa_balancing,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_FOUR,
+       },
+#endif /* CONFIG_NUMA_BALANCING */
         {}
  };
  static int __init sched_core_sysctl_init(void)
@@ -4823,10 +4876,10 @@ static inline void finish_task(struct task_struct *prev)
  
  #ifdef CONFIG_SMP
  
-static void do_balance_callbacks(struct rq *rq, struct callback_head *head)
+static void do_balance_callbacks(struct rq *rq, struct balance_callback *head)
  {
         void (*func)(struct rq *rq);
-       struct callback_head *next;
+       struct balance_callback *next;
  
         lockdep_assert_rq_held(rq);
  
@@ -4853,15 +4906,15 @@ static void balance_push(struct rq *rq);
   * This abuse is tolerated because it places all the unlikely/odd cases behind
   * a single test, namely: rq->balance_callback == NULL.
   */
-struct callback_head balance_push_callback = {
+struct balance_callback balance_push_callback = {
         .next = NULL,
-       .func = (void (*)(struct callback_head *))balance_push,
+       .func = balance_push,
  };
  
-static inline struct callback_head *
+static inline struct balance_callback *
  __splice_balance_callbacks(struct rq *rq, bool split)
  {
-       struct callback_head *head = rq->balance_callback;
+       struct balance_callback *head = rq->balance_callback;
  
         if (likely(!head))
                 return NULL;
@@ -4883,7 +4936,7 @@ __splice_balance_callbacks(struct rq *rq, bool split)
         return head;
  }
  
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
  {
         return __splice_balance_callbacks(rq, true);
  }
@@ -4893,7 +4946,7 @@ static void __balance_callbacks(struct rq *rq)
         do_balance_callbacks(rq, __splice_balance_callbacks(rq, false));
  }
  
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
  {
         unsigned long flags;
  
@@ -4910,12 +4963,12 @@ static inline void __balance_callbacks(struct rq *rq)
  {
  }
  
-static inline struct callback_head *splice_balance_callbacks(struct rq *rq)
+static inline struct balance_callback *splice_balance_callbacks(struct rq *rq)
  {
         return NULL;
  }
  
-static inline void balance_callbacks(struct rq *rq, struct callback_head *head)
+static inline void balance_callbacks(struct rq *rq, struct balance_callback *head)
  {
  }
  
@@ -6187,7 +6240,7 @@ static void sched_core_balance(struct rq *rq)
         preempt_enable();
  }
  
-static DEFINE_PER_CPU(struct callback_head, core_balance_head);
+static DEFINE_PER_CPU(struct balance_callback, core_balance_head);
  
  static void queue_core_balance(struct rq *rq)
  {
@@ -7418,7 +7471,7 @@ static int __sched_setscheduler(struct task_struct *p,
         int oldpolicy = -1, policy = attr->sched_policy;
         int retval, oldprio, newprio, queued, running;
         const struct sched_class *prev_class;
-       struct callback_head *head;
+       struct balance_callback *head;
         struct rq_flags rf;
         int reset_on_fork;
         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -8087,7 +8140,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
  #endif
  
  static int
-__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
  {
         int retval;
         cpumask_var_t cpus_allowed, new_mask;
@@ -8101,13 +8154,16 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
         }
  
         cpuset_cpus_allowed(p, cpus_allowed);
-       cpumask_and(new_mask, mask, cpus_allowed);
+       cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
+
+       ctx->new_mask = new_mask;
+       ctx->flags |= SCA_CHECK;
  
         retval = dl_task_check_affinity(p, new_mask);
         if (retval)
                 goto out_free_new_mask;
-again:
-       retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+
+       retval = __set_cpus_allowed_ptr(p, ctx);
         if (retval)
                 goto out_free_new_mask;
  
@@ -8118,7 +8174,24 @@ again:
                  * Just reset the cpumask to the cpuset's cpus_allowed.
                  */
                 cpumask_copy(new_mask, cpus_allowed);
-               goto again;
+
+               /*
+                * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
+                * will restore the previous user_cpus_ptr value.
+                *
+                * In the unlikely event a previous user_cpus_ptr exists,
+                * we need to further restrict the mask to what is allowed
+                * by that old user_cpus_ptr.
+                */
+               if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
+                       bool empty = !cpumask_and(new_mask, new_mask,
+                                                 ctx->user_mask);
+
+                       if (WARN_ON_ONCE(empty))
+                               cpumask_copy(new_mask, cpus_allowed);
+               }
+               __set_cpus_allowed_ptr(p, ctx);
+               retval = -EINVAL;
         }
  
  out_free_new_mask:
@@ -8130,6 +8203,8 @@ out_free_cpus_allowed:
  
  long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
  {
+       struct affinity_context ac;
+       struct cpumask *user_mask;
         struct task_struct *p;
         int retval;
  
@@ -8164,7 +8239,21 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (retval)
                 goto out_put_task;
  
-       retval = __sched_setaffinity(p, in_mask);
+       user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+       if (!user_mask) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       cpumask_copy(user_mask, in_mask);
+       ac = (struct affinity_context){
+               .new_mask  = in_mask,
+               .user_mask = user_mask,
+               .flags     = SCA_USER,
+       };
+
+       retval = __sched_setaffinity(p, &ac);
+       kfree(ac.user_mask);
+
  out_put_task:
         put_task_struct(p);
         return retval;
@@ -8945,6 +9034,12 @@ void show_state_filter(unsigned int state_filter)
   */
  void __init init_idle(struct task_struct *idle, int cpu)
  {
+#ifdef CONFIG_SMP
+       struct affinity_context ac = (struct affinity_context) {
+               .new_mask  = cpumask_of(cpu),
+               .flags     = 0,
+       };
+#endif
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
  
@@ -8969,7 +9064,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
          *
          * And since this is boot we can forgo the serialization.
          */
-       set_cpus_allowed_common(idle, cpumask_of(cpu), 0);
+       set_cpus_allowed_common(idle, &ac);
  #endif
         /*
          * We're having a chicken and egg problem, even though we are
@@ -9756,6 +9851,7 @@ void __init sched_init(void)
  
                 rq->core_cookie = 0UL;
  #endif
+               zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
         }
  
         set_load_weight(&init_task, false);