Merge tag 'sched-core-2022-08-01' of git://git.kernel.org/pub/scm/linux/kernel/git...

[platform/kernel/linux-starfive.git] / kernel / sched / core.c
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index da0bf6f..5555e49 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -873,15 +873,11 @@ static inline void hrtick_rq_init(struct rq *rq)
         ({                                                              \
                 typeof(ptr) _ptr = (ptr);                               \
                 typeof(mask) _mask = (mask);                            \
-               typeof(*_ptr) _old, _val = *_ptr;                       \
+               typeof(*_ptr) _val = *_ptr;                             \
                                                                         \
-               for (;;) {                                              \
-                       _old = cmpxchg(_ptr, _val, _val | _mask);       \
-                       if (_old == _val)                               \
-                               break;                                  \
-                       _val = _old;                                    \
-               }                                                       \
-       _old;                                                           \
+               do {                                                    \
+               } while (!try_cmpxchg(_ptr, &_val, _val | _mask));      \
+       _val;                                                           \
  })
  
  #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
@@ -890,7 +886,7 @@ static inline void hrtick_rq_init(struct rq *rq)
   * this avoids any races wrt polling state changes and thereby avoids
   * spurious IPIs.
   */
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
  {
         struct thread_info *ti = task_thread_info(p);
         return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
@@ -905,30 +901,28 @@ static bool set_nr_and_not_polling(struct task_struct *p)
  static bool set_nr_if_polling(struct task_struct *p)
  {
         struct thread_info *ti = task_thread_info(p);
-       typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+       typeof(ti->flags) val = READ_ONCE(ti->flags);
  
         for (;;) {
                 if (!(val & _TIF_POLLING_NRFLAG))
                         return false;
                 if (val & _TIF_NEED_RESCHED)
                         return true;
-               old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
-               if (old == val)
+               if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED))
                         break;
-               val = old;
         }
         return true;
  }
  
  #else
-static bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p)
  {
         set_tsk_need_resched(p);
         return true;
  }
  
  #ifdef CONFIG_SMP
-static bool set_nr_if_polling(struct task_struct *p)
+static inline bool set_nr_if_polling(struct task_struct *p)
  {
         return false;
  }
@@ -3808,7 +3802,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
  }
  
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
  {
         /*
          * Do not complicate things with the async wake_list while the CPU is
@@ -3824,13 +3818,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
         if (!cpus_share_cache(smp_processor_id(), cpu))
                 return true;
  
+       if (cpu == smp_processor_id())
+               return false;
+
         /*
-        * If the task is descheduling and the only running task on the
-        * CPU then use the wakelist to offload the task activation to
-        * the soon-to-be-idle CPU as the current CPU is likely busy.
-        * nr_running is checked to avoid unnecessary task stacking.
+        * If the wakee cpu is idle, or the task is descheduling and the
+        * only running task on the CPU, then use the wakelist to offload
+        * the task activation to the idle (or soon-to-be-idle) CPU as
+        * the current CPU is likely busy. nr_running is checked to
+        * avoid unnecessary task stacking.
+        *
+        * Note that we can only get here with (wakee) p->on_rq=0,
+        * p->on_cpu can be whatever, we've done the dequeue, so
+        * the wakee has been accounted out of ->nr_running.
          */
-       if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+       if (!cpu_rq(cpu)->nr_running)
                 return true;
  
         return false;
@@ -3838,10 +3840,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
  
  static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
  {
-       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
-               if (WARN_ON_ONCE(cpu == smp_processor_id()))
-                       return false;
-
+       if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
                 sched_clock_cpu(cpu); /* Sync clocks across CPUs */
                 __ttwu_queue_wakelist(p, cpu, wake_flags);
                 return true;
@@ -4163,7 +4162,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          * scheduling.
          */
         if (smp_load_acquire(&p->on_cpu) &&
-           ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+           ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
                 goto unlock;
  
         /*
@@ -4753,7 +4752,8 @@ static inline void prepare_task(struct task_struct *next)
          * Claim the task as running, we do this before switching to it
          * such that any running task will have this set.
          *
-        * See the ttwu() WF_ON_CPU case and its ordering comment.
+        * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+        * its ordering comment.
          */
         WRITE_ONCE(next->on_cpu, 1);
  #endif
@@ -6500,8 +6500,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
                         io_wq_worker_sleeping(tsk);
         }
  
-       if (tsk_is_pi_blocked(tsk))
-               return;
+       /*
+        * spinlock and rwlock must not flush block requests.  This will
+        * deadlock if the callback attempts to acquire a lock which is
+        * already acquired.
+        */
+       SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);
  
         /*
          * If we are going to sleep and we have plugged IO queued,
@@ -6998,17 +7002,29 @@ out_unlock:
  EXPORT_SYMBOL(set_user_nice);
  
  /*
- * can_nice - check if a task can reduce its nice value
+ * is_nice_reduction - check if nice value is an actual reduction
+ *
+ * Similar to can_nice() but does not perform a capability check.
+ *
   * @p: task
   * @nice: nice value
   */
-int can_nice(const struct task_struct *p, const int nice)
+static bool is_nice_reduction(const struct task_struct *p, const int nice)
  {
         /* Convert nice value [19,-20] to rlimit style value [1,40]: */
         int nice_rlim = nice_to_rlimit(nice);
  
-       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
-               capable(CAP_SYS_NICE));
+       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
+}
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+       return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
  }
  
  #ifdef __ARCH_WANT_SYS_NICE
@@ -7137,12 +7153,14 @@ struct task_struct *idle_task(int cpu)
   * required to meet deadlines.
   */
  unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
-                                unsigned long max, enum cpu_util_type type,
+                                enum cpu_util_type type,
                                  struct task_struct *p)
  {
-       unsigned long dl_util, util, irq;
+       unsigned long dl_util, util, irq, max;
         struct rq *rq = cpu_rq(cpu);
  
+       max = arch_scale_cpu_capacity(cpu);
+
         if (!uclamp_is_used() &&
             type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
                 return max;
@@ -7222,10 +7240,9 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs,
         return min(max, util);
  }
  
-unsigned long sched_cpu_util(int cpu, unsigned long max)
+unsigned long sched_cpu_util(int cpu)
  {
-       return effective_cpu_util(cpu, cpu_util_cfs(cpu), max,
-                                 ENERGY_UTIL, NULL);
+       return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENERGY_UTIL, NULL);
  }
  #endif /* CONFIG_SMP */
  
@@ -7287,6 +7304,69 @@ static bool check_same_owner(struct task_struct *p)
         return match;
  }
  
+/*
+ * Allow unprivileged RT tasks to decrease priority.
+ * Only issue a capable test if needed and only once to avoid an audit
+ * event on permitted non-privileged operations:
+ */
+static int user_check_sched_setscheduler(struct task_struct *p,
+                                        const struct sched_attr *attr,
+                                        int policy, int reset_on_fork)
+{
+       if (fair_policy(policy)) {
+               if (attr->sched_nice < task_nice(p) &&
+                   !is_nice_reduction(p, attr->sched_nice))
+                       goto req_priv;
+       }
+
+       if (rt_policy(policy)) {
+               unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+
+               /* Can't set/change the rt policy: */
+               if (policy != p->policy && !rlim_rtprio)
+                       goto req_priv;
+
+               /* Can't increase priority: */
+               if (attr->sched_priority > p->rt_priority &&
+                   attr->sched_priority > rlim_rtprio)
+                       goto req_priv;
+       }
+
+       /*
+        * Can't set/change SCHED_DEADLINE policy at all for now
+        * (safest behavior); in the future we would like to allow
+        * unprivileged DL tasks to increase their relative deadline
+        * or reduce their runtime (both ways reducing utilization)
+        */
+       if (dl_policy(policy))
+               goto req_priv;
+
+       /*
+        * Treat SCHED_IDLE as nice 20. Only allow a switch to
+        * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
+        */
+       if (task_has_idle_policy(p) && !idle_policy(policy)) {
+               if (!is_nice_reduction(p, task_nice(p)))
+                       goto req_priv;
+       }
+
+       /* Can't change other user's priorities: */
+       if (!check_same_owner(p))
+               goto req_priv;
+
+       /* Normal users shall not reset the sched_reset_on_fork flag: */
+       if (p->sched_reset_on_fork && !reset_on_fork)
+               goto req_priv;
+
+       return 0;
+
+req_priv:
+       if (!capable(CAP_SYS_NICE))
+               return -EPERM;
+
+       return 0;
+}
+
  static int __sched_setscheduler(struct task_struct *p,
                                 const struct sched_attr *attr,
                                 bool user, bool pi)
@@ -7328,58 +7408,11 @@ recheck:
             (rt_policy(policy) != (attr->sched_priority != 0)))
                 return -EINVAL;
  
-       /*
-        * Allow unprivileged RT tasks to decrease priority:
-        */
-       if (user && !capable(CAP_SYS_NICE)) {
-               if (fair_policy(policy)) {
-                       if (attr->sched_nice < task_nice(p) &&
-                           !can_nice(p, attr->sched_nice))
-                               return -EPERM;
-               }
-
-               if (rt_policy(policy)) {
-                       unsigned long rlim_rtprio =
-                                       task_rlimit(p, RLIMIT_RTPRIO);
-
-                       /* Can't set/change the rt policy: */
-                       if (policy != p->policy && !rlim_rtprio)
-                               return -EPERM;
-
-                       /* Can't increase priority: */
-                       if (attr->sched_priority > p->rt_priority &&
-                           attr->sched_priority > rlim_rtprio)
-                               return -EPERM;
-               }
-
-                /*
-                 * Can't set/change SCHED_DEADLINE policy at all for now
-                 * (safest behavior); in the future we would like to allow
-                 * unprivileged DL tasks to increase their relative deadline
-                 * or reduce their runtime (both ways reducing utilization)
-                 */
-               if (dl_policy(policy))
-                       return -EPERM;
-
-               /*
-                * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
-                */
-               if (task_has_idle_policy(p) && !idle_policy(policy)) {
-                       if (!can_nice(p, task_nice(p)))
-                               return -EPERM;
-               }
-
-               /* Can't change other user's priorities: */
-               if (!check_same_owner(p))
-                       return -EPERM;
-
-               /* Normal users shall not reset the sched_reset_on_fork flag: */
-               if (p->sched_reset_on_fork && !reset_on_fork)
-                       return -EPERM;
-       }
-
         if (user) {
+               retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
+               if (retval)
+                       return retval;
+
                 if (attr->sched_flags & SCHED_FLAG_SUGOV)
                         return -EINVAL;
  
@@ -9531,7 +9564,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
  #endif
  
  DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
-DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_rq_mask);
  
  void __init sched_init(void)
  {
@@ -9580,7 +9613,7 @@ void __init sched_init(void)
         for_each_possible_cpu(i) {
                 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
                         cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-               per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+               per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node(
                         cpumask_size(), GFP_KERNEL, cpu_to_node(i));
         }
  #endif /* CONFIG_CPUMASK_OFFSTACK */